Merge pull request #1 from KasparasKralikas/duplicate-pairs-similarity
Duplicate Bug Report Detection Model
This commit is contained in:
Коммит
6a7e5fafe2
|
@ -129,3 +129,4 @@ training_dataset.csv
|
|||
vecs.tsv
|
||||
meta.tsv
|
||||
models
|
||||
datasets
|
102
bug_model.py
102
bug_model.py
|
@ -1,24 +1,77 @@
|
|||
import tensorflow as tf
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import math
|
||||
import IPython.display as display
|
||||
|
||||
class BugModel:
|
||||
|
||||
model = None
|
||||
history = None
|
||||
modelPath = 'models/bug_model'
|
||||
learning_rate = 0.0008
|
||||
steady_epochs = 3
|
||||
decay_rate = 0.75
|
||||
|
||||
def constructModel(self, vocab_size, embedding_dim, max_length):
|
||||
self.model = tf.keras.Sequential([
|
||||
tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
|
||||
tf.keras.layers.GlobalAveragePooling1D(),
|
||||
tf.keras.layers.Dense(24, activation='relu'),
|
||||
tf.keras.layers.Dense(1, activation='sigmoid')
|
||||
])
|
||||
self.model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
|
||||
def construct_model(self, vocab_size, embedding_dim, max_length, dropout, embedding_matrix):
|
||||
|
||||
input1 = tf.keras.Input(shape=(max_length,))
|
||||
input2 = tf.keras.Input(shape=(max_length,))
|
||||
|
||||
embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length, embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix))
|
||||
embedding.trainable = False
|
||||
|
||||
embedding1 = embedding(input1)
|
||||
embedding2 = embedding(input2)
|
||||
|
||||
bi_lstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim))
|
||||
|
||||
vector1 = bi_lstm(embedding1)
|
||||
vector1 = tf.keras.layers.Flatten()(vector1)
|
||||
vector2 = bi_lstm(embedding2)
|
||||
vector2 = tf.keras.layers.Flatten()(vector2)
|
||||
|
||||
x3 = tf.keras.layers.Subtract()([vector1, vector2])
|
||||
x3 = tf.keras.layers.Multiply()([x3, x3])
|
||||
|
||||
x1 = tf.keras.layers.Multiply()([vector1, vector1])
|
||||
x2 = tf.keras.layers.Multiply()([vector2, vector2])
|
||||
|
||||
x4 = tf.keras.layers.Subtract()([x1, x2])
|
||||
|
||||
x5 = tf.keras.layers.Lambda(self.cosine_distance, output_shape=self.cos_dist_output_shape)([x1, x2])
|
||||
|
||||
x = tf.keras.layers.Concatenate(axis=-1)([x5, x4, x3])
|
||||
|
||||
x = tf.keras.layers.Dense(embedding_dim * 2, activation='relu')(x)
|
||||
|
||||
x = tf.keras.layers.Dropout(rate=dropout)(x)
|
||||
|
||||
#x = tf.keras.layers.Dense(embedding_dim * 2, activation='relu')(x)
|
||||
|
||||
#x = tf.keras.layers.Dropout(rate=dropout)(x)
|
||||
|
||||
pred = tf.keras.layers.Dense(1, activation='sigmoid')(x)
|
||||
|
||||
self.model = tf.keras.Model(inputs=[input1, input2], outputs=pred)
|
||||
|
||||
metrics = ['accuracy', tf.keras.metrics.Recall(name='recall'), tf.keras.metrics.Precision(name='precission'), tf.keras.metrics.AUC(name='AUC')]
|
||||
self.model.compile(loss='binary_crossentropy',optimizer=tf.keras.optimizers.Adam(learning_rate=self.learning_rate),metrics=metrics)
|
||||
print(self.model.summary())
|
||||
|
||||
def fit_model(self, training_data, training_labels, testing_data, testing_labels, num_epochs):
|
||||
self.history = self.model.fit(training_data, training_labels, epochs=num_epochs, validation_data=(testing_data, testing_labels), verbose=2)
|
||||
#def lr_decay(self, epoch):
|
||||
# return self.learning_rate * math.pow(self.decay_rate, epoch)
|
||||
|
||||
def lr_decay(self, epoch):
|
||||
if epoch < self.steady_epochs:
|
||||
return self.learning_rate
|
||||
else:
|
||||
return self.learning_rate * tf.math.exp(self.decay_rate * (self.steady_epochs - epoch))
|
||||
|
||||
def fit_model(self, training_data, training_labels, testing_data, testing_labels, num_epochs, class_weight):
|
||||
lr_decay_callback = tf.keras.callbacks.LearningRateScheduler(self.lr_decay, verbose=True)
|
||||
self.plot_learning_rate(self.lr_decay, num_epochs)
|
||||
self.history = self.model.fit(training_data, training_labels, epochs=num_epochs, validation_data=(testing_data, testing_labels), verbose=2, class_weight=class_weight, callbacks=[lr_decay_callback])
|
||||
|
||||
def predict(self, data):
|
||||
return self.model.predict(data)
|
||||
|
@ -27,18 +80,39 @@ class BugModel:
|
|||
self.model.save(self.modelPath)
|
||||
|
||||
def load_model(self):
|
||||
self.model = tf.keras.models.load_model(self.modelPath)
|
||||
self.model = tf.keras.models.load_model(self.modelPath, custom_objects={'cosine_distance': self.cosine_distance, 'cos_dist_output_shape': self.cos_dist_output_shape})
|
||||
self.model.summary()
|
||||
|
||||
def plot_graph(self, string):
|
||||
plt.plot(self.history.history[string])
|
||||
plt.plot(self.history.history['val_'+string])
|
||||
plt.xlabel("Epochs")
|
||||
plt.xlabel('Epochs')
|
||||
plt.ylabel(string)
|
||||
plt.legend([string, 'val_'+string])
|
||||
plt.show()
|
||||
|
||||
def plot_graphs(self):
|
||||
self.plot_graph("accuracy")
|
||||
self.plot_graph("loss")
|
||||
self.plot_graph('accuracy')
|
||||
self.plot_graph('recall')
|
||||
self.plot_graph('precission')
|
||||
self.plot_graph('AUC')
|
||||
self.plot_graph('loss')
|
||||
|
||||
def plot_learning_rate(self, lr_func, epochs):
|
||||
xx = np.arange(epochs+1, dtype=np.float)
|
||||
y = [lr_func(x) for x in xx]
|
||||
plt.xlabel('Epochs')
|
||||
plt.ylabel('Learning Rate')
|
||||
plt.plot(xx, y)
|
||||
plt.show()
|
||||
|
||||
def cosine_distance(self, vests):
|
||||
x, y = vests
|
||||
x = tf.keras.backend.l2_normalize(x, axis=-1)
|
||||
y = tf.keras.backend.l2_normalize(y, axis=-1)
|
||||
return -tf.keras.backend.mean(x * y, axis=-1, keepdims=True)
|
||||
|
||||
def cos_dist_output_shape(self, shapes):
|
||||
shape1, shape2 = shapes
|
||||
return (shape1[0],1)
|
||||
|
||||
|
|
|
@ -0,0 +1,161 @@
|
|||
from tensorflow.keras.preprocessing.text import Tokenizer
|
||||
import pandas as pd
|
||||
from text_preprocessor import clean_text, text_to_padded
|
||||
import tensorflow as tf
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import io
|
||||
import pickle
|
||||
from sklearn.model_selection import train_test_split
|
||||
from bug_model import BugModel
|
||||
|
||||
class BugModelClient:
|
||||
|
||||
oov_token = '<OOV>'
|
||||
vocab_size = None
|
||||
embedding_dim = 50
|
||||
training_portion = 0.8
|
||||
max_length = 100
|
||||
num_epochs = 8
|
||||
dropout = 0.2
|
||||
|
||||
class_weight = {0 : 1 , 1 : 2}
|
||||
|
||||
data_path = 'datasets/training_dataset_pairs.csv'
|
||||
|
||||
tokenizer_path = 'models/tokenizer.pickle'
|
||||
|
||||
custom_glove_path = 'datasets/custom_glove_50d.txt'
|
||||
|
||||
data = None
|
||||
|
||||
training_size = None
|
||||
|
||||
word_index = None
|
||||
|
||||
tokenizer = None
|
||||
|
||||
embedding_matrix = None
|
||||
|
||||
bug_model = BugModel()
|
||||
|
||||
def init_data(self, data_count):
|
||||
self.data = pd.read_csv(self.data_path, sep=',')
|
||||
self.data = self.data[:data_count]
|
||||
print(len(self.data.index))
|
||||
self.data['clean_description_1'] = self.clean_descriptions(self.data['description_1'])
|
||||
self.data['clean_description_2'] = self.clean_descriptions(self.data['description_2'])
|
||||
self.training_size = int(len(self.data.index) * self.training_portion)
|
||||
|
||||
X1_train, X1_test, X2_train, X2_test, y_train, y_test = train_test_split(self.data['clean_description_1'], self.data['clean_description_2'], self.data['duplicates'], test_size=0.2)
|
||||
|
||||
self.tokenizer = Tokenizer(oov_token=self.oov_token)
|
||||
self.tokenizer.fit_on_texts(X1_train)
|
||||
self.tokenizer.fit_on_texts(X2_train)
|
||||
self.word_index = self.tokenizer.word_index
|
||||
print(len(self.word_index))
|
||||
self.vocab_size = len(self.word_index) + 1
|
||||
|
||||
X1_train = np.array(text_to_padded(X1_train, self.tokenizer, self.max_length))
|
||||
X1_test = np.array(text_to_padded(X1_test, self.tokenizer, self.max_length))
|
||||
X2_train = np.array(text_to_padded(X2_train, self.tokenizer, self.max_length))
|
||||
X2_test = np.array(text_to_padded(X2_test, self.tokenizer, self.max_length))
|
||||
|
||||
self.X1_train = X1_train
|
||||
self.X1_test = X1_test
|
||||
self.X2_train = X2_train
|
||||
self.X2_test = X2_test
|
||||
self.y_train = y_train
|
||||
self.y_test = y_test
|
||||
|
||||
def prepare_embedding(self):
|
||||
embeddings_index = dict()
|
||||
f = open(self.custom_glove_path, encoding='utf8')
|
||||
for line in f:
|
||||
values = line.split()
|
||||
word = values[0]
|
||||
coefs = np.asarray(values[1:], dtype='float32')
|
||||
embeddings_index[word] = coefs
|
||||
f.close()
|
||||
print('Loaded %s word vectors.' % len(embeddings_index))
|
||||
embeddings_matrix = np.zeros((self.vocab_size, self.embedding_dim))
|
||||
for word, i in self.tokenizer.word_index.items():
|
||||
embedding_vector = embeddings_index.get(word)
|
||||
if embedding_vector is not None:
|
||||
embeddings_matrix[i] = embedding_vector
|
||||
self.embedding_matrix = embeddings_matrix
|
||||
|
||||
def save_tokenizer(self):
|
||||
with open(self.tokenizer_path, 'wb') as handle:
|
||||
pickle.dump(self.tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
def load_tokenizer(self):
|
||||
with open(self.tokenizer_path, 'rb') as handle:
|
||||
self.tokenizer = pickle.load(handle)
|
||||
self.word_index = self.tokenizer.word_index
|
||||
self.vocab_size = len(self.word_index) + 1
|
||||
print('Loaded tokenizer with %s words.' % self.vocab_size)
|
||||
|
||||
def clean_descriptions(self, descriptions):
|
||||
clean_descriptions = descriptions.apply(lambda x: clean_text(x))
|
||||
return clean_descriptions
|
||||
|
||||
def train_model(self):
|
||||
self.bug_model.construct_model(self.vocab_size, self.embedding_dim, self.max_length, self.dropout, self.embedding_matrix)
|
||||
self.bug_model.fit_model([self.X1_train, self.X2_train], self.y_train, [self.X1_test, self.X2_test], self.y_test, self.num_epochs, self.class_weight)
|
||||
|
||||
def plot_graphs(self):
|
||||
self.bug_model.plot_graphs()
|
||||
|
||||
def save_model(self):
|
||||
self.bug_model.save_model()
|
||||
self.save_tokenizer()
|
||||
|
||||
def load_model(self):
|
||||
self.bug_model.load_model()
|
||||
self.load_tokenizer()
|
||||
|
||||
def predict(self, descriptions1, descriptions2):
|
||||
descriptions1 = np.array(text_to_padded(self.clean_descriptions(descriptions1), self.tokenizer, self.max_length))
|
||||
descriptions2 = np.array(text_to_padded(self.clean_descriptions(descriptions2), self.tokenizer, self.max_length))
|
||||
return self.bug_model.predict([descriptions1, descriptions2])
|
||||
|
||||
def validate_predict_top_k(self, descriptions, labels, master_labels, all_descriptions, all_labels, all_master_labels, k):
|
||||
descriptions = np.array(text_to_padded(self.clean_descriptions(descriptions), self.tokenizer, self.max_length))
|
||||
all_descriptions = np.array(text_to_padded(self.clean_descriptions(all_descriptions), self.tokenizer, self.max_length))
|
||||
print(labels)
|
||||
all_predictions = []
|
||||
for index, description in enumerate(descriptions):
|
||||
print(index)
|
||||
description_repeated = np.full((len(all_descriptions), self.max_length), description)
|
||||
predictions = self.bug_model.predict([description_repeated, all_descriptions])
|
||||
predictions = np.array([prediction[0] for prediction in predictions])
|
||||
predictions_top_indices = (-predictions).argsort()
|
||||
prediction_summary = []
|
||||
top_k_master_labels = []
|
||||
for pred_index in predictions_top_indices:
|
||||
if len(top_k_master_labels) >= k:
|
||||
break
|
||||
if all_master_labels[pred_index] not in top_k_master_labels:
|
||||
top_k_master_labels.append(all_master_labels[pred_index])
|
||||
prediction_summary.append({'case_id': all_labels[pred_index], 'master_id': all_master_labels[pred_index], 'probability': predictions[pred_index]})
|
||||
did_predict = master_labels[index] in top_k_master_labels if master_labels[index] != labels[index] else master_labels[index] not in top_k_master_labels
|
||||
for n, pred_index in enumerate(predictions_top_indices):
|
||||
if all_master_labels[pred_index] == master_labels[index]:
|
||||
print('Correct target for {} with id {} in position {} with probability of {}'.format(labels[index], all_labels[pred_index], n, predictions[pred_index]))
|
||||
all_predictions.append({
|
||||
'case_id': labels[index],
|
||||
'master_id': master_labels[index],
|
||||
'predictions': prediction_summary,
|
||||
'correct': did_predict
|
||||
})
|
||||
return {'predictions': all_predictions, 'recall': len([prediction for prediction in all_predictions if prediction['correct'] == True]) / len(all_predictions)}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,41 @@
|
|||
import numpy as np
|
||||
import pandas as pd
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
|
||||
from gensim.models import Word2Vec
|
||||
from tqdm import tqdm
|
||||
|
||||
from text_preprocessor import clean_text
|
||||
|
||||
dataset_path = 'datasets/training_dataset_embeddings.csv'
|
||||
|
||||
custom_glove_path = 'datasets/custom_glove_50d.txt'
|
||||
|
||||
tqdm.pandas()
|
||||
|
||||
df = pd.read_csv(dataset_path)
|
||||
|
||||
df['clean_description'] = df['description'].apply(lambda x: clean_text(x))
|
||||
|
||||
descriptions = df['clean_description']
|
||||
|
||||
train_descriptions = list(descriptions.progress_apply(str.split).values)
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
model = Word2Vec(sentences=train_descriptions,
|
||||
sg=1,
|
||||
size=50,
|
||||
workers=4)
|
||||
|
||||
print(f'Time taken : {(time.time() - start_time) / 60:.2f} mins')
|
||||
|
||||
print(len(model.wv.vocab.keys()))
|
||||
|
||||
print(model.wv.vector_size)
|
||||
|
||||
model.wv.save_word2vec_format(custom_glove_path)
|
||||
|
||||
|
61
main.py
61
main.py
|
@ -2,55 +2,36 @@ from tensorflow.keras.preprocessing.text import Tokenizer
|
|||
import pandas as pd
|
||||
from text_preprocessor import clean_text, text_to_padded
|
||||
import tensorflow as tf
|
||||
import tensorflow_hub as hub
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import io
|
||||
from bug_model import BugModel
|
||||
import tensorflow.keras.layers as layers
|
||||
from tensorflow.keras.models import Model
|
||||
from tensorflow.keras import backend as K
|
||||
|
||||
oov_token = '<OOV>'
|
||||
vocab_size = 10000
|
||||
embedding_dim = 16
|
||||
training_portion = 0.8
|
||||
max_length = 800
|
||||
num_epochs = 30
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
data_full = pd.read_csv('datasets/training_dataset.csv', sep=',')
|
||||
from bug_model_client import BugModelClient
|
||||
|
||||
# smaller dataset for testing
|
||||
data_full = data_full[:20000]
|
||||
import time
|
||||
|
||||
data_full['cleaned_description'] = data_full['description'].apply(lambda x: clean_text(x))
|
||||
|
||||
training_size = int(len(data_full.index) * training_portion)
|
||||
bug_model_client = BugModelClient()
|
||||
bug_model_client.init_data(30000)
|
||||
bug_model_client.prepare_embedding()
|
||||
bug_model_client.train_model()
|
||||
bug_model_client.save_model()
|
||||
bug_model_client.plot_graphs()
|
||||
bug_model_client.load_model()
|
||||
|
||||
training_descriptions = data_full['cleaned_description'][0:training_size]
|
||||
training_labels = data_full['is_bug'][0:training_size]
|
||||
testing_descriptions = data_full['cleaned_description'][training_size:]
|
||||
testing_labels = data_full['is_bug'][training_size:]
|
||||
all_bugs = pd.read_csv('datasets/bugs_dataset.csv', sep=',')
|
||||
|
||||
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_token)
|
||||
tokenizer.fit_on_texts(training_descriptions)
|
||||
word_index = tokenizer.word_index
|
||||
new_bugs = pd.read_csv('datasets/bugs_dataset_testing.csv', sep=',')[:500]
|
||||
new_bugs.reset_index(inplace=True)
|
||||
|
||||
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
|
||||
|
||||
def decode_sentence(text):
|
||||
return ' '.join([reverse_word_index.get(i, '?') for i in text])
|
||||
|
||||
training_padded = text_to_padded(training_descriptions, tokenizer, max_length)
|
||||
testing_padded = text_to_padded(testing_descriptions, tokenizer, max_length)
|
||||
|
||||
training_padded = np.array(training_padded)
|
||||
training_labels = np.array(training_labels)
|
||||
testing_padded = np.array(testing_padded)
|
||||
testing_labels = np.array(testing_labels)
|
||||
|
||||
bug_model = BugModel()
|
||||
|
||||
bug_model.constructModel(vocab_size, embedding_dim, max_length)
|
||||
|
||||
bug_model.fit_model(training_padded, training_labels, training_padded, training_labels, num_epochs)
|
||||
|
||||
bug_model.plot_graphs()
|
||||
|
||||
bug_model.save_model()
|
||||
start = time.time()
|
||||
bug_model_client.validate_predict_top_k(new_bugs['description'], new_bugs['case_id'], new_bugs['master_id_label'], all_bugs['description'], all_bugs['case_id'], all_bugs['master_id_label'], 20)
|
||||
end = time.time()
|
||||
print(end - start)
|
||||
|
|
|
@ -5,7 +5,7 @@ from tensorflow.keras.preprocessing.sequence import pad_sequences
|
|||
|
||||
punctuation = string.punctuation
|
||||
stopwords = nltk.corpus.stopwords.words('english')
|
||||
ps = nltk.PorterStemmer()
|
||||
wnl = nltk.stem.WordNetLemmatizer()
|
||||
|
||||
trunctating_type='post'
|
||||
padding_type='post'
|
||||
|
@ -14,7 +14,8 @@ def clean_text(text):
|
|||
text = str(text).lower()
|
||||
text = ''.join([char for char in text if char not in punctuation])
|
||||
tokens = re.split('\W+', text)
|
||||
text = [ps.stem(word) for word in tokens if word not in stopwords]
|
||||
#text = [wnl.lemmatize(word) for word in tokens if word not in stopwords]
|
||||
text = [word for word in tokens if word not in stopwords]
|
||||
return ' '.join(text)
|
||||
|
||||
def text_to_padded(text, tokenizer, max_length):
|
||||
|
|
Загрузка…
Ссылка в новой задаче