Merge pull request #1 from KasparasKralikas/duplicate-pairs-similarity

Duplicate Bug Report Detection Model
This commit is contained in:
KasparasKralikas 2020-06-17 18:47:12 +03:00 коммит произвёл GitHub
Родитель 1375eeeae6 468c32b890
Коммит 6a7e5fafe2
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
6 изменённых файлов: 315 добавлений и 56 удалений

1
.gitignore поставляемый
Просмотреть файл

@ -129,3 +129,4 @@ training_dataset.csv
vecs.tsv
meta.tsv
models
datasets

Просмотреть файл

@ -1,24 +1,77 @@
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import math
import IPython.display as display
class BugModel:
model = None
history = None
modelPath = 'models/bug_model'
learning_rate = 0.0008
steady_epochs = 3
decay_rate = 0.75
def constructModel(self, vocab_size, embedding_dim, max_length):
self.model = tf.keras.Sequential([
tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
tf.keras.layers.GlobalAveragePooling1D(),
tf.keras.layers.Dense(24, activation='relu'),
tf.keras.layers.Dense(1, activation='sigmoid')
])
self.model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
def construct_model(self, vocab_size, embedding_dim, max_length, dropout, embedding_matrix):
input1 = tf.keras.Input(shape=(max_length,))
input2 = tf.keras.Input(shape=(max_length,))
embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length, embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix))
embedding.trainable = False
embedding1 = embedding(input1)
embedding2 = embedding(input2)
bi_lstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim))
vector1 = bi_lstm(embedding1)
vector1 = tf.keras.layers.Flatten()(vector1)
vector2 = bi_lstm(embedding2)
vector2 = tf.keras.layers.Flatten()(vector2)
x3 = tf.keras.layers.Subtract()([vector1, vector2])
x3 = tf.keras.layers.Multiply()([x3, x3])
x1 = tf.keras.layers.Multiply()([vector1, vector1])
x2 = tf.keras.layers.Multiply()([vector2, vector2])
x4 = tf.keras.layers.Subtract()([x1, x2])
x5 = tf.keras.layers.Lambda(self.cosine_distance, output_shape=self.cos_dist_output_shape)([x1, x2])
x = tf.keras.layers.Concatenate(axis=-1)([x5, x4, x3])
x = tf.keras.layers.Dense(embedding_dim * 2, activation='relu')(x)
x = tf.keras.layers.Dropout(rate=dropout)(x)
#x = tf.keras.layers.Dense(embedding_dim * 2, activation='relu')(x)
#x = tf.keras.layers.Dropout(rate=dropout)(x)
pred = tf.keras.layers.Dense(1, activation='sigmoid')(x)
self.model = tf.keras.Model(inputs=[input1, input2], outputs=pred)
metrics = ['accuracy', tf.keras.metrics.Recall(name='recall'), tf.keras.metrics.Precision(name='precission'), tf.keras.metrics.AUC(name='AUC')]
self.model.compile(loss='binary_crossentropy',optimizer=tf.keras.optimizers.Adam(learning_rate=self.learning_rate),metrics=metrics)
print(self.model.summary())
def fit_model(self, training_data, training_labels, testing_data, testing_labels, num_epochs):
self.history = self.model.fit(training_data, training_labels, epochs=num_epochs, validation_data=(testing_data, testing_labels), verbose=2)
#def lr_decay(self, epoch):
# return self.learning_rate * math.pow(self.decay_rate, epoch)
def lr_decay(self, epoch):
if epoch < self.steady_epochs:
return self.learning_rate
else:
return self.learning_rate * tf.math.exp(self.decay_rate * (self.steady_epochs - epoch))
def fit_model(self, training_data, training_labels, testing_data, testing_labels, num_epochs, class_weight):
lr_decay_callback = tf.keras.callbacks.LearningRateScheduler(self.lr_decay, verbose=True)
self.plot_learning_rate(self.lr_decay, num_epochs)
self.history = self.model.fit(training_data, training_labels, epochs=num_epochs, validation_data=(testing_data, testing_labels), verbose=2, class_weight=class_weight, callbacks=[lr_decay_callback])
def predict(self, data):
return self.model.predict(data)
@ -27,18 +80,39 @@ class BugModel:
self.model.save(self.modelPath)
def load_model(self):
self.model = tf.keras.models.load_model(self.modelPath)
self.model = tf.keras.models.load_model(self.modelPath, custom_objects={'cosine_distance': self.cosine_distance, 'cos_dist_output_shape': self.cos_dist_output_shape})
self.model.summary()
def plot_graph(self, string):
plt.plot(self.history.history[string])
plt.plot(self.history.history['val_'+string])
plt.xlabel("Epochs")
plt.xlabel('Epochs')
plt.ylabel(string)
plt.legend([string, 'val_'+string])
plt.show()
def plot_graphs(self):
self.plot_graph("accuracy")
self.plot_graph("loss")
self.plot_graph('accuracy')
self.plot_graph('recall')
self.plot_graph('precission')
self.plot_graph('AUC')
self.plot_graph('loss')
def plot_learning_rate(self, lr_func, epochs):
xx = np.arange(epochs+1, dtype=np.float)
y = [lr_func(x) for x in xx]
plt.xlabel('Epochs')
plt.ylabel('Learning Rate')
plt.plot(xx, y)
plt.show()
def cosine_distance(self, vests):
x, y = vests
x = tf.keras.backend.l2_normalize(x, axis=-1)
y = tf.keras.backend.l2_normalize(y, axis=-1)
return -tf.keras.backend.mean(x * y, axis=-1, keepdims=True)
def cos_dist_output_shape(self, shapes):
shape1, shape2 = shapes
return (shape1[0],1)

161
bug_model_client.py Normal file
Просмотреть файл

@ -0,0 +1,161 @@
from tensorflow.keras.preprocessing.text import Tokenizer
import pandas as pd
from text_preprocessor import clean_text, text_to_padded
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import io
import pickle
from sklearn.model_selection import train_test_split
from bug_model import BugModel
class BugModelClient:
oov_token = '<OOV>'
vocab_size = None
embedding_dim = 50
training_portion = 0.8
max_length = 100
num_epochs = 8
dropout = 0.2
class_weight = {0 : 1 , 1 : 2}
data_path = 'datasets/training_dataset_pairs.csv'
tokenizer_path = 'models/tokenizer.pickle'
custom_glove_path = 'datasets/custom_glove_50d.txt'
data = None
training_size = None
word_index = None
tokenizer = None
embedding_matrix = None
bug_model = BugModel()
def init_data(self, data_count):
self.data = pd.read_csv(self.data_path, sep=',')
self.data = self.data[:data_count]
print(len(self.data.index))
self.data['clean_description_1'] = self.clean_descriptions(self.data['description_1'])
self.data['clean_description_2'] = self.clean_descriptions(self.data['description_2'])
self.training_size = int(len(self.data.index) * self.training_portion)
X1_train, X1_test, X2_train, X2_test, y_train, y_test = train_test_split(self.data['clean_description_1'], self.data['clean_description_2'], self.data['duplicates'], test_size=0.2)
self.tokenizer = Tokenizer(oov_token=self.oov_token)
self.tokenizer.fit_on_texts(X1_train)
self.tokenizer.fit_on_texts(X2_train)
self.word_index = self.tokenizer.word_index
print(len(self.word_index))
self.vocab_size = len(self.word_index) + 1
X1_train = np.array(text_to_padded(X1_train, self.tokenizer, self.max_length))
X1_test = np.array(text_to_padded(X1_test, self.tokenizer, self.max_length))
X2_train = np.array(text_to_padded(X2_train, self.tokenizer, self.max_length))
X2_test = np.array(text_to_padded(X2_test, self.tokenizer, self.max_length))
self.X1_train = X1_train
self.X1_test = X1_test
self.X2_train = X2_train
self.X2_test = X2_test
self.y_train = y_train
self.y_test = y_test
def prepare_embedding(self):
embeddings_index = dict()
f = open(self.custom_glove_path, encoding='utf8')
for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))
embeddings_matrix = np.zeros((self.vocab_size, self.embedding_dim))
for word, i in self.tokenizer.word_index.items():
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
embeddings_matrix[i] = embedding_vector
self.embedding_matrix = embeddings_matrix
def save_tokenizer(self):
with open(self.tokenizer_path, 'wb') as handle:
pickle.dump(self.tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
def load_tokenizer(self):
with open(self.tokenizer_path, 'rb') as handle:
self.tokenizer = pickle.load(handle)
self.word_index = self.tokenizer.word_index
self.vocab_size = len(self.word_index) + 1
print('Loaded tokenizer with %s words.' % self.vocab_size)
def clean_descriptions(self, descriptions):
clean_descriptions = descriptions.apply(lambda x: clean_text(x))
return clean_descriptions
def train_model(self):
self.bug_model.construct_model(self.vocab_size, self.embedding_dim, self.max_length, self.dropout, self.embedding_matrix)
self.bug_model.fit_model([self.X1_train, self.X2_train], self.y_train, [self.X1_test, self.X2_test], self.y_test, self.num_epochs, self.class_weight)
def plot_graphs(self):
self.bug_model.plot_graphs()
def save_model(self):
self.bug_model.save_model()
self.save_tokenizer()
def load_model(self):
self.bug_model.load_model()
self.load_tokenizer()
def predict(self, descriptions1, descriptions2):
descriptions1 = np.array(text_to_padded(self.clean_descriptions(descriptions1), self.tokenizer, self.max_length))
descriptions2 = np.array(text_to_padded(self.clean_descriptions(descriptions2), self.tokenizer, self.max_length))
return self.bug_model.predict([descriptions1, descriptions2])
def validate_predict_top_k(self, descriptions, labels, master_labels, all_descriptions, all_labels, all_master_labels, k):
descriptions = np.array(text_to_padded(self.clean_descriptions(descriptions), self.tokenizer, self.max_length))
all_descriptions = np.array(text_to_padded(self.clean_descriptions(all_descriptions), self.tokenizer, self.max_length))
print(labels)
all_predictions = []
for index, description in enumerate(descriptions):
print(index)
description_repeated = np.full((len(all_descriptions), self.max_length), description)
predictions = self.bug_model.predict([description_repeated, all_descriptions])
predictions = np.array([prediction[0] for prediction in predictions])
predictions_top_indices = (-predictions).argsort()
prediction_summary = []
top_k_master_labels = []
for pred_index in predictions_top_indices:
if len(top_k_master_labels) >= k:
break
if all_master_labels[pred_index] not in top_k_master_labels:
top_k_master_labels.append(all_master_labels[pred_index])
prediction_summary.append({'case_id': all_labels[pred_index], 'master_id': all_master_labels[pred_index], 'probability': predictions[pred_index]})
did_predict = master_labels[index] in top_k_master_labels if master_labels[index] != labels[index] else master_labels[index] not in top_k_master_labels
for n, pred_index in enumerate(predictions_top_indices):
if all_master_labels[pred_index] == master_labels[index]:
print('Correct target for {} with id {} in position {} with probability of {}'.format(labels[index], all_labels[pred_index], n, predictions[pred_index]))
all_predictions.append({
'case_id': labels[index],
'master_id': master_labels[index],
'predictions': prediction_summary,
'correct': did_predict
})
return {'predictions': all_predictions, 'recall': len([prediction for prediction in all_predictions if prediction['correct'] == True]) / len(all_predictions)}

41
custom_embeddings.py Normal file
Просмотреть файл

@ -0,0 +1,41 @@
import numpy as np
import pandas as pd
import os
import re
import time
from gensim.models import Word2Vec
from tqdm import tqdm
from text_preprocessor import clean_text
dataset_path = 'datasets/training_dataset_embeddings.csv'
custom_glove_path = 'datasets/custom_glove_50d.txt'
tqdm.pandas()
df = pd.read_csv(dataset_path)
df['clean_description'] = df['description'].apply(lambda x: clean_text(x))
descriptions = df['clean_description']
train_descriptions = list(descriptions.progress_apply(str.split).values)
start_time = time.time()
model = Word2Vec(sentences=train_descriptions,
sg=1,
size=50,
workers=4)
print(f'Time taken : {(time.time() - start_time) / 60:.2f} mins')
print(len(model.wv.vocab.keys()))
print(model.wv.vector_size)
model.wv.save_word2vec_format(custom_glove_path)

61
main.py
Просмотреть файл

@ -2,55 +2,36 @@ from tensorflow.keras.preprocessing.text import Tokenizer
import pandas as pd
from text_preprocessor import clean_text, text_to_padded
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import matplotlib.pyplot as plt
import io
from bug_model import BugModel
import tensorflow.keras.layers as layers
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K
oov_token = '<OOV>'
vocab_size = 10000
embedding_dim = 16
training_portion = 0.8
max_length = 800
num_epochs = 30
from sklearn.model_selection import train_test_split
data_full = pd.read_csv('datasets/training_dataset.csv', sep=',')
from bug_model_client import BugModelClient
# smaller dataset for testing
data_full = data_full[:20000]
import time
data_full['cleaned_description'] = data_full['description'].apply(lambda x: clean_text(x))
training_size = int(len(data_full.index) * training_portion)
bug_model_client = BugModelClient()
bug_model_client.init_data(30000)
bug_model_client.prepare_embedding()
bug_model_client.train_model()
bug_model_client.save_model()
bug_model_client.plot_graphs()
bug_model_client.load_model()
training_descriptions = data_full['cleaned_description'][0:training_size]
training_labels = data_full['is_bug'][0:training_size]
testing_descriptions = data_full['cleaned_description'][training_size:]
testing_labels = data_full['is_bug'][training_size:]
all_bugs = pd.read_csv('datasets/bugs_dataset.csv', sep=',')
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_token)
tokenizer.fit_on_texts(training_descriptions)
word_index = tokenizer.word_index
new_bugs = pd.read_csv('datasets/bugs_dataset_testing.csv', sep=',')[:500]
new_bugs.reset_index(inplace=True)
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
def decode_sentence(text):
return ' '.join([reverse_word_index.get(i, '?') for i in text])
training_padded = text_to_padded(training_descriptions, tokenizer, max_length)
testing_padded = text_to_padded(testing_descriptions, tokenizer, max_length)
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)
bug_model = BugModel()
bug_model.constructModel(vocab_size, embedding_dim, max_length)
bug_model.fit_model(training_padded, training_labels, training_padded, training_labels, num_epochs)
bug_model.plot_graphs()
bug_model.save_model()
start = time.time()
bug_model_client.validate_predict_top_k(new_bugs['description'], new_bugs['case_id'], new_bugs['master_id_label'], all_bugs['description'], all_bugs['case_id'], all_bugs['master_id_label'], 20)
end = time.time()
print(end - start)

Просмотреть файл

@ -5,7 +5,7 @@ from tensorflow.keras.preprocessing.sequence import pad_sequences
punctuation = string.punctuation
stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()
wnl = nltk.stem.WordNetLemmatizer()
trunctating_type='post'
padding_type='post'
@ -14,7 +14,8 @@ def clean_text(text):
text = str(text).lower()
text = ''.join([char for char in text if char not in punctuation])
tokens = re.split('\W+', text)
text = [ps.stem(word) for word in tokens if word not in stopwords]
#text = [wnl.lemmatize(word) for word in tokens if word not in stopwords]
text = [word for word in tokens if word not in stopwords]
return ' '.join(text)
def text_to_padded(text, tokenizer, max_length):