From 87cb3dce0d59d3d3496f6d60f4dcd6a624ad4321 Mon Sep 17 00:00:00 2001 From: Harshit chittora Date: Mon, 10 Feb 2020 15:37:08 +0530 Subject: [PATCH] Duplicate/similarity detection using Elasticsearch (#1269) --- bugbug/similarity.py | 69 ++++++++++++++++++++++++++++++++-- extra-nlp-requirements.txt | 1 + scripts/evaluate_similarity.py | 7 ++++ scripts/similarity_query.py | 33 ++++++++-------- scripts/similarity_trainer.py | 26 ++++++++----- 5 files changed, 107 insertions(+), 29 deletions(-) diff --git a/bugbug/similarity.py b/bugbug/similarity.py index 628a0864..1fea6096 100644 --- a/bugbug/similarity.py +++ b/bugbug/similarity.py @@ -33,6 +33,8 @@ try: from gensim.similarities import SoftCosineSimilarity, SparseTermSimilarityMatrix from gensim.summarization.bm25 import BM25 from gensim.corpora import Dictionary + from elasticsearch.helpers import bulk + from elasticsearch import Elasticsearch from nltk.corpus import stopwords from nltk.stem.porter import PorterStemmer from nltk.tokenize import word_tokenize @@ -62,10 +64,15 @@ class BaseSimilarity(abc.ABC): self.nltk_tokenizer = nltk_tokenizer - def get_text(self, bug): - return "{} {}".format(bug["summary"], bug["comments"][0]["text"]) + def get_text(self, bug, all_comments=False): + if all_comments: + comments = " ".join(comment["text"] for comment in bug["comments"]) + else: + comments = bug["comments"][0]["text"] - def text_preprocess(self, text, lemmatization=False, join=False): + return "{} {}".format(bug["summary"], comments) + + def text_preprocess(self, text, stemming=True, lemmatization=False, join=False): for func in self.cleanup_functions: text = func(text) @@ -74,7 +81,7 @@ class BaseSimilarity(abc.ABC): if lemmatization: text = [word.lemma_ for word in nlp(text)] - else: + elif stemming: ps = PorterStemmer() tokenized_text = ( word_tokenize(text.lower()) @@ -86,6 +93,8 @@ class BaseSimilarity(abc.ABC): for word in tokenized_text if word not in set(stopwords.words("english")) and len(word) > 1 ] + else: + text = text.split() if join: return " ".join(word for word in text) @@ -626,6 +635,57 @@ class LDASimilarity(BaseSimilarity): raise NotImplementedError +class ElasticSearchSimilarity(BaseSimilarity): + def __init__(self, cleanup_urls=True, nltk_tokenizer=False): + super().__init__(cleanup_urls=cleanup_urls, nltk_tokenizer=nltk_tokenizer) + self.elastic_search = Elasticsearch() + assert ( + self.elastic_search.ping() + ), "Check if Elastic Search Server is running by visiting http://localhost:9200" + + def make_documents(self): + for bug in bugzilla.get_bugs(): + yield { + "_index": "bugbug", + "_type": "_doc", + "bug_id": bug["id"], + "description": self.text_preprocess( + self.get_text(bug, all_comments=True), stemming=False, join=True + ), + } + + def index(self): + self.elastic_search.indices.delete(index="bugbug", ignore=[400, 404]) + bulk(self.elastic_search, self.make_documents()) + + def get_similar_bugs(self, query): + find_similar = self.text_preprocess( + self.get_text(query, all_comments=True), stemming=False, join=True + ) + + es_query = { + "more_like_this": { + "fields": ["description"], + "like": find_similar, + "min_term_freq": 1, + "max_query_terms": 25, + "min_doc_freq": 2, + } + } + + result = self.elastic_search.search(index="bugbug", body={"query": es_query}) + + top_similar = [ + result["hits"]["hits"][i]["_source"]["bug_id"] + for i in range(len(result["hits"]["hits"])) + if result["hits"]["hits"][i]["_source"]["bug_id"] != query["id"] + ] + return top_similar + + def get_distance(self, query1, query2): + raise NotImplementedError + + model_name_to_class = { "lsi": LSISimilarity, "neighbors_tfidf": NeighborsSimilarity, @@ -635,4 +695,5 @@ model_name_to_class = { "word2vec_softcos": Word2VecSoftCosSimilarity, "bm25": BM25Similarity, "lda": LDASimilarity, + "elasticsearch": ElasticSearchSimilarity, } diff --git a/extra-nlp-requirements.txt b/extra-nlp-requirements.txt index ffdee7c1..0b0e983f 100644 --- a/extra-nlp-requirements.txt +++ b/extra-nlp-requirements.txt @@ -1,3 +1,4 @@ +elasticsearch==7.5.1 gensim==3.8.1 nltk==3.4.5 pyemd==0.5.1 diff --git a/scripts/evaluate_similarity.py b/scripts/evaluate_similarity.py index f3fc7a5b..5c0981b6 100644 --- a/scripts/evaluate_similarity.py +++ b/scripts/evaluate_similarity.py @@ -31,6 +31,11 @@ def parse_args(args): dest="nltk_tokenizer", default=False, ) + parser.add_argument( + "--index", + help="Create/Recreate a database in Elastic Search", + action="store_true", + ) return parser.parse_args(args) @@ -45,6 +50,8 @@ def main(args): model = similarity.model_name_to_class[args.algorithm]( cleanup_urls=args.cleanup_urls, nltk_tokenizer=args.nltk_tokenizer ) + if args.algorithm == "elasticsearch" and args.index: + model.index() model.evaluation() diff --git a/scripts/similarity_query.py b/scripts/similarity_query.py index 47f9fd54..50f75a93 100644 --- a/scripts/similarity_query.py +++ b/scripts/similarity_query.py @@ -32,24 +32,27 @@ def parse_args(args): def main(args): - model_file_name = f"{similarity.model_name_to_class[args.algorithm].__name__.lower()}.similaritymodel" + if args.algorithm == "elasticsearch": + model = similarity.model_name_to_class[args.algorithm]() + else: + model_file_name = f"{similarity.model_name_to_class[args.algorithm].__name__.lower()}.similaritymodel" - if not os.path.exists(model_file_name): - logger.info(f"{model_file_name} does not exist. Downloading the model....") - try: - download_check_etag(URL.format(model_file_name)) - except requests.HTTPError: - logger.error( - f"A pre-trained model is not available, you will need to train it yourself using the trainer script" - ) - raise SystemExit(1) + if not os.path.exists(model_file_name): + logger.info(f"{model_file_name} does not exist. Downloading the model....") + try: + download_check_etag(URL.format(model_file_name)) + except requests.HTTPError: + logger.error( + f"A pre-trained model is not available, you will need to train it yourself using the trainer script" + ) + raise SystemExit(1) - zstd_decompress(model_file_name) - assert os.path.exists(model_file_name), "Decompressed file doesn't exist" + zstd_decompress(model_file_name) + assert os.path.exists(model_file_name), "Decompressed file doesn't exist" - model = similarity.model_name_to_class[args.algorithm].load( - f"{similarity.model_name_to_class[args.algorithm].__name__.lower()}.similaritymodel" - ) + model = similarity.model_name_to_class[args.algorithm].load( + f"{similarity.model_name_to_class[args.algorithm].__name__.lower()}.similaritymodel" + ) bug_ids = model.get_similar_bugs(bugzilla.get(args.bug_id)[args.bug_id]) diff --git a/scripts/similarity_trainer.py b/scripts/similarity_trainer.py index 8c34c7fe..4208dfe0 100644 --- a/scripts/similarity_trainer.py +++ b/scripts/similarity_trainer.py @@ -48,20 +48,26 @@ def main(): assert db.download(bugzilla.BUGS_DB) - if args.algorithm == "neighbors_tfidf_bigrams": - model = similarity.model_name_to_class[args.algorithm]( - vectorizer=TfidfVectorizer(ngram_range=(1, 2)), - cleanup_urls=args.cleanup_urls, - nltk_tokenizer=args.nltk_tokenizer, - ) - else: + if args.algorithm == "elasticsearch": model = similarity.model_name_to_class[args.algorithm]( cleanup_urls=args.cleanup_urls, nltk_tokenizer=args.nltk_tokenizer ) + model.index() + else: + if args.algorithm == "neighbors_tfidf_bigrams": + model = similarity.model_name_to_class[args.algorithm]( + vectorizer=TfidfVectorizer(ngram_range=(1, 2)), + cleanup_urls=args.cleanup_urls, + nltk_tokenizer=args.nltk_tokenizer, + ) + else: + model = similarity.model_name_to_class[args.algorithm]( + cleanup_urls=args.cleanup_urls, nltk_tokenizer=args.nltk_tokenizer + ) - path = model.save() - assert os.path.exists(path) - zstd_compress(path) + path = model.save() + assert os.path.exists(path) + zstd_compress(path) if __name__ == "__main__":