Duplicate/similarity detection using Elasticsearch (#1269)

This commit is contained in:
Harshit chittora 2020-02-10 15:37:08 +05:30 коммит произвёл GitHub
Родитель 7a1d2457ef
Коммит 87cb3dce0d
5 изменённых файлов: 107 добавлений и 29 удалений

Просмотреть файл

@ -33,6 +33,8 @@ try:
from gensim.similarities import SoftCosineSimilarity, SparseTermSimilarityMatrix
from gensim.summarization.bm25 import BM25
from gensim.corpora import Dictionary
from elasticsearch.helpers import bulk
from elasticsearch import Elasticsearch
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
@ -62,10 +64,15 @@ class BaseSimilarity(abc.ABC):
self.nltk_tokenizer = nltk_tokenizer
def get_text(self, bug):
return "{} {}".format(bug["summary"], bug["comments"][0]["text"])
def get_text(self, bug, all_comments=False):
if all_comments:
comments = " ".join(comment["text"] for comment in bug["comments"])
else:
comments = bug["comments"][0]["text"]
def text_preprocess(self, text, lemmatization=False, join=False):
return "{} {}".format(bug["summary"], comments)
def text_preprocess(self, text, stemming=True, lemmatization=False, join=False):
for func in self.cleanup_functions:
text = func(text)
@ -74,7 +81,7 @@ class BaseSimilarity(abc.ABC):
if lemmatization:
text = [word.lemma_ for word in nlp(text)]
else:
elif stemming:
ps = PorterStemmer()
tokenized_text = (
word_tokenize(text.lower())
@ -86,6 +93,8 @@ class BaseSimilarity(abc.ABC):
for word in tokenized_text
if word not in set(stopwords.words("english")) and len(word) > 1
]
else:
text = text.split()
if join:
return " ".join(word for word in text)
@ -626,6 +635,57 @@ class LDASimilarity(BaseSimilarity):
raise NotImplementedError
class ElasticSearchSimilarity(BaseSimilarity):
def __init__(self, cleanup_urls=True, nltk_tokenizer=False):
super().__init__(cleanup_urls=cleanup_urls, nltk_tokenizer=nltk_tokenizer)
self.elastic_search = Elasticsearch()
assert (
self.elastic_search.ping()
), "Check if Elastic Search Server is running by visiting http://localhost:9200"
def make_documents(self):
for bug in bugzilla.get_bugs():
yield {
"_index": "bugbug",
"_type": "_doc",
"bug_id": bug["id"],
"description": self.text_preprocess(
self.get_text(bug, all_comments=True), stemming=False, join=True
),
}
def index(self):
self.elastic_search.indices.delete(index="bugbug", ignore=[400, 404])
bulk(self.elastic_search, self.make_documents())
def get_similar_bugs(self, query):
find_similar = self.text_preprocess(
self.get_text(query, all_comments=True), stemming=False, join=True
)
es_query = {
"more_like_this": {
"fields": ["description"],
"like": find_similar,
"min_term_freq": 1,
"max_query_terms": 25,
"min_doc_freq": 2,
}
}
result = self.elastic_search.search(index="bugbug", body={"query": es_query})
top_similar = [
result["hits"]["hits"][i]["_source"]["bug_id"]
for i in range(len(result["hits"]["hits"]))
if result["hits"]["hits"][i]["_source"]["bug_id"] != query["id"]
]
return top_similar
def get_distance(self, query1, query2):
raise NotImplementedError
model_name_to_class = {
"lsi": LSISimilarity,
"neighbors_tfidf": NeighborsSimilarity,
@ -635,4 +695,5 @@ model_name_to_class = {
"word2vec_softcos": Word2VecSoftCosSimilarity,
"bm25": BM25Similarity,
"lda": LDASimilarity,
"elasticsearch": ElasticSearchSimilarity,
}

Просмотреть файл

@ -1,3 +1,4 @@
elasticsearch==7.5.1
gensim==3.8.1
nltk==3.4.5
pyemd==0.5.1

Просмотреть файл

@ -31,6 +31,11 @@ def parse_args(args):
dest="nltk_tokenizer",
default=False,
)
parser.add_argument(
"--index",
help="Create/Recreate a database in Elastic Search",
action="store_true",
)
return parser.parse_args(args)
@ -45,6 +50,8 @@ def main(args):
model = similarity.model_name_to_class[args.algorithm](
cleanup_urls=args.cleanup_urls, nltk_tokenizer=args.nltk_tokenizer
)
if args.algorithm == "elasticsearch" and args.index:
model.index()
model.evaluation()

Просмотреть файл

@ -32,24 +32,27 @@ def parse_args(args):
def main(args):
model_file_name = f"{similarity.model_name_to_class[args.algorithm].__name__.lower()}.similaritymodel"
if args.algorithm == "elasticsearch":
model = similarity.model_name_to_class[args.algorithm]()
else:
model_file_name = f"{similarity.model_name_to_class[args.algorithm].__name__.lower()}.similaritymodel"
if not os.path.exists(model_file_name):
logger.info(f"{model_file_name} does not exist. Downloading the model....")
try:
download_check_etag(URL.format(model_file_name))
except requests.HTTPError:
logger.error(
f"A pre-trained model is not available, you will need to train it yourself using the trainer script"
)
raise SystemExit(1)
if not os.path.exists(model_file_name):
logger.info(f"{model_file_name} does not exist. Downloading the model....")
try:
download_check_etag(URL.format(model_file_name))
except requests.HTTPError:
logger.error(
f"A pre-trained model is not available, you will need to train it yourself using the trainer script"
)
raise SystemExit(1)
zstd_decompress(model_file_name)
assert os.path.exists(model_file_name), "Decompressed file doesn't exist"
zstd_decompress(model_file_name)
assert os.path.exists(model_file_name), "Decompressed file doesn't exist"
model = similarity.model_name_to_class[args.algorithm].load(
f"{similarity.model_name_to_class[args.algorithm].__name__.lower()}.similaritymodel"
)
model = similarity.model_name_to_class[args.algorithm].load(
f"{similarity.model_name_to_class[args.algorithm].__name__.lower()}.similaritymodel"
)
bug_ids = model.get_similar_bugs(bugzilla.get(args.bug_id)[args.bug_id])

Просмотреть файл

@ -48,20 +48,26 @@ def main():
assert db.download(bugzilla.BUGS_DB)
if args.algorithm == "neighbors_tfidf_bigrams":
model = similarity.model_name_to_class[args.algorithm](
vectorizer=TfidfVectorizer(ngram_range=(1, 2)),
cleanup_urls=args.cleanup_urls,
nltk_tokenizer=args.nltk_tokenizer,
)
else:
if args.algorithm == "elasticsearch":
model = similarity.model_name_to_class[args.algorithm](
cleanup_urls=args.cleanup_urls, nltk_tokenizer=args.nltk_tokenizer
)
model.index()
else:
if args.algorithm == "neighbors_tfidf_bigrams":
model = similarity.model_name_to_class[args.algorithm](
vectorizer=TfidfVectorizer(ngram_range=(1, 2)),
cleanup_urls=args.cleanup_urls,
nltk_tokenizer=args.nltk_tokenizer,
)
else:
model = similarity.model_name_to_class[args.algorithm](
cleanup_urls=args.cleanup_urls, nltk_tokenizer=args.nltk_tokenizer
)
path = model.save()
assert os.path.exists(path)
zstd_compress(path)
path = model.save()
assert os.path.exists(path)
zstd_compress(path)
if __name__ == "__main__":