Duplicate/similarity detection using Elasticsearch (#1269)

This commit is contained in:
Harshit chittora 2020-02-10 15:37:08 +05:30 коммит произвёл GitHub
Родитель 7a1d2457ef
Коммит 87cb3dce0d
5 изменённых файлов: 107 добавлений и 29 удалений

Просмотреть файл

@ -33,6 +33,8 @@ try:
from gensim.similarities import SoftCosineSimilarity, SparseTermSimilarityMatrix from gensim.similarities import SoftCosineSimilarity, SparseTermSimilarityMatrix
from gensim.summarization.bm25 import BM25 from gensim.summarization.bm25 import BM25
from gensim.corpora import Dictionary from gensim.corpora import Dictionary
from elasticsearch.helpers import bulk
from elasticsearch import Elasticsearch
from nltk.corpus import stopwords from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize from nltk.tokenize import word_tokenize
@ -62,10 +64,15 @@ class BaseSimilarity(abc.ABC):
self.nltk_tokenizer = nltk_tokenizer self.nltk_tokenizer = nltk_tokenizer
def get_text(self, bug): def get_text(self, bug, all_comments=False):
return "{} {}".format(bug["summary"], bug["comments"][0]["text"]) if all_comments:
comments = " ".join(comment["text"] for comment in bug["comments"])
else:
comments = bug["comments"][0]["text"]
def text_preprocess(self, text, lemmatization=False, join=False): return "{} {}".format(bug["summary"], comments)
def text_preprocess(self, text, stemming=True, lemmatization=False, join=False):
for func in self.cleanup_functions: for func in self.cleanup_functions:
text = func(text) text = func(text)
@ -74,7 +81,7 @@ class BaseSimilarity(abc.ABC):
if lemmatization: if lemmatization:
text = [word.lemma_ for word in nlp(text)] text = [word.lemma_ for word in nlp(text)]
else: elif stemming:
ps = PorterStemmer() ps = PorterStemmer()
tokenized_text = ( tokenized_text = (
word_tokenize(text.lower()) word_tokenize(text.lower())
@ -86,6 +93,8 @@ class BaseSimilarity(abc.ABC):
for word in tokenized_text for word in tokenized_text
if word not in set(stopwords.words("english")) and len(word) > 1 if word not in set(stopwords.words("english")) and len(word) > 1
] ]
else:
text = text.split()
if join: if join:
return " ".join(word for word in text) return " ".join(word for word in text)
@ -626,6 +635,57 @@ class LDASimilarity(BaseSimilarity):
raise NotImplementedError raise NotImplementedError
class ElasticSearchSimilarity(BaseSimilarity):
def __init__(self, cleanup_urls=True, nltk_tokenizer=False):
super().__init__(cleanup_urls=cleanup_urls, nltk_tokenizer=nltk_tokenizer)
self.elastic_search = Elasticsearch()
assert (
self.elastic_search.ping()
), "Check if Elastic Search Server is running by visiting http://localhost:9200"
def make_documents(self):
for bug in bugzilla.get_bugs():
yield {
"_index": "bugbug",
"_type": "_doc",
"bug_id": bug["id"],
"description": self.text_preprocess(
self.get_text(bug, all_comments=True), stemming=False, join=True
),
}
def index(self):
self.elastic_search.indices.delete(index="bugbug", ignore=[400, 404])
bulk(self.elastic_search, self.make_documents())
def get_similar_bugs(self, query):
find_similar = self.text_preprocess(
self.get_text(query, all_comments=True), stemming=False, join=True
)
es_query = {
"more_like_this": {
"fields": ["description"],
"like": find_similar,
"min_term_freq": 1,
"max_query_terms": 25,
"min_doc_freq": 2,
}
}
result = self.elastic_search.search(index="bugbug", body={"query": es_query})
top_similar = [
result["hits"]["hits"][i]["_source"]["bug_id"]
for i in range(len(result["hits"]["hits"]))
if result["hits"]["hits"][i]["_source"]["bug_id"] != query["id"]
]
return top_similar
def get_distance(self, query1, query2):
raise NotImplementedError
model_name_to_class = { model_name_to_class = {
"lsi": LSISimilarity, "lsi": LSISimilarity,
"neighbors_tfidf": NeighborsSimilarity, "neighbors_tfidf": NeighborsSimilarity,
@ -635,4 +695,5 @@ model_name_to_class = {
"word2vec_softcos": Word2VecSoftCosSimilarity, "word2vec_softcos": Word2VecSoftCosSimilarity,
"bm25": BM25Similarity, "bm25": BM25Similarity,
"lda": LDASimilarity, "lda": LDASimilarity,
"elasticsearch": ElasticSearchSimilarity,
} }

Просмотреть файл

@ -1,3 +1,4 @@
elasticsearch==7.5.1
gensim==3.8.1 gensim==3.8.1
nltk==3.4.5 nltk==3.4.5
pyemd==0.5.1 pyemd==0.5.1

Просмотреть файл

@ -31,6 +31,11 @@ def parse_args(args):
dest="nltk_tokenizer", dest="nltk_tokenizer",
default=False, default=False,
) )
parser.add_argument(
"--index",
help="Create/Recreate a database in Elastic Search",
action="store_true",
)
return parser.parse_args(args) return parser.parse_args(args)
@ -45,6 +50,8 @@ def main(args):
model = similarity.model_name_to_class[args.algorithm]( model = similarity.model_name_to_class[args.algorithm](
cleanup_urls=args.cleanup_urls, nltk_tokenizer=args.nltk_tokenizer cleanup_urls=args.cleanup_urls, nltk_tokenizer=args.nltk_tokenizer
) )
if args.algorithm == "elasticsearch" and args.index:
model.index()
model.evaluation() model.evaluation()

Просмотреть файл

@ -32,24 +32,27 @@ def parse_args(args):
def main(args): def main(args):
model_file_name = f"{similarity.model_name_to_class[args.algorithm].__name__.lower()}.similaritymodel" if args.algorithm == "elasticsearch":
model = similarity.model_name_to_class[args.algorithm]()
else:
model_file_name = f"{similarity.model_name_to_class[args.algorithm].__name__.lower()}.similaritymodel"
if not os.path.exists(model_file_name): if not os.path.exists(model_file_name):
logger.info(f"{model_file_name} does not exist. Downloading the model....") logger.info(f"{model_file_name} does not exist. Downloading the model....")
try: try:
download_check_etag(URL.format(model_file_name)) download_check_etag(URL.format(model_file_name))
except requests.HTTPError: except requests.HTTPError:
logger.error( logger.error(
f"A pre-trained model is not available, you will need to train it yourself using the trainer script" f"A pre-trained model is not available, you will need to train it yourself using the trainer script"
) )
raise SystemExit(1) raise SystemExit(1)
zstd_decompress(model_file_name) zstd_decompress(model_file_name)
assert os.path.exists(model_file_name), "Decompressed file doesn't exist" assert os.path.exists(model_file_name), "Decompressed file doesn't exist"
model = similarity.model_name_to_class[args.algorithm].load( model = similarity.model_name_to_class[args.algorithm].load(
f"{similarity.model_name_to_class[args.algorithm].__name__.lower()}.similaritymodel" f"{similarity.model_name_to_class[args.algorithm].__name__.lower()}.similaritymodel"
) )
bug_ids = model.get_similar_bugs(bugzilla.get(args.bug_id)[args.bug_id]) bug_ids = model.get_similar_bugs(bugzilla.get(args.bug_id)[args.bug_id])

Просмотреть файл

@ -48,20 +48,26 @@ def main():
assert db.download(bugzilla.BUGS_DB) assert db.download(bugzilla.BUGS_DB)
if args.algorithm == "neighbors_tfidf_bigrams": if args.algorithm == "elasticsearch":
model = similarity.model_name_to_class[args.algorithm](
vectorizer=TfidfVectorizer(ngram_range=(1, 2)),
cleanup_urls=args.cleanup_urls,
nltk_tokenizer=args.nltk_tokenizer,
)
else:
model = similarity.model_name_to_class[args.algorithm]( model = similarity.model_name_to_class[args.algorithm](
cleanup_urls=args.cleanup_urls, nltk_tokenizer=args.nltk_tokenizer cleanup_urls=args.cleanup_urls, nltk_tokenizer=args.nltk_tokenizer
) )
model.index()
else:
if args.algorithm == "neighbors_tfidf_bigrams":
model = similarity.model_name_to_class[args.algorithm](
vectorizer=TfidfVectorizer(ngram_range=(1, 2)),
cleanup_urls=args.cleanup_urls,
nltk_tokenizer=args.nltk_tokenizer,
)
else:
model = similarity.model_name_to_class[args.algorithm](
cleanup_urls=args.cleanup_urls, nltk_tokenizer=args.nltk_tokenizer
)
path = model.save() path = model.save()
assert os.path.exists(path) assert os.path.exists(path)
zstd_compress(path) zstd_compress(path)
if __name__ == "__main__": if __name__ == "__main__":