зеркало из https://github.com/mozilla/bugbug.git
Duplicate/similarity detection using Elasticsearch (#1269)
This commit is contained in:
Родитель
7a1d2457ef
Коммит
87cb3dce0d
|
@ -33,6 +33,8 @@ try:
|
||||||
from gensim.similarities import SoftCosineSimilarity, SparseTermSimilarityMatrix
|
from gensim.similarities import SoftCosineSimilarity, SparseTermSimilarityMatrix
|
||||||
from gensim.summarization.bm25 import BM25
|
from gensim.summarization.bm25 import BM25
|
||||||
from gensim.corpora import Dictionary
|
from gensim.corpora import Dictionary
|
||||||
|
from elasticsearch.helpers import bulk
|
||||||
|
from elasticsearch import Elasticsearch
|
||||||
from nltk.corpus import stopwords
|
from nltk.corpus import stopwords
|
||||||
from nltk.stem.porter import PorterStemmer
|
from nltk.stem.porter import PorterStemmer
|
||||||
from nltk.tokenize import word_tokenize
|
from nltk.tokenize import word_tokenize
|
||||||
|
@ -62,10 +64,15 @@ class BaseSimilarity(abc.ABC):
|
||||||
|
|
||||||
self.nltk_tokenizer = nltk_tokenizer
|
self.nltk_tokenizer = nltk_tokenizer
|
||||||
|
|
||||||
def get_text(self, bug):
|
def get_text(self, bug, all_comments=False):
|
||||||
return "{} {}".format(bug["summary"], bug["comments"][0]["text"])
|
if all_comments:
|
||||||
|
comments = " ".join(comment["text"] for comment in bug["comments"])
|
||||||
|
else:
|
||||||
|
comments = bug["comments"][0]["text"]
|
||||||
|
|
||||||
def text_preprocess(self, text, lemmatization=False, join=False):
|
return "{} {}".format(bug["summary"], comments)
|
||||||
|
|
||||||
|
def text_preprocess(self, text, stemming=True, lemmatization=False, join=False):
|
||||||
|
|
||||||
for func in self.cleanup_functions:
|
for func in self.cleanup_functions:
|
||||||
text = func(text)
|
text = func(text)
|
||||||
|
@ -74,7 +81,7 @@ class BaseSimilarity(abc.ABC):
|
||||||
|
|
||||||
if lemmatization:
|
if lemmatization:
|
||||||
text = [word.lemma_ for word in nlp(text)]
|
text = [word.lemma_ for word in nlp(text)]
|
||||||
else:
|
elif stemming:
|
||||||
ps = PorterStemmer()
|
ps = PorterStemmer()
|
||||||
tokenized_text = (
|
tokenized_text = (
|
||||||
word_tokenize(text.lower())
|
word_tokenize(text.lower())
|
||||||
|
@ -86,6 +93,8 @@ class BaseSimilarity(abc.ABC):
|
||||||
for word in tokenized_text
|
for word in tokenized_text
|
||||||
if word not in set(stopwords.words("english")) and len(word) > 1
|
if word not in set(stopwords.words("english")) and len(word) > 1
|
||||||
]
|
]
|
||||||
|
else:
|
||||||
|
text = text.split()
|
||||||
|
|
||||||
if join:
|
if join:
|
||||||
return " ".join(word for word in text)
|
return " ".join(word for word in text)
|
||||||
|
@ -626,6 +635,57 @@ class LDASimilarity(BaseSimilarity):
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
|
class ElasticSearchSimilarity(BaseSimilarity):
|
||||||
|
def __init__(self, cleanup_urls=True, nltk_tokenizer=False):
|
||||||
|
super().__init__(cleanup_urls=cleanup_urls, nltk_tokenizer=nltk_tokenizer)
|
||||||
|
self.elastic_search = Elasticsearch()
|
||||||
|
assert (
|
||||||
|
self.elastic_search.ping()
|
||||||
|
), "Check if Elastic Search Server is running by visiting http://localhost:9200"
|
||||||
|
|
||||||
|
def make_documents(self):
|
||||||
|
for bug in bugzilla.get_bugs():
|
||||||
|
yield {
|
||||||
|
"_index": "bugbug",
|
||||||
|
"_type": "_doc",
|
||||||
|
"bug_id": bug["id"],
|
||||||
|
"description": self.text_preprocess(
|
||||||
|
self.get_text(bug, all_comments=True), stemming=False, join=True
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
|
def index(self):
|
||||||
|
self.elastic_search.indices.delete(index="bugbug", ignore=[400, 404])
|
||||||
|
bulk(self.elastic_search, self.make_documents())
|
||||||
|
|
||||||
|
def get_similar_bugs(self, query):
|
||||||
|
find_similar = self.text_preprocess(
|
||||||
|
self.get_text(query, all_comments=True), stemming=False, join=True
|
||||||
|
)
|
||||||
|
|
||||||
|
es_query = {
|
||||||
|
"more_like_this": {
|
||||||
|
"fields": ["description"],
|
||||||
|
"like": find_similar,
|
||||||
|
"min_term_freq": 1,
|
||||||
|
"max_query_terms": 25,
|
||||||
|
"min_doc_freq": 2,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
result = self.elastic_search.search(index="bugbug", body={"query": es_query})
|
||||||
|
|
||||||
|
top_similar = [
|
||||||
|
result["hits"]["hits"][i]["_source"]["bug_id"]
|
||||||
|
for i in range(len(result["hits"]["hits"]))
|
||||||
|
if result["hits"]["hits"][i]["_source"]["bug_id"] != query["id"]
|
||||||
|
]
|
||||||
|
return top_similar
|
||||||
|
|
||||||
|
def get_distance(self, query1, query2):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
model_name_to_class = {
|
model_name_to_class = {
|
||||||
"lsi": LSISimilarity,
|
"lsi": LSISimilarity,
|
||||||
"neighbors_tfidf": NeighborsSimilarity,
|
"neighbors_tfidf": NeighborsSimilarity,
|
||||||
|
@ -635,4 +695,5 @@ model_name_to_class = {
|
||||||
"word2vec_softcos": Word2VecSoftCosSimilarity,
|
"word2vec_softcos": Word2VecSoftCosSimilarity,
|
||||||
"bm25": BM25Similarity,
|
"bm25": BM25Similarity,
|
||||||
"lda": LDASimilarity,
|
"lda": LDASimilarity,
|
||||||
|
"elasticsearch": ElasticSearchSimilarity,
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
elasticsearch==7.5.1
|
||||||
gensim==3.8.1
|
gensim==3.8.1
|
||||||
nltk==3.4.5
|
nltk==3.4.5
|
||||||
pyemd==0.5.1
|
pyemd==0.5.1
|
||||||
|
|
|
@ -31,6 +31,11 @@ def parse_args(args):
|
||||||
dest="nltk_tokenizer",
|
dest="nltk_tokenizer",
|
||||||
default=False,
|
default=False,
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--index",
|
||||||
|
help="Create/Recreate a database in Elastic Search",
|
||||||
|
action="store_true",
|
||||||
|
)
|
||||||
return parser.parse_args(args)
|
return parser.parse_args(args)
|
||||||
|
|
||||||
|
|
||||||
|
@ -45,6 +50,8 @@ def main(args):
|
||||||
model = similarity.model_name_to_class[args.algorithm](
|
model = similarity.model_name_to_class[args.algorithm](
|
||||||
cleanup_urls=args.cleanup_urls, nltk_tokenizer=args.nltk_tokenizer
|
cleanup_urls=args.cleanup_urls, nltk_tokenizer=args.nltk_tokenizer
|
||||||
)
|
)
|
||||||
|
if args.algorithm == "elasticsearch" and args.index:
|
||||||
|
model.index()
|
||||||
|
|
||||||
model.evaluation()
|
model.evaluation()
|
||||||
|
|
||||||
|
|
|
@ -32,24 +32,27 @@ def parse_args(args):
|
||||||
|
|
||||||
def main(args):
|
def main(args):
|
||||||
|
|
||||||
model_file_name = f"{similarity.model_name_to_class[args.algorithm].__name__.lower()}.similaritymodel"
|
if args.algorithm == "elasticsearch":
|
||||||
|
model = similarity.model_name_to_class[args.algorithm]()
|
||||||
|
else:
|
||||||
|
model_file_name = f"{similarity.model_name_to_class[args.algorithm].__name__.lower()}.similaritymodel"
|
||||||
|
|
||||||
if not os.path.exists(model_file_name):
|
if not os.path.exists(model_file_name):
|
||||||
logger.info(f"{model_file_name} does not exist. Downloading the model....")
|
logger.info(f"{model_file_name} does not exist. Downloading the model....")
|
||||||
try:
|
try:
|
||||||
download_check_etag(URL.format(model_file_name))
|
download_check_etag(URL.format(model_file_name))
|
||||||
except requests.HTTPError:
|
except requests.HTTPError:
|
||||||
logger.error(
|
logger.error(
|
||||||
f"A pre-trained model is not available, you will need to train it yourself using the trainer script"
|
f"A pre-trained model is not available, you will need to train it yourself using the trainer script"
|
||||||
)
|
)
|
||||||
raise SystemExit(1)
|
raise SystemExit(1)
|
||||||
|
|
||||||
zstd_decompress(model_file_name)
|
zstd_decompress(model_file_name)
|
||||||
assert os.path.exists(model_file_name), "Decompressed file doesn't exist"
|
assert os.path.exists(model_file_name), "Decompressed file doesn't exist"
|
||||||
|
|
||||||
model = similarity.model_name_to_class[args.algorithm].load(
|
model = similarity.model_name_to_class[args.algorithm].load(
|
||||||
f"{similarity.model_name_to_class[args.algorithm].__name__.lower()}.similaritymodel"
|
f"{similarity.model_name_to_class[args.algorithm].__name__.lower()}.similaritymodel"
|
||||||
)
|
)
|
||||||
|
|
||||||
bug_ids = model.get_similar_bugs(bugzilla.get(args.bug_id)[args.bug_id])
|
bug_ids = model.get_similar_bugs(bugzilla.get(args.bug_id)[args.bug_id])
|
||||||
|
|
||||||
|
|
|
@ -48,20 +48,26 @@ def main():
|
||||||
|
|
||||||
assert db.download(bugzilla.BUGS_DB)
|
assert db.download(bugzilla.BUGS_DB)
|
||||||
|
|
||||||
if args.algorithm == "neighbors_tfidf_bigrams":
|
if args.algorithm == "elasticsearch":
|
||||||
model = similarity.model_name_to_class[args.algorithm](
|
|
||||||
vectorizer=TfidfVectorizer(ngram_range=(1, 2)),
|
|
||||||
cleanup_urls=args.cleanup_urls,
|
|
||||||
nltk_tokenizer=args.nltk_tokenizer,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
model = similarity.model_name_to_class[args.algorithm](
|
model = similarity.model_name_to_class[args.algorithm](
|
||||||
cleanup_urls=args.cleanup_urls, nltk_tokenizer=args.nltk_tokenizer
|
cleanup_urls=args.cleanup_urls, nltk_tokenizer=args.nltk_tokenizer
|
||||||
)
|
)
|
||||||
|
model.index()
|
||||||
|
else:
|
||||||
|
if args.algorithm == "neighbors_tfidf_bigrams":
|
||||||
|
model = similarity.model_name_to_class[args.algorithm](
|
||||||
|
vectorizer=TfidfVectorizer(ngram_range=(1, 2)),
|
||||||
|
cleanup_urls=args.cleanup_urls,
|
||||||
|
nltk_tokenizer=args.nltk_tokenizer,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
model = similarity.model_name_to_class[args.algorithm](
|
||||||
|
cleanup_urls=args.cleanup_urls, nltk_tokenizer=args.nltk_tokenizer
|
||||||
|
)
|
||||||
|
|
||||||
path = model.save()
|
path = model.save()
|
||||||
assert os.path.exists(path)
|
assert os.path.exists(path)
|
||||||
zstd_compress(path)
|
zstd_compress(path)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
Загрузка…
Ссылка в новой задаче