зеркало из https://github.com/mozilla/bugbug.git
Duplicate/similarity detection using Elasticsearch (#1269)
This commit is contained in:
Родитель
7a1d2457ef
Коммит
87cb3dce0d
|
@ -33,6 +33,8 @@ try:
|
|||
from gensim.similarities import SoftCosineSimilarity, SparseTermSimilarityMatrix
|
||||
from gensim.summarization.bm25 import BM25
|
||||
from gensim.corpora import Dictionary
|
||||
from elasticsearch.helpers import bulk
|
||||
from elasticsearch import Elasticsearch
|
||||
from nltk.corpus import stopwords
|
||||
from nltk.stem.porter import PorterStemmer
|
||||
from nltk.tokenize import word_tokenize
|
||||
|
@ -62,10 +64,15 @@ class BaseSimilarity(abc.ABC):
|
|||
|
||||
self.nltk_tokenizer = nltk_tokenizer
|
||||
|
||||
def get_text(self, bug):
|
||||
return "{} {}".format(bug["summary"], bug["comments"][0]["text"])
|
||||
def get_text(self, bug, all_comments=False):
|
||||
if all_comments:
|
||||
comments = " ".join(comment["text"] for comment in bug["comments"])
|
||||
else:
|
||||
comments = bug["comments"][0]["text"]
|
||||
|
||||
def text_preprocess(self, text, lemmatization=False, join=False):
|
||||
return "{} {}".format(bug["summary"], comments)
|
||||
|
||||
def text_preprocess(self, text, stemming=True, lemmatization=False, join=False):
|
||||
|
||||
for func in self.cleanup_functions:
|
||||
text = func(text)
|
||||
|
@ -74,7 +81,7 @@ class BaseSimilarity(abc.ABC):
|
|||
|
||||
if lemmatization:
|
||||
text = [word.lemma_ for word in nlp(text)]
|
||||
else:
|
||||
elif stemming:
|
||||
ps = PorterStemmer()
|
||||
tokenized_text = (
|
||||
word_tokenize(text.lower())
|
||||
|
@ -86,6 +93,8 @@ class BaseSimilarity(abc.ABC):
|
|||
for word in tokenized_text
|
||||
if word not in set(stopwords.words("english")) and len(word) > 1
|
||||
]
|
||||
else:
|
||||
text = text.split()
|
||||
|
||||
if join:
|
||||
return " ".join(word for word in text)
|
||||
|
@ -626,6 +635,57 @@ class LDASimilarity(BaseSimilarity):
|
|||
raise NotImplementedError
|
||||
|
||||
|
||||
class ElasticSearchSimilarity(BaseSimilarity):
|
||||
def __init__(self, cleanup_urls=True, nltk_tokenizer=False):
|
||||
super().__init__(cleanup_urls=cleanup_urls, nltk_tokenizer=nltk_tokenizer)
|
||||
self.elastic_search = Elasticsearch()
|
||||
assert (
|
||||
self.elastic_search.ping()
|
||||
), "Check if Elastic Search Server is running by visiting http://localhost:9200"
|
||||
|
||||
def make_documents(self):
|
||||
for bug in bugzilla.get_bugs():
|
||||
yield {
|
||||
"_index": "bugbug",
|
||||
"_type": "_doc",
|
||||
"bug_id": bug["id"],
|
||||
"description": self.text_preprocess(
|
||||
self.get_text(bug, all_comments=True), stemming=False, join=True
|
||||
),
|
||||
}
|
||||
|
||||
def index(self):
|
||||
self.elastic_search.indices.delete(index="bugbug", ignore=[400, 404])
|
||||
bulk(self.elastic_search, self.make_documents())
|
||||
|
||||
def get_similar_bugs(self, query):
|
||||
find_similar = self.text_preprocess(
|
||||
self.get_text(query, all_comments=True), stemming=False, join=True
|
||||
)
|
||||
|
||||
es_query = {
|
||||
"more_like_this": {
|
||||
"fields": ["description"],
|
||||
"like": find_similar,
|
||||
"min_term_freq": 1,
|
||||
"max_query_terms": 25,
|
||||
"min_doc_freq": 2,
|
||||
}
|
||||
}
|
||||
|
||||
result = self.elastic_search.search(index="bugbug", body={"query": es_query})
|
||||
|
||||
top_similar = [
|
||||
result["hits"]["hits"][i]["_source"]["bug_id"]
|
||||
for i in range(len(result["hits"]["hits"]))
|
||||
if result["hits"]["hits"][i]["_source"]["bug_id"] != query["id"]
|
||||
]
|
||||
return top_similar
|
||||
|
||||
def get_distance(self, query1, query2):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
model_name_to_class = {
|
||||
"lsi": LSISimilarity,
|
||||
"neighbors_tfidf": NeighborsSimilarity,
|
||||
|
@ -635,4 +695,5 @@ model_name_to_class = {
|
|||
"word2vec_softcos": Word2VecSoftCosSimilarity,
|
||||
"bm25": BM25Similarity,
|
||||
"lda": LDASimilarity,
|
||||
"elasticsearch": ElasticSearchSimilarity,
|
||||
}
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
elasticsearch==7.5.1
|
||||
gensim==3.8.1
|
||||
nltk==3.4.5
|
||||
pyemd==0.5.1
|
||||
|
|
|
@ -31,6 +31,11 @@ def parse_args(args):
|
|||
dest="nltk_tokenizer",
|
||||
default=False,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--index",
|
||||
help="Create/Recreate a database in Elastic Search",
|
||||
action="store_true",
|
||||
)
|
||||
return parser.parse_args(args)
|
||||
|
||||
|
||||
|
@ -45,6 +50,8 @@ def main(args):
|
|||
model = similarity.model_name_to_class[args.algorithm](
|
||||
cleanup_urls=args.cleanup_urls, nltk_tokenizer=args.nltk_tokenizer
|
||||
)
|
||||
if args.algorithm == "elasticsearch" and args.index:
|
||||
model.index()
|
||||
|
||||
model.evaluation()
|
||||
|
||||
|
|
|
@ -32,24 +32,27 @@ def parse_args(args):
|
|||
|
||||
def main(args):
|
||||
|
||||
model_file_name = f"{similarity.model_name_to_class[args.algorithm].__name__.lower()}.similaritymodel"
|
||||
if args.algorithm == "elasticsearch":
|
||||
model = similarity.model_name_to_class[args.algorithm]()
|
||||
else:
|
||||
model_file_name = f"{similarity.model_name_to_class[args.algorithm].__name__.lower()}.similaritymodel"
|
||||
|
||||
if not os.path.exists(model_file_name):
|
||||
logger.info(f"{model_file_name} does not exist. Downloading the model....")
|
||||
try:
|
||||
download_check_etag(URL.format(model_file_name))
|
||||
except requests.HTTPError:
|
||||
logger.error(
|
||||
f"A pre-trained model is not available, you will need to train it yourself using the trainer script"
|
||||
)
|
||||
raise SystemExit(1)
|
||||
if not os.path.exists(model_file_name):
|
||||
logger.info(f"{model_file_name} does not exist. Downloading the model....")
|
||||
try:
|
||||
download_check_etag(URL.format(model_file_name))
|
||||
except requests.HTTPError:
|
||||
logger.error(
|
||||
f"A pre-trained model is not available, you will need to train it yourself using the trainer script"
|
||||
)
|
||||
raise SystemExit(1)
|
||||
|
||||
zstd_decompress(model_file_name)
|
||||
assert os.path.exists(model_file_name), "Decompressed file doesn't exist"
|
||||
zstd_decompress(model_file_name)
|
||||
assert os.path.exists(model_file_name), "Decompressed file doesn't exist"
|
||||
|
||||
model = similarity.model_name_to_class[args.algorithm].load(
|
||||
f"{similarity.model_name_to_class[args.algorithm].__name__.lower()}.similaritymodel"
|
||||
)
|
||||
model = similarity.model_name_to_class[args.algorithm].load(
|
||||
f"{similarity.model_name_to_class[args.algorithm].__name__.lower()}.similaritymodel"
|
||||
)
|
||||
|
||||
bug_ids = model.get_similar_bugs(bugzilla.get(args.bug_id)[args.bug_id])
|
||||
|
||||
|
|
|
@ -48,20 +48,26 @@ def main():
|
|||
|
||||
assert db.download(bugzilla.BUGS_DB)
|
||||
|
||||
if args.algorithm == "neighbors_tfidf_bigrams":
|
||||
model = similarity.model_name_to_class[args.algorithm](
|
||||
vectorizer=TfidfVectorizer(ngram_range=(1, 2)),
|
||||
cleanup_urls=args.cleanup_urls,
|
||||
nltk_tokenizer=args.nltk_tokenizer,
|
||||
)
|
||||
else:
|
||||
if args.algorithm == "elasticsearch":
|
||||
model = similarity.model_name_to_class[args.algorithm](
|
||||
cleanup_urls=args.cleanup_urls, nltk_tokenizer=args.nltk_tokenizer
|
||||
)
|
||||
model.index()
|
||||
else:
|
||||
if args.algorithm == "neighbors_tfidf_bigrams":
|
||||
model = similarity.model_name_to_class[args.algorithm](
|
||||
vectorizer=TfidfVectorizer(ngram_range=(1, 2)),
|
||||
cleanup_urls=args.cleanup_urls,
|
||||
nltk_tokenizer=args.nltk_tokenizer,
|
||||
)
|
||||
else:
|
||||
model = similarity.model_name_to_class[args.algorithm](
|
||||
cleanup_urls=args.cleanup_urls, nltk_tokenizer=args.nltk_tokenizer
|
||||
)
|
||||
|
||||
path = model.save()
|
||||
assert os.path.exists(path)
|
||||
zstd_compress(path)
|
||||
path = model.save()
|
||||
assert os.path.exists(path)
|
||||
zstd_compress(path)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
Загрузка…
Ссылка в новой задаче