Use re.compile to speedup feature cleanups (#351)

Fixes #338.
This commit is contained in:
Ayush Shridhar 2019-05-09 18:39:26 +05:30 коммит произвёл Marco
Родитель 4954fe38d4
Коммит c440db7315
10 изменённых файлов: 219 добавлений и 198 удалений

Просмотреть файл

@ -6,172 +6,193 @@
import re
def url(text):
text = re.sub(
r"http[s]?://(hg.mozilla|searchfox|dxr.mozilla)\S+",
"__CODE_REFERENCE_URL__",
text,
)
return re.sub(r"http\S+", "__URL__", text)
class url(object):
def __init__(self):
self.reference_url = re.compile(
r"http[s]?://(hg.mozilla|searchfox|dxr.mozilla)\S+"
)
self.url = re.compile(r"http\S+")
def fileref(text):
return re.sub(
r"\w+\.py\b|\w+\.json\b|\w+\.js\b|\w+\.jsm\b|\w+\.html\b|\w+\.css\b|\w+\.c\b|\w+\.cpp\b|\w+\.h\b",
"__FILE_REFERENCE__",
text,
)
def responses(text):
return re.sub(">[^\n]+", " ", text)
def hex(text):
return re.sub(r"\b0[xX][0-9a-fA-F]+\b", "__HEX_NUMBER__", text)
FIREFOX_DLLS_MATCH = "|".join(
[
"libmozwayland.so",
"libssl3.so",
"libnssdbm3.so",
"liblgpllibs.so",
"libmozavutil.so",
"libxul.so",
"libmozgtk.so",
"libnssckbi.so",
"libclearkey.dylib",
"libmozsqlite3.so",
"libplc4.so",
"libsmime3.so",
"libclearkey.so",
"libnssutil3.so",
"libnss3.so",
"libplds4.so",
"libfreeblpriv3.so",
"libsoftokn3.so",
"libmozgtk.so",
"libmozavcodec.so",
"libnspr4.so",
"IA2Marshal.dll",
"lgpllibs.dll",
"libEGL.dll",
"libGLESv2.dll",
"libmozsandbox.so",
"AccessibleHandler.dll",
"AccessibleMarshal.dll",
"api-ms-win-core-console-l1-1-0.dll",
"api-ms-win-core-datetime-l1-1-0.dll",
"api-ms-win-core-debug-l1-1-0.dll",
"api-ms-win-core-errorhandling-l1-1-0.dll",
"api-ms-win-core-file-l1-1-0.dll",
"api-ms-win-core-file-l1-2-0.dll",
"api-ms-win-core-file-l2-1-0.dll",
"api-ms-win-core-handle-l1-1-0.dll",
"api-ms-win-core-heap-l1-1-0.dll",
"api-ms-win-core-interlocked-l1-1-0.dll",
"api-ms-win-core-libraryloader-l1-1-0.dll",
"api-ms-win-core-localization-l1-2-0.dll",
"api-ms-win-core-memory-l1-1-0.dll",
"api-ms-win-core-namedpipe-l1-1-0.dll",
"api-ms-win-core-processenvironment-l1-1-0.dll",
"api-ms-win-core-processthreads-l1-1-0.dll",
"api-ms-win-core-processthreads-l1-1-1.dll",
"api-ms-win-core-profile-l1-1-0.dll",
"api-ms-win-core-rtlsupport-l1-1-0.dll",
"api-ms-win-core-string-l1-1-0.dll",
"api-ms-win-core-synch-l1-1-0.dll",
"api-ms-win-core-synch-l1-2-0.dll",
"api-ms-win-core-sysinfo-l1-1-0.dll",
"api-ms-win-core-timezone-l1-1-0.dll",
"api-ms-win-core-util-l1-1-0.dll",
"api-ms-win-crt-conio-l1-1-0.dll",
"api-ms-win-crt-convert-l1-1-0.dll",
"api-ms-win-crt-environment-l1-1-0.dll",
"api-ms-win-crt-filesystem-l1-1-0.dll",
"api-ms-win-crt-heap-l1-1-0.dll",
"api-ms-win-crt-locale-l1-1-0.dll",
"api-ms-win-crt-math-l1-1-0.dll",
"api-ms-win-crt-multibyte-l1-1-0.dll",
"api-ms-win-crt-private-l1-1-0.dll",
"api-ms-win-crt-process-l1-1-0.dll",
"api-ms-win-crt-runtime-l1-1-0.dll",
"api-ms-win-crt-stdio-l1-1-0.dll",
"api-ms-win-crt-string-l1-1-0.dll",
"api-ms-win-crt-time-l1-1-0.dll",
"api-ms-win-crt-utility-l1-1-0.dll",
"d3dcompiler_47.dll",
"freebl3.dll",
"mozavcodec.dll",
"mozavutil.dll",
"mozglue.dll",
"msvcp140.dll",
"nss3.dll",
"nssckbi.dll",
"nssdbm3.dll",
"qipcap64.dll",
"softokn3.dll",
"ucrtbase.dll",
"vcruntime140.dll",
"xul.dll",
"clearkey.dll",
"libfreebl3.dylib",
"liblgpllibs.dylib",
"libmozavcodec.dylib",
"libmozavutil.dylib",
"libmozglue.dylib",
"libnss3.dylib",
"libnssckbi.dylib",
"libnssdbm3.dylib",
"libplugin_child_interpose.dylib",
"libsoftokn3.dylib",
]
).replace(".", r"\.")
def dll(text):
regex = fr"\b(?!{FIREFOX_DLLS_MATCH})\w+(\.dll|\.so|\.dylib)\b"
return re.sub(regex, "__DLL_NAME__", text)
def synonyms(text):
synonyms = [
("safemode", ["safemode", "safe mode"]),
("str", ["str", "steps to reproduce", "repro steps"]),
("uaf", ["uaf", "use after free", "use-after-free"]),
("asan", ["asan", "address sanitizer", "addresssanitizer"]),
(
"permafailure",
[
"permafailure",
"permafailing",
"permafail",
"perma failure",
"perma failing",
"perma fail",
"perma-failure",
"perma-failing",
"perma-fail",
],
),
("spec", ["spec", "specification"]),
]
for synonym_group, synonym_list in synonyms:
text = re.sub(
"|".join(fr"\b{synonym}\b" for synonym in synonym_list),
synonym_group,
text,
flags=re.IGNORECASE,
def __call__(self, text):
return self.url.sub(
"__URL__", self.reference_url.sub("__CODE_REFERENCE_URL__", text)
)
return text
class fileref(object):
def __init__(self):
self.pattern = re.compile(
r"\w+\.py\b|\w+\.json\b|\w+\.js\b|\w+\.jsm\b|\w+\.html\b|\w+\.css\b|\w+\.c\b|\w+\.cpp\b|\w+\.h\b"
)
def __call__(self, text):
return self.pattern.sub("__FILE_REFERENCE__", text)
def crash(text):
return re.sub(
r"bp-[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{6}[0-9]{6}\b",
"__CRASH_STATS_LINK__",
text,
)
class responses(object):
def __init__(self):
self.pattern = re.compile(">[^\n]+")
def __call__(self, text):
return self.pattern.sub(" ", text)
class hex(object):
def __init__(self):
self.pattern = re.compile(r"\b0[xX][0-9a-fA-F]+\b")
def __call__(self, text):
return self.pattern.sub("__HEX_NUMBER__", text)
class dll(object):
def __init__(self):
FIREFOX_DLLS_MATCH = "|".join(
[
"libmozwayland.so",
"libssl3.so",
"libnssdbm3.so",
"liblgpllibs.so",
"libmozavutil.so",
"libxul.so",
"libmozgtk.so",
"libnssckbi.so",
"libclearkey.dylib",
"libmozsqlite3.so",
"libplc4.so",
"libsmime3.so",
"libclearkey.so",
"libnssutil3.so",
"libnss3.so",
"libplds4.so",
"libfreeblpriv3.so",
"libsoftokn3.so",
"libmozgtk.so",
"libmozavcodec.so",
"libnspr4.so",
"IA2Marshal.dll",
"lgpllibs.dll",
"libEGL.dll",
"libGLESv2.dll",
"libmozsandbox.so",
"AccessibleHandler.dll",
"AccessibleMarshal.dll",
"api-ms-win-core-console-l1-1-0.dll",
"api-ms-win-core-datetime-l1-1-0.dll",
"api-ms-win-core-debug-l1-1-0.dll",
"api-ms-win-core-errorhandling-l1-1-0.dll",
"api-ms-win-core-file-l1-1-0.dll",
"api-ms-win-core-file-l1-2-0.dll",
"api-ms-win-core-file-l2-1-0.dll",
"api-ms-win-core-handle-l1-1-0.dll",
"api-ms-win-core-heap-l1-1-0.dll",
"api-ms-win-core-interlocked-l1-1-0.dll",
"api-ms-win-core-libraryloader-l1-1-0.dll",
"api-ms-win-core-localization-l1-2-0.dll",
"api-ms-win-core-memory-l1-1-0.dll",
"api-ms-win-core-namedpipe-l1-1-0.dll",
"api-ms-win-core-processenvironment-l1-1-0.dll",
"api-ms-win-core-processthreads-l1-1-0.dll",
"api-ms-win-core-processthreads-l1-1-1.dll",
"api-ms-win-core-profile-l1-1-0.dll",
"api-ms-win-core-rtlsupport-l1-1-0.dll",
"api-ms-win-core-string-l1-1-0.dll",
"api-ms-win-core-synch-l1-1-0.dll",
"api-ms-win-core-synch-l1-2-0.dll",
"api-ms-win-core-sysinfo-l1-1-0.dll",
"api-ms-win-core-timezone-l1-1-0.dll",
"api-ms-win-core-util-l1-1-0.dll",
"api-ms-win-crt-conio-l1-1-0.dll",
"api-ms-win-crt-convert-l1-1-0.dll",
"api-ms-win-crt-environment-l1-1-0.dll",
"api-ms-win-crt-filesystem-l1-1-0.dll",
"api-ms-win-crt-heap-l1-1-0.dll",
"api-ms-win-crt-locale-l1-1-0.dll",
"api-ms-win-crt-math-l1-1-0.dll",
"api-ms-win-crt-multibyte-l1-1-0.dll",
"api-ms-win-crt-private-l1-1-0.dll",
"api-ms-win-crt-process-l1-1-0.dll",
"api-ms-win-crt-runtime-l1-1-0.dll",
"api-ms-win-crt-stdio-l1-1-0.dll",
"api-ms-win-crt-string-l1-1-0.dll",
"api-ms-win-crt-time-l1-1-0.dll",
"api-ms-win-crt-utility-l1-1-0.dll",
"d3dcompiler_47.dll",
"freebl3.dll",
"mozavcodec.dll",
"mozavutil.dll",
"mozglue.dll",
"msvcp140.dll",
"nss3.dll",
"nssckbi.dll",
"nssdbm3.dll",
"qipcap64.dll",
"softokn3.dll",
"ucrtbase.dll",
"vcruntime140.dll",
"xul.dll",
"clearkey.dll",
"libfreebl3.dylib",
"liblgpllibs.dylib",
"libmozavcodec.dylib",
"libmozavutil.dylib",
"libmozglue.dylib",
"libnss3.dylib",
"libnssckbi.dylib",
"libnssdbm3.dylib",
"libplugin_child_interpose.dylib",
"libsoftokn3.dylib",
]
).replace(".", r"\.")
self.pattern = re.compile(
fr"\b(?!{FIREFOX_DLLS_MATCH})\w+(\.dll|\.so|\.dylib)\b"
)
def __call__(self, text):
return self.pattern.sub("__DLL_NAME__", text)
class synonyms(object):
def __init__(self):
synonyms = [
("safemode", ["safemode", "safe mode"]),
("str", ["str", "steps to reproduce", "repro steps"]),
("uaf", ["uaf", "use after free", "use-after-free"]),
("asan", ["asan", "address sanitizer", "addresssanitizer"]),
(
"permafailure",
[
"permafailure",
"permafailing",
"permafail",
"perma failure",
"perma failing",
"perma fail",
"perma-failure",
"perma-failing",
"perma-fail",
],
),
("spec", ["spec", "specification"]),
]
self.pattern = {}
for synonym_group, synonym_list in synonyms:
self.pattern[synonym_group] = re.compile(
"|".join(fr"\b{synonym}\b" for synonym in synonym_list),
flags=re.IGNORECASE,
)
def __call__(self, text):
for synonym_group in self.pattern:
text = self.pattern[synonym_group].sub(synonym_group, text)
return text
class crash(object):
def __init__(self):
self.pattern = re.compile(
r"bp-[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{6}[0-9]{6}\b"
)
def __call__(self, text):
return self.pattern.sub("__CRASH_STATS_LINK__", text)

Просмотреть файл

@ -48,9 +48,9 @@ class AssigneeModel(BugModel):
]
cleanup_functions = [
feature_cleanup.fileref,
feature_cleanup.url,
feature_cleanup.synonyms,
feature_cleanup.fileref(),
feature_cleanup.url(),
feature_cleanup.synonyms(),
]
self.extraction_pipeline = Pipeline(

Просмотреть файл

@ -41,9 +41,9 @@ class BackoutModel(CommitModel):
]
cleanup_functions = [
feature_cleanup.fileref,
feature_cleanup.url,
feature_cleanup.synonyms,
feature_cleanup.fileref(),
feature_cleanup.url(),
feature_cleanup.synonyms(),
]
self.extraction_pipeline = Pipeline(

Просмотреть файл

@ -77,9 +77,9 @@ class ComponentModel(BugModel):
]
cleanup_functions = [
feature_cleanup.fileref,
feature_cleanup.url,
feature_cleanup.synonyms,
feature_cleanup.fileref(),
feature_cleanup.url(),
feature_cleanup.synonyms(),
]
self.extraction_pipeline = Pipeline(

Просмотреть файл

@ -47,9 +47,9 @@ class DefectModel(BugModel):
feature_extractors.append(bug_features.had_severity_enhancement())
cleanup_functions = [
feature_cleanup.url,
feature_cleanup.fileref,
feature_cleanup.synonyms,
feature_cleanup.url(),
feature_cleanup.fileref(),
feature_cleanup.synonyms(),
]
self.extraction_pipeline = Pipeline(

Просмотреть файл

@ -41,9 +41,9 @@ class DevDocNeededModel(BugModel):
]
cleanup_functions = [
feature_cleanup.fileref,
feature_cleanup.url,
feature_cleanup.synonyms,
feature_cleanup.fileref(),
feature_cleanup.url(),
feature_cleanup.synonyms(),
]
self.extraction_pipeline = Pipeline(

Просмотреть файл

@ -36,9 +36,9 @@ class QANeededModel(BugModel):
]
cleanup_functions = [
feature_cleanup.fileref,
feature_cleanup.url,
feature_cleanup.synonyms,
feature_cleanup.fileref(),
feature_cleanup.url(),
feature_cleanup.synonyms(),
]
self.extraction_pipeline = Pipeline(

Просмотреть файл

@ -47,12 +47,12 @@ class TrackingModel(BugModel):
]
cleanup_functions = [
feature_cleanup.url,
feature_cleanup.fileref,
feature_cleanup.hex,
feature_cleanup.dll,
feature_cleanup.synonyms,
feature_cleanup.crash,
feature_cleanup.url(),
feature_cleanup.fileref(),
feature_cleanup.hex(),
feature_cleanup.dll(),
feature_cleanup.synonyms(),
feature_cleanup.crash(),
]
self.extraction_pipeline = Pipeline(

Просмотреть файл

@ -36,9 +36,9 @@ class UpliftModel(BugModel):
]
cleanup_functions = [
feature_cleanup.fileref,
feature_cleanup.url,
feature_cleanup.synonyms,
feature_cleanup.fileref(),
feature_cleanup.url(),
feature_cleanup.synonyms(),
]
self.extraction_pipeline = Pipeline(

Просмотреть файл

@ -26,7 +26,7 @@ def test_url():
),
]
for orig_text, cleaned_text in tests:
assert feature_cleanup.url(orig_text) == cleaned_text
assert feature_cleanup.url()(orig_text) == cleaned_text
def test_fileref():
@ -37,7 +37,7 @@ def test_fileref():
)
]
for orig_text, cleaned_text in tests:
assert feature_cleanup.fileref(orig_text) == cleaned_text
assert feature_cleanup.fileref()(orig_text) == cleaned_text
def test_responses():
@ -57,7 +57,7 @@ def test_responses():
),
]
for orig_text, cleaned_text in tests:
assert feature_cleanup.responses(orig_text) == cleaned_text
assert feature_cleanup.responses()(orig_text) == cleaned_text
def test_hex():
@ -72,7 +72,7 @@ def test_hex():
),
]
for orig_text, cleaned_text in tests:
assert feature_cleanup.hex(orig_text) == cleaned_text
assert feature_cleanup.hex()(orig_text) == cleaned_text
def test_dll():
@ -100,7 +100,7 @@ def test_dll():
),
]
for orig_text, cleaned_text in tests:
assert feature_cleanup.dll(orig_text) == cleaned_text
assert feature_cleanup.dll()(orig_text) == cleaned_text
def test_synonyms():
@ -118,7 +118,7 @@ def test_synonyms():
("found via address sanitizer or asan", "found via asan or asan"),
]
for orig_text, cleaned_text in tests:
assert feature_cleanup.synonyms(orig_text) == cleaned_text
assert feature_cleanup.synonyms()(orig_text) == cleaned_text
def test_crash():
@ -133,4 +133,4 @@ def test_crash():
),
]
for orig_text, cleaned_text in tests:
assert feature_cleanup.crash(orig_text) == cleaned_text
assert feature_cleanup.crash()(orig_text) == cleaned_text