From c440db73151cf778a1677845ce91ecdb2119189b Mon Sep 17 00:00:00 2001 From: Ayush Shridhar Date: Thu, 9 May 2019 18:39:26 +0530 Subject: [PATCH] Use re.compile to speedup feature cleanups (#351) Fixes #338. --- bugbug/feature_cleanup.py | 349 ++++++++++++++++++---------------- bugbug/models/assignee.py | 6 +- bugbug/models/backout.py | 6 +- bugbug/models/component.py | 6 +- bugbug/models/defect.py | 6 +- bugbug/models/devdocneeded.py | 6 +- bugbug/models/qaneeded.py | 6 +- bugbug/models/tracking.py | 12 +- bugbug/models/uplift.py | 6 +- tests/test_feature_cleanup.py | 14 +- 10 files changed, 219 insertions(+), 198 deletions(-) diff --git a/bugbug/feature_cleanup.py b/bugbug/feature_cleanup.py index 6519b7b6..e5cbbf40 100644 --- a/bugbug/feature_cleanup.py +++ b/bugbug/feature_cleanup.py @@ -6,172 +6,193 @@ import re -def url(text): - text = re.sub( - r"http[s]?://(hg.mozilla|searchfox|dxr.mozilla)\S+", - "__CODE_REFERENCE_URL__", - text, - ) - return re.sub(r"http\S+", "__URL__", text) +class url(object): + def __init__(self): + self.reference_url = re.compile( + r"http[s]?://(hg.mozilla|searchfox|dxr.mozilla)\S+" + ) + self.url = re.compile(r"http\S+") - -def fileref(text): - return re.sub( - r"\w+\.py\b|\w+\.json\b|\w+\.js\b|\w+\.jsm\b|\w+\.html\b|\w+\.css\b|\w+\.c\b|\w+\.cpp\b|\w+\.h\b", - "__FILE_REFERENCE__", - text, - ) - - -def responses(text): - return re.sub(">[^\n]+", " ", text) - - -def hex(text): - return re.sub(r"\b0[xX][0-9a-fA-F]+\b", "__HEX_NUMBER__", text) - - -FIREFOX_DLLS_MATCH = "|".join( - [ - "libmozwayland.so", - "libssl3.so", - "libnssdbm3.so", - "liblgpllibs.so", - "libmozavutil.so", - "libxul.so", - "libmozgtk.so", - "libnssckbi.so", - "libclearkey.dylib", - "libmozsqlite3.so", - "libplc4.so", - "libsmime3.so", - "libclearkey.so", - "libnssutil3.so", - "libnss3.so", - "libplds4.so", - "libfreeblpriv3.so", - "libsoftokn3.so", - "libmozgtk.so", - "libmozavcodec.so", - "libnspr4.so", - "IA2Marshal.dll", - "lgpllibs.dll", - "libEGL.dll", - "libGLESv2.dll", - "libmozsandbox.so", - "AccessibleHandler.dll", - "AccessibleMarshal.dll", - "api-ms-win-core-console-l1-1-0.dll", - "api-ms-win-core-datetime-l1-1-0.dll", - "api-ms-win-core-debug-l1-1-0.dll", - "api-ms-win-core-errorhandling-l1-1-0.dll", - "api-ms-win-core-file-l1-1-0.dll", - "api-ms-win-core-file-l1-2-0.dll", - "api-ms-win-core-file-l2-1-0.dll", - "api-ms-win-core-handle-l1-1-0.dll", - "api-ms-win-core-heap-l1-1-0.dll", - "api-ms-win-core-interlocked-l1-1-0.dll", - "api-ms-win-core-libraryloader-l1-1-0.dll", - "api-ms-win-core-localization-l1-2-0.dll", - "api-ms-win-core-memory-l1-1-0.dll", - "api-ms-win-core-namedpipe-l1-1-0.dll", - "api-ms-win-core-processenvironment-l1-1-0.dll", - "api-ms-win-core-processthreads-l1-1-0.dll", - "api-ms-win-core-processthreads-l1-1-1.dll", - "api-ms-win-core-profile-l1-1-0.dll", - "api-ms-win-core-rtlsupport-l1-1-0.dll", - "api-ms-win-core-string-l1-1-0.dll", - "api-ms-win-core-synch-l1-1-0.dll", - "api-ms-win-core-synch-l1-2-0.dll", - "api-ms-win-core-sysinfo-l1-1-0.dll", - "api-ms-win-core-timezone-l1-1-0.dll", - "api-ms-win-core-util-l1-1-0.dll", - "api-ms-win-crt-conio-l1-1-0.dll", - "api-ms-win-crt-convert-l1-1-0.dll", - "api-ms-win-crt-environment-l1-1-0.dll", - "api-ms-win-crt-filesystem-l1-1-0.dll", - "api-ms-win-crt-heap-l1-1-0.dll", - "api-ms-win-crt-locale-l1-1-0.dll", - "api-ms-win-crt-math-l1-1-0.dll", - "api-ms-win-crt-multibyte-l1-1-0.dll", - "api-ms-win-crt-private-l1-1-0.dll", - "api-ms-win-crt-process-l1-1-0.dll", - "api-ms-win-crt-runtime-l1-1-0.dll", - "api-ms-win-crt-stdio-l1-1-0.dll", - "api-ms-win-crt-string-l1-1-0.dll", - "api-ms-win-crt-time-l1-1-0.dll", - "api-ms-win-crt-utility-l1-1-0.dll", - "d3dcompiler_47.dll", - "freebl3.dll", - "mozavcodec.dll", - "mozavutil.dll", - "mozglue.dll", - "msvcp140.dll", - "nss3.dll", - "nssckbi.dll", - "nssdbm3.dll", - "qipcap64.dll", - "softokn3.dll", - "ucrtbase.dll", - "vcruntime140.dll", - "xul.dll", - "clearkey.dll", - "libfreebl3.dylib", - "liblgpllibs.dylib", - "libmozavcodec.dylib", - "libmozavutil.dylib", - "libmozglue.dylib", - "libnss3.dylib", - "libnssckbi.dylib", - "libnssdbm3.dylib", - "libplugin_child_interpose.dylib", - "libsoftokn3.dylib", - ] -).replace(".", r"\.") - - -def dll(text): - regex = fr"\b(?!{FIREFOX_DLLS_MATCH})\w+(\.dll|\.so|\.dylib)\b" - return re.sub(regex, "__DLL_NAME__", text) - - -def synonyms(text): - synonyms = [ - ("safemode", ["safemode", "safe mode"]), - ("str", ["str", "steps to reproduce", "repro steps"]), - ("uaf", ["uaf", "use after free", "use-after-free"]), - ("asan", ["asan", "address sanitizer", "addresssanitizer"]), - ( - "permafailure", - [ - "permafailure", - "permafailing", - "permafail", - "perma failure", - "perma failing", - "perma fail", - "perma-failure", - "perma-failing", - "perma-fail", - ], - ), - ("spec", ["spec", "specification"]), - ] - - for synonym_group, synonym_list in synonyms: - text = re.sub( - "|".join(fr"\b{synonym}\b" for synonym in synonym_list), - synonym_group, - text, - flags=re.IGNORECASE, + def __call__(self, text): + return self.url.sub( + "__URL__", self.reference_url.sub("__CODE_REFERENCE_URL__", text) ) - return text + +class fileref(object): + def __init__(self): + self.pattern = re.compile( + r"\w+\.py\b|\w+\.json\b|\w+\.js\b|\w+\.jsm\b|\w+\.html\b|\w+\.css\b|\w+\.c\b|\w+\.cpp\b|\w+\.h\b" + ) + + def __call__(self, text): + return self.pattern.sub("__FILE_REFERENCE__", text) -def crash(text): - return re.sub( - r"bp-[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{6}[0-9]{6}\b", - "__CRASH_STATS_LINK__", - text, - ) +class responses(object): + def __init__(self): + self.pattern = re.compile(">[^\n]+") + + def __call__(self, text): + return self.pattern.sub(" ", text) + + +class hex(object): + def __init__(self): + self.pattern = re.compile(r"\b0[xX][0-9a-fA-F]+\b") + + def __call__(self, text): + return self.pattern.sub("__HEX_NUMBER__", text) + + +class dll(object): + def __init__(self): + FIREFOX_DLLS_MATCH = "|".join( + [ + "libmozwayland.so", + "libssl3.so", + "libnssdbm3.so", + "liblgpllibs.so", + "libmozavutil.so", + "libxul.so", + "libmozgtk.so", + "libnssckbi.so", + "libclearkey.dylib", + "libmozsqlite3.so", + "libplc4.so", + "libsmime3.so", + "libclearkey.so", + "libnssutil3.so", + "libnss3.so", + "libplds4.so", + "libfreeblpriv3.so", + "libsoftokn3.so", + "libmozgtk.so", + "libmozavcodec.so", + "libnspr4.so", + "IA2Marshal.dll", + "lgpllibs.dll", + "libEGL.dll", + "libGLESv2.dll", + "libmozsandbox.so", + "AccessibleHandler.dll", + "AccessibleMarshal.dll", + "api-ms-win-core-console-l1-1-0.dll", + "api-ms-win-core-datetime-l1-1-0.dll", + "api-ms-win-core-debug-l1-1-0.dll", + "api-ms-win-core-errorhandling-l1-1-0.dll", + "api-ms-win-core-file-l1-1-0.dll", + "api-ms-win-core-file-l1-2-0.dll", + "api-ms-win-core-file-l2-1-0.dll", + "api-ms-win-core-handle-l1-1-0.dll", + "api-ms-win-core-heap-l1-1-0.dll", + "api-ms-win-core-interlocked-l1-1-0.dll", + "api-ms-win-core-libraryloader-l1-1-0.dll", + "api-ms-win-core-localization-l1-2-0.dll", + "api-ms-win-core-memory-l1-1-0.dll", + "api-ms-win-core-namedpipe-l1-1-0.dll", + "api-ms-win-core-processenvironment-l1-1-0.dll", + "api-ms-win-core-processthreads-l1-1-0.dll", + "api-ms-win-core-processthreads-l1-1-1.dll", + "api-ms-win-core-profile-l1-1-0.dll", + "api-ms-win-core-rtlsupport-l1-1-0.dll", + "api-ms-win-core-string-l1-1-0.dll", + "api-ms-win-core-synch-l1-1-0.dll", + "api-ms-win-core-synch-l1-2-0.dll", + "api-ms-win-core-sysinfo-l1-1-0.dll", + "api-ms-win-core-timezone-l1-1-0.dll", + "api-ms-win-core-util-l1-1-0.dll", + "api-ms-win-crt-conio-l1-1-0.dll", + "api-ms-win-crt-convert-l1-1-0.dll", + "api-ms-win-crt-environment-l1-1-0.dll", + "api-ms-win-crt-filesystem-l1-1-0.dll", + "api-ms-win-crt-heap-l1-1-0.dll", + "api-ms-win-crt-locale-l1-1-0.dll", + "api-ms-win-crt-math-l1-1-0.dll", + "api-ms-win-crt-multibyte-l1-1-0.dll", + "api-ms-win-crt-private-l1-1-0.dll", + "api-ms-win-crt-process-l1-1-0.dll", + "api-ms-win-crt-runtime-l1-1-0.dll", + "api-ms-win-crt-stdio-l1-1-0.dll", + "api-ms-win-crt-string-l1-1-0.dll", + "api-ms-win-crt-time-l1-1-0.dll", + "api-ms-win-crt-utility-l1-1-0.dll", + "d3dcompiler_47.dll", + "freebl3.dll", + "mozavcodec.dll", + "mozavutil.dll", + "mozglue.dll", + "msvcp140.dll", + "nss3.dll", + "nssckbi.dll", + "nssdbm3.dll", + "qipcap64.dll", + "softokn3.dll", + "ucrtbase.dll", + "vcruntime140.dll", + "xul.dll", + "clearkey.dll", + "libfreebl3.dylib", + "liblgpllibs.dylib", + "libmozavcodec.dylib", + "libmozavutil.dylib", + "libmozglue.dylib", + "libnss3.dylib", + "libnssckbi.dylib", + "libnssdbm3.dylib", + "libplugin_child_interpose.dylib", + "libsoftokn3.dylib", + ] + ).replace(".", r"\.") + self.pattern = re.compile( + fr"\b(?!{FIREFOX_DLLS_MATCH})\w+(\.dll|\.so|\.dylib)\b" + ) + + def __call__(self, text): + return self.pattern.sub("__DLL_NAME__", text) + + +class synonyms(object): + def __init__(self): + synonyms = [ + ("safemode", ["safemode", "safe mode"]), + ("str", ["str", "steps to reproduce", "repro steps"]), + ("uaf", ["uaf", "use after free", "use-after-free"]), + ("asan", ["asan", "address sanitizer", "addresssanitizer"]), + ( + "permafailure", + [ + "permafailure", + "permafailing", + "permafail", + "perma failure", + "perma failing", + "perma fail", + "perma-failure", + "perma-failing", + "perma-fail", + ], + ), + ("spec", ["spec", "specification"]), + ] + self.pattern = {} + for synonym_group, synonym_list in synonyms: + self.pattern[synonym_group] = re.compile( + "|".join(fr"\b{synonym}\b" for synonym in synonym_list), + flags=re.IGNORECASE, + ) + + def __call__(self, text): + for synonym_group in self.pattern: + text = self.pattern[synonym_group].sub(synonym_group, text) + return text + + +class crash(object): + def __init__(self): + self.pattern = re.compile( + r"bp-[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{6}[0-9]{6}\b" + ) + + def __call__(self, text): + return self.pattern.sub("__CRASH_STATS_LINK__", text) diff --git a/bugbug/models/assignee.py b/bugbug/models/assignee.py index 3f3c62b0..54ef9004 100644 --- a/bugbug/models/assignee.py +++ b/bugbug/models/assignee.py @@ -48,9 +48,9 @@ class AssigneeModel(BugModel): ] cleanup_functions = [ - feature_cleanup.fileref, - feature_cleanup.url, - feature_cleanup.synonyms, + feature_cleanup.fileref(), + feature_cleanup.url(), + feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline( diff --git a/bugbug/models/backout.py b/bugbug/models/backout.py index f75962aa..dbf04e04 100644 --- a/bugbug/models/backout.py +++ b/bugbug/models/backout.py @@ -41,9 +41,9 @@ class BackoutModel(CommitModel): ] cleanup_functions = [ - feature_cleanup.fileref, - feature_cleanup.url, - feature_cleanup.synonyms, + feature_cleanup.fileref(), + feature_cleanup.url(), + feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline( diff --git a/bugbug/models/component.py b/bugbug/models/component.py index 46cf9052..38a37db2 100644 --- a/bugbug/models/component.py +++ b/bugbug/models/component.py @@ -77,9 +77,9 @@ class ComponentModel(BugModel): ] cleanup_functions = [ - feature_cleanup.fileref, - feature_cleanup.url, - feature_cleanup.synonyms, + feature_cleanup.fileref(), + feature_cleanup.url(), + feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline( diff --git a/bugbug/models/defect.py b/bugbug/models/defect.py index a156df59..6d5726bc 100644 --- a/bugbug/models/defect.py +++ b/bugbug/models/defect.py @@ -47,9 +47,9 @@ class DefectModel(BugModel): feature_extractors.append(bug_features.had_severity_enhancement()) cleanup_functions = [ - feature_cleanup.url, - feature_cleanup.fileref, - feature_cleanup.synonyms, + feature_cleanup.url(), + feature_cleanup.fileref(), + feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline( diff --git a/bugbug/models/devdocneeded.py b/bugbug/models/devdocneeded.py index 13d77f71..d6fde013 100644 --- a/bugbug/models/devdocneeded.py +++ b/bugbug/models/devdocneeded.py @@ -41,9 +41,9 @@ class DevDocNeededModel(BugModel): ] cleanup_functions = [ - feature_cleanup.fileref, - feature_cleanup.url, - feature_cleanup.synonyms, + feature_cleanup.fileref(), + feature_cleanup.url(), + feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline( diff --git a/bugbug/models/qaneeded.py b/bugbug/models/qaneeded.py index 1a3098e4..2bbf12ca 100644 --- a/bugbug/models/qaneeded.py +++ b/bugbug/models/qaneeded.py @@ -36,9 +36,9 @@ class QANeededModel(BugModel): ] cleanup_functions = [ - feature_cleanup.fileref, - feature_cleanup.url, - feature_cleanup.synonyms, + feature_cleanup.fileref(), + feature_cleanup.url(), + feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline( diff --git a/bugbug/models/tracking.py b/bugbug/models/tracking.py index d35d292f..9af7230b 100644 --- a/bugbug/models/tracking.py +++ b/bugbug/models/tracking.py @@ -47,12 +47,12 @@ class TrackingModel(BugModel): ] cleanup_functions = [ - feature_cleanup.url, - feature_cleanup.fileref, - feature_cleanup.hex, - feature_cleanup.dll, - feature_cleanup.synonyms, - feature_cleanup.crash, + feature_cleanup.url(), + feature_cleanup.fileref(), + feature_cleanup.hex(), + feature_cleanup.dll(), + feature_cleanup.synonyms(), + feature_cleanup.crash(), ] self.extraction_pipeline = Pipeline( diff --git a/bugbug/models/uplift.py b/bugbug/models/uplift.py index 984fde5f..0dc7d008 100644 --- a/bugbug/models/uplift.py +++ b/bugbug/models/uplift.py @@ -36,9 +36,9 @@ class UpliftModel(BugModel): ] cleanup_functions = [ - feature_cleanup.fileref, - feature_cleanup.url, - feature_cleanup.synonyms, + feature_cleanup.fileref(), + feature_cleanup.url(), + feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline( diff --git a/tests/test_feature_cleanup.py b/tests/test_feature_cleanup.py index f4791747..edfbe593 100644 --- a/tests/test_feature_cleanup.py +++ b/tests/test_feature_cleanup.py @@ -26,7 +26,7 @@ def test_url(): ), ] for orig_text, cleaned_text in tests: - assert feature_cleanup.url(orig_text) == cleaned_text + assert feature_cleanup.url()(orig_text) == cleaned_text def test_fileref(): @@ -37,7 +37,7 @@ def test_fileref(): ) ] for orig_text, cleaned_text in tests: - assert feature_cleanup.fileref(orig_text) == cleaned_text + assert feature_cleanup.fileref()(orig_text) == cleaned_text def test_responses(): @@ -57,7 +57,7 @@ def test_responses(): ), ] for orig_text, cleaned_text in tests: - assert feature_cleanup.responses(orig_text) == cleaned_text + assert feature_cleanup.responses()(orig_text) == cleaned_text def test_hex(): @@ -72,7 +72,7 @@ def test_hex(): ), ] for orig_text, cleaned_text in tests: - assert feature_cleanup.hex(orig_text) == cleaned_text + assert feature_cleanup.hex()(orig_text) == cleaned_text def test_dll(): @@ -100,7 +100,7 @@ def test_dll(): ), ] for orig_text, cleaned_text in tests: - assert feature_cleanup.dll(orig_text) == cleaned_text + assert feature_cleanup.dll()(orig_text) == cleaned_text def test_synonyms(): @@ -118,7 +118,7 @@ def test_synonyms(): ("found via address sanitizer or asan", "found via asan or asan"), ] for orig_text, cleaned_text in tests: - assert feature_cleanup.synonyms(orig_text) == cleaned_text + assert feature_cleanup.synonyms()(orig_text) == cleaned_text def test_crash(): @@ -133,4 +133,4 @@ def test_crash(): ), ] for orig_text, cleaned_text in tests: - assert feature_cleanup.crash(orig_text) == cleaned_text + assert feature_cleanup.crash()(orig_text) == cleaned_text