Add a model to identify invalid reports for Firefox in-product reporter (#3790)

This commit is contained in:
Ksenia 2023-11-18 20:56:31 -05:00 коммит произвёл GitHub
Родитель d454491af9
Коммит 97a0f86dd6
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
7 изменённых файлов: 283 добавлений и 2 удалений

Просмотреть файл

@ -198,3 +198,66 @@ class crash(object):
def __call__(self, text):
return self.pattern.sub("__CRASH_STATS_LINK__", text)
class CleanCompatibilityReportDescription(object):
def __init__(self):
self.sub_patterns = {
"details": re.compile(r"<details>.*?</details>", re.DOTALL),
"footer": re.compile(
r"_From \[webcompat\.com\]\(https://webcompat\.com/\) with ❤_"
),
"link": re.compile(
r"\[View console log messages\]\(https://webcompat\.com/console_logs/.*?\)"
),
"screenshot": re.compile(r"\[\!\[Screenshot Description\]\(.*?\)\]\(.*?\)"),
"screenshot_md": re.compile(
r'\*\*Screenshot\*\*\s*\r?\n\<img width="[\d]+" alt="[^"]*" src="https?://[^"]+"[^>]*>'
),
"watchers": re.compile(r"\*\*Watchers:\*\*(?:\r?\n@[\w-]+)+"),
}
self.extract_patterns = {
"description": re.compile(r"\*\*Description\*\*: (.*?)\n", re.DOTALL),
"problem_type": re.compile(r"\*\*Problem type\*\*: (.*?)\n", re.DOTALL),
"steps": re.compile(r"\*\*Steps to Reproduce\*\*:?(.*)", re.DOTALL),
}
self.default_problems = {
"Desktop site instead of mobile site",
"Browser unsupported",
"Page not loading correctly",
"Missing items",
"Buttons or links not working",
"Unable to type",
"Unable to login",
"Problems with Captcha",
"Images not loaded",
"Items are overlapped",
"Items are misaligned",
"Items not fully visible",
"There is no video",
"There is no audio",
"Media controls are broken or missing",
"The video or audio does not play",
}
def _extract_and_strip(self, pattern, text):
match = pattern.search(text)
return match.group(1).strip() if match else ""
def __call__(self, text):
for pattern in self.sub_patterns.values():
text = pattern.sub("", text)
problem_type = self._extract_and_strip(
self.extract_patterns["problem_type"], text
)
description = self._extract_and_strip(
self.extract_patterns["description"], text
)
steps = self._extract_and_strip(self.extract_patterns["steps"], text)
if problem_type == "Something else" or description not in self.default_problems:
return f"{description}\n {steps}" if steps else description
else:
return steps

Просмотреть файл

@ -21,6 +21,7 @@ MODELS = {
"devdocneeded": "bugbug.models.devdocneeded.DevDocNeededModel",
"duplicate": "bugbug.models.duplicate.DuplicateModel",
"fixtime": "bugbug.models.fixtime.FixTimeModel",
"invalidcompatibilityreport": "bugbug.models.invalid_compatibility_report.InvalidCompatibilityReportModel",
"needsdiagnosis": "bugbug.models.needsdiagnosis.NeedsDiagnosisModel",
"qaneeded": "bugbug.models.qaneeded.QANeededModel",
"rcatype": "bugbug.models.rcatype.RCATypeModel",

Просмотреть файл

@ -0,0 +1,105 @@
# -*- coding: utf-8 -*-
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
import logging
import xgboost
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from bugbug import feature_cleanup, issue_features, utils
from bugbug.model import IssueModel
logger = logging.getLogger(__name__)
class InvalidCompatibilityReportModel(IssueModel):
def __init__(self, lemmatization=False):
super().__init__(
owner="webcompat", repo="web-bugs", lemmatization=lemmatization
)
self.calculate_importance = False
feature_extractors = []
cleanup_functions = []
self.extraction_pipeline = Pipeline(
[
(
"report_extractor",
issue_features.IssueExtractor(
feature_extractors, cleanup_functions, rollback=False
),
),
(
"union",
ColumnTransformer(
[
(
"first_comment",
self.text_vectorizer(min_df=0.0001),
"first_comment",
),
]
),
),
]
)
self.clf = xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count())
def items_gen(self, classes):
# Do cleanup separately from extraction pipeline to
# make sure it's not applied during classification due to differences
# in text structure between GitHub issues and reports
cleanup_function = feature_cleanup.CleanCompatibilityReportDescription()
for issue, label in super().items_gen(classes):
issue = {
**issue,
"body": cleanup_function(issue["body"]),
}
yield issue, label
def get_labels(self):
classes = {}
for issue in self.github.get_issues():
if not issue["title"] or not issue["body"]:
continue
# Skip issues that are not moderated yet as they don't have a
# meaningful title or body.
if issue["title"] == "In the moderation queue.":
continue
if (
issue["milestone"]
and (issue["milestone"]["title"] in ("invalid", "incomplete"))
and any(label["name"] == "wcrt-invalid" for label in issue["labels"])
):
classes[issue["number"]] = 1
elif any(
event["event"] == "milestoned"
and (event["milestone"]["title"] in ("needsdiagnosis", "moved"))
for event in issue["events"]
):
classes[issue["number"]] = 0
logger.info(
"%d issues have been moved to invalid",
sum(label == 1 for label in classes.values()),
)
logger.info(
"%d issues have not been moved to invalid",
sum(label == 0 for label in classes.values()),
)
return classes, [0, 1]
def get_feature_names(self):
return self.extraction_pipeline.named_steps["union"].get_feature_names_out()

Просмотреть файл

@ -0,0 +1,71 @@
# -*- coding: utf-8 -*-
import argparse
import os
from logging import INFO, basicConfig, getLogger
import numpy as np
import requests
from bugbug.models import get_model_class
from bugbug.utils import download_model
basicConfig(level=INFO)
logger = getLogger(__name__)
def classify_reports(model_name: str, report_text: str) -> None:
model_file_name = f"{model_name}model"
if not os.path.exists(model_file_name):
logger.info("%s does not exist. Downloading the model....", model_file_name)
try:
download_model(model_name)
except requests.HTTPError:
logger.error(
"A pre-trained model is not available, you will need to train it yourself using the trainer script"
)
raise SystemExit(1)
model_class = get_model_class(model_name)
model = model_class.load(model_file_name)
logger.info("%s", report_text)
report = {"body": report_text, "title": ""}
if model.calculate_importance:
probas, importance = model.classify(
report, probabilities=True, importances=True
)
model.print_feature_importances(
importance["importances"], class_probabilities=probas
)
else:
probas = model.classify(report, probabilities=True, importances=False)
probability = probas[0]
pred_index = np.argmax(probability)
if len(probability) > 2:
pred_class = model.le.inverse_transform([pred_index])[0]
else:
pred_class = "Positive" if pred_index == 1 else "Negative"
logger.info("%s %s", pred_class, probability)
input()
def main() -> None:
description = "Perform evaluation of user report using the specified model"
parser = argparse.ArgumentParser(description=description)
parser.add_argument("model", type=str, help="Which model to use for evaluation")
parser.add_argument("--report-text", help="Report text to classify", type=str)
args = parser.parse_args()
classify_reports(args.model, args.report_text)
if __name__ == "__main__":
main()

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -134,3 +134,29 @@ def test_crash():
]
for orig_text, cleaned_text in tests:
assert feature_cleanup.crash()(orig_text) == cleaned_text
def test_clean_compatibility_report_description():
tests = [
(
'<!-- @browser: Firefox 117.0 -->\n<!-- @ua_header: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/117.0 -->\n<!-- @reported_with: unknown -->\n<!-- @public_url: https://github.com/webcompat/web-bugs/issues/126685 -->\n\n**URL**: https://www.lequipe.fr/explore/video/la-course-en-tete/20177528\n\n**Browser / Version**: Firefox 117.0\n**Operating System**: Windows 10\n**Tested Another Browser**: Yes Chrome\n\n**Problem type**: Video or audio doesn\'t play\n**Description**: Media controls are broken or missing\n**Steps to Reproduce**:\nVideo is starting but we cannot use the video panel control. It working on Brave.\r\n<details>\r\n <summary>View the screenshot</summary>\r\n <img alt="Screenshot" src="https://webcompat.com/uploads/2023/9/501af310-e646-4b2c-8eb9-7f21ce8725fe.jpg">\r\n </details>\n\n<details>\n<summary>Browser Configuration</summary>\n<ul>\n <li>None</li>\n</ul>\n</details>\n\n_From [webcompat.com](https://webcompat.com/) with ❤_',
"Video is starting but we cannot use the video panel control. It working on Brave.",
),
(
"<!-- @browser: Firefox Mobile 120.0 -->\n<!-- @ua_header: Mozilla/5.0 (Android 10; Mobile; rv:120.0) Gecko/120.0 Firefox/120.0 -->\n<!-- @reported_with: unknown -->\n<!-- @public_url: https://github.com/webcompat/web-bugs/issues/128961 -->\n\n**URL**: https://www.jianshu.com/p/ba52ec38ac51\n\n**Browser / Version**: Firefox Mobile 120.0\n**Operating System**: Android 10\n**Tested Another Browser**: Yes Edge\n\n**Problem type**: Something else\n**Description**: Couldn't scroll down\n**Steps to Reproduce**:\nScroll down the page, then scroll to top, scroll down again, the page couldn't scroll (will always back to top). \n\n<details>\n<summary>Browser Configuration</summary>\n<ul>\n <li>None</li>\n</ul>\n</details>\n\n_From [webcompat.com](https://webcompat.com/) with ❤_",
"Couldn't scroll down\n Scroll down the page, then scroll to top, scroll down again, the page couldn't scroll (will always back to top).",
),
(
'**URL**:\r\nhttps://samarabags.com/collections/all-bags/products/the-jewelry-box?variant=40390455820322\r\n\r\n**Browser/Version**:\r\nFirefox 112.0.2\r\n\r\n**Operating System**:\r\nMacOS Ventura 13.3.1 (a) (22E772610a)\r\nPrivate window\r\n\r\n**What seems to be the trouble?(Required)**\r\n- [ ] Desktop site instead of mobile site\r\n- [ ] Mobile site is not usable\r\n- [ ] Video doesn\'t play\r\n- [X] Layout is messed up\r\n- [X] Text is not visible\r\n- [ ] Something else (Add details below)\r\n\r\n**Steps to Reproduce**\r\n\r\n1. Navigate to: (www.samarabags.com)\r\n2. Select a product and open its page.\r\n\r\n*__Expected Behavior:__*\r\nThe customer review, Instagram and the footer are visible.\r\n\r\n*__Actual Behavior:__*\r\nAnything below the product\'s image is just blank. "This page slowing down Firefox" message appears on the top.\r\n\r\n**Screenshot**\r\n<img width="1510" alt="Screenshot 2023-05-12 at 6 24 29 PM" src="https://github.com/webcompat/web-bugs/assets/1740517/20423943-c0a2-42b4-a763-ff814fa48ecb">\r\n',
'\n 1. Navigate to: (www.samarabags.com)\r\n2. Select a product and open its page.\r\n\r\n*__Expected Behavior:__*\r\nThe customer review, Instagram and the footer are visible.\r\n\r\n*__Actual Behavior:__*\r\nAnything below the product\'s image is just blank. "This page slowing down Firefox" message appears on the top.',
),
(
'<!-- @browser: Firefox Nightly 108.0a1 (2022-10-18) -->\r\n<!-- @ua_header: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:108.0) Gecko/20100101 Firefox/108.0 -->\r\n<!-- @reported_with: unknown -->\r\n\r\n**URL**: https://dlive.tv/s/dashboard#0\r\n\r\n**Browser / Version**: Firefox Nightly 108.0a1 (2022-10-18)\r\n**Operating System**: Windows 10\r\n**Tested Another Browser**: Yes Chrome\r\n\r\n**Problem type**: Design is broken\r\n**Description**: Items are misaligned\r\n\r\n**Prerequisites**: \r\nAccount created and signed in.\r\n\r\n**Steps to Reproduce**:\r\n1. Navigate to https://dlive.tv/s/dashboard#0\r\n2. Type in a message in the "Chat". \r\n3. Observe text alignment. \r\n\r\n**Expected Behavior:**\r\nThe text is centered in the message field.\r\n\r\n**Actual Behavior:**\r\nThe text is aligned on the top side of the message field.\r\n\r\n**Notes:**\r\n1. The issue is not reproducible on Chrome.\r\n2. The issue is also reproducible on Firefox Release.\r\n3. The issue is also reproducible for the hint text in the message field.\r\n3. Screenshot attached. \r\n\r\n**Watchers:**\r\n@softvision-oana-arbuzov\r\n@softvision-raul-bucata\r\n@sv-calin \r\n<details>\r\n <summary>View the screenshot</summary>\r\n <img alt="Screenshot" src="https://webcompat.com/uploads/2022/10/b4a296a5-ee2f-4a18-a5da-b1e20ee8d27d.jpg">\r\n </details>\r\n\r\n<details>\r\n<summary>Browser Configuration</summary>\r\n<ul>\r\n <li>None</li>\r\n</ul>\r\n</details>\r\n\r\n_From [webcompat.com](https://webcompat.com/) with ❤_',
'1. Navigate to https://dlive.tv/s/dashboard#0\r\n2. Type in a message in the "Chat". \r\n3. Observe text alignment. \r\n\r\n**Expected Behavior:**\r\nThe text is centered in the message field.\r\n\r\n**Actual Behavior:**\r\nThe text is aligned on the top side of the message field.\r\n\r\n**Notes:**\r\n1. The issue is not reproducible on Chrome.\r\n2. The issue is also reproducible on Firefox Release.\r\n3. The issue is also reproducible for the hint text in the message field.\r\n3. Screenshot attached.',
),
]
for orig_text, cleaned_text in tests:
assert (
feature_cleanup.CleanCompatibilityReportDescription()(orig_text)
== cleaned_text
)

Просмотреть файл

@ -0,0 +1,15 @@
# -*- coding: utf-8 -*-
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
from bugbug.models.invalid_compatibility_report import InvalidCompatibilityReportModel
def test_get_invalid_labels():
model = InvalidCompatibilityReportModel()
classes, _ = model.get_labels()
assert classes[70960]
assert classes[70978]
assert not classes[71052]
assert not classes[71011]