Fix bug suggestions with PostgreSQL (#7988)

* Use trigram similarity instead of FTS

* Skip escaping special characters

* Order results by match ranking on Postgres
---------

Co-authored-by: Bastien Abadie <bastien@nextcairn.com>
Co-authored-by: Sebastian Hengst <aryx.github@gmx-topmail.de>
This commit is contained in:
Valentin Rigal 2024-03-26 16:59:06 +01:00 коммит произвёл GitHub
Родитель ccd8a1bc25
Коммит 931e8bbaa7
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: B5690EEEBB952194
4 изменённых файлов: 115 добавлений и 14 удалений

Просмотреть файл

@ -1,10 +1,11 @@
import pytest
from unittest.mock import patch
from tests.test_utils import add_log_response
from treeherder.etl.jobs import store_job_data
from treeherder.etl.push import store_push_data
from treeherder.model.error_summary import get_error_summary
from treeherder.model.models import Job, TextLogError
from treeherder.model.error_summary import get_error_summary, bug_suggestions_line
from treeherder.model.models import Job, TextLogError, Bugscache
from ..sampledata import SampleData
@ -62,3 +63,86 @@ def test_create_error_summary(
)
for failure_line in bug_suggestions:
assert set(failure_line.keys()) == expected_keys
@pytest.mark.django_db
@patch(
"treeherder.model.error_summary.get_error_search_term_and_path",
return_value={
"search_term": ["browser_dbg-pretty-print-inline-scripts.js"],
"path_end": "devtools/client/debugger/test/mochitest/browser_dbg-pretty-print-inline-scripts.js",
},
)
def test_bug_suggestion_line(
search_mock, failure_classifications, jobs_with_local_log, sample_push, test_repository
):
"""
A test to verify similarity of search term (often test name) derived from
the failure line and bug summary gets taken into account. If it is equal
for every bug, the expected result won't be returned by the query because
of its higher bug ID.
"""
store_push_data(test_repository, sample_push)
for job in jobs_with_local_log:
job["job"]["result"] = "testfailed"
job["revision"] = sample_push[0]["revision"]
store_job_data(test_repository, jobs_with_local_log)
job = Job.objects.get(id=1)
Bugscache.objects.create(
id=1775819,
status="2",
keywords="intermittent-failure,regression,test-verify-fail",
whiteboard="[retriggered][stockwell unknown]",
summary=(
"Intermittent devtools/client/debugger/test/mochitest/browser_dbg-pretty-print-inline-scripts.js "
"| single tracking bug"
),
modified="2010-01-01 00:00:00",
)
# Create 50 other results with an inferior ID.
# The bug suggestions SQL query fetches up to 50 rows, ordered by match rank then ID.
# In case results are returned with a wrong rank (e.g. 0 for each result), above related suggestion will be lost.
Bugscache.objects.bulk_create(
[
Bugscache(
id=100 + i,
status="2",
keywords="intermittent-failure,intermittent-testcase",
summary=(
f"Intermittent devtools/client/debugger/test/mochitest/browser_unrelated-{i}.js "
"| single tracking bug"
),
modified="2010-01-01 00:00:00",
)
for i in range(50)
]
)
error = job.text_log_error.first()
summary, line_cache = bug_suggestions_line(
error,
project=job.repository,
logdate=job.submit_time,
term_cache={},
line_cache={str(job.submit_time.date()): {}},
revision=job.push.revision,
)
assert summary["bugs"]["open_recent"] == [
{
"crash_signature": "",
"dupe_of": None,
"id": 1775819,
"keywords": "intermittent-failure,regression,test-verify-fail",
"resolution": "",
"status": "2",
"whiteboard": "[retriggered][stockwell unknown]",
"summary": (
"Intermittent "
"devtools/client/debugger/test/mochitest/browser_dbg-pretty-print-inline-scripts.js "
"| single tracking bug"
),
}
]

Просмотреть файл

@ -135,7 +135,7 @@
{
"status": "NEW",
"id": 1054669,
"summary": "Intermittent test_switch_frame.py TestSwitchFrame.test_should_be_able_to_carry_on_working_if_the_frame_is_deleted_from_under_us | TimeoutException: TimeoutException: Connection timed out",
"summary": "Intermittent test_switch_frame.py TestSwitchFrame.test_should_be_able_to_carry_on_working_if_the_frame_is_deleted_from_under_us | TimeoutException",
"dupe_of": null,
"duplicates": [],
"cf_crash_signature": "",

Просмотреть файл

@ -0,0 +1,13 @@
# Generated by Django 4.1.13 on 2024-03-25 16:15
from django.db import migrations
from django.contrib.postgres.operations import TrigramExtension
class Migration(migrations.Migration):
dependencies = [
("model", "0030_group_durations"),
]
operations = [TrigramExtension()]

Просмотреть файл

@ -12,12 +12,12 @@ warnings.filterwarnings("ignore", category=DeprecationWarning, module="newrelic"
import newrelic.agent
from django.conf import settings
from django.contrib.auth.models import User
from django.contrib.postgres.search import SearchQuery, SearchRank, SearchVector
from django.core.cache import cache
from django.core.exceptions import ObjectDoesNotExist
from django.core.validators import MinLengthValidator
from django.db import models, transaction
from django.db.models import Count, Max, Min, Q, Subquery
from django.contrib.postgres.search import TrigramSimilarity
from django.db.utils import ProgrammingError
from django.forms import model_to_dict
from django.utils import timezone
@ -248,11 +248,11 @@ class Bugscache(models.Model):
def search(cls, search_term):
max_size = 50
# Do not wrap a string in quotes to search as a phrase;
# see https://bugzilla.mozilla.org/show_bug.cgi?id=1704311
search_term_fulltext = cls.sanitized_search_term(search_term)
if settings.DATABASES["default"]["ENGINE"] == "django.db.backends.mysql":
# Do not wrap a string in quotes to search as a phrase;
# see https://bugzilla.mozilla.org/show_bug.cgi?id=1704311
search_term_fulltext = cls.sanitized_search_term(search_term)
# Substitute escape and wildcard characters, so the search term is used
# literally in the LIKE statement.
search_term_like = (
@ -275,12 +275,16 @@ class Bugscache(models.Model):
[search_term_fulltext, search_term_like, max_size],
)
else:
# On PostgreSQL we can use the full text search features
vector = SearchVector("summary")
query = SearchQuery(search_term_fulltext)
recent_qs = Bugscache.objects.annotate(rank=SearchRank(vector, query)).order_by(
"-rank", "id"
)[0:max_size]
# On PostgreSQL we can use the ORM directly, but NOT the full text search
# as the ranking algorithm expects english words, not paths
# So we use standard pattern matching AND trigram similarity to compare suite of characters
# instead of words
# Django already escapes special characters, so we do not need to handle that here
recent_qs = (
Bugscache.objects.filter(summary__icontains=search_term)
.annotate(similarity=TrigramSimilarity("summary", search_term))
.order_by("-similarity")[0:max_size]
)
exclude_fields = ["modified", "processed_update"]
try: