convert matchers to functions

Now that matchers are all confined to a single method it's easier to handle them as functions.
2018-06-01 13:59:21 +01:00 · 2018-06-01 13:59:21 +01:00 · 0cb2387540
--- a/tests/autoclassify/test_autoclassify.py
+++ b/tests/autoclassify/test_autoclassify.py
@ -0,0 +1,8 @@
+from treeherder.autoclassify.autoclassify import get_matchers
+
+
+def test_get_matchers():
+    matchers = list(get_matchers())
+
+    assert len(matchers) == 3
+    assert all(m.__name__.endswith('_matcher') for m in matchers)
--- a/tests/autoclassify/test_classify_failures.py
+++ b/tests/autoclassify/test_classify_failures.py
@ -1,7 +1,7 @@
 from treeherder.autoclassify.autoclassify import match_errors
-from treeherder.autoclassify.matchers import (CrashSignatureMatcher,
-                                              ElasticSearchTestMatcher,
-                                              PreciseTestMatcher)
+from treeherder.autoclassify.matchers import (crash_signature_matcher,
+                                              elasticsearch_matcher,
+                                              precise_matcher)
 from treeherder.model.models import (BugJobMap,
                                     ClassifiedFailure,
                                     JobNote,
@ -39,7 +39,7 @@ def test_classify_test_failure(text_log_errors_failure_lines,
             (test_line, {"message": "message2"})]
    test_error_lines, test_failure_lines = create_lines(test_job_2, lines)

-    do_autoclassify(test_job_2, test_failure_lines, [PreciseTestMatcher])
+    do_autoclassify(test_job_2, test_failure_lines, [precise_matcher])

    expected_classified = test_error_lines[:2], test_failure_lines[:2]
    expected_unclassified = test_error_lines[2:], test_failure_lines[2:]
@ -65,7 +65,7 @@ def test_no_autoclassify_job_success(text_log_errors_failure_lines,
             (test_line, {"message": "message2"})]
    test_error_lines, test_failure_lines = create_lines(test_job_2, lines)

-    do_autoclassify(test_job_2, test_failure_lines, [PreciseTestMatcher], status="success")
+    do_autoclassify(test_job_2, test_failure_lines, [precise_matcher], status="success")

    expected_classified = [], []
    expected_unclassified = test_error_lines, test_failure_lines
@ -89,7 +89,7 @@ def test_autoclassify_update_job_classification(failure_lines, classified_failur
    lines = [(test_line, {})]
    _, test_failure_lines = create_lines(test_job_2, lines)

-    do_autoclassify(test_job_2, test_failure_lines, [PreciseTestMatcher])
+    do_autoclassify(test_job_2, test_failure_lines, [precise_matcher])

    assert JobNote.objects.filter(job=test_job_2).count() == 1

@ -107,7 +107,7 @@ def test_autoclassify_no_update_job_classification(test_job, test_job_2,
                                line="Some error that isn't in the structured logs",
                                line_number=2)

-    do_autoclassify(test_job_2, test_failure_lines, [PreciseTestMatcher])
+    do_autoclassify(test_job_2, test_failure_lines, [precise_matcher])

    assert JobNote.objects.filter(job=test_job_2).count() == 0

@ -200,7 +200,7 @@ def test_classify_skip_ignore(test_job_2,
                                         [(test_line, {}),
                                          (test_line, {"subtest": "subtest2"})])

-    do_autoclassify(test_job_2, test_failure_lines, [PreciseTestMatcher])
+    do_autoclassify(test_job_2, test_failure_lines, [precise_matcher])

    expected_classified = test_failure_lines[:1]
    expected_unclassified = test_failure_lines[1:]
@ -222,7 +222,7 @@ def test_classify_es(test_job_2, failure_lines, classified_failures):
                                          (test_line, {"status": "TIMEOUT"}),
                                          (test_line, {"expected": "ERROR"})])

-    do_autoclassify(test_job_2, test_failure_lines, [ElasticSearchTestMatcher])
+    do_autoclassify(test_job_2, test_failure_lines, [elasticsearch_matcher])

    expected_classified = test_failure_lines[:4]
    expected_unclassified = test_failure_lines[4:]
@ -242,16 +242,16 @@ def test_classify_multiple(test_job_2, failure_lines, classified_failures):
    expected_classified_precise = [test_failure_lines[0]]
    expected_classified_fuzzy = [test_failure_lines[1]]

-    do_autoclassify(test_job_2, test_failure_lines, [PreciseTestMatcher,
-                                                     ElasticSearchTestMatcher])
+    do_autoclassify(test_job_2, test_failure_lines, [precise_matcher,
+                                                     elasticsearch_matcher])

    for actual, expected in zip(expected_classified_precise, classified_failures):
        assert list(actual.error.classified_failures.values_list('id', flat=True)) == [expected.id]
-        assert actual.error.matches.first().matcher_name == "PreciseTestMatcher"
+        assert actual.error.matches.first().matcher_name == "precise_matcher"

    for actual, expected in zip(expected_classified_fuzzy, classified_failures):
        assert list(actual.error.classified_failures.values_list('id', flat=True)) == [expected.id]
-        assert actual.error.matches.first().matcher_name == "ElasticSearchTestMatcher"
+        assert actual.error.matches.first().matcher_name == "elasticsearch_matcher"


 def test_classify_crash(test_repository, test_job, test_job_2, test_matcher):
@ -269,7 +269,7 @@ def test_classify_crash(test_repository, test_job, test_job_2, test_matcher):
                                     classified_failure=classified_failure,
                                     matcher_name=test_matcher.__class__.__name__,
                                     score=1.0)
-    do_autoclassify(test_job_2, failure_lines, [CrashSignatureMatcher])
+    do_autoclassify(test_job_2, failure_lines, [crash_signature_matcher])

    expected_classified = failure_lines[0:2]
    expected_unclassified = failure_lines[2:]
--- a/tests/autoclassify/test_matchers.py
+++ b/tests/autoclassify/test_matchers.py
@ -2,8 +2,8 @@ from decimal import Decimal

 from first import first

-from treeherder.autoclassify.matchers import (PreciseTestMatcher,
-                                              score_matches)
+from treeherder.autoclassify.matchers import precise_matcher
+from treeherder.autoclassify.utils import score_matches
 from treeherder.model.models import (FailureLine,
                                     TextLogErrorMatch,
                                     TextLogErrorMetadata)
@ -12,10 +12,10 @@ from .utils import (create_failure_lines,
                    create_text_log_errors)


-def test_precise_test_matcher_with_matches(classified_failures):
+def test_precise_matcher_with_matches(classified_failures):
    tle = TextLogErrorMatch.objects.first().text_log_error

-    results = PreciseTestMatcher().query_best(tle)
+    results = precise_matcher(tle)
    score, classified_failure_id = first(results)

    match = tle.matches.first()
@ -23,7 +23,7 @@ def test_precise_test_matcher_with_matches(classified_failures):
    assert score == match.score


-def test_precise_test_matcher_without_matches(test_job, test_matcher):
+def test_precise_matcher_without_matches(test_job, test_matcher):
    # create an error log group to match against
    data1 = {
        'action': 'test_result',
@ -50,7 +50,7 @@ def test_precise_test_matcher_without_matches(test_job, test_matcher):
    TextLogErrorMetadata.objects.create(text_log_error=tle1, failure_line=failure_line1)
    TextLogErrorMetadata.objects.create(text_log_error=tle2, failure_line=failure_line2)

-    output = PreciseTestMatcher().query_best(tle2)
+    output = precise_matcher(tle2)
    assert output is None  # we should have no matches


--- a/treeherder/autoclassify/autoclassify.py
+++ b/treeherder/autoclassify/autoclassify.py
@ -1,4 +1,5 @@
 # -*- coding: utf-8 -*-
+import inspect
 import logging

 from django.db.utils import IntegrityError
@ -6,7 +7,6 @@ from first import first

 from treeherder.model.models import (Job,
                                     JobNote,
-                                     Matcher,
                                     TextLogError,
                                     TextLogErrorMatch)

@ -18,6 +18,26 @@ AUTOCLASSIFY_CUTOFF_RATIO = 0.7
 AUTOCLASSIFY_GOOD_ENOUGH_RATIO = 0.9


+def get_matchers():
+    """
+    Get matcher functions from treeherder.autoclassify.matchers
+
+    We classify matchers as any function treeherder.autoclassify.matchers with
+    a name ending in _matcher.  This is currently overkill but protects against
+    the unwarey engineer adding new functions to the matchers module that
+    shouldn't be treated as matchers.
+    """
+    from . import matchers
+
+    def is_matcher_func(member):
+        return inspect.isfunction(member) and member.__name__.endswith("_matcher")
+
+    members = inspect.getmembers(matchers, is_matcher_func)
+
+    for name, func in members:
+        yield func
+
+
 def match_errors(job, matchers=None):
    # Only try to autoclassify where we have a failure status; sometimes there can be
    # error lines even in jobs marked as passing.
@ -42,7 +62,7 @@ def match_errors(job, matchers=None):
        return

    if matchers is None:
-        matchers = Matcher.__subclasses__()
+        matchers = get_matchers()

    try:
        matches = list(find_best_matches(errors, matchers))
@ -89,18 +109,16 @@ def find_all_matches(text_log_error, matchers):

    Returns *unsaved* TextLogErrorMatch instances.
    """
-    for matcher_class in matchers:
-        matcher = matcher_class()
-
+    for matcher_func in matchers:
+        matches = matcher_func(text_log_error)
        # matches: iterator of (score, ClassifiedFailure.id)
-        matches = matcher.query_best(text_log_error)
        if not matches:
            continue

        for score, classified_failure_id in matches:
            yield TextLogErrorMatch(
                score=score,
-                matcher_name=matcher.__class__.__name__,
+                matcher_name=matcher_func.__name__,
                classified_failure_id=classified_failure_id,
                text_log_error=text_log_error,
            )
--- a/treeherder/autoclassify/matchers.py
+++ b/treeherder/autoclassify/matchers.py
@ -1,15 +1,12 @@
 from __future__ import division

 import logging
-from abc import (ABCMeta,
-                 abstractmethod)
 from difflib import SequenceMatcher
 from itertools import chain

 import newrelic.agent
 from django.conf import settings
 from django.db.models import Q
-from six import add_metaclass

 from treeherder.model.models import TextLogErrorMatch
 from treeherder.services.elasticsearch import search
@ -21,179 +18,164 @@ from .utils import (score_matches,
 logger = logging.getLogger(__name__)


-@add_metaclass(ABCMeta)
-class Matcher(object):
-    """Parent class for Matchers, providing the interface for query_best"""
-    @abstractmethod
-    def query_best(self, text_log_error):
-        """All child classes must implement this method."""
-        pass
+@newrelic.agent.function_trace()
+def precise_matcher(text_log_error):
+    """Query for TextLogErrorMatches identical to matches of the given TextLogError."""
+    failure_line = text_log_error.metadata.failure_line
+    logger.debug("Looking for test match in failure %d", failure_line.id)
+
+    if failure_line.action != "test_result" or failure_line.message is None:
+        return
+
+    f = {
+        'text_log_error___metadata__failure_line__action': 'test_result',
+        'text_log_error___metadata__failure_line__test': failure_line.test,
+        'text_log_error___metadata__failure_line__subtest': failure_line.subtest,
+        'text_log_error___metadata__failure_line__status': failure_line.status,
+        'text_log_error___metadata__failure_line__expected': failure_line.expected,
+        'text_log_error___metadata__failure_line__message': failure_line.message
+    }
+    qwargs = (
+        Q(text_log_error___metadata__best_classification=None)
+        & (Q(text_log_error___metadata__best_is_verified=True)
+           | Q(text_log_error__step__job=text_log_error.step.job))
+    )
+    qs = (TextLogErrorMatch.objects.filter(**f)
+                                   .exclude(qwargs)
+                                   .order_by('-score', '-classified_failure'))
+
+    if not qs:
+        return
+
+    # chunk through the QuerySet because it could potentially be very large
+    # time bound each call to the scoring function to avoid job timeouts
+    # returns an iterable of (score, classified_failure_id) tuples
+    chunks = chunked_qs_reverse(qs, chunk_size=20000)
+    return chain.from_iterable(time_boxed(score_matches, chunks, time_budget=500))


-class PreciseTestMatcher(Matcher):
-    """Matcher that looks for existing failures with identical tests and identical error message."""
-    @newrelic.agent.function_trace()
-    def query_best(self, text_log_error):
-        """Query for TextLogErrorMatches identical to matches of the given TextLogError."""
-        failure_line = text_log_error.metadata.failure_line
-        logger.debug("Looking for test match in failure %d", failure_line.id)
+@newrelic.agent.function_trace()
+def elasticsearch_matcher(text_log_error):
+    """
+    Query Elasticsearch and score the results.

-        if failure_line.action != "test_result" or failure_line.message is None:
-            return
+    Uses a filtered search checking test, status, expected, and the message
+    as a phrase query with non-alphabet tokens removed.
+    """
+    if not settings.ELASTICSEARCH_URL:
+        return []

-        f = {
-            'text_log_error___metadata__failure_line__action': 'test_result',
-            'text_log_error___metadata__failure_line__test': failure_line.test,
-            'text_log_error___metadata__failure_line__subtest': failure_line.subtest,
-            'text_log_error___metadata__failure_line__status': failure_line.status,
-            'text_log_error___metadata__failure_line__expected': failure_line.expected,
-            'text_log_error___metadata__failure_line__message': failure_line.message
-        }
-        qwargs = (
-            Q(text_log_error___metadata__best_classification=None)
-            & (Q(text_log_error___metadata__best_is_verified=True)
-               | Q(text_log_error__step__job=text_log_error.step.job))
-        )
-        qs = (TextLogErrorMatch.objects.filter(**f)
-                                       .exclude(qwargs)
-                                       .order_by('-score', '-classified_failure'))
+    failure_line = text_log_error.metadata.failure_line

-        if not qs:
-            return
+    if failure_line.action != "test_result" or not failure_line.message:
+        logger.debug("Skipped elasticsearch matching")
+        return

-        # chunk through the QuerySet because it could potentially be very large
-        # time bound each call to the scoring function to avoid job timeouts
-        # returns an iterable of (score, classified_failure_id) tuples
-        chunks = chunked_qs_reverse(qs, chunk_size=20000)
-        return chain.from_iterable(time_boxed(score_matches, chunks, time_budget=500))
+    filters = [
+        {'term': {'test': failure_line.test}},
+        {'term': {'status': failure_line.status}},
+        {'term': {'expected': failure_line.expected}},
+        {'exists': {'field': 'best_classification'}}
+    ]
+    if failure_line.subtest:
+        query = filters.append({'term': {'subtest': failure_line.subtest}})

-
-class ElasticSearchTestMatcher(Matcher):
-    """Looks for existing failures using Elasticsearch."""
-    @newrelic.agent.function_trace()
-    def query_best(self, text_log_error):
-        """
-        Query Elasticsearch and score the results.
-
-        Uses a filtered search checking test, status, expected, and the message
-        as a phrase query with non-alphabet tokens removed.
-        """
-        if not settings.ELASTICSEARCH_URL:
-            return []
-
-        failure_line = text_log_error.metadata.failure_line
-
-        if failure_line.action != "test_result" or not failure_line.message:
-            logger.debug("Skipped elasticsearch matching")
-            return
-
-        filters = [
-            {'term': {'test': failure_line.test}},
-            {'term': {'status': failure_line.status}},
-            {'term': {'expected': failure_line.expected}},
-            {'exists': {'field': 'best_classification'}}
-        ]
-        if failure_line.subtest:
-            query = filters.append({'term': {'subtest': failure_line.subtest}})
-
-        query = {
-            'query': {
-                'bool': {
-                    'filter': filters,
-                    'must': [{
-                        'match_phrase': {
-                            'message': failure_line.message[:1024],
-                        },
-                    }],
-                },
+    query = {
+        'query': {
+            'bool': {
+                'filter': filters,
+                'must': [{
+                    'match_phrase': {
+                        'message': failure_line.message[:1024],
+                    },
+                }],
            },
-        }
+        },
+    }

-        try:
-            results = search(query)
-        except Exception:
-            logger.error("Elasticsearch lookup failed: %s %s %s %s %s",
-                         failure_line.test, failure_line.subtest, failure_line.status,
-                         failure_line.expected, failure_line.message)
-            raise
+    try:
+        results = search(query)
+    except Exception:
+        logger.error("Elasticsearch lookup failed: %s %s %s %s %s",
+                     failure_line.test, failure_line.subtest, failure_line.status,
+                     failure_line.expected, failure_line.message)
+        raise

-        if len(results) > 1:
-            args = (
-                text_log_error.id,
-                failure_line.id,
-                len(results),
-            )
-            logger.info('text_log_error=%i failure_line=%i Elasticsearch produced %i results' % args)
-            newrelic.agent.record_custom_event('es_matches', {
-                'num_results': len(results),
-                'text_log_error_id': text_log_error.id,
-                'failure_line_id': failure_line.id,
-            })
-
-        scorer = MatchScorer(failure_line.message)
-        matches = [(item, item['message']) for item in results]
-        best_match = scorer.best_match(matches)
-        if not best_match:
-            return
-
-        score, es_result = best_match
-        # TODO: score all results and return
-        # TODO: just return results with score above cut off?
-        return [(score, es_result['best_classification'])]
-
-
-class CrashSignatureMatcher(Matcher):
-    """Matcher that looks for crashes with identical signature."""
-    @newrelic.agent.function_trace()
-    def query_best(self, text_log_error):
-        """
-        Query for TextLogErrorMatches with the same crash signature.
-
-        Produces two queries, first checking if the same test produces matches
-        and secondly checking without the same test but lowering the produced
-        scores.
-        """
-        failure_line = text_log_error.metadata.failure_line
-
-        if (failure_line.action != "crash" or
-            failure_line.signature is None or
-            failure_line.signature == "None"):
-            return
-
-        f = {
-            'text_log_error___metadata__failure_line__action': 'crash',
-            'text_log_error___metadata__failure_line__signature': failure_line.signature,
-        }
-        qwargs = (
-            Q(text_log_error___metadata__best_classification=None)
-            & (Q(text_log_error___metadata__best_is_verified=True)
-               | Q(text_log_error__step__job=text_log_error.step.job))
+    if len(results) > 1:
+        args = (
+            text_log_error.id,
+            failure_line.id,
+            len(results),
        )
-        qs = (TextLogErrorMatch.objects.filter(**f)
-                                       .exclude(qwargs)
-                                       .select_related('text_log_error', 'text_log_error___metadata')
-                                       .order_by('-score', '-classified_failure'))
+        logger.info('text_log_error=%i failure_line=%i Elasticsearch produced %i results' % args)
+        newrelic.agent.record_custom_event('es_matches', {
+            'num_results': len(results),
+            'text_log_error_id': text_log_error.id,
+            'failure_line_id': failure_line.id,
+        })

-        size = 20000
-        time_budget = 500
+    scorer = MatchScorer(failure_line.message)
+    matches = [(item, item['message']) for item in results]
+    best_match = scorer.best_match(matches)
+    if not best_match:
+        return

-        # See if we can get any matches when filtering by the same test
-        first_attempt = qs.filter(text_log_error___metadata__failure_line__test=failure_line.test)
-        chunks = chunked_qs_reverse(first_attempt, chunk_size=size)
-        scored_matches = chain.from_iterable(time_boxed(score_matches, chunks, time_budget))
-        if scored_matches:
-            return scored_matches
+    score, es_result = best_match
+    # TODO: score all results and return
+    # TODO: just return results with score above cut off?
+    return [(score, es_result['best_classification'])]

-        # try again without filtering to the test but applying a .8 score multiplyer
-        chunks = chunked_qs_reverse(qs, chunk_size=size)
-        scored_matches = chain.from_iterable(time_boxed(
-            score_matches,
-            chunks,
-            time_budget,
-            score_multiplier=(8, 10),
-        ))
+
+@newrelic.agent.function_trace()
+def crash_signature_matcher(text_log_error):
+    """
+    Query for TextLogErrorMatches with the same crash signature.
+
+    Produces two queries, first checking if the same test produces matches
+    and secondly checking without the same test but lowering the produced
+    scores.
+    """
+    failure_line = text_log_error.metadata.failure_line
+
+    if (failure_line.action != "crash" or
+        failure_line.signature is None or
+        failure_line.signature == "None"):
+        return
+
+    f = {
+        'text_log_error___metadata__failure_line__action': 'crash',
+        'text_log_error___metadata__failure_line__signature': failure_line.signature,
+    }
+    qwargs = (
+        Q(text_log_error___metadata__best_classification=None)
+        & (Q(text_log_error___metadata__best_is_verified=True)
+           | Q(text_log_error__step__job=text_log_error.step.job))
+    )
+    qs = (TextLogErrorMatch.objects.filter(**f)
+                                   .exclude(qwargs)
+                                   .select_related('text_log_error', 'text_log_error___metadata')
+                                   .order_by('-score', '-classified_failure'))
+
+    size = 20000
+    time_budget = 500
+
+    # See if we can get any matches when filtering by the same test
+    first_attempt = qs.filter(text_log_error___metadata__failure_line__test=failure_line.test)
+    chunks = chunked_qs_reverse(first_attempt, chunk_size=size)
+    scored_matches = chain.from_iterable(time_boxed(score_matches, chunks, time_budget))
+    if scored_matches:
        return scored_matches

+    # try again without filtering to the test but applying a .8 score multiplyer
+    chunks = chunked_qs_reverse(qs, chunk_size=size)
+    scored_matches = chain.from_iterable(time_boxed(
+        score_matches,
+        chunks,
+        time_budget,
+        score_multiplier=(8, 10),
+    ))
+    return scored_matches
+

 class MatchScorer(object):
    """Simple scorer for similarity of strings based on python's difflib SequenceMatcher."""