Scripts to analyze validation results (bug 1183396)

Fixes #613
2015-07-22 15:47:41 -05:00 · 2015-07-22 15:47:41 -05:00 · 81a97dc1f0
--- a/scripts/fetch_validation_data.py
+++ b/scripts/fetch_validation_data.py
@ -0,0 +1,71 @@
+"""
+Fetch data from the olympia database for validation results and unlisted
+addons for use with the validations.py script.
+
+Expected environment variables:
+    MYSQL_HOST - The MySQL host.
+    MYSQL_USER - The MySQL username.
+    MYSQL_PASSWORD - The MySQL password.
+
+Actions supported:
+    validations - Fetch validation data for the last 30 days and write it to
+        the filesystem in files named `validations/YYYY-MM-DD.txt`.
+    unlisted - Fetch all unlisted addon guids and write the results to
+        `validations/unlisted-addons.txt`.
+
+Usage:
+    python fetch_validation_data.py <action>
+"""
+
+import os
+import sys
+from datetime import datetime, timedelta
+
+import MySQLdb
+
+date_format = '%Y-%m-%d'
+db = MySQLdb.connect(host=os.environ['MYSQL_HOST'],
+                     user=os.environ['MYSQL_USER'],
+                     passwd=os.environ['MYSQL_PASSWORD'],
+                     db="addons_mozilla_org")
+cursor = db.cursor()
+
+QUERY_FORMAT = """
+    SELECT validation
+    FROM file_uploads
+    WHERE created LIKE %s
+    AND validation IS NOT NULL
+    ORDER BY created DESC;
+"""
+
+
+def fetch_data_for_date(date):
+    date_string = date.strftime(date_format)
+    print 'Fetching for {date}'.format(date=date_string)
+    cursor.execute(QUERY_FORMAT, [date_string + '%'])
+    with open('validations/{date}.txt'.format(date=date_string), 'w') as f:
+        for row in cursor:
+            f.write(row[0])
+            f.write('\n')
+
+
+def fetch_unlisted_addon_ids():
+    print 'Fetching unlisted addons'
+    cursor.execute('SELECT guid FROM addons WHERE is_listed=0 '
+                   'AND guid IS NOT NULL;')
+    with open('validations/unlisted-addons.txt', 'w') as f:
+        for row in cursor:
+            f.write(row[0])
+            f.write('\n')
+
+if __name__ == '__main__':
+    action = len(sys.argv) == 2 and sys.argv[1]
+    if action == 'validations':
+        today = datetime.today()
+        for i in range(30, 0, -1):
+            date = today - timedelta(days=i)
+            fetch_data_for_date(date)
+    elif action == 'unlisted':
+        fetch_unlisted_addon_ids()
+    else:
+        print 'Unknown action "{action}"'.format(action=action or '')
--- a/scripts/test_validations.py
+++ b/scripts/test_validations.py
@ -0,0 +1,42 @@
+from validations import (parse_validations, severe_validations,
+                         unlisted_validations)
+
+TEST_ADDON_LISTED_FALSE = {'metadata': {'listed': False, 'id': 'wat'}}
+TEST_ADDON_UNLISTED_ID = {'metadata': {'id': 'baz'}}
+TEST_ADDONS = [
+    {'metadata': {'listed': True, 'id': 'yo'}},
+    TEST_ADDON_LISTED_FALSE,
+    {'metadata': {'id': 'foobar'}},
+    TEST_ADDON_UNLISTED_ID,
+]
+
+
+def test_parse_validations():
+    results = parse_validations([
+        '{"foo":"bar"}\n',
+        '["baz",1,{"wat":99}]\n'
+    ])
+    assert list(results) == [{'foo': 'bar'}, ['baz', 1, {'wat': 99}]]
+
+
+def test_unlisted_validations_without_unlisted_addons():
+    unlisted = unlisted_validations(TEST_ADDONS, set())
+    assert list(unlisted) == [TEST_ADDON_LISTED_FALSE]
+
+
+def test_unlisted_validations_with_unlisted_addons():
+    unlisted = unlisted_validations(TEST_ADDONS, set(['baz', 'wat']))
+    assert list(unlisted) == [TEST_ADDON_LISTED_FALSE, TEST_ADDON_UNLISTED_ID]
+
+
+def test_severe_validations():
+    nope = {'signing_summary':
+            {'high': 0, 'medium': 0, 'trivial': 0, 'low': 0}}
+    minor = {'signing_summary':
+             {'high': 0, 'medium': 0, 'trivial': 0, 'low': 1}}
+    trivial = {'signing_summary':
+               {'high': 0, 'medium': 0, 'trivial': 1, 'low': 0}}
+    severe = {'signing_summary':
+              {'high': 10, 'medium': 0, 'trivial': 0, 'low': 0}}
+    results = severe_validations([nope, trivial, minor, nope, severe, nope])
+    assert list(results) == [minor, severe]
--- a/scripts/validations.py
+++ b/scripts/validations.py
@ -0,0 +1,145 @@
+"""
+Process validation data retrieved using fetch_validation_data.py. Two types
+of data are expected. A file at `validations/unlisted-addons.txt` that contains
+the guid of each unlisted addon and input on STDIN which has the validation
+JSON data for each validation to check. See fetch_validation_data.py for how
+this data is retrieved. Results are returned on STDOUT.
+
+The following reports are supported:
+    * count - Return signing errors ordered by addon unique frequency in the
+        format: `error.id.dot.separated total_count unique_addon_count`.
+    * context - Return the context for 5 most common signing errors in the JSON
+        format: `{"context": ["", ...], "error": "error.id"}`.
+
+Usage:
+    cat my-test-data-*.txt | python validations.py <report> > results.txt
+"""
+
+import itertools
+import json
+import sys
+
+ACTION_CONTEXT = 'context'
+ACTION_COUNT = 'count'
+ACTIONS = (ACTION_CONTEXT, ACTION_COUNT)
+
+
+def parse_validations(results):
+    return (json.loads(result) for result in results)
+
+
+def unlisted_validations(results, unlisted_addons=None):
+    if unlisted_addons is None:
+        unlisted_addons = get_unlisted_addons()
+    return (result
+            for result in results
+            if ('id' in result['metadata'] and
+                (not result['metadata'].get('listed', True)
+                 or result['metadata']['id'] in unlisted_addons)))
+
+
+def severe_validations(results):
+    return (result
+            for result in results
+            if (result['signing_summary']['high'] > 0 or
+                result['signing_summary']['medium'] > 0 or
+                result['signing_summary']['low'] > 0))
+
+
+def error_messages(results):
+    return ({'addon': result['metadata']['id'],
+             'message_id': '.'.join(message['id']),
+             'context': message['context']}
+            for result in results
+            for message in result['messages']
+            if 'signing_severity' in message)
+
+
+def sort_by_message(results):
+    return sorted(results, key=lambda r: r['message_id'])
+
+
+def group_by_message(results):
+    return itertools.groupby(results, lambda r: r['message_id'])
+
+
+def extract_error_results(results):
+    for error, messages in results:
+        all_messages = list(messages)
+        yield {
+            'error': error,
+            'total': len(all_messages),
+            'unique': len(set(msg['addon'] for msg in all_messages)),
+            'contexts': [msg['context'] for msg in all_messages],
+        }
+
+
+def sort_results_by_unique(results):
+    return sorted(results, reverse=True, key=lambda r: r['unique'])
+
+
+def format_error_count(results):
+    return ('{error} {total} {unique}'.format(**result)
+            for result in results)
+
+
+def format_contexts(results):
+    for result in results:
+        for context in result['contexts']:
+            yield json.dumps({
+                'error': result['error'],
+                'context': context,
+            })
+
+
+def get_unlisted_addons():
+    with open('validations/unlisted-addons.txt') as f:
+        return set(guid.strip() for guid in f)
+
+
+def main(action):
+    pipeline = [
+        parse_validations,
+        unlisted_validations,
+        severe_validations,
+        error_messages,
+        sort_by_message,
+        group_by_message,
+        extract_error_results,
+        sort_results_by_unique,
+    ]
+
+    if action == ACTION_CONTEXT:
+        # Only get context for the top 5 errors (they're already sorted by
+        # unique occurrences so we can just take the first 5).
+        pipeline.append(lambda results: itertools.islice(results, 5))
+        pipeline.append(format_contexts)
+    elif action == ACTION_COUNT:
+        pipeline.append(format_error_count)
+    else:
+        raise ValueError('{0} is not a valid action'.format(action))
+
+    process_pipeline(pipeline)
+
+
+def process_pipeline(pipeline):
+    # Read from STDIN.
+    val = sys.stdin
+
+    # Process through the pipeline.
+    for fn in pipeline:
+        val = fn(val)
+
+    # Print the results.
+    for line in val:
+        print line
+
+if __name__ == '__main__':
+    if len(sys.argv) != 2 or sys.argv[1] not in ACTIONS:
+        print """Usage: python {name} <action>
+    action: {actions}
+    values are read from STDIN""".format(
+            name=sys.argv[0], actions='|'.join(ACTIONS))
+        sys.exit(1)
+    else:
+        main(sys.argv[1])