Bug 1194830: Add a runnable_job API endpoint

This creates a 'runnable_job' table in the database, as well as an API endpoint at /api/project/{branch}/runnable_jobs listing all existing buildbot jobs and their symbols. A new daily task 'fetch_allthethings' is added to update the this table.
2015-10-15 14:47:22 -03:00 · 2015-10-15 14:47:22 -03:00 · 5d9e430cac
--- a/2
+++ b/2
@ -4,6 +4,6 @@ worker_pushlog: newrelic-admin run-program celery -A treeherder worker -Q pushlo
 worker_buildapi_pending: newrelic-admin run-program celery -A treeherder worker -Q buildapi_pending --maxtasksperchild=20 --concurrency=5
 worker_buildapi_running: newrelic-admin run-program celery -A treeherder worker -Q buildapi_running --maxtasksperchild=20 --concurrency=5
 worker_buildapi_4hr: newrelic-admin run-program celery -A treeherder worker -Q buildapi_4hr --maxtasksperchild=20 --concurrency=1
-worker_default: newrelic-admin run-program celery -A treeherder worker -Q default,cycle_data,calculate_eta,fetch_bugs --maxtasksperchild=50 --concurrency=3
+worker_default: newrelic-admin run-program celery -A treeherder worker -Q default,cycle_data,calculate_eta,fetch_bugs,fetch_allthethings --maxtasksperchild=50 --concurrency=3
 worker_hp: newrelic-admin run-program celery -A treeherder worker -Q classification_mirroring,publish_to_pulse --maxtasksperchild=50 --concurrency=1
 worker_log_parser: newrelic-admin run-program celery -A treeherder worker -Q log_parser_fail,log_parser,log_parser_hp,log_parser_json --maxtasksperchild=50 --concurrency=5
--- a/bin/run_celery_worker
+++ b/bin/run_celery_worker
@ -19,6 +19,6 @@ if [ ! -f $LOGFILE ]; then
 fi

 exec $NEWRELIC_ADMIN celery -A treeherder worker -c 3 \
-     -Q default,cycle_data,calculate_eta,fetch_bugs,autoclassify,detect_intermittents \
+     -Q default,cycle_data,calculate_eta,fetch_bugs,autoclassify,detect_intermittents,fetch_allthethings \
     -E --maxtasksperchild=500 \
     --logfile=$LOGFILE -l INFO -n default.%h
--- a/treeherder/config/settings.py
+++ b/treeherder/config/settings.py
@ -184,6 +184,7 @@ CELERY_QUEUES = (
    Queue('buildapi_pending', Exchange('default'), routing_key='buildapi_pending'),
    Queue('buildapi_running', Exchange('default'), routing_key='buildapi_running'),
    Queue('buildapi_4hr', Exchange('default'), routing_key='buildapi_4hr'),
+    Queue('fetch_allthethings', Exchange('default'), routing_key='fetch_allthethings'),
    Queue('cycle_data', Exchange('default'), routing_key='cycle_data'),
    Queue('calculate_eta', Exchange('default'), routing_key='calculate_eta'),
    Queue('fetch_bugs', Exchange('default'), routing_key='fetch_bugs'),
@ -232,6 +233,14 @@ CELERYBEAT_SCHEDULE = {
            "queue": "buildapi_4hr"
        }
    },
+    'fetch-allthethings-every-day': {
+        'task': 'fetch-allthethings',
+        'schedule': timedelta(days=1),
+        'relative': True,
+        'options': {
+            'queue': "fetch_allthethings"
+        }
+    },
    'cycle-data-every-day': {
        'task': 'cycle-data',
        'schedule': timedelta(days=1),
@ -291,6 +300,7 @@ SITE_URL = os.environ.get("SITE_URL", "http://local.treeherder.mozilla.org")
 BUILDAPI_PENDING_URL = "https://secure.pub.build.mozilla.org/builddata/buildjson/builds-pending.js"
 BUILDAPI_RUNNING_URL = "https://secure.pub.build.mozilla.org/builddata/buildjson/builds-running.js"
 BUILDAPI_BUILDS4H_URL = "https://secure.pub.build.mozilla.org/builddata/buildjson/builds-4hr.js.gz"
+ALLTHETHINGS_URL = "https://secure.pub.build.mozilla.org/builddata/reports/allthethings.json"

 # the max size of a posted request to treeherder client during Buildbot
 # data job ingestion.
--- a/treeherder/etl/allthethings.py
+++ b/treeherder/etl/allthethings.py
@ -0,0 +1,109 @@
+import collections
+import logging
+from hashlib import sha1
+
+from django.conf import settings
+
+from treeherder.etl.buildbot import get_symbols_and_platforms
+from treeherder.etl.mixins import JsonExtractorMixin
+from treeherder.model.models import (BuildPlatform,
+                                     JobGroup,
+                                     JobType,
+                                     MachinePlatform,
+                                     Option,
+                                     OptionCollection,
+                                     Repository,
+                                     RunnableJob)
+
+logger = logging.getLogger(__name__)
+
+
+class AllthethingsTransformerMixin:
+
+    def transform(self, extracted_content):
+        logger.info('About to import allthethings.json builder data.')
+
+        jobs_per_branch = collections.defaultdict(list)
+
+        for builder, content in extracted_content['builders'].iteritems():
+            job = get_symbols_and_platforms(builder)
+
+            branch = content['properties']['branch']
+            job.update({'branch': branch})
+            jobs_per_branch[branch].append(job)
+
+        return jobs_per_branch
+
+
+class RunnableJobsProcess(JsonExtractorMixin,
+                          AllthethingsTransformerMixin):
+
+    # XXX: Copied from refdata.py. What is the best place for this?
+    def get_option_collection_hash(self, options):
+        """returns an option_collection_hash given a list of options"""
+
+        options = sorted(list(options))
+        sha_hash = sha1()
+        # equivalent to loop over the options and call sha_hash.update()
+        sha_hash.update(''.join(options))
+        return sha_hash.hexdigest()
+
+    def load(self, jobs_per_branch):
+        active_repositories = Repository.objects.all().filter(
+            active_status='active')
+
+        for repo in active_repositories:
+            # Some active repositories might not have any buildbot
+            # builders.
+            if repo.name not in jobs_per_branch:
+                continue
+
+            for datum in jobs_per_branch[repo.name]:
+                # XXX: refdata.py truncates those fields at 25 characters.
+                # Should we do the same?
+                build_platform, _ = BuildPlatform.objects.get_or_create(
+                    os_name=datum['build_os'],
+                    platform=datum['build_platform'],
+                    architecture=datum['build_architecture']
+                )
+
+                machine_platform, _ = MachinePlatform.objects.get_or_create(
+                    os_name=datum['machine_platform_os'],
+                    platform=datum['platform'],
+                    architecture=datum['machine_platform_architecture']
+                )
+
+                job_group, _ = JobGroup.objects.get_or_create(
+                    name=datum['job_group_name'],
+                    symbol=datum['job_group_symbol']
+                )
+
+                job_type, _ = JobType.objects.get_or_create(
+                    name=datum['job_type_name'],
+                    symbol=datum['job_type_symbol'],
+                    job_group=job_group
+                )
+
+                option_collection_hash = self.get_option_collection_hash(
+                    datum['option_collection'].keys())
+
+                for key in datum['option_collection'].keys():
+                    option, _ = Option.objects.get_or_create(name=key)
+                    OptionCollection.objects.get_or_create(
+                        option_collection_hash=option_collection_hash,
+                        option=option)
+
+                # This automatically updates the last_touched field.
+                RunnableJob.objects.update_or_create(
+                    ref_data_name=datum['ref_data_name'],
+                    build_system_type=datum['build_system_type'],
+                    defaults={'build_platform': build_platform,
+                              'machine_platform': machine_platform,
+                              'job_type': job_type,
+                              'option_collection_hash': option_collection_hash,
+                              'repository': repo})
+
+    def run(self):
+        extracted_content = self.extract(settings.ALLTHETHINGS_URL)
+        jobs_per_branch = self.transform(extracted_content)
+        self.load(jobs_per_branch)
--- a/treeherder/etl/buildbot.py
+++ b/treeherder/etl/buildbot.py
@ -1072,3 +1072,29 @@ def get_symbol(name, bn):
        return n

    return "{0}{1}".format(s, n)
+
+
+def get_symbols_and_platforms(buildername):
+    """Return a dict with all the information we extract from the buildername."""
+    platform_info = extract_platform_info(buildername)
+    job_name_info = extract_name_info(buildername)
+
+    job = {
+        'job_type_name': job_name_info.get('name', ''),
+        'job_type_symbol': job_name_info.get('job_symbol', ''),
+        'job_group_name': job_name_info.get('group_name', ''),
+        'job_group_symbol': job_name_info.get('group_symbol', ''),
+        'ref_data_name': buildername,
+        'build_platform': platform_info.get('os_platform', ''),
+        'build_os': platform_info.get('os', ''),
+        'build_architecture': platform_info.get('arch', ''),
+        'build_system_type': 'buildbot',
+        'machine_platform_architecture': platform_info.get('arch', ''),
+        'machine_platform_os': platform_info.get('os', ''),
+        'option_collection': {
+            extract_build_type(buildername): True
+        },
+        'platform': platform_info.get('os_platform', ''),
+        'job_coalesced_to_guid': None
+    }
+    return job
--- a/treeherder/etl/tasks/buildapi_tasks.py
+++ b/treeherder/etl/tasks/buildapi_tasks.py
@ -3,6 +3,7 @@ This module contains
 """
 from celery import task

+from treeherder.etl.allthethings import RunnableJobsProcess
 from treeherder.etl.buildapi import (Builds4hJobsProcess,
                                     PendingJobsProcess,
                                     RunningJobsProcess)
@ -34,6 +35,14 @@ def fetch_buildapi_build4h():
    Builds4hJobsProcess().run()


+@task(name='fetch-allthethings', time_limit=10 * 60)
+def fetch_allthethings():
+    """
+    Fetches possible jobs from allthethings and load them
+    """
+    RunnableJobsProcess().run()
+
+
@task(name='fetch-push-logs')
 def fetch_push_logs():
    """
--- a/treeherder/model/migrations/0004_add_runnable_job_table.py
+++ b/treeherder/model/migrations/0004_add_runnable_job_table.py
@ -0,0 +1,35 @@
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('model', '0003_auto_20151111_0942'),
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name='RunnableJob',
+            fields=[
+                ('id', models.AutoField(serialize=False, primary_key=True)),
+                ('option_collection_hash', models.CharField(max_length=64L)),
+                ('ref_data_name', models.CharField(max_length=255L)),
+                ('build_system_type', models.CharField(max_length=25L)),
+                ('last_touched', models.DateTimeField(auto_now=True)),
+                ('build_platform', models.ForeignKey(to='model.BuildPlatform')),
+                ('job_type', models.ForeignKey(to='model.JobType')),
+                ('machine_platform', models.ForeignKey(to='model.MachinePlatform')),
+                ('repository', models.ForeignKey(to='model.Repository')),
+            ],
+            options={
+                'db_table': 'runnable_job',
+            },
+        ),
+        migrations.AlterUniqueTogether(
+            name='runnablejob',
+            unique_together=set([('ref_data_name', 'build_system_type')]),
+        ),
+    ]
--- a/treeherder/model/models.py
+++ b/treeherder/model/models.py
@ -671,3 +671,27 @@ class FailureMatch(models.Model):
        unique_together = (
            ('failure_line', 'classified_failure', 'matcher')
        )
+
+
+@python_2_unicode_compatible
+class RunnableJob(models.Model):
+    id = models.AutoField(primary_key=True)
+    build_platform = models.ForeignKey(BuildPlatform)
+    machine_platform = models.ForeignKey(MachinePlatform)
+    job_type = models.ForeignKey(JobType)
+    option_collection_hash = models.CharField(max_length=64L)
+    ref_data_name = models.CharField(max_length=255L)
+    build_system_type = models.CharField(max_length=25L)
+    repository = models.ForeignKey(Repository)
+    last_touched = models.DateTimeField(auto_now=True)
+
+    class Meta:
+        db_table = 'runnable_job'
+        unique_together = (
+            ('ref_data_name', 'build_system_type')
+        )
+
+    def __str__(self):
+        return "{0} {1} {2}".format(self.id,
+                                    self.ref_data_name,
+                                    self.build_system_type)
--- a/treeherder/model/tasks.py
+++ b/treeherder/model/tasks.py
@ -108,6 +108,22 @@ def publish_resultset_action(project, action, resultset_id, requester, times=1):
    )


+@task(name='publish-resultset-runnable-job-action')
+def publish_resultset_runnable_job_action(project, resultset_id, requester,
+                                          buildernames):
+    publisher = pulse_connection.get_publisher()
+    if not publisher:
+        return
+
+    publisher.resultset_runnable_job_action(
+        version=1,
+        project=project,
+        requester=requester,
+        resultset_id=resultset_id,
+        buildernames=buildernames
+    )
+
+
@task(name='publish-resultset')
 def publish_resultset(project, ids):
    # If we don't have a publisher (because of missing configs), then we can't
--- a/treeherder/webapp/api/runnable_jobs.py
+++ b/treeherder/webapp/api/runnable_jobs.py
@ -0,0 +1,65 @@
+import datetime
+
+from rest_framework import viewsets
+from rest_framework.response import Response
+
+from treeherder.model import models
+
+
+class RunnableJobsViewSet(viewsets.ViewSet):
+    """
+    This viewset is responsible for the runnable_jobs endpoint.
+
+    """
+
+    def list(self, request, project):
+        """
+        GET method implementation for list of all runnable buildbot jobs
+        """
+        repository = models.Repository.objects.get(name=project)
+
+        options_by_hash = models.OptionCollection.objects.all().select_related(
+            'option').values_list('option__name', 'option_collection_hash')
+
+        runnable_jobs = models.RunnableJob.objects.filter(
+            repository=repository,
+            last_touched__gte=datetime.datetime.now() - datetime.timedelta(weeks=1)
+        ).select_related('build_platform', 'machine_platform',
+                         'job_type', 'job_type__job_group')
+
+        ret = []
+        for datum in runnable_jobs:
+            options = ' '.join(option_name for (option_name, col_hash) in options_by_hash
+                               if col_hash == datum.option_collection_hash)
+
+            ret.append({
+                'build_platform_id': datum.build_platform.id,
+                'build_platform': datum.build_platform.platform,
+                'build_os': datum.build_platform.os_name,
+                'build_architecture': datum.build_platform.architecture,
+                'machine_platform_id': datum.machine_platform.id,
+                'platform': datum.machine_platform.platform,
+                'machine_platform_os': datum.machine_platform.os_name,
+                'machine_platform_architecture': datum.machine_platform.architecture,
+                'job_group_id': datum.job_type.job_group.id,
+                'job_group_name': datum.job_type.job_group.name,
+                'job_group_symbol': datum.job_type.job_group.symbol,
+                'job_group_description': datum.job_type.job_group.description,
+                'job_type_id': datum.job_type.id,
+                'job_type_name': datum.job_type.name,
+                'job_type_symbol': datum.job_type.symbol,
+                'job_type_description': datum.job_type.description,
+                'option_collection_hash': datum.option_collection_hash,
+                'ref_data_name': datum.ref_data_name,
+                'build_system_type': datum.build_system_type,
+                'platform_option': options,
+                'job_coalesced_to_guid': None,
+                'state': 'runnable',
+                'result': 'runnable'})
+
+        response_body = dict(meta={"repository": project,
+                                   "offset": 0,
+                                   "count": len(ret)},
+                             results=ret)
+
+        return Response(response_body)
--- a/treeherder/webapp/api/urls.py
+++ b/treeherder/webapp/api/urls.py
@ -11,7 +11,8 @@ from treeherder.webapp.api import (artifact,
                                   note,
                                   performance_data,
                                   refdata,
-                                   resultset)
+                                   resultset,
+                                   runnable_jobs)

 # router for views that are bound to a project
 # i.e. all those views that don't involve reference data
@ -23,6 +24,12 @@ project_bound_router.register(
    base_name='jobs',
 )

+project_bound_router.register(
+    r'runnable_jobs',
+    runnable_jobs.RunnableJobsViewSet,
+    base_name='runnable_jobs',
+)
+
 project_bound_router.register(
    r'resultset',
    resultset.ResultSetViewSet,