From 08c04ae79ab0b4d4eac8b68e1657bae758209125 Mon Sep 17 00:00:00 2001 From: Cameron Dawson Date: Wed, 15 Oct 2014 10:37:09 -0700 Subject: [PATCH] bug 1077136 - ingest bad pushes as 'onhold' so we don't keep trying to ingest them --- treeherder/etl/buildapi.py | 51 +++++++++++--------------------- treeherder/etl/common.py | 37 +++++++++++++++++++++++ treeherder/etl/pushlog.py | 12 ++++---- treeherder/model/derived/jobs.py | 11 +++++++ treeherder/model/sql/jobs.json | 7 +++-- 5 files changed, 77 insertions(+), 41 deletions(-) diff --git a/treeherder/etl/buildapi.py b/treeherder/etl/buildapi.py index 7fd47f50e..065e485d4 100644 --- a/treeherder/etl/buildapi.py +++ b/treeherder/etl/buildapi.py @@ -131,18 +131,13 @@ class Builds4hTransformerMixin(object): artifact_build = copy.deepcopy(build) try: - branch = revisions_lookup[project] - try: - resultset = branch[prop['revision']] - except KeyError: - # we don't have the resultset for this build/job yet - # we need to queue fetching that resultset - if prop['revision'] not in ["Unknown", None]: - missing_resultsets[project].add(prop['revision']) - - continue + resultset = common.get_resultset(project, + revisions_lookup, + prop['revision'], + missing_resultsets, + logger) except KeyError: - # this branch is not one of those we care about + # skip this job, at least at this point continue treeherder_data = { @@ -303,18 +298,13 @@ class PendingTransformerMixin(object): for revision, jobs in revisions.items(): try: - branch = revisions_lookup[project] - try: - resultset = branch[revision] - except KeyError: - # we don't have the resultset for this build/job yet - # we need to queue fetching that resultset - if revision not in ["Unknown", None]: - missing_resultsets[project].add(revision) - - continue + resultset = common.get_resultset(project, + revisions_lookup, + revision, + missing_resultsets, + logger) except KeyError: - # this branch is not one of those we care about + # skip this job, at least at this point continue # using project and revision form the revision lookups @@ -433,18 +423,13 @@ class RunningTransformerMixin(object): for revision, jobs in revisions.items(): try: - branch = revisions_lookup[project] - try: - resultset = branch[revision] - except KeyError: - # we don't have the resultset for this build/job yet - # we need to queue fetching that resultset - if revision not in ["Unknown", None]: - missing_resultsets[project].add(revision) - - continue + resultset = common.get_resultset(project, + revisions_lookup, + revision, + missing_resultsets, + logger) except KeyError: - # this branch is not one of those we care about + # skip this job, at least at this point continue # using project and revision form the revision lookups diff --git a/treeherder/etl/common.py b/treeherder/etl/common.py index 56b5bfbdb..b3f80d69a 100644 --- a/treeherder/etl/common.py +++ b/treeherder/etl/common.py @@ -163,3 +163,40 @@ def fetch_missing_resultsets(source, missing_resultsets, logger): missing_resultsets, ex )) + +def get_resultset(project, revisions_lookup, revision, missing_resultsets, logger): + """ + Get the resultset out of the revisions_lookup for the given revision. + + This is a little complex due to our attempts to get missing resultsets + in case we see jobs that, for one reason or another, we didn't get the + resultset from json-pushes. + + This may raise a KeyError if the project or revision isn't found in the + lookup.. This signals that the job should be skipped + """ + + branch = revisions_lookup[project] + # we can ingest resultsets that are not active for various + # reasons. One would be that the data from pending/running/ + # builds4hr may have a bad revision (from the wrong repo). + # in this case, we ingest the resultset as inactive so we + # don't keep re-trying to find it when we hit jobs like this. + # Or, the resultset could be inactive for other reasons. + # Either way, we don't want to ingest jobs for it. + if branch.get("active_status", "active") != "active": + logger.warn(("Skipping job for non-active " + "resultset/revision: {0}").format( + revision)) + try: + resultset = branch[revision] + except KeyError as ex: + # we don't have the resultset for this build/job yet + # we need to queue fetching that resultset + if revision not in ["Unknown", None]: + missing_resultsets[project].add(revision) + raise ex + + return resultset + + diff --git a/treeherder/etl/pushlog.py b/treeherder/etl/pushlog.py index b23eaaa52..0b0439e45 100644 --- a/treeherder/etl/pushlog.py +++ b/treeherder/etl/pushlog.py @@ -110,13 +110,14 @@ class MissingHgPushlogProcess(HgPushlogTransformerMixin, OAuthLoaderMixin): def extract(self, url, resultset): - # we will sometimes get here because builds4hr/pending/running have a - # job with a resultset that json-pushes doesn't know about. Seems - # odd, but it happens. So we just ingest - logger.info("extracting missing resultsets: {0}".format(url)) response = requests.get(url, timeout=settings.TREEHERDER_REQUESTS_TIMEOUT) if response.status_code == 404: + # we will sometimes get here because builds4hr/pending/running have a + # job with a resultset that json-pushes doesn't know about. Seems + # odd, but it happens. So we just ingest + logger.error("no pushlog in json-pushes. generating a placeholder: {0}".format(url)) + # we want to make a "fake" resultset, because json-pushes doesn't # know about it. This is what TBPL does return { @@ -132,7 +133,8 @@ class MissingHgPushlogProcess(HgPushlogTransformerMixin, "desc": "Pushlog not found at {0}".format(url) } ], - "user": "Unknown" + "user": "Unknown", + "active_status": "onhold" } } else: diff --git a/treeherder/model/derived/jobs.py b/treeherder/model/derived/jobs.py index 8995bd89f..da7505307 100644 --- a/treeherder/model/derived/jobs.py +++ b/treeherder/model/derived/jobs.py @@ -967,6 +967,16 @@ class JobsModel(TreeherderModelBase): def get_revision_resultset_lookup(self, revision_list): """ Create a list of revision->resultset lookups from a list of revision + + This will retrieve non-active resultsets as well. Some of the data + ingested has mixed up revisions that show for jobs, but are not in + the right repository in builds4hr/running/pending. So we ingest those + bad resultsets/revisions as non-active so that we don't keep trying + to re-ingest them. Allowing this query to retrieve non ``active`` + resultsets means we will avoid re-doing that work by detacting that + we've already ingested it. + + But we skip ingesting the job, because the resultset is not active. """ replacement = ",".join(["%s"] * len(revision_list)) @@ -2405,6 +2415,7 @@ class JobsModel(TreeherderModelBase): result.get('author', 'unknown@somewhere.com'), result['revision_hash'], result['push_timestamp'], + result.get('active_status', 'active'), result['revision_hash'] ] ) diff --git a/treeherder/model/sql/jobs.json b/treeherder/model/sql/jobs.json index f1d09bebd..5b8b13f98 100644 --- a/treeherder/model/sql/jobs.json +++ b/treeherder/model/sql/jobs.json @@ -125,8 +125,8 @@ }, "set_result_set":{ - "sql":"INSERT INTO `result_set` (`author`, `revision_hash`,`push_timestamp`) - SELECT ?,?,? + "sql":"INSERT INTO `result_set` (`author`, `revision_hash`,`push_timestamp`, `active_status`) + SELECT ?,?,?,? FROM DUAL WHERE NOT EXISTS ( SELECT `revision_hash`, `push_timestamp` @@ -760,6 +760,7 @@ rs.id, rs.revision_hash, rs.push_timestamp, + rs.active_status, revision.id as revision_id, revision.revision FROM result_set AS rs @@ -767,7 +768,7 @@ ON rs.id = revision_map.result_set_id INNER JOIN revision ON revision_map.revision_id = revision.id - WHERE rs.active_status = 'active' + WHERE 1 REP0 ORDER BY rs.push_timestamp DESC LIMIT ?,?