bug 1077136 - ingest bad pushes as 'onhold' so we don't keep trying to ingest them

This commit is contained in:
Cameron Dawson 2014-10-15 10:37:09 -07:00
Родитель 914ef7c7f1
Коммит 08c04ae79a
5 изменённых файлов: 77 добавлений и 41 удалений

Просмотреть файл

@ -131,18 +131,13 @@ class Builds4hTransformerMixin(object):
artifact_build = copy.deepcopy(build)
try:
branch = revisions_lookup[project]
try:
resultset = branch[prop['revision']]
except KeyError:
# we don't have the resultset for this build/job yet
# we need to queue fetching that resultset
if prop['revision'] not in ["Unknown", None]:
missing_resultsets[project].add(prop['revision'])
continue
resultset = common.get_resultset(project,
revisions_lookup,
prop['revision'],
missing_resultsets,
logger)
except KeyError:
# this branch is not one of those we care about
# skip this job, at least at this point
continue
treeherder_data = {
@ -303,18 +298,13 @@ class PendingTransformerMixin(object):
for revision, jobs in revisions.items():
try:
branch = revisions_lookup[project]
try:
resultset = branch[revision]
except KeyError:
# we don't have the resultset for this build/job yet
# we need to queue fetching that resultset
if revision not in ["Unknown", None]:
missing_resultsets[project].add(revision)
continue
resultset = common.get_resultset(project,
revisions_lookup,
revision,
missing_resultsets,
logger)
except KeyError:
# this branch is not one of those we care about
# skip this job, at least at this point
continue
# using project and revision form the revision lookups
@ -433,18 +423,13 @@ class RunningTransformerMixin(object):
for revision, jobs in revisions.items():
try:
branch = revisions_lookup[project]
try:
resultset = branch[revision]
except KeyError:
# we don't have the resultset for this build/job yet
# we need to queue fetching that resultset
if revision not in ["Unknown", None]:
missing_resultsets[project].add(revision)
continue
resultset = common.get_resultset(project,
revisions_lookup,
revision,
missing_resultsets,
logger)
except KeyError:
# this branch is not one of those we care about
# skip this job, at least at this point
continue
# using project and revision form the revision lookups

Просмотреть файл

@ -163,3 +163,40 @@ def fetch_missing_resultsets(source, missing_resultsets, logger):
missing_resultsets,
ex
))
def get_resultset(project, revisions_lookup, revision, missing_resultsets, logger):
"""
Get the resultset out of the revisions_lookup for the given revision.
This is a little complex due to our attempts to get missing resultsets
in case we see jobs that, for one reason or another, we didn't get the
resultset from json-pushes.
This may raise a KeyError if the project or revision isn't found in the
lookup.. This signals that the job should be skipped
"""
branch = revisions_lookup[project]
# we can ingest resultsets that are not active for various
# reasons. One would be that the data from pending/running/
# builds4hr may have a bad revision (from the wrong repo).
# in this case, we ingest the resultset as inactive so we
# don't keep re-trying to find it when we hit jobs like this.
# Or, the resultset could be inactive for other reasons.
# Either way, we don't want to ingest jobs for it.
if branch.get("active_status", "active") != "active":
logger.warn(("Skipping job for non-active "
"resultset/revision: {0}").format(
revision))
try:
resultset = branch[revision]
except KeyError as ex:
# we don't have the resultset for this build/job yet
# we need to queue fetching that resultset
if revision not in ["Unknown", None]:
missing_resultsets[project].add(revision)
raise ex
return resultset

Просмотреть файл

@ -110,13 +110,14 @@ class MissingHgPushlogProcess(HgPushlogTransformerMixin,
OAuthLoaderMixin):
def extract(self, url, resultset):
# we will sometimes get here because builds4hr/pending/running have a
# job with a resultset that json-pushes doesn't know about. Seems
# odd, but it happens. So we just ingest
logger.info("extracting missing resultsets: {0}".format(url))
response = requests.get(url, timeout=settings.TREEHERDER_REQUESTS_TIMEOUT)
if response.status_code == 404:
# we will sometimes get here because builds4hr/pending/running have a
# job with a resultset that json-pushes doesn't know about. Seems
# odd, but it happens. So we just ingest
logger.error("no pushlog in json-pushes. generating a placeholder: {0}".format(url))
# we want to make a "fake" resultset, because json-pushes doesn't
# know about it. This is what TBPL does
return {
@ -132,7 +133,8 @@ class MissingHgPushlogProcess(HgPushlogTransformerMixin,
"desc": "Pushlog not found at {0}".format(url)
}
],
"user": "Unknown"
"user": "Unknown",
"active_status": "onhold"
}
}
else:

Просмотреть файл

@ -967,6 +967,16 @@ class JobsModel(TreeherderModelBase):
def get_revision_resultset_lookup(self, revision_list):
"""
Create a list of revision->resultset lookups from a list of revision
This will retrieve non-active resultsets as well. Some of the data
ingested has mixed up revisions that show for jobs, but are not in
the right repository in builds4hr/running/pending. So we ingest those
bad resultsets/revisions as non-active so that we don't keep trying
to re-ingest them. Allowing this query to retrieve non ``active``
resultsets means we will avoid re-doing that work by detacting that
we've already ingested it.
But we skip ingesting the job, because the resultset is not active.
"""
replacement = ",".join(["%s"] * len(revision_list))
@ -2405,6 +2415,7 @@ class JobsModel(TreeherderModelBase):
result.get('author', 'unknown@somewhere.com'),
result['revision_hash'],
result['push_timestamp'],
result.get('active_status', 'active'),
result['revision_hash']
]
)

Просмотреть файл

@ -125,8 +125,8 @@
},
"set_result_set":{
"sql":"INSERT INTO `result_set` (`author`, `revision_hash`,`push_timestamp`)
SELECT ?,?,?
"sql":"INSERT INTO `result_set` (`author`, `revision_hash`,`push_timestamp`, `active_status`)
SELECT ?,?,?,?
FROM DUAL
WHERE NOT EXISTS (
SELECT `revision_hash`, `push_timestamp`
@ -760,6 +760,7 @@
rs.id,
rs.revision_hash,
rs.push_timestamp,
rs.active_status,
revision.id as revision_id,
revision.revision
FROM result_set AS rs
@ -767,7 +768,7 @@
ON rs.id = revision_map.result_set_id
INNER JOIN revision
ON revision_map.revision_id = revision.id
WHERE rs.active_status = 'active'
WHERE 1
REP0
ORDER BY rs.push_timestamp DESC
LIMIT ?,?