diff --git a/treeherder/etl/common.py b/treeherder/etl/common.py index cebae8771..56b5bfbdb 100644 --- a/treeherder/etl/common.py +++ b/treeherder/etl/common.py @@ -159,4 +159,7 @@ def fetch_missing_resultsets(source, missing_resultsets, logger): try: fetch_missing_push_logs.apply_async(args=[missing_resultsets]) except Exception as ex: - logger.error("error fetching missing resultsets: {0}".format(ex)) + logger.exception("error fetching missing resultsets: {0}, {1}".format( + missing_resultsets, + ex + )) diff --git a/treeherder/etl/mixins.py b/treeherder/etl/mixins.py index 257498873..84cd9e033 100644 --- a/treeherder/etl/mixins.py +++ b/treeherder/etl/mixins.py @@ -129,6 +129,7 @@ class OAuthLoaderMixin(object): oauth_secret=credentials.get('consumer_secret', None) ) + logger.info("collection loading request: {0}".format(th_request.get_uri(th_collections[project].endpoint_base))) response = th_request.post(th_collections[project]) if not response or response.status != 200: diff --git a/treeherder/etl/pushlog.py b/treeherder/etl/pushlog.py index 831d20d81..b23eaaa52 100644 --- a/treeherder/etl/pushlog.py +++ b/treeherder/etl/pushlog.py @@ -1,5 +1,6 @@ from django.core.cache import cache from django.conf import settings +import time import requests import logging @@ -108,16 +109,40 @@ class HgPushlogProcess(HgPushlogTransformerMixin, class MissingHgPushlogProcess(HgPushlogTransformerMixin, OAuthLoaderMixin): - def extract(self, url): + def extract(self, url, resultset): + # we will sometimes get here because builds4hr/pending/running have a + # job with a resultset that json-pushes doesn't know about. Seems + # odd, but it happens. So we just ingest + logger.info("extracting missing resultsets: {0}".format(url)) response = requests.get(url, timeout=settings.TREEHERDER_REQUESTS_TIMEOUT) - response.raise_for_status() + if response.status_code == 404: + # we want to make a "fake" resultset, because json-pushes doesn't + # know about it. This is what TBPL does + return { + "00001": { + "date": int(time.time()), + "changesets": [ + { + "node": resultset, + "files": [], + "tags": [], + "author": "Unknown", + "branch": "default", + "desc": "Pushlog not found at {0}".format(url) + } + ], + "user": "Unknown" + } + } + else: + response.raise_for_status() return response.json() - def run(self, source_url, repository): + def run(self, source_url, repository, resultset): try: - extracted_content = self.extract(source_url) + extracted_content = self.extract(source_url, resultset) if extracted_content: @@ -125,16 +150,22 @@ class MissingHgPushlogProcess(HgPushlogTransformerMixin, extracted_content, repository ) - logger.info("loading missing resultsets: {0}".format(transformed)) + + for project, coll in transformed.iteritems(): + logger.info("loading missing resultsets for {0}: {1}".format( + project, + coll.to_json())) + self.load(transformed) + logger.info("done loading missing resultsets for {0}".format(repository)) else: assert extracted_content, ( "Got no content response for missing resultsets: {0}".format( source_url) ) except Exception as ex: - logger.error("error fetching missing resultsets: {0}".format( - ex + logger.exception("error loading missing resultsets: {0}".format( + source_url )) class GitPushlogTransformerMixin(object): diff --git a/treeherder/etl/tasks/cleanup_tasks.py b/treeherder/etl/tasks/cleanup_tasks.py index 80fc33b1f..198efbd51 100644 --- a/treeherder/etl/tasks/cleanup_tasks.py +++ b/treeherder/etl/tasks/cleanup_tasks.py @@ -16,20 +16,23 @@ def fetch_missing_push_logs(missing_pushlogs): try: repos = filter(lambda x: x['url'], rdm.get_all_repository_info()) for repo in repos: - if repo['dvcs_type'] == 'hg': - fetch_missing_hg_push_logs.apply_async(args=( - repo['name'], - repo['url'], - missing_pushlogs[repo['name']] - ), - routing_key='pushlog' + if repo['dvcs_type'] == 'hg' and repo['name'] in missing_pushlogs: + # we must get them one at a time, because if ANY are missing + # from json-pushes, it'll return a 404 for the group. + for resultset in missing_pushlogs[repo['name']]: + fetch_missing_hg_push_logs.apply_async(args=( + repo['name'], + repo['url'], + resultset + ), + routing_key='pushlog' ) finally: rdm.disconnect() @task(name='fetch-missing-hg-push-logs', time_limit=3*60) -def fetch_missing_hg_push_logs(repo_name, repo_url, resultsets): +def fetch_missing_hg_push_logs(repo_name, repo_url, resultset): """ Run a HgPushlog etl process @@ -37,10 +40,10 @@ def fetch_missing_hg_push_logs(repo_name, repo_url, resultsets): """ process = MissingHgPushlogProcess() - changesetParam = urllib.urlencode({"changeset": resultsets}, True) + changesetParam = urllib.urlencode({"changeset": resultset}, True) url_str = repo_url + '/json-pushes/?full=1&' + changesetParam logger.info("fetching missing resultsets: {0}".format(url_str)) - process.run(url_str, repo_name) + process.run(url_str, repo_name, resultset)