From f5817334d86e08836874b00ecac38fd6e4479f3c Mon Sep 17 00:00:00 2001 From: Ed Morley Date: Wed, 20 Jul 2016 14:57:23 +0100 Subject: [PATCH] Bug 1284289 - Manually report exceptions that caused tasks to retry See in-code comment for more details. Once this is deployed, I'll use the New Relic web configuration page to add `celery.exceptions:Retry` to the ignore list. (Contrary to the linked New Relic docs, this cannot be done via newrelic.ini, since the server-side config takes preference once server-side mode is enabled.) --- treeherder/workers/task.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/treeherder/workers/task.py b/treeherder/workers/task.py index 764846953..7e7ab4d42 100644 --- a/treeherder/workers/task.py +++ b/treeherder/workers/task.py @@ -2,6 +2,7 @@ import random import zlib from functools import wraps +import newrelic.agent from celery import task from django.db.utils import (IntegrityError, ProgrammingError) @@ -30,10 +31,25 @@ class retryable_task(object): except self.NON_RETRYABLE_EXCEPTIONS: raise except Exception as e: + number_of_prior_retries = task_func.request.retries + # Whilst the New Relic agent does report the exception that caused a retry, + # it does so in a form like: + # `celery.exceptions:Retry: Retry in 640s: error('Error -3 while decompressing: incorrect header check',)` + # ...which causes all retry exceptions to be lumped together in the same + # `celery.exceptions:Retry` group. The original exception is then only + # reported to New Relic once the max number of retries has been reached. + # As such we manually report the retried exceptions to New Relic here, so + # that the original exception is shown verbatim immediately, and then filter + # out the automatic `celery.exceptions:Retry` exceptions via the web UI. See: + # https://docs.newrelic.com/docs/agents/python-agent/back-end-services/python-agent-celery#ignoring-task-retry-errors + params = { + "number_of_prior_retries": number_of_prior_retries, + } + newrelic.agent.record_exception(params=params) # Implement exponential backoff with some randomness to prevent # thundering herd type problems. Constant factor chosen so we get # reasonable pause between the fastest retries. - timeout = 10 * int(random.uniform(1.9, 2.1) ** task_func.request.retries) + timeout = 10 * int(random.uniform(1.9, 2.1) ** number_of_prior_retries) raise task_func.retry(exc=e, countdown=timeout) task_func = task(*self.task_args, **self.task_kwargs)(inner)