From f5817334d86e08836874b00ecac38fd6e4479f3c Mon Sep 17 00:00:00 2001
From: Ed Morley <emorley@mozilla.com>
Date: Wed, 20 Jul 2016 14:57:23 +0100
Subject: [PATCH] Bug 1284289 - Manually report exceptions that caused tasks to
 retry

See in-code comment for more details.

Once this is deployed, I'll use the New Relic web configuration page to
add `celery.exceptions:Retry` to the ignore list. (Contrary to the
linked New Relic docs, this cannot be done via newrelic.ini, since the
server-side config takes preference once server-side mode is enabled.)
---
 treeherder/workers/task.py | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/treeherder/workers/task.py b/treeherder/workers/task.py
index 764846953..7e7ab4d42 100644
--- a/treeherder/workers/task.py
+++ b/treeherder/workers/task.py
@@ -2,6 +2,7 @@ import random
 import zlib
 from functools import wraps
 
+import newrelic.agent
 from celery import task
 from django.db.utils import (IntegrityError,
                              ProgrammingError)
@@ -30,10 +31,25 @@ class retryable_task(object):
             except self.NON_RETRYABLE_EXCEPTIONS:
                 raise
             except Exception as e:
+                number_of_prior_retries = task_func.request.retries
+                # Whilst the New Relic agent does report the exception that caused a retry,
+                # it does so in a form like:
+                #   `celery.exceptions:Retry: Retry in 640s: error('Error -3 while decompressing: incorrect header check',)`
+                # ...which causes all retry exceptions to be lumped together in the same
+                # `celery.exceptions:Retry` group. The original exception is then only
+                # reported to New Relic once the max number of retries has been reached.
+                # As such we manually report the retried exceptions to New Relic here, so
+                # that the original exception is shown verbatim immediately, and then filter
+                # out the automatic `celery.exceptions:Retry` exceptions via the web UI. See:
+                # https://docs.newrelic.com/docs/agents/python-agent/back-end-services/python-agent-celery#ignoring-task-retry-errors
+                params = {
+                    "number_of_prior_retries": number_of_prior_retries,
+                }
+                newrelic.agent.record_exception(params=params)
                 # Implement exponential backoff with some randomness to prevent
                 # thundering herd type problems. Constant factor chosen so we get
                 # reasonable pause between the fastest retries.
-                timeout = 10 * int(random.uniform(1.9, 2.1) ** task_func.request.retries)
+                timeout = 10 * int(random.uniform(1.9, 2.1) ** number_of_prior_retries)
                 raise task_func.retry(exc=e, countdown=timeout)
 
         task_func = task(*self.task_args, **self.task_kwargs)(inner)