Bug 1295997 - Skip parsing logs whose compressed size exceeds 5MB (#4700)

Occasionally failing build/test runs can fail in such a way that results in a significant amount of log spam and therefore log files that are hundreds of MB in size each. This can cause log parsing backlogs, particularly when many jobs on the same push fail in such a way. The log parser now checks the `Content-Length` of log files prior to streaming them, and skips the download/parse if it exceeds the set threshold. The frontend has been adjusted to display an appropriate message explaining why the parsed log is not available. The threshold has been set to 5MB, since: * the 99th percentile of download size on New Relic was ~2.8MB: https://insights.newrelic.com/accounts/677903/dashboards/339080 * `Content-Length` is the size of the log prior to decompression, and the chronic logspam cases have been known to have compression ratios of 20-50x, which would translate to an uncompressed size limit of up to 250MB (which is already much larger than buildbot's former 50MB uncompressed size limit).
2019-02-25 19:04:38 +00:00 · 2019-02-25 19:04:38 +00:00 · 52d6017c5b
--- a/tests/log_parser/test_artifact_builder_collection.py
+++ b/tests/log_parser/test_artifact_builder_collection.py
@ -1,7 +1,10 @@
+import pytest
 import responses

 from tests.test_utils import add_log_response
-from treeherder.log_parser.artifactbuildercollection import ArtifactBuilderCollection
+from treeherder.log_parser.artifactbuildercollection import (ArtifactBuilderCollection,
+                                                             LogSizeException,
+                                                             MAX_DOWNLOAD_SIZE_IN_BYTES)
 from treeherder.log_parser.artifactbuilders import BuildbotLogViewArtifactBuilder


@ -60,3 +63,22 @@ def test_all_builders_complete():
    }

    assert exp == lpc.artifacts
+
+
+@responses.activate
+def test_log_download_size_limit():
+    """Test that logs whose Content-Length exceed the size limit are not parsed."""
+    url = 'http://foo.tld/fake_large_log.tar.gz'
+    responses.add(
+        responses.GET,
+        url,
+        body='',
+        adding_headers={
+            'Content-Encoding': 'gzip',
+            'Content-Length': str(MAX_DOWNLOAD_SIZE_IN_BYTES + 1),
+        }
+    )
+    lpc = ArtifactBuilderCollection(url)
+
+    with pytest.raises(LogSizeException):
+        lpc.parse()
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@ -220,12 +220,14 @@ def add_log_response(filename):
    log_url = "http://my-log.mozilla.org/{}".format(filename)

    with open(log_path, 'rb') as log_file:
+        content = log_file.read()
        responses.add(
            responses.GET,
            log_url,
-            body=log_file.read(),
+            body=content,
            adding_headers={
-                "Content-Encoding": "gzip",
+                'Content-Encoding': 'gzip',
+                'Content-Length': str(len(content)),
            }
        )
    return log_url
--- a/treeherder/log_parser/artifactbuildercollection.py
+++ b/treeherder/log_parser/artifactbuildercollection.py
@ -6,6 +6,9 @@ from .artifactbuilders import (BuildbotJobArtifactBuilder,
                               BuildbotLogViewArtifactBuilder,
                               BuildbotPerformanceDataArtifactBuilder)

+# Max log size in bytes we will download (prior to decompression).
+MAX_DOWNLOAD_SIZE_IN_BYTES = 5 * 1024 * 1024
+

 class ArtifactBuilderCollection(object):
    """
@ -86,16 +89,21 @@ BuildbotPerformanceDataArtifactBuilder
        building the ``artifact`` as we go.
        """
        with make_request(self.url, stream=True) as response:
+            download_size_in_bytes = int(response.headers.get('Content-Length', -1))
+
            # Temporary annotation of log size to help set thresholds in bug 1295997.
            newrelic.agent.add_custom_parameter(
                'unstructured_log_size',
-                int(response.headers.get('Content-Length', -1))
+                download_size_in_bytes
            )
            newrelic.agent.add_custom_parameter(
                'unstructured_log_encoding',
                response.headers.get('Content-Encoding', 'None')
            )

+            if download_size_in_bytes > MAX_DOWNLOAD_SIZE_IN_BYTES:
+                raise LogSizeException('Download size of %i bytes exceeds limit' % download_size_in_bytes)
+
            # Lines must be explicitly decoded since `iter_lines()`` returns bytes by default
            # and we cannot use its `decode_unicode=True` mode, since otherwise Unicode newline
            # characters such as `\u0085` (which can appear in test output) are treated the same
@ -116,3 +124,7 @@ BuildbotPerformanceDataArtifactBuilder
            if name == 'performance_data' and not artifact[name]:
                continue
            self.artifacts[name] = artifact
+
+
+class LogSizeException(Exception):
+    pass
--- a/treeherder/log_parser/tasks.py
+++ b/treeherder/log_parser/tasks.py
@ -41,9 +41,9 @@ def parse_logs(job_id, job_log_ids, priority):
        newrelic.agent.add_custom_parameter("job_log_%s_url" % job_log.name, job_log.url)
        logger.debug("parser_task for %s", job_log.id)

-        # Don't parse jobs which have already been parsed.
-        if job_log.status == JobLog.PARSED:
-            logger.info("%s log already parsed", job_log.id)
+        # Only parse logs which haven't yet been processed or else failed on the last attempt.
+        if job_log.status not in (JobLog.PENDING, JobLog.FAILED):
+            logger.info('Skipping parsing for job %s since log already processed', job_log.id)
            continue

        parser = parser_tasks.get(job_log.name)
--- a/treeherder/log_parser/utils.py
+++ b/treeherder/log_parser/utils.py
@ -5,7 +5,8 @@ from requests.exceptions import HTTPError

 from treeherder.etl.artifact import (serialize_artifact_json_blobs,
                                     store_job_artifacts)
-from treeherder.log_parser.artifactbuildercollection import ArtifactBuilderCollection
+from treeherder.log_parser.artifactbuildercollection import (ArtifactBuilderCollection,
+                                                             LogSizeException)
 from treeherder.model.models import JobLog

 logger = logging.getLogger(__name__)
@ -36,6 +37,10 @@ def post_log_artifacts(job_log):

    try:
        artifact_list = extract_text_log_artifacts(job_log)
+    except LogSizeException as e:
+        job_log.update_status(JobLog.SKIPPED_SIZE)
+        logger.warning('Skipping parsing log for %s: %s', job_log.id, e)
+        return
    except Exception as e:
        job_log.update_status(JobLog.FAILED)

--- a/treeherder/model/migrations/0014_add_job_log_status_skipped_size.py
+++ b/treeherder/model/migrations/0014_add_job_log_status_skipped_size.py
@ -0,0 +1,18 @@
+# Generated by Django 2.0.13 on 2019-02-25 14:09
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('model', '0013_add_index_to_push_revision'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='joblog',
+            name='status',
+            field=models.IntegerField(choices=[(0, 'pending'), (1, 'parsed'), (2, 'failed'), (3, 'skipped-size')], default=0),
+        ),
+    ]
--- a/treeherder/model/models.py
+++ b/treeherder/model/models.py
@ -724,10 +724,14 @@ class JobLog(models.Model):
    PENDING = 0
    PARSED = 1
    FAILED = 2
+    SKIPPED_SIZE = 3

-    STATUSES = ((PENDING, 'pending'),
-                (PARSED, 'parsed'),
-                (FAILED, 'failed'))
+    STATUSES = (
+        (PENDING, 'pending'),
+        (PARSED, 'parsed'),
+        (FAILED, 'failed'),
+        (SKIPPED_SIZE, 'skipped-size'),
+    )

    job = models.ForeignKey(Job, on_delete=models.CASCADE, related_name="job_log")
    name = models.CharField(max_length=50)
--- a/ui/job-view/details/summary/ActionBar.jsx
+++ b/ui/job-view/details/summary/ActionBar.jsx
@ -50,6 +50,9 @@ class ActionBar extends React.PureComponent {
      case 'failed':
        notify('Log parsing has failed, log viewer is unavailable', 'warning');
        break;
+      case 'skipped-size':
+        notify('Log parsing was skipped, log viewer is unavailable', 'warning');
+        break;
      case 'unavailable':
        notify('No logs available for this job', 'info');
        break;
--- a/ui/job-view/details/summary/LogUrls.jsx
+++ b/ui/job-view/details/summary/LogUrls.jsx
@ -18,6 +18,11 @@ function getLogUrlProps(logUrl, logViewerUrl, logViewerFullUrl) {
        className: 'disabled',
        title: 'Log parsing has failed',
      };
+    case 'skipped-size':
+      return {
+        className: 'disabled',
+        title: 'Log parsing was skipped',
+      };
    case 'pending':
      return {
        className: 'disabled',
--- a/ui/job-view/details/tabs/autoclassify/AutoclassifyTab.jsx
+++ b/ui/job-view/details/tabs/autoclassify/AutoclassifyTab.jsx
@ -112,6 +112,8 @@ class AutoclassifyTab extends React.Component {
        return 'Logs not fully parsed, please wait';
      case 'failed':
        return 'Log parsing failed';
+      case 'skipped-size':
+        return 'Log parsing was skipped since the log file exceeds the size limit';
      case 'no_logs':
        return 'No errors logged';
      case 'error':
--- a/ui/job-view/details/tabs/failureSummary/FailureSummaryTab.jsx
+++ b/ui/job-view/details/tabs/failureSummary/FailureSummaryTab.jsx
@ -119,6 +119,10 @@ class FailureSummaryTab extends React.Component {
            <ListItem text="Log parsing failed.  Unable to generate failure summary." />
          )}

+          {!bugSuggestionsLoading && logParseStatus === 'skipped-size' && (
+            <ListItem text="Log parsing was skipped since the log exceeds the size limit." />
+          )}
+
          {!bugSuggestionsLoading && !logs.length && (
            <ListItem text="No logs available for this job." />
          )}