Bug 1295997 - Skip parsing logs whose compressed size exceeds 5MB (#4700)

Occasionally failing build/test runs can fail in such a way that results
in a significant amount of log spam and therefore log files that are
hundreds of MB in size each. This can cause log parsing backlogs,
particularly when many jobs on the same push fail in such a way.

The log parser now checks the `Content-Length` of log files prior to
streaming them, and skips the download/parse if it exceeds the set
threshold. The frontend has been adjusted to display an appropriate
message explaining why the parsed log is not available.

The threshold has been set to 5MB, since:
* the 99th percentile of download size on New Relic was ~2.8MB:
  https://insights.newrelic.com/accounts/677903/dashboards/339080
* `Content-Length` is the size of the log prior to decompression, and
  the chronic logspam cases have been known to have compression ratios
  of 20-50x, which would translate to an uncompressed size limit of
  up to 250MB (which is already much larger than buildbot's former 50MB
  uncompressed size limit).
This commit is contained in:
Ed Morley 2019-02-25 19:04:38 +00:00 коммит произвёл GitHub
Родитель 047c90228e
Коммит 52d6017c5b
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
11 изменённых файлов: 88 добавлений и 11 удалений

Просмотреть файл

@ -1,7 +1,10 @@
import pytest
import responses
from tests.test_utils import add_log_response
from treeherder.log_parser.artifactbuildercollection import ArtifactBuilderCollection
from treeherder.log_parser.artifactbuildercollection import (ArtifactBuilderCollection,
LogSizeException,
MAX_DOWNLOAD_SIZE_IN_BYTES)
from treeherder.log_parser.artifactbuilders import BuildbotLogViewArtifactBuilder
@ -60,3 +63,22 @@ def test_all_builders_complete():
}
assert exp == lpc.artifacts
@responses.activate
def test_log_download_size_limit():
"""Test that logs whose Content-Length exceed the size limit are not parsed."""
url = 'http://foo.tld/fake_large_log.tar.gz'
responses.add(
responses.GET,
url,
body='',
adding_headers={
'Content-Encoding': 'gzip',
'Content-Length': str(MAX_DOWNLOAD_SIZE_IN_BYTES + 1),
}
)
lpc = ArtifactBuilderCollection(url)
with pytest.raises(LogSizeException):
lpc.parse()

Просмотреть файл

@ -220,12 +220,14 @@ def add_log_response(filename):
log_url = "http://my-log.mozilla.org/{}".format(filename)
with open(log_path, 'rb') as log_file:
content = log_file.read()
responses.add(
responses.GET,
log_url,
body=log_file.read(),
body=content,
adding_headers={
"Content-Encoding": "gzip",
'Content-Encoding': 'gzip',
'Content-Length': str(len(content)),
}
)
return log_url

Просмотреть файл

@ -6,6 +6,9 @@ from .artifactbuilders import (BuildbotJobArtifactBuilder,
BuildbotLogViewArtifactBuilder,
BuildbotPerformanceDataArtifactBuilder)
# Max log size in bytes we will download (prior to decompression).
MAX_DOWNLOAD_SIZE_IN_BYTES = 5 * 1024 * 1024
class ArtifactBuilderCollection(object):
"""
@ -86,16 +89,21 @@ BuildbotPerformanceDataArtifactBuilder
building the ``artifact`` as we go.
"""
with make_request(self.url, stream=True) as response:
download_size_in_bytes = int(response.headers.get('Content-Length', -1))
# Temporary annotation of log size to help set thresholds in bug 1295997.
newrelic.agent.add_custom_parameter(
'unstructured_log_size',
int(response.headers.get('Content-Length', -1))
download_size_in_bytes
)
newrelic.agent.add_custom_parameter(
'unstructured_log_encoding',
response.headers.get('Content-Encoding', 'None')
)
if download_size_in_bytes > MAX_DOWNLOAD_SIZE_IN_BYTES:
raise LogSizeException('Download size of %i bytes exceeds limit' % download_size_in_bytes)
# Lines must be explicitly decoded since `iter_lines()`` returns bytes by default
# and we cannot use its `decode_unicode=True` mode, since otherwise Unicode newline
# characters such as `\u0085` (which can appear in test output) are treated the same
@ -116,3 +124,7 @@ BuildbotPerformanceDataArtifactBuilder
if name == 'performance_data' and not artifact[name]:
continue
self.artifacts[name] = artifact
class LogSizeException(Exception):
pass

Просмотреть файл

@ -41,9 +41,9 @@ def parse_logs(job_id, job_log_ids, priority):
newrelic.agent.add_custom_parameter("job_log_%s_url" % job_log.name, job_log.url)
logger.debug("parser_task for %s", job_log.id)
# Don't parse jobs which have already been parsed.
if job_log.status == JobLog.PARSED:
logger.info("%s log already parsed", job_log.id)
# Only parse logs which haven't yet been processed or else failed on the last attempt.
if job_log.status not in (JobLog.PENDING, JobLog.FAILED):
logger.info('Skipping parsing for job %s since log already processed', job_log.id)
continue
parser = parser_tasks.get(job_log.name)

Просмотреть файл

@ -5,7 +5,8 @@ from requests.exceptions import HTTPError
from treeherder.etl.artifact import (serialize_artifact_json_blobs,
store_job_artifacts)
from treeherder.log_parser.artifactbuildercollection import ArtifactBuilderCollection
from treeherder.log_parser.artifactbuildercollection import (ArtifactBuilderCollection,
LogSizeException)
from treeherder.model.models import JobLog
logger = logging.getLogger(__name__)
@ -36,6 +37,10 @@ def post_log_artifacts(job_log):
try:
artifact_list = extract_text_log_artifacts(job_log)
except LogSizeException as e:
job_log.update_status(JobLog.SKIPPED_SIZE)
logger.warning('Skipping parsing log for %s: %s', job_log.id, e)
return
except Exception as e:
job_log.update_status(JobLog.FAILED)

Просмотреть файл

@ -0,0 +1,18 @@
# Generated by Django 2.0.13 on 2019-02-25 14:09
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('model', '0013_add_index_to_push_revision'),
]
operations = [
migrations.AlterField(
model_name='joblog',
name='status',
field=models.IntegerField(choices=[(0, 'pending'), (1, 'parsed'), (2, 'failed'), (3, 'skipped-size')], default=0),
),
]

Просмотреть файл

@ -724,10 +724,14 @@ class JobLog(models.Model):
PENDING = 0
PARSED = 1
FAILED = 2
SKIPPED_SIZE = 3
STATUSES = ((PENDING, 'pending'),
(PARSED, 'parsed'),
(FAILED, 'failed'))
STATUSES = (
(PENDING, 'pending'),
(PARSED, 'parsed'),
(FAILED, 'failed'),
(SKIPPED_SIZE, 'skipped-size'),
)
job = models.ForeignKey(Job, on_delete=models.CASCADE, related_name="job_log")
name = models.CharField(max_length=50)

Просмотреть файл

@ -50,6 +50,9 @@ class ActionBar extends React.PureComponent {
case 'failed':
notify('Log parsing has failed, log viewer is unavailable', 'warning');
break;
case 'skipped-size':
notify('Log parsing was skipped, log viewer is unavailable', 'warning');
break;
case 'unavailable':
notify('No logs available for this job', 'info');
break;

Просмотреть файл

@ -18,6 +18,11 @@ function getLogUrlProps(logUrl, logViewerUrl, logViewerFullUrl) {
className: 'disabled',
title: 'Log parsing has failed',
};
case 'skipped-size':
return {
className: 'disabled',
title: 'Log parsing was skipped',
};
case 'pending':
return {
className: 'disabled',

Просмотреть файл

@ -112,6 +112,8 @@ class AutoclassifyTab extends React.Component {
return 'Logs not fully parsed, please wait';
case 'failed':
return 'Log parsing failed';
case 'skipped-size':
return 'Log parsing was skipped since the log file exceeds the size limit';
case 'no_logs':
return 'No errors logged';
case 'error':

Просмотреть файл

@ -119,6 +119,10 @@ class FailureSummaryTab extends React.Component {
<ListItem text="Log parsing failed. Unable to generate failure summary." />
)}
{!bugSuggestionsLoading && logParseStatus === 'skipped-size' && (
<ListItem text="Log parsing was skipped since the log exceeds the size limit." />
)}
{!bugSuggestionsLoading && !logs.length && (
<ListItem text="No logs available for this job." />
)}