Bug 1198786 - Stop storing performance artifacts

They were never used for anything and take up a lot of space. They
also don't fit into the new performance model we're working on. We
can always bring back the useful bits later.
This commit is contained in:
William Lachance 2015-08-28 10:49:29 -04:00
Родитель 0dec2d5049
Коммит 7440089508
13 изменённых файлов: 102 добавлений и 423 удалений

Просмотреть файл

@ -1,73 +1,106 @@
import copy
import json
import zlib
import unittest
from tests.sampledata import SampleData
from treeherder.etl.perf_data_adapters import TalosDataAdapter
def test_adapt_and_load():
class TalosDataAdapterTest(unittest.TestCase):
talos_perf_data = SampleData.get_talos_perf_data()
def test_adapt_and_load(self):
tda = TalosDataAdapter()
talos_perf_data = SampleData.get_talos_perf_data()
result_count = 0
for datum in talos_perf_data:
for talos_datum in talos_perf_data:
datum = {
"job_guid": 'oqiwy0q847365qiu',
"name": "test",
"type": "test",
"blob": datum
}
job_data = {
"oqiwy0q847365qiu": {
"id": 1,
"result_set_id": 1,
"push_timestamp": 1402692388
datum = {
"job_guid": 'oqiwy0q847365qiu',
"name": "test",
"type": "test",
"blob": talos_datum
}
}
reference_data = {
"property1": "value1",
"property2": "value2",
"property3": "value3"
}
job_data = {
"oqiwy0q847365qiu": {
"id": 1,
"result_set_id": 1,
"push_timestamp": 1402692388
}
}
# one extra result for the summary series
result_count += len(datum['blob']["results"]) + 1
reference_data = {
"property1": "value1",
"property2": "value2",
"property3": "value3"
}
# we create one performance series per counter
if 'talos_counters' in datum['blob']:
result_count += len(datum['blob']["talos_counters"])
# Mimic production environment, the blobs are serialized
# when the web service receives them
datum['blob'] = json.dumps({'talos_data': [datum['blob']]})
tda = TalosDataAdapter()
tda.adapt_and_load(reference_data, job_data, datum)
# Mimic production environment, the blobs are serialized
# when the web service receives them
datum['blob'] = json.dumps({'talos_data': [datum['blob']]})
tda.adapt_and_load(reference_data, job_data, datum)
# base: subtests + one extra result for the summary series
expected_result_count = len(talos_datum["results"]) + 1
# we upload a summary with a suite and subtest values, +1 for suite
if 'summary' in datum['blob']:
results = json.loads(zlib.decompress(tda.performance_artifact_placeholders[-1][4]))
data = json.loads(datum['blob'])['talos_data'][0]
assert results["blob"]["performance_series"]["geomean"] == data['summary']['suite']
# we create one performance series per counter
if 'talos_counters' in talos_datum:
expected_result_count += len(talos_datum["talos_counters"])
# deal with the subtests now
for i in range(0, len(data['summary']['subtests'])):
subresults = json.loads(zlib.decompress(tda.performance_artifact_placeholders[-1 - i][4]))
if 'subtest_signatures' in subresults["blob"]['signature_properties']:
# ignore summary signatures
continue
# result count == number of signatures
self.assertEqual(expected_result_count, len(tda.signatures.keys()))
subdata = data['summary']['subtests'][subresults["blob"]['signature_properties']['test']]
for datatype in ['min', 'max', 'mean', 'median', 'std']:
assert subdata[datatype] == subresults["blob"]["performance_series"][datatype]
if 'value' in subdata.keys():
assert subdata['value'] == subresults["blob"]["performance_series"]['value']
else:
# FIXME: the talos data blob we're currently using contains datums with summaries and those without
# we should probably test non-summarized data as well
pass
# verify that we have signatures for the subtests
signature_placeholders = copy.copy(
tda.signature_property_placeholders)
for (testname, results) in talos_datum["results"].iteritems():
signature_placeholder = filter(
lambda p: p[2] == testname, signature_placeholders)
self.assertEqual(len(signature_placeholder), 1)
signature_hash = signature_placeholder[0][0]
perfdata = tda.signatures[signature_hash][0]
if talos_datum.get('summary'):
# if we have a summary, ensure the subtest summary values made
# it in
for measure in ['min', 'max', 'std', 'mean', 'median']:
self.assertEqual(
round(talos_datum['summary']['subtests'][testname][measure], 2),
perfdata[measure])
else:
# this is an old style talos blob without a summary. these are going
# away, so I'm not going to bother testing the correctness. however
# let's at least verify that some values are being generated here
for measure in ['min', 'max', 'std', 'mean', 'median']:
self.assertTrue(perfdata[measure])
assert result_count == len(tda.performance_artifact_placeholders)
# filter out this signature from data to process
signature_placeholders = filter(
lambda p: p[0] != signature_hash, signature_placeholders)
# if we have counters, verify that the series for them is as expected
for (counter, results) in talos_datum.get('talos_counters',
{}).iteritems():
signature_placeholder = filter(
lambda p: p[2] == counter, signature_placeholders)
self.assertEqual(len(signature_placeholder), 1)
signature_hash = signature_placeholder[0][0]
perfdata = tda.signatures[signature_hash][0]
for measure in ['max', 'mean']:
self.assertEqual(round(float(results[measure]), 2),
perfdata[measure])
# filter out this signature from data to process
signature_placeholders = filter(
lambda p: p[0] != signature_hash, signature_placeholders)
# we should be left with just summary signature placeholders
self.assertEqual(len(signature_placeholders), 2)
perfdata = tda.signatures[signature_placeholders[0][0]][0]
if talos_datum.get('summary'):
self.assertEqual(round(talos_datum['summary']['suite'], 2),
perfdata['geomean'])
else:
# old style talos blob without summary. again, going away,
# but let's at least test that we have the 'geomean' value
# generated
self.assertTrue(perfdata['geomean'])

Просмотреть файл

@ -7,8 +7,6 @@ import simplejson as json
from django.conf import settings
from django.utils.six import BytesIO
from treeherder.model.derived import ArtifactsModel
from ..sampledata import SampleData
@ -27,18 +25,6 @@ def jobs_with_local_log(initial_data):
return [job]
@pytest.fixture
def jobs_with_local_talos_log(initial_data):
sample_data = SampleData()
url = sample_data.get_performance_logs()[0]
job = sample_data.job_data[0]
# substitute the log url with a local url
job['job']['log_references'][0]['url'] = url
return [job]
@pytest.fixture
def jobs_with_local_mozlog_log(initial_data):
log = ("plain-chunked_raw.log")
@ -152,24 +138,6 @@ def test_parse_mozlog_log(jm, initial_data, jobs_with_local_mozlog_log,
assert len(fails) == 3
def test_parse_talos_log(jm, test_project, initial_data, jobs_with_local_talos_log,
sample_resultset, mock_post_json,
mock_get_remote_content):
"""
check that performance job_artifacts get inserted when running
a parse_log task for a talos job
"""
jm.store_result_set_data(sample_resultset)
jobs = jobs_with_local_talos_log
jm.store_job_data(jobs)
with ArtifactsModel(test_project) as artifacts_model:
artifact_list = artifacts_model.get_performance_artifact_list(0, 10)
assert len(artifact_list) >= 1 # should parse out at least one perf artifact
def test_bug_suggestions_artifact(jm, initial_data, jobs_with_local_log,
sample_resultset, mock_post_json,
mock_get_remote_content

Просмотреть файл

@ -385,44 +385,6 @@ def test_get_job_data(jm, test_project, refdata, sample_data, initial_data,
assert len(job_data) is target_len
def test_store_performance_artifact(
jm, test_project, refdata, sample_data, sample_resultset, initial_data,
mock_log_parser):
tp_data = test_utils.ingest_talos_performance_data(
jm, refdata, sample_data, sample_resultset
)
job_ids = tp_data['job_ids']
perf_data = tp_data['perf_data']
for index, d in enumerate(perf_data):
perf_data[index]['blob'] = json.dumps({'talos_data': [d['blob']]})
with ArtifactsModel(test_project) as artifacts_model:
artifacts_model.store_performance_artifact(job_ids, perf_data)
replace = [','.join(['%s'] * len(job_ids))]
performance_artifact_signatures = jm.get_dhub().execute(
proc="jobs.selects.get_performance_artifact",
debug_show=jm.DEBUG,
placeholders=job_ids,
replace=replace,
return_type='set',
key_column='series_signature')
series_signatures = jm.get_dhub().execute(
proc="jobs.selects.get_all_series_signatures",
return_type='set',
key_column='signature',
debug_show=jm.DEBUG)
jm.disconnect()
assert performance_artifact_signatures == series_signatures
def test_store_performance_series(jm, test_project):
# basic case: everything works as expected

Просмотреть файл

@ -336,40 +336,3 @@ def clean_job_blob_dict(job):
pass # no problem
return job
def ingest_talos_performance_data(jm, refdata, sample_data, sample_resultset):
talos_perf_data = sample_data.get_talos_perf_data()
job_data = sample_data.job_data[:20]
do_job_ingestion(
jm, refdata, job_data, sample_resultset, False)
job_guids = map(lambda job: job['job']['job_guid'], job_data)
job_id_lookup = jm.get_job_ids_by_guid(job_guids)
job_ids = map(lambda job_guid: job_id_lookup[job_guid]['id'], job_guids)
# Dynamically map the job_guids to the talos test objects
# so that reference data will exist for the talos blobs
talos_perf_index_max = len(talos_perf_data)
talos_perf_index = 0
perf_data = []
for job_guid in job_guids:
perf_data.append({
"job_guid": job_guid,
"name": "talos",
"type": "performance",
"blob": talos_perf_data[talos_perf_index]
})
# cycle through the talos perf indexes so we test all of
# the sample structures
if talos_perf_index == talos_perf_index_max - 1:
talos_perf_index = 0
return {
"job_ids": job_ids,
"perf_data": perf_data
}

Просмотреть файл

@ -5,7 +5,6 @@ def test_project_endpoint(webapp, eleven_jobs_stored, jm):
url = '/api/project/%s' % jm.project
resp = webapp.get(url)
assert resp.json['max_job_id'] == 11
assert resp.json['max_performance_artifact_id'] == 0
def test_project_endpoint_does_not_exist(webapp, eleven_jobs_stored, jm):

Просмотреть файл

@ -1,6 +1,5 @@
import logging
import math
import zlib
from hashlib import sha1
import simplejson as json
@ -42,41 +41,6 @@ class PerformanceDataAdapter(object):
"required": ["results", "test_build", "testrun", "test_machine"]
}
"""
name = test suite name
type = perf_test | perf_aux
perf_aux can have any structure
"""
self.treeherder_perf_test_schema = {
"title": "Treeherder Schema",
"type": "object",
"properties": {
"job_guid": {"type": "string"},
"name": {"type": "string"},
"type": {"type": "string"},
"blob": {
"type": "object",
"properties": {
"date": {"type": "integer"}, # time test was run
"series_properties": {"type": "object"},
"series_signature": {"type": "string"},
"testsuite": {"type": "string"},
"test": {"type": "string"},
"replicates": {"type": "array"},
"performance_series": {"type": "object"},
"metadata": {"type": "object"} # (holds 'options' from talos data & various auxiliary data including 'test_aux', 'talox_aux', 'results_aux', and 'results_xperf')
},
"required": [
"date", "series_signature", "testsuite",
]
}
},
"required": ["blob", "job_guid", "name", "type"]
}
@staticmethod
def _round(num):
# Use a precision of .2f for all numbers stored
@ -180,9 +144,9 @@ class PerformanceDataAdapter(object):
return sha.hexdigest()
def _add_performance_artifact(self, job_id, series_signature,
signature_properties, obj,
name, testname, testdata):
def _add_performance_placeholder(self, series_signature,
signature_properties,
testdata):
if series_signature not in self.signatures:
self.signatures[series_signature] = []
@ -196,13 +160,6 @@ class PerformanceDataAdapter(object):
signature_properties[signature_property],
])
self.performance_artifact_placeholders.append([
job_id,
series_signature,
name,
testname,
zlib.compress(json.dumps(obj))
])
self.signatures[series_signature].append(testdata)
@ -219,7 +176,6 @@ class TalosDataAdapter(PerformanceDataAdapter):
self.adapted_data = []
self.signatures = {}
self.performance_artifact_placeholders = []
self.signature_property_placeholders = []
@staticmethod
@ -278,8 +234,6 @@ class TalosDataAdapter(PerformanceDataAdapter):
validate(talos_datum, self.datazilla_schema)
_job_guid = datum["job_guid"]
_name = datum["name"]
_type = "performance"
_suite = talos_datum["testrun"]["suite"]
# data for performance series
@ -318,16 +272,8 @@ class TalosDataAdapter(PerformanceDataAdapter):
talos_datum["talos_counters"][_test]))
continue
obj = self._get_base_perf_obj(_job_guid, _name, _type,
talos_datum,
series_signature,
signature_properties,
series_data)
obj['test'] = _test
validate(obj, self.treeherder_perf_test_schema)
self._add_performance_artifact(job_id, series_signature,
signature_properties, obj,
_name, _test, series_data)
self._add_performance_placeholder(
series_signature, signature_properties, series_data)
subtest_signatures = []
@ -353,18 +299,8 @@ class TalosDataAdapter(PerformanceDataAdapter):
series_data = self._extract_test_data(series_data,
summary_data)
obj = self._get_base_perf_obj(_job_guid, _name, _type,
talos_datum,
series_signature,
signature_properties,
series_data)
obj['test'] = _test
obj['replicates'] = talos_datum["results"][_test]
validate(obj, self.treeherder_perf_test_schema)
self._add_performance_artifact(job_id, series_signature,
signature_properties, obj,
_name, _test, series_data)
self._add_performance_placeholder(
series_signature, signature_properties, series_data)
if subtest_signatures:
# summary series
@ -383,16 +319,9 @@ class TalosDataAdapter(PerformanceDataAdapter):
summary_data = self._extract_summary_data(summary_data,
talos_datum["summary"])
obj = self._get_base_perf_obj(_job_guid, _name, _type,
talos_datum,
summary_signature,
summary_properties,
summary_data)
validate(obj, self.treeherder_perf_test_schema)
self._add_performance_artifact(job_id, summary_signature,
summary_properties, obj,
_name, 'summary', summary_data)
self._add_performance_placeholder(
summary_signature, summary_properties,
summary_data)
def submit_tasks(self, project):

Просмотреть файл

@ -25,13 +25,6 @@ class ArtifactsModel(TreeherderModelBase):
"job_id": "job_id",
"name": "name",
"type": "type"
},
"performance_artifact": {
"id": "id",
"job_id": "job_id",
"series_signature": "series_signature",
"name": "name",
"type": "type"
}
}
@ -85,48 +78,6 @@ class ArtifactsModel(TreeherderModelBase):
return data
def get_performance_artifact_list(self, offset, limit, conditions=None):
"""
Retrieve a list of performance artifacts. The conditions parameter is a
dict containing a set of conditions for each key. e.g.:
{
'job_id': set([('IN', (1, 2))])
}
"""
replace_str, placeholders = self._process_conditions(
conditions, self.INDEXED_COLUMNS['performance_artifact']
)
repl = [replace_str]
proc = "jobs.selects.get_performance_artifact_list"
data = self.execute(
proc=proc,
replace=repl,
placeholders=placeholders,
limit=limit,
offset=offset,
debug_show=self.DEBUG,
)
for artifact in data:
artifact["blob"] = utils.decompress_if_needed(artifact["blob"])
# performance artifacts are always json encoded
artifact["blob"] = json.loads(artifact["blob"])
return data
def get_max_performance_artifact_id(self):
"""Get the maximum performance artifact id."""
data = self.execute(
proc="jobs.selects.get_max_performance_artifact_id",
debug_show=self.DEBUG,
)
return int(data[0]['max_id'] or 0)
def store_job_artifact(self, artifact_placeholders):
"""
Store a list of job_artifacts given a list of placeholders
@ -171,12 +122,6 @@ class ArtifactsModel(TreeherderModelBase):
# adapt and load data into placeholder structures
tda.adapt_and_load(ref_data, job_data, perf_data)
self.execute(
proc="jobs.inserts.set_performance_artifact",
debug_show=self.DEBUG,
placeholders=tda.performance_artifact_placeholders,
executemany=True)
self.execute(
proc='jobs.inserts.set_series_signature',
debug_show=self.DEBUG,

Просмотреть файл

@ -103,7 +103,6 @@ class JobsModel(TreeherderModelBase):
# to their ids.
JOBS_CYCLE_TARGETS = [
"jobs.deletes.cycle_job_artifact",
"jobs.deletes.cycle_performance_artifact",
"jobs.deletes.cycle_job_log_url",
"jobs.deletes.cycle_job_note",
"jobs.deletes.cycle_bug_job_map",

Просмотреть файл

@ -55,11 +55,6 @@
"sql":"DELETE FROM result_set WHERE id IN (REP0)",
"host_type": "master_host"
},
"cycle_performance_artifact":{
"sql":"DELETE FROM performance_artifact WHERE job_id IN (REP0)",
"host_type": "master_host"
}
},
"inserts":{
@ -95,12 +90,6 @@
"host_type":"master_host"
},
"set_performance_artifact":{
"sql":"INSERT INTO `performance_artifact` (`job_id`, `series_signature`, `name`, `type`, `blob`)
VALUES (?,?,?,?,?)",
"host_type":"master_host"
},
"set_series_signature":{
"sql":"INSERT INTO `series_signature` (`signature`, `property`, `value`)
SELECT ?,?,?
@ -371,30 +360,6 @@
"host_type":"read_host"
},
"get_performance_artifact":{
"sql":"SELECT `id`, `job_id`, `series_signature`, `name`, `type`, `blob`
FROM `performance_artifact`
WHERE `job_id` IN (REP0) AND `active_status` = 'active'",
"host_type":"read_host"
},
"get_max_performance_artifact_id":{
"sql":"SELECT max(`id`) as max_id from `performance_artifact`",
"host_type": "read_host"
},
"get_performance_artifact_list": {
"sql":"SELECT `id`, `job_id`, `series_signature`, `name`, `type`, `blob`
FROM `performance_artifact`
WHERE `active_status` = 'active'
REP0
ORDER BY id DESC, name",
"host_type":"read_host"
},
"get_performance_series": {

Просмотреть файл

@ -234,46 +234,6 @@ CREATE TABLE `job_artifact` (
) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin;
/*!40101 SET character_set_client = @saved_cs_client */;
DROP TABLE IF EXISTS `performance_artifact`;
/*!40101 SET @saved_cs_client = @@character_set_client */;
/*!40101 SET character_set_client = utf8 */;
/**************************
* Table: performance_artifact
*
* Holds any kind of performance related data associated with a job. Data can be structured
* JSON data or binary in the case of an image.
*
* Population Method: dynamic from incoming data
*
* Example Data:
*
* job_id - References job.id
* series_signature - References series_signature.signature. A hash of the property values defining a series.
* name - Name of artifact data.
* type - json | img | ...
* blob - Artifact data
**************************/
CREATE TABLE `performance_artifact` (
`id` bigint(20) unsigned NOT NULL AUTO_INCREMENT,
`job_id` bigint(20) unsigned NOT NULL,
`failure_classification_id` int(10) unsigned DEFAULT NULL,
`series_signature` varchar(50) COLLATE utf8_bin NOT NULL,
`name` varchar(50) COLLATE utf8_bin NOT NULL,
`type` varchar(50) COLLATE utf8_bin NOT NULL,
`blob` mediumblob NOT NULL,
`active_status` enum('active','onhold','deleted') COLLATE utf8_bin DEFAULT 'active',
PRIMARY KEY (`id`),
KEY `idx_job_id` (`job_id`),
KEY `idx_failure_classification_id` (`failure_classification_id`),
KEY `idx_series_signature` (`series_signature`),
KEY `idx_name` (`name`),
KEY `idx_type` (`type`),
KEY `idx_active_status` (`active_status`),
CONSTRAINT `fk_performance_artifact` FOREIGN KEY (`job_id`) REFERENCES `job` (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin;
/*!40101 SET character_set_client = @saved_cs_client */;
DROP TABLE IF EXISTS `series_signature`;
/*!40101 SET @saved_cs_client = @@character_set_client */;
/*!40101 SET character_set_client = utf8 */;

Просмотреть файл

@ -1,36 +0,0 @@
from rest_framework import viewsets
from rest_framework.response import Response
from treeherder.webapp.api.utils import UrlQueryFilter, with_jobs
class PerformanceArtifactViewSet(viewsets.ViewSet):
"""
This viewset is responsible for the artifact endpoint.
"""
@with_jobs
def retrieve(self, request, project, jm, pk=None):
"""
retrieve a single instance of performance_artifact
"""
filter = UrlQueryFilter({"id": pk})
objs = jm.get_performance_artifact_list(0, 1, filter.conditions)
if objs:
return Response(objs[0])
else:
return Response("performance_artifact {0} not found".format(pk), 404)
@with_jobs
def list(self, request, project, jm):
"""
return a list of job artifacts
"""
filter = UrlQueryFilter(request.QUERY_PARAMS)
offset = int(filter.pop("offset", 0))
count = min(int(filter.pop("count", 10)), 1000)
objs = jm.get_performance_artifact_list(offset, count, filter.conditions)
return Response(objs)

Просмотреть файл

@ -1,17 +1,15 @@
import simplejson as json
from django.http import HttpResponse, HttpResponseNotFound
from treeherder.model.derived import ArtifactsModel, DatasetNotFoundError, JobsModel
from treeherder.model.derived import DatasetNotFoundError, JobsModel
def project_info(request, project):
try:
with JobsModel(project) as jobs_model, ArtifactsModel(project) as artifacts_model:
with JobsModel(project) as jobs_model:
return HttpResponse(
content=json.dumps(
{'max_job_id': jobs_model.get_max_job_id(),
'max_performance_artifact_id':
artifacts_model.get_max_performance_artifact_id()}
{'max_job_id': jobs_model.get_max_job_id()}
),
content_type='application/json'
)

Просмотреть файл

@ -2,8 +2,8 @@ from django.conf.urls import include, patterns, url
from rest_framework import routers
from treeherder.webapp.api import (artifact, bug, job_log_url, jobs, logslice,
note, performance_artifact, performance_data,
projects, refdata, resultset)
note, performance_data, projects, refdata,
resultset)
# router for views that are bound to a project
# i.e. all those views that don't involve reference data
@ -27,12 +27,6 @@ project_bound_router.register(
base_name='artifact',
)
project_bound_router.register(
r'performance_artifact',
performance_artifact.PerformanceArtifactViewSet,
base_name='performance_artifact',
)
project_bound_router.register(
r'note',
note.NoteViewSet,