Bug 1673911 - Remove more data from treeherder-prototype2

* overwrite cycle interval in strategies (if legal)
* DRY days validation & split test coverage
* provide test coverage & refactor strategies
This commit is contained in:
ionutgoldan 2020-12-14 11:20:56 +02:00 коммит произвёл GitHub
Родитель 08f29dd4d9
Коммит fee6da9a74
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
4 изменённых файлов: 336 добавлений и 224 удалений

Просмотреть файл

Просмотреть файл

@ -5,20 +5,16 @@ from datetime import datetime, timedelta
import pytest
from unittest.mock import MagicMock
from django.core.management import call_command
from django.db.models import Max
from unittest.mock import patch
from tests import test_utils
from tests.autoclassify.utils import create_failure_lines, test_line
from treeherder.model.management.commands.cycle_data import PerfherderCycler
from treeherder.model.models import (
FailureLine,
Job,
JobGroup,
JobLog,
JobType,
Machine,
Push,
from treeherder.model.management.commands.cycle_data import (
PerfherderCycler,
MainRemovalStrategy,
TryDataRemoval,
IrrelevantDataRemoval,
StalledDataRemoval,
)
from treeherder.model.models import Push
from treeherder.perf.exceptions import MaxRuntimeExceeded
from treeherder.perf.models import (
PerformanceDatum,
@ -30,182 +26,6 @@ from treeherder.perf.data_cycling.signature_remover import PublicSignatureRemove
from treeherder.perf.data_cycling.max_runtime import MaxRuntime
@pytest.mark.parametrize(
'days, expected_jobs, expected_failure_lines, expected_job_logs, cmd_args, cmd_kwargs',
[
(7, 0, 0, 0, ('cycle_data', 'from:treeherder'), {'sleep_time': 0, 'days': 1}),
# also check default '--days' param from treeherder
(119, 20, 2, 22, ('cycle_data',), {'sleep_time': 0}),
(120, 0, 0, 0, ('cycle_data',), {'sleep_time': 0}),
(150, 0, 0, 0, ('cycle_data',), {'sleep_time': 0}),
],
)
def test_cycle_all_data(
test_repository,
failure_classifications,
sample_data,
sample_push,
mock_log_parser,
failure_lines,
days,
expected_jobs,
expected_failure_lines,
expected_job_logs,
cmd_args,
cmd_kwargs,
):
"""
Test cycling the sample data
"""
job_data = sample_data.job_data[:20]
test_utils.do_job_ingestion(test_repository, job_data, sample_push, False)
cycle_date_ts = datetime.now() - timedelta(days=days)
for job in Job.objects.all():
job.submit_time = cycle_date_ts
job.save()
call_command(*cmd_args, **cmd_kwargs)
# There should be no jobs or failure lines after cycling
assert Job.objects.count() == expected_jobs
assert FailureLine.objects.count() == expected_failure_lines
assert JobLog.objects.count() == expected_job_logs
def test_cycle_all_but_one_job(
test_repository,
failure_classifications,
sample_data,
sample_push,
mock_log_parser,
failure_lines,
):
"""
Test cycling all but one job in a group of jobs to confirm there are no
unexpected deletions
"""
job_data = sample_data.job_data[:20]
test_utils.do_job_ingestion(test_repository, job_data, sample_push, False)
# one job should not be deleted, set its submit time to now
job_not_deleted = Job.objects.get(id=Job.objects.aggregate(Max("id"))["id__max"])
job_not_deleted.submit_time = datetime.now()
job_not_deleted.save()
extra_objects = {
'failure_lines': (
FailureLine,
create_failure_lines(
job_not_deleted, [(test_line, {}), (test_line, {"subtest": "subtest2"})]
),
),
}
# set other job's submit time to be a week ago from now
cycle_date_ts = datetime.now() - timedelta(weeks=1)
for job in Job.objects.all().exclude(id=job_not_deleted.id):
job.submit_time = cycle_date_ts
job.save()
num_job_logs_to_be_deleted = JobLog.objects.all().exclude(job__id=job_not_deleted.id).count()
num_job_logs_before = JobLog.objects.count()
call_command('cycle_data', 'from:treeherder', sleep_time=0, days=1, debug=True, chunk_size=1)
assert Job.objects.count() == 1
assert JobLog.objects.count() == (num_job_logs_before - num_job_logs_to_be_deleted)
for (object_type, objects) in extra_objects.values():
actual = set(item.id for item in object_type.objects.all())
expected = set(item.id for item in objects)
assert actual == expected
def test_cycle_all_data_in_chunks(
test_repository, failure_classifications, sample_data, sample_push, mock_log_parser
):
"""
Test cycling the sample data in chunks.
"""
job_data = sample_data.job_data[:20]
test_utils.do_job_ingestion(test_repository, job_data, sample_push, False)
# build a date that will cause the data to be cycled
cycle_date_ts = datetime.now() - timedelta(weeks=1)
for job in Job.objects.all():
job.submit_time = cycle_date_ts
job.save()
create_failure_lines(Job.objects.get(id=1), [(test_line, {})] * 7)
call_command('cycle_data', 'from:treeherder', sleep_time=0, days=1, chunk_size=3)
# There should be no jobs after cycling
assert Job.objects.count() == 0
assert FailureLine.objects.count() == 0
def test_cycle_job_model_reference_data(
test_repository, failure_classifications, sample_data, sample_push, mock_log_parser
):
job_data = sample_data.job_data[:20]
test_utils.do_job_ingestion(test_repository, job_data, sample_push, False)
# get a list of ids of original reference data
original_job_type_ids = JobType.objects.values_list('id', flat=True)
original_job_group_ids = JobGroup.objects.values_list('id', flat=True)
original_machine_ids = Machine.objects.values_list('id', flat=True)
# create a bunch of job model data that should be cycled, since they don't
# reference any current jobs
jg = JobGroup.objects.create(symbol='moo', name='moo')
jt = JobType.objects.create(symbol='mu', name='mu')
m = Machine.objects.create(name='machine_with_no_job')
(jg_id, jt_id, m_id) = (jg.id, jt.id, m.id)
call_command('cycle_data', 'from:treeherder', sleep_time=0, days=1, chunk_size=3)
# assert that reference data that should have been cycled, was cycled
assert JobGroup.objects.filter(id=jg_id).count() == 0
assert JobType.objects.filter(id=jt_id).count() == 0
assert Machine.objects.filter(id=m_id).count() == 0
# assert that we still have everything that shouldn't have been cycled
assert JobType.objects.filter(id__in=original_job_type_ids).count() == len(
original_job_type_ids
)
assert JobGroup.objects.filter(id__in=original_job_group_ids).count() == len(
original_job_group_ids
)
assert Machine.objects.filter(id__in=original_machine_ids).count() == len(original_machine_ids)
def test_cycle_job_with_performance_data(
test_repository, failure_classifications, test_job, mock_log_parser, test_perf_signature
):
# build a date that will cause the data to be cycled
test_job.submit_time = datetime.now() - timedelta(weeks=1)
test_job.save()
PerformanceDatum.objects.create(
repository=test_repository,
push=test_job.push,
job=test_job,
signature=test_perf_signature,
push_timestamp=test_job.push.time,
value=1.0,
)
call_command('cycle_data', 'from:treeherder', sleep_time=0, days=1, chunk_size=3)
# assert that the job got cycled
assert Job.objects.count() == 0
# assert that the perf object is still there, but the job reference is None
p = PerformanceDatum.objects.get(id=1)
assert p.job is None
@pytest.mark.parametrize(
'repository_name',
[
@ -695,3 +515,51 @@ def test_stalled_data_removal(
assert test_perf_alert not in PerformanceAlert.objects.all()
assert test_perf_signature_2 in PerformanceSignature.objects.all()
assert seg2_data in PerformanceDatum.objects.all()
@patch('treeherder.config.settings.SITE_HOSTNAME', 'treeherder-prototype2.herokuapp.com')
@pytest.mark.parametrize('days', [None, 5, 30, 100])
def test_explicit_days_validation_on_treeherder_prototype2_environment(days):
try:
_ = PerfherderCycler(10_000, 0, days=days)
except ValueError:
pytest.fail()
try:
_ = MainRemovalStrategy(10_000, days=days)
except ValueError:
pytest.fail()
try:
_ = TryDataRemoval(10_000, days=days)
except ValueError:
pytest.fail()
try:
_ = IrrelevantDataRemoval(10_000, days=days)
except ValueError:
pytest.fail()
try:
_ = StalledDataRemoval(10_000, days=days)
except ValueError:
pytest.fail()
@patch('treeherder.config.settings.SITE_HOSTNAME', 'treeherder-production.com')
@pytest.mark.parametrize('days', [5, 30, 100, 364])
def test_explicit_days_validation_on_envs_other_than_treeherder_prototype2(days):
with pytest.raises(ValueError):
_ = PerfherderCycler(10_000, 0, days=days)
with pytest.raises(ValueError):
_ = MainRemovalStrategy(10_000, days=days)
with pytest.raises(ValueError):
_ = TryDataRemoval(10_000, days=days)
with pytest.raises(ValueError):
_ = IrrelevantDataRemoval(10_000, days=days)
with pytest.raises(ValueError):
_ = StalledDataRemoval(10_000, days=days)

Просмотреть файл

@ -0,0 +1,196 @@
from datetime import datetime, timedelta
import pytest
from django.core.management import call_command
from django.db.models import Max
from tests import test_utils
from tests.autoclassify.utils import create_failure_lines, test_line
from treeherder.model.models import Job, FailureLine, JobLog, JobType, Machine, JobGroup
from treeherder.perf.models import PerformanceDatum
@pytest.mark.parametrize(
'days, expected_jobs, expected_failure_lines, expected_job_logs, cmd_args, cmd_kwargs',
[
(7, 0, 0, 0, ('cycle_data', 'from:treeherder'), {'sleep_time': 0, 'days': 1}),
# also check default '--days' param from treeherder
(119, 20, 2, 22, ('cycle_data',), {'sleep_time': 0}),
(120, 0, 0, 0, ('cycle_data',), {'sleep_time': 0}),
(150, 0, 0, 0, ('cycle_data',), {'sleep_time': 0}),
],
)
def test_cycle_all_data(
test_repository,
failure_classifications,
sample_data,
sample_push,
mock_log_parser,
failure_lines,
days,
expected_jobs,
expected_failure_lines,
expected_job_logs,
cmd_args,
cmd_kwargs,
):
"""
Test cycling the sample data
"""
job_data = sample_data.job_data[:20]
test_utils.do_job_ingestion(test_repository, job_data, sample_push, False)
cycle_date_ts = datetime.now() - timedelta(days=days)
for job in Job.objects.all():
job.submit_time = cycle_date_ts
job.save()
call_command(*cmd_args, **cmd_kwargs)
# There should be no jobs or failure lines after cycling
assert Job.objects.count() == expected_jobs
assert FailureLine.objects.count() == expected_failure_lines
assert JobLog.objects.count() == expected_job_logs
def test_cycle_all_but_one_job(
test_repository,
failure_classifications,
sample_data,
sample_push,
mock_log_parser,
failure_lines,
):
"""
Test cycling all but one job in a group of jobs to confirm there are no
unexpected deletions
"""
job_data = sample_data.job_data[:20]
test_utils.do_job_ingestion(test_repository, job_data, sample_push, False)
# one job should not be deleted, set its submit time to now
job_not_deleted = Job.objects.get(id=Job.objects.aggregate(Max("id"))["id__max"])
job_not_deleted.submit_time = datetime.now()
job_not_deleted.save()
extra_objects = {
'failure_lines': (
FailureLine,
create_failure_lines(
job_not_deleted, [(test_line, {}), (test_line, {"subtest": "subtest2"})]
),
),
}
# set other job's submit time to be a week ago from now
cycle_date_ts = datetime.now() - timedelta(weeks=1)
for job in Job.objects.all().exclude(id=job_not_deleted.id):
job.submit_time = cycle_date_ts
job.save()
num_job_logs_to_be_deleted = JobLog.objects.all().exclude(job__id=job_not_deleted.id).count()
num_job_logs_before = JobLog.objects.count()
call_command('cycle_data', 'from:treeherder', sleep_time=0, days=1, debug=True, chunk_size=1)
assert Job.objects.count() == 1
assert JobLog.objects.count() == (num_job_logs_before - num_job_logs_to_be_deleted)
for (object_type, objects) in extra_objects.values():
actual = set(item.id for item in object_type.objects.all())
expected = set(item.id for item in objects)
assert actual == expected
def test_cycle_all_data_in_chunks(
test_repository, failure_classifications, sample_data, sample_push, mock_log_parser
):
"""
Test cycling the sample data in chunks.
"""
job_data = sample_data.job_data[:20]
test_utils.do_job_ingestion(test_repository, job_data, sample_push, False)
# build a date that will cause the data to be cycled
cycle_date_ts = datetime.now() - timedelta(weeks=1)
for job in Job.objects.all():
job.submit_time = cycle_date_ts
job.save()
create_failure_lines(Job.objects.get(id=1), [(test_line, {})] * 7)
call_command('cycle_data', 'from:treeherder', sleep_time=0, days=1, chunk_size=3)
# There should be no jobs after cycling
assert Job.objects.count() == 0
assert FailureLine.objects.count() == 0
def test_cycle_job_model_reference_data(
test_repository, failure_classifications, sample_data, sample_push, mock_log_parser
):
job_data = sample_data.job_data[:20]
test_utils.do_job_ingestion(test_repository, job_data, sample_push, False)
# get a list of ids of original reference data
original_job_type_ids = JobType.objects.values_list('id', flat=True)
original_job_group_ids = JobGroup.objects.values_list('id', flat=True)
original_machine_ids = Machine.objects.values_list('id', flat=True)
# create a bunch of job model data that should be cycled, since they don't
# reference any current jobs
jg = JobGroup.objects.create(symbol='moo', name='moo')
jt = JobType.objects.create(symbol='mu', name='mu')
m = Machine.objects.create(name='machine_with_no_job')
(jg_id, jt_id, m_id) = (jg.id, jt.id, m.id)
call_command('cycle_data', 'from:treeherder', sleep_time=0, days=1, chunk_size=3)
# assert that reference data that should have been cycled, was cycled
assert JobGroup.objects.filter(id=jg_id).count() == 0
assert JobType.objects.filter(id=jt_id).count() == 0
assert Machine.objects.filter(id=m_id).count() == 0
# assert that we still have everything that shouldn't have been cycled
assert JobType.objects.filter(id__in=original_job_type_ids).count() == len(
original_job_type_ids
)
assert JobGroup.objects.filter(id__in=original_job_group_ids).count() == len(
original_job_group_ids
)
assert Machine.objects.filter(id__in=original_machine_ids).count() == len(original_machine_ids)
# Treeherder's data cycling can have some impact upon
# Perfherder data. Test cases touching this aspect
# should be defined bellow.
def test_cycle_job_with_performance_data(
test_repository, failure_classifications, test_job, mock_log_parser, test_perf_signature
):
"""
Ensure that removing Treeherder jobs won't CASCADE DELETE to
`performance_datum` rows, as this would have dire consequences.
Rather the perf rows remain, but with their `job` foreign key set to NULL.
"""
# build a date that will cause the data to be cycled
test_job.submit_time = datetime.now() - timedelta(weeks=1)
test_job.save()
PerformanceDatum.objects.create(
repository=test_repository,
push=test_job.push,
job=test_job,
signature=test_perf_signature,
push_timestamp=test_job.push.time,
value=1.0,
)
call_command('cycle_data', 'from:treeherder', sleep_time=0, days=1, chunk_size=3)
# assert that the job got cycled
assert Job.objects.count() == 0
# assert that the perf object is still there, but the job reference is None
p = PerformanceDatum.objects.get(id=1)
assert p.job is None

Просмотреть файл

@ -9,9 +9,9 @@ from django.db import connection
from django.db.backends.utils import CursorWrapper
from django.db.models import Count
from django.db.utils import OperationalError
from django.conf import settings
from typing import List
from treeherder.config import settings
from treeherder.model.models import Job, JobGroup, JobType, Machine, Repository
from treeherder.perf.exceptions import MaxRuntimeExceeded, NoDataCyclingAtAll
from treeherder.perf.models import PerformanceDatum, PerformanceSignature, PerformanceAlertSummary
@ -30,8 +30,22 @@ MINIMUM_PERFHERDER_EXPIRE_INTERVAL = 365
logger = logging.getLogger(__name__)
def has_valid_explicit_days(func):
def wrapper(*args, **kwargs):
days = kwargs.get('days')
if (days is not None) and settings.SITE_HOSTNAME != 'treeherder-prototype2.herokuapp.com':
raise ValueError(
'Cannot override perf data retention parameters on projects other than treeherder-prototype2'
)
func(*args, **kwargs)
return wrapper
class DataCycler(ABC):
def __init__(self, chunk_size: int, sleep_time: int, is_debug: bool = None, **kwargs):
def __init__(
self, chunk_size: int, sleep_time: int, is_debug: bool = None, days: int = None, **kwargs
):
self.chunk_size = chunk_size
self.sleep_time = sleep_time
self.is_debug = is_debug or False
@ -89,17 +103,22 @@ class TreeherderCycler(DataCycler):
class PerfherderCycler(DataCycler):
DEFAULT_MAX_RUNTIME = timedelta(hours=23)
@has_valid_explicit_days
def __init__(
self,
chunk_size: int,
sleep_time: int,
is_debug: bool = None,
days: int = None,
strategies: List[RemovalStrategy] = None,
**kwargs,
):
super().__init__(chunk_size, sleep_time, is_debug)
self.strategies = strategies or RemovalStrategy.fabricate_all_strategies(chunk_size)
self.strategies = strategies or RemovalStrategy.fabricate_all_strategies(
chunk_size, days=days
)
self.timer = MaxRuntime()
@property
@ -199,6 +218,22 @@ class PerfherderCycler(DataCycler):
class RemovalStrategy(ABC):
@property
@abstractmethod
def CYCLE_INTERVAL(self) -> int:
"""
expressed in days
"""
pass
@has_valid_explicit_days
def __init__(self, chunk_size: int, days: int = None):
days = days or self.CYCLE_INTERVAL
self._cycle_interval = timedelta(days=days)
self._chunk_size = chunk_size
self._max_timestamp = datetime.now() - self._cycle_interval
@abstractmethod
def remove(self, using: CursorWrapper):
pass
@ -231,15 +266,14 @@ class MainRemovalStrategy(RemovalStrategy):
that are at least 1 year old.
"""
# WARNING!! Don't override this without proper approval!
CYCLE_INTERVAL = 365 # in days #
@property
def CYCLE_INTERVAL(self) -> int:
# WARNING!! Don't override this without proper approval!
return 365 # days #
########################################################
########################################################
def __init__(self, chunk_size: int):
self._cycle_interval = timedelta(days=self.CYCLE_INTERVAL)
self._chunk_size = chunk_size
self._max_timestamp = datetime.now() - self._cycle_interval
def __init__(self, chunk_size: int, days: int = None):
super().__init__(chunk_size, days=days)
self._manager = PerformanceDatum.objects
@property
@ -279,11 +313,14 @@ class TryDataRemoval(RemovalStrategy):
SIGNATURE_BULK_SIZE = 10
def __init__(self, chunk_size: int):
self._cycle_interval = timedelta(weeks=6)
self._chunk_size = chunk_size
self._max_timestamp = datetime.now() - self._cycle_interval
self._manager = PerformanceDatum.objects
@property
def CYCLE_INTERVAL(self) -> int:
# WARNING!! Don't override this without proper approval!
return 42 # days #
########################################################
def __init__(self, chunk_size: int, days: int = None):
super().__init__(chunk_size, days=days)
self.__try_repo_id = None
self.__target_signatures = None
@ -375,10 +412,15 @@ class IrrelevantDataRemoval(RemovalStrategy):
'reference-browser',
]
def __init__(self, chunk_size: int):
self._cycle_interval = timedelta(days=(6 * 30))
self._chunk_size = chunk_size
self._max_timestamp = datetime.now() - self._cycle_interval
@property
def CYCLE_INTERVAL(self) -> int:
# WARNING!! Don't override this without proper approval!
return 180 # days #
########################################################
def __init__(self, chunk_size: int, days: int = None):
super().__init__(chunk_size, days=days)
self._manager = PerformanceDatum.objects
self.__relevant_repos = None
@ -442,10 +484,15 @@ class StalledDataRemoval(RemovalStrategy):
that haven't been updated in the last 4 months.
"""
def __init__(self, chunk_size: int):
self._cycle_interval = timedelta(days=120)
self._chunk_size = chunk_size
self._max_timestamp = datetime.now() - self._cycle_interval
@property
def CYCLE_INTERVAL(self) -> int:
# WARNING!! Don't override this without proper approval!
return 120 # days #
########################################################
def __init__(self, chunk_size: int, days: int = None):
super().__init__(chunk_size, days=days)
self._target_signature = None
self._removable_signatures = None
@ -531,6 +578,17 @@ class Command(BaseCommand):
default=False,
help='Write debug messages to stdout',
)
parser.add_argument(
'--days',
action='store',
dest='days',
type=int,
help=(
"Data cycle interval expressed in days. "
"On Perfherder specifically, this only applies for `treeherder-prototype2` "
"environment; supplying it for other environments is illegal."
),
)
parser.add_argument(
'--chunk-size',
action='store',
@ -552,17 +610,7 @@ class Command(BaseCommand):
subparsers = parser.add_subparsers(
description='Data producers from which to expire data', dest='data_source'
)
treeherder_subcommand = subparsers.add_parser(
TREEHERDER_SUBCOMMAND
) # default subcommand even if not provided
treeherder_subcommand.add_argument(
'--days',
action='store',
dest='days',
type=int,
help='Data cycle interval expressed in days. '
'Only relevant for Treeherder specific data.',
)
subparsers.add_parser(TREEHERDER_SUBCOMMAND) # default subcommand even if not provided
# Perfherder will have its own specifics
subparsers.add_parser(PERFHERDER_SUBCOMMAND)