Publish ingestion metrics on statsd (#7875)

* Add statsd service

* Add celery task to publish stats

* Add task to prod misc worker

* Fix beat task

* Aggregate stats

* Publish on statsd

* Fix date range filter

* Allow to change statsd host and port

* Add a prefix for stats publication
This commit is contained in:
Valentin Rigal 2023-11-21 20:37:53 +01:00 коммит произвёл GitHub
Родитель c13a218373
Коммит 307abe8106
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
10 изменённых файлов: 180 добавлений и 25 удалений

Просмотреть файл

@ -9,3 +9,6 @@ ui/partials/
# Ignore markdown
*.md
# Statsd configuration file
docker/statsd_config.js

Просмотреть файл

@ -148,7 +148,7 @@ services:
- DATABASE_URL=mysql://root@mysql:3306/treeherder
- PROJECTS_TO_INGEST=${PROJECTS_TO_INGEST:-autoland,try}
entrypoint: './docker/entrypoint.sh'
command: celery -A treeherder worker --uid=nobody --gid=nogroup --without-gossip --without-mingle --without-heartbeat -Q store_pulse_pushes,store_pulse_tasks,store_pulse_tasks_classification --concurrency=1 --loglevel=INFO
command: celery -A treeherder worker --uid=nobody --gid=nogroup --without-gossip --without-mingle --without-heartbeat -Q store_pulse_pushes,store_pulse_tasks,store_pulse_tasks_classification,statsd --concurrency=1 --loglevel=INFO
volumes:
- .:/app
depends_on:
@ -157,6 +157,15 @@ services:
- redis
- rabbitmq
platform: linux/amd64
statsd:
container_name: statsd
image: statsd/statsd:v0.10.2
volumes:
- ./docker/statsd_config.js:/usr/src/app/config.js
ports:
- '8125:8125'
volumes:
# TODO: Experiment with using tmpfs when testing, to speed up database-using Python tests.
mysql_data: {}

Просмотреть файл

@ -56,7 +56,7 @@ elif [ "$1" == "worker_log_parser_fail_json_unsheriffed" ]; then
# Tasks that don't need a dedicated worker.
elif [ "$1" == "worker_misc" ]; then
export REMAP_SIGTERM=SIGQUIT
exec newrelic-admin run-program celery -A treeherder worker --without-gossip --without-mingle --without-heartbeat -Q default,generate_perf_alerts,pushlog --concurrency=3
exec newrelic-admin run-program celery -A treeherder worker --without-gossip --without-mingle --without-heartbeat -Q default,generate_perf_alerts,pushlog,statsd --concurrency=3
# Cron jobs
elif [ "$1" == "run_intermittents_commenter" ]; then

8
docker/statsd_config.js Normal file
Просмотреть файл

@ -0,0 +1,8 @@
{
debug: true,
healthStatus: true,
dumpMessages: true,
port: 8125,
// InfluxDB backend should be added to read stats
backends: []
}

Просмотреть файл

@ -44,3 +44,6 @@ moz-measure-noise==2.60.1
# Used in the intermittents commenter
jinja2==3.1.2
# Client to publish runtime statistics to statsd
statsd==4.0.1

Просмотреть файл

@ -1,6 +1,6 @@
#
# This file is autogenerated by pip-compile with python 3.9
# To update, run:
# This file is autogenerated by pip-compile with Python 3.9
# by the following command:
#
# pip-compile --generate-hashes --output-file=requirements/common.txt requirements/common.in
#
@ -101,10 +101,6 @@ amqp==5.1.1 \
--hash=sha256:2c1b13fecc0893e946c65cbd5f36427861cffa4ea2201d8f6fca22e2a373b5e2 \
--hash=sha256:6f0956d2c23d8fa6e7691934d8c3930eadb44972cbbd1a7ae3a520f735d43359
# via kombu
ansicon==1.89.0 \
--hash=sha256:e4d039def5768a47e4afec8e89e83ec3ae5a26bf00ad851f914d1240b444d2b1 \
--hash=sha256:f1def52d17f65c2c9682cf8370c03f541f410c1752d6a14029f97318e4b9dfec
# via jinxed
appdirs==1.4.4 \
--hash=sha256:7d5d0167b2b1ba821647616af46a749d1c653740dd0d2415100fe26e27afdf41 \
--hash=sha256:a841dacd6b99318a741b166adb07e19ee71a274450e68237b4650ca1055ab128
@ -353,12 +349,6 @@ click-repl==0.3.0 \
--hash=sha256:17849c23dba3d667247dc4defe1757fff98694e90fe37474f3feebb69ced26a9 \
--hash=sha256:fb7e06deb8da8de86180a33a9da97ac316751c094c6899382da7feeeeb51b812
# via celery
colorama==0.4.6 \
--hash=sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44 \
--hash=sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6
# via
# click
# loguru
crashtest==0.4.1 \
--hash=sha256:80d7b1f316ebfbd429f648076d6275c877ba30ba48979de4191714a75266f0ce \
--hash=sha256:8d23eac5fa660409f57472e3851dab7ac18aba459a8d19cbbba86d3d5aecd2a5
@ -495,10 +485,6 @@ jinja2==3.1.2 \
--hash=sha256:31351a702a408a9e7595a8fc6150fc3f43bb6bf7e319770cbc0db9df9437e852 \
--hash=sha256:6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61
# via -r requirements/common.in
jinxed==1.2.0 \
--hash=sha256:032acda92d5c57cd216033cbbd53de731e6ed50deb63eb4781336ca55f72cda5 \
--hash=sha256:cfc2b2e4e3b4326954d546ba6d6b9a7a796ddcb0aef8d03161d005177eb0d48b
# via blessed
jmespath==1.0.1 \
--hash=sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980 \
--hash=sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe
@ -1281,6 +1267,10 @@ sqlparse==0.4.4 \
--hash=sha256:5430a4fe2ac7d0f93e66f1efc6e1338a41884b7ddf2a350cedd20ccc4d9d28f3 \
--hash=sha256:d446183e84b8349fa3061f0fe7f06ca94ba65b426946ffebe6e3e8295332420c
# via django
statsd==4.0.1 \
--hash=sha256:99763da81bfea8daf6b3d22d11aaccb01a8d0f52ea521daab37e758a4ca7d128 \
--hash=sha256:c2676519927f7afade3723aca9ca8ea986ef5b059556a980a867721ca69df093
# via -r requirements/common.in
tabulate==0.9.0 \
--hash=sha256:0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c \
--hash=sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f
@ -1311,9 +1301,7 @@ typing-extensions==4.7.1 \
tzdata==2023.3 \
--hash=sha256:11ef1e08e54acb0d4f95bdb1be05da659673de4acbd21bf9c69e94cc5e907a3a \
--hash=sha256:7e65763eef3120314099b6939b5546db7adce1e7d6f2e179e3df563c70511eda
# via
# celery
# django
# via celery
uritemplate==4.1.1 \
--hash=sha256:4346edfc5c3b79f694bccd6d6099a322bbeb628dbf2cd86eea55a456ce5124f0 \
--hash=sha256:830c08b8d99bdd312ea4ead05994a38e8936266f84b9a7878232db50b044e02e
@ -1387,10 +1375,6 @@ whitenoise[brotli]==6.5.0 \
--hash=sha256:15fe60546ac975b58e357ccaeb165a4ca2d0ab697e48450b8f0307ca368195a8 \
--hash=sha256:16468e9ad2189f09f4a8c635a9031cc9bb2cdbc8e5e53365407acf99f7ade9ec
# via -r requirements/common.in
win32-setctime==1.1.0 \
--hash=sha256:15cf5750465118d6929ae4de4eb46e8edae9a5634350c01ba582df868e932cb2 \
--hash=sha256:231db239e959c2fe7eb1d7dc129f11172354f98361c4fa2d6d2d7e278baa8aad
# via loguru
yarl==1.9.2 \
--hash=sha256:04ab9d4b9f587c06d801c2abfe9317b77cdf996c65a90d5e84ecc45010823571 \
--hash=sha256:066c163aec9d3d073dc9ffe5dd3ad05069bcb03fcaab8d221290ba99f9f69ee3 \

Просмотреть файл

@ -0,0 +1,52 @@
import pytest
from unittest.mock import patch, MagicMock, call
from treeherder.workers.stats import publish_stats
from treeherder.model.models import Push, Job
from django.utils import timezone
from datetime import timedelta
@pytest.mark.django_db
@patch('treeherder.workers.stats.get_stats_client')
def test_publish_stats_nothing_to_do(get_worker_mock, django_assert_num_queries, caplog):
statsd_client = MagicMock()
get_worker_mock.return_value = statsd_client
assert Push.objects.count() == 0
assert Job.objects.count() == 0
with django_assert_num_queries(2):
publish_stats()
assert [(level, message) for _, level, message in caplog.record_tuples] == [
(20, 'Publishing runtime statistics to statsd'),
(20, 'Ingested 0 pushes'),
(20, 'Ingested 0 jobs in total'),
]
assert statsd_client.call_args_list == []
@pytest.mark.django_db
@patch('treeherder.workers.stats.get_stats_client')
def test_publish_stats(
get_worker_mock, eleven_jobs_stored_new_date, django_assert_num_queries, caplog, settings
):
"Test statsd statistics publication task"
settings.CELERY_STATS_PUBLICATION_DELAY = 10
statsd_client = MagicMock()
get_worker_mock.return_value = statsd_client
assert Push.objects.count() == 10
assert Job.objects.count() == 11
Push.objects.update(time=timezone.now() - timedelta(minutes=10))
Job.objects.update(end_time=timezone.now() - timedelta(minutes=10))
with django_assert_num_queries(2):
publish_stats()
assert [(level, message) for _, level, message in caplog.record_tuples] == [
(20, 'Publishing runtime statistics to statsd'),
(20, 'Ingested 10 pushes'),
(20, 'Ingested 11 jobs in total'),
]
assert statsd_client.incr.call_args_list == [
call('push', 10),
call('jobs', 11),
call('jobs_repo.mozilla-central', 11),
call('jobs_state.completed', 11),
]

Просмотреть файл

@ -15,3 +15,4 @@ app.config_from_object('django.conf:settings', namespace='CELERY')
# Load task modules from all registered Django app configs.
app.autodiscover_tasks()
app.autodiscover_tasks(['treeherder.workers.stats'])

Просмотреть файл

@ -10,6 +10,7 @@ from kombu import Exchange, Queue
from treeherder.config.utils import connection_should_use_tls
from treeherder.middleware import add_headers_function
from celery.schedules import crontab
# TODO: Switch to pathlib once using Python 3.
SRC_DIR = dirname(dirname(dirname(abspath(__file__))))
@ -342,6 +343,7 @@ CELERY_TASK_QUEUES = [
routing_key='store_pulse_tasks_classification',
),
Queue('store_pulse_pushes', Exchange('default'), routing_key='store_pulse_pushes'),
Queue('statsd', Exchange('default'), routing_key='statsd'),
]
# Force all queues to be explicitly listed in `CELERY_TASK_QUEUES` to help prevent typos
@ -378,6 +380,12 @@ CELERY_TASK_ACKS_LATE = True
CELERY_TASK_SOFT_TIME_LIMIT = 15 * 60
CELERY_TASK_TIME_LIMIT = CELERY_TASK_SOFT_TIME_LIMIT + 30
# Periodically publish runtime statistics on statsd (in minutes)
CELERY_STATS_PUBLICATION_DELAY = 5
assert (
0 < CELERY_STATS_PUBLICATION_DELAY < 60 and 60 % 10 == 0
), "Celery task must be a valid cron delay in minutes"
CELERY_BEAT_SCHEDULE = {
# this is just a failsafe in case the Pulse ingestion misses something
'fetch-push-logs-every-5-minutes': {
@ -386,6 +394,12 @@ CELERY_BEAT_SCHEDULE = {
'relative': True,
'options': {"queue": "pushlog"},
},
'publish_stats': {
'task': 'publish-stats',
'schedule': crontab(minute=f'*/{CELERY_STATS_PUBLICATION_DELAY}'),
'relative': True,
'options': {'queue': 'statsd'},
},
}
# CORS Headers
@ -490,5 +504,10 @@ NOTIFY_ACCESS_TOKEN = env('NOTIFY_ACCESS_TOKEN', default=None)
# https://github.com/settings/tokens
GITHUB_TOKEN = env("GITHUB_TOKEN", default=None)
# Statsd server configuration
STATSD_HOST = env('STATSD_HOST', default='statsd')
STATSD_PORT = env('STATSD_PORT', default=8124)
STATSD_PREFIX = env('STATSD_PREFIX', default='treeherder')
# For dockerflow
BASE_DIR = SRC_DIR

Просмотреть файл

@ -0,0 +1,76 @@
from celery import shared_task
from django.conf import settings
from django.utils import timezone
from datetime import timedelta
from django.db.models import Count
from treeherder.model.models import Push, Job
from itertools import groupby
import statsd
import logging
logging.basicConfig()
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
def get_stats_client():
return statsd.StatsClient(
settings.STATSD_HOST, settings.STATSD_PORT, prefix=settings.STATSD_PREFIX
)
@shared_task(name='publish-stats')
def publish_stats():
"""
Publish runtime stats on statsd
"""
stats_client = get_stats_client()
logger.info('Publishing runtime statistics to statsd')
end_date = timezone.now()
# Round the date to the current date range
# This should not overlapse as the beat is set as a relative cron based delay in minutes
end_date = end_date - timedelta(
minutes=end_date.minute % settings.CELERY_STATS_PUBLICATION_DELAY,
seconds=end_date.second,
microseconds=end_date.microsecond,
)
start_date = end_date - timedelta(minutes=settings.CELERY_STATS_PUBLICATION_DELAY)
logger.debug(f'Reading data ingested from {start_date} to {end_date}')
# Nb of pushes
pushes_count = Push.objects.filter(time__lte=end_date, time__gt=start_date).count()
logger.info(f'Ingested {pushes_count} pushes')
if pushes_count:
stats_client.incr('push', pushes_count)
# Compute stats for jobs in a single request
jobs_stats = (
Job.objects.filter(end_time__lte=end_date, end_time__gt=start_date)
.values('push__repository__name', 'state')
.annotate(count=Count('id'))
.values_list('push__repository__name', 'state', 'count')
)
# nb of job total
jobs_total = sum(ct for _, _, ct in jobs_stats)
logger.info(f'Ingested {jobs_total} jobs in total')
if jobs_total:
stats_client.incr('jobs', jobs_total)
# nb of job per repo
jobs_per_repo = {
key: sum(ct for k, ct in vals)
for key, vals in groupby(sorted((repo, ct) for repo, _, ct in jobs_stats), lambda x: x[0])
}
logger.debug(f'Jobs per repo: {jobs_per_repo}')
for key, value in jobs_per_repo.items():
stats_client.incr(f'jobs_repo.{key}', value)
# nb of job per state
jobs_per_state = {
key: sum(ct for k, ct in vals)
for key, vals in groupby(sorted((state, ct) for _, state, ct in jobs_stats), lambda x: x[0])
}
logger.debug(f'Jobs per state : {jobs_per_state}')
for key, value in jobs_per_state.items():
stats_client.incr(f'jobs_state.{key}', value)