telemetry-airflow/dags/search_term_data_validation...

55 строки
1.7 KiB
Python

"""
See [search-term-data-validation-v2 in the docker-etl repository](https://github.com/mozilla/docker-etl/blob/main/jobs/search-term-data-validation-v2).
This job populates a table for evaluating whether our recorded search terms
(candidate search volume for being sanitized and stored) are changing in ways
that might invalidate assumptions on which we've built our sanitization model.
This DAG is low priority.
"""
from datetime import datetime, timedelta
from airflow import DAG
from operators.gcp_container_operator import GKEPodOperator
from utils.tags import Tag
default_args = {
"owner": "ctroy@mozilla.com",
"email": ["ctroy@mozilla.com", "wstuckey@mozilla.com"],
"depends_on_past": False,
"start_date": datetime(2023, 9, 5),
"email_on_failure": True,
"email_on_retry": True,
"retries": 2,
"retry_delay": timedelta(minutes=30),
}
tags = [
Tag.ImpactTier.tier_3,
Tag.Triage.no_triage,
]
daily_at_8AM = "0 8 * * *"
with DAG(
"search_term_data_validation_v2",
default_args=default_args,
schedule_interval=daily_at_8AM,
doc_md=__doc__,
tags=tags,
) as dag:
search_term_data_validation = GKEPodOperator(
task_id="search_term_data_validation_v2",
arguments=[
"python",
"search_term_data_validation_v2/main.py",
"--data_validation_origin",
"moz-fx-data-shared-prod.search_terms.sanitization_job_data_validation_metrics",
"--data_validation_reporting_destination",
"moz-fx-data-shared-prod.search_terms_derived.search_term_data_validation_reports_v1",
],
image="gcr.io/moz-fx-data-airflow-prod-88e0/search-term-data-validation-v2_docker_etl:latest",
dag=dag,
)