Bugfix: Don't try to create a duplicate Dag Run in Scheduler (#13920)

closes https://github.com/apache/airflow/issues/13685

When the Scheduler is restarted or killed after creating Dag Run in `Scheduler._create_dag_runs` but
before `Scheduler.self._update_dag_next_dagruns`, the Scheduler falls in a loop because it will not try
to create the Dag Run again in the Scheduler Loop. However, as the DagRun already exists it will fail
with:

```
Traceback (most recent call last):
  File "/Users/kaxilnaik/opt/anaconda3/lib/python3.7/site-packages/sqlalchemy/engine/base.py", line 1277, in _execute_context
    cursor, statement, parameters, context
  File "/Users/kaxilnaik/opt/anaconda3/lib/python3.7/site-packages/sqlalchemy/engine/default.py", line 593, in do_execute
    cursor.execute(statement, parameters)
psycopg2.errors.UniqueViolation: duplicate key value violates unique constraint "dag_run_dag_id_run_id_key"
DETAIL:  Key (dag_id, run_id)=(scenario1_case2_02, scheduled__2021-01-25T00:00:00+00:00) already exists.
```
This commit is contained in:
Kaxil Naik 2021-02-01 16:06:31 +00:00 коммит произвёл GitHub
Родитель 05fbeb16bc
Коммит 594069ee06
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
2 изменённых файлов: 94 добавлений и 11 удалений

Просмотреть файл

@ -34,7 +34,7 @@ from multiprocessing.connection import Connection as MultiprocessingConnection
from typing import Any, Callable, DefaultDict, Dict, Iterable, List, Optional, Set, Tuple
from setproctitle import setproctitle
from sqlalchemy import and_, func, not_, or_
from sqlalchemy import and_, func, not_, or_, tuple_
from sqlalchemy.exc import OperationalError
from sqlalchemy.orm import load_only, selectinload
from sqlalchemy.orm.session import Session, make_transient
@ -1563,6 +1563,20 @@ class SchedulerJob(BaseJob): # pylint: disable=too-many-instance-attributes
Unconditionally create a DAG run for the given DAG, and update the dag_model's fields to control
if/when the next DAGRun should be created
"""
# Bulk Fetch DagRuns with dag_id and execution_date same
# as DagModel.dag_id and DagModel.next_dagrun
# This list is used to verify if the DagRun already exist so that we don't attempt to create
# duplicate dag runs
active_dagruns = (
session.query(DagRun.dag_id, DagRun.execution_date)
.filter(
tuple_(DagRun.dag_id, DagRun.execution_date).in_(
[(dm.dag_id, dm.next_dagrun) for dm in dag_models]
)
)
.all()
)
for dag_model in dag_models:
try:
dag = self.dagbag.get_dag(dag_model.dag_id, session=session)
@ -1571,16 +1585,25 @@ class SchedulerJob(BaseJob): # pylint: disable=too-many-instance-attributes
continue
dag_hash = self.dagbag.dags_hash.get(dag.dag_id)
dag.create_dagrun(
run_type=DagRunType.SCHEDULED,
execution_date=dag_model.next_dagrun,
start_date=timezone.utcnow(),
state=State.RUNNING,
external_trigger=False,
session=session,
dag_hash=dag_hash,
creating_job_id=self.id,
)
# Explicitly check if the DagRun already exists. This is an edge case
# where a Dag Run is created but `DagModel.next_dagrun` and `DagModel.next_dagrun_create_after`
# are not updated.
# We opted to check DagRun existence instead
# of catching an Integrity error and rolling back the session i.e
# we need to run self._update_dag_next_dagruns if the Dag Run already exists or if we
# create a new one. This is so that in the next Scheduling loop we try to create new runs
# instead of falling in a loop of Integrity Error.
if (dag.dag_id, dag_model.next_dagrun) not in active_dagruns:
dag.create_dagrun(
run_type=DagRunType.SCHEDULED,
execution_date=dag_model.next_dagrun,
start_date=timezone.utcnow(),
state=State.RUNNING,
external_trigger=False,
session=session,
dag_hash=dag_hash,
creating_job_id=self.id,
)
self._update_dag_next_dagruns(dag_models, session)

Просмотреть файл

@ -3695,6 +3695,66 @@ class TestSchedulerJob(unittest.TestCase):
dag_model = session.query(DagModel).get(dag.dag_id)
assert dag_model.next_dagrun == DEFAULT_DATE + timedelta(minutes=1)
def test_scheduler_create_dag_runs_check_existing_run(self):
"""
Test that if a dag run exists, scheduler._create_dag_runs does not raise an error.
And if a Dag Run does not exist it creates next Dag Run. In both cases the Scheduler
sets next execution date as DagModel.next_dagrun
"""
dag = DAG(
dag_id='test_scheduler_create_dag_runs_check_existing_run',
start_date=DEFAULT_DATE,
schedule_interval=timedelta(days=1),
)
DummyOperator(
task_id='dummy',
dag=dag,
)
session = settings.Session()
assert dag.get_last_dagrun(session) is None
dagbag = DagBag(
dag_folder=os.devnull,
include_examples=False,
read_dags_from_db=False,
)
dagbag.bag_dag(dag=dag, root_dag=dag)
# Create DagModel
DAG.bulk_write_to_db(dagbag.dags.values())
dag_model = DagModel.get_dagmodel(dag.dag_id)
# Assert dag_model.next_dagrun is set correctly
assert dag_model.next_dagrun == DEFAULT_DATE
dag = SerializedDAG.from_dict(SerializedDAG.to_dict(dag))
dagrun = dag.create_dagrun(
run_type=DagRunType.SCHEDULED,
execution_date=dag_model.next_dagrun,
start_date=timezone.utcnow(),
state=State.RUNNING,
external_trigger=False,
session=session,
creating_job_id=2,
)
session.flush()
assert dag.get_last_dagrun(session) == dagrun
scheduler = SchedulerJob(subdir=os.devnull, executor=self.null_exec)
scheduler.dagbag = dagbag
scheduler.processor_agent = mock.MagicMock()
# Test that this does not raise any error
scheduler._create_dag_runs([dag_model], session)
# Assert dag_model.next_dagrun is set correctly to next execution date
assert dag_model.next_dagrun == DEFAULT_DATE + timedelta(days=1)
session.rollback()
def test_do_schedule_max_active_runs_upstream_failed(self):
"""
Test that tasks in upstream failed don't count as actively running.