From ffb472cf9e630bd70f51b74b0d0ea4ab98635572 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kamil=20Bregu=C5=82a?= Date: Tue, 26 Jan 2021 13:45:49 +0100 Subject: [PATCH] Add quick start for Airflow on Docker (#13660) Co-authored-by: Felix Uellendall Co-authored-by: Jarek Potiuk Co-authored-by: Kaxil Naik --- .gitignore | 2 +- .pre-commit-config.yaml | 2 +- airflow/cli/commands/db_command.py | 1 + docs/apache-airflow/concepts.rst | 45 ++++- docs/apache-airflow/howto/index.rst | 2 +- docs/apache-airflow/index.rst | 2 +- docs/apache-airflow/redirects.txt | 3 + docs/apache-airflow/start/.gitignore | 4 + docs/apache-airflow/start/airflow.sh | 28 +++ docs/apache-airflow/start/docker-compose.yaml | 134 ++++++++++++++ docs/apache-airflow/start/docker.rst | 170 ++++++++++++++++++ docs/apache-airflow/start/index.rst | 27 +++ .../{start.rst => start/local.rst} | 41 +---- docs/conf.py | 16 +- docs/exts/docs_build/lint_checks.py | 66 ++++++- 15 files changed, 493 insertions(+), 50 deletions(-) create mode 100644 docs/apache-airflow/start/.gitignore create mode 100755 docs/apache-airflow/start/airflow.sh create mode 100644 docs/apache-airflow/start/docker-compose.yaml create mode 100644 docs/apache-airflow/start/docker.rst create mode 100644 docs/apache-airflow/start/index.rst rename docs/apache-airflow/{start.rst => start/local.rst} (74%) diff --git a/.gitignore b/.gitignore index 1ba0c9f692..67dfd80bab 100644 --- a/.gitignore +++ b/.gitignore @@ -17,7 +17,7 @@ airflow/www/static/dist airflow/www_rbac/static/coverage/ airflow/www_rbac/static/dist/ -logs/ +/logs/ airflow-webserver.pid # Byte-compiled / optimized / DLL files diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0be1bd1364..812e534590 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -522,7 +522,7 @@ repos: - https://raw.githubusercontent.com/compose-spec/compose-spec/master/schema/compose-spec.json language: python pass_filenames: true - files: scripts/ci/docker-compose/.+.yml + files: ^scripts/ci/docker-compose/.+\.ya?ml$|docker-compose.ya?ml$ require_serial: true additional_dependencies: ['jsonschema==3.2.0', 'PyYAML==5.3.1', 'requests==2.25.0'] - id: json-schema diff --git a/airflow/cli/commands/db_command.py b/airflow/cli/commands/db_command.py index 29bade8e69..315a8ec596 100644 --- a/airflow/cli/commands/db_command.py +++ b/airflow/cli/commands/db_command.py @@ -46,6 +46,7 @@ def upgradedb(args): """Upgrades the metadata database""" print("DB: " + repr(settings.engine.url)) db.upgradedb() + print("Upgrades done") def check_migrations(args): diff --git a/docs/apache-airflow/concepts.rst b/docs/apache-airflow/concepts.rst index a8c6bb42c9..346f6c008d 100644 --- a/docs/apache-airflow/concepts.rst +++ b/docs/apache-airflow/concepts.rst @@ -23,6 +23,38 @@ Concepts The Airflow platform is a tool for describing, executing, and monitoring workflows. +.. _architecture: + +Basic Airflow architecture +'''''''''''''''''''''''''' + +Primarily intended for development use, the basic Airflow architecture with the Local and Sequential executors is an +excellent starting point for understanding the architecture of Apache Airflow. + +.. image:: img/arch-diag-basic.png + + +There are a few components to note: + +* **Metadata Database**: Airflow uses a SQL database to store metadata about the data pipelines being run. In the + diagram above, this is represented as Postgres which is extremely popular with Airflow. + Alternate databases supported with Airflow include MySQL. + +* **Web Server** and **Scheduler**: The Airflow web server and Scheduler are separate processes run (in this case) + on the local machine and interact with the database mentioned above. + +* The **Executor** is shown separately above, since it is commonly discussed within Airflow and in the documentation, but + in reality it is NOT a separate process, but run within the Scheduler. + +* The **Worker(s)** are separate processes which also interact with the other components of the Airflow architecture and + the metadata repository. + +* ``airflow.cfg`` is the Airflow configuration file which is accessed by the Web Server, Scheduler, and Workers. + +* **DAGs** refers to the DAG files containing Python code, representing the data pipelines to be run by Airflow. The + location of these files is specified in the Airflow configuration file, but they need to be accessible by the + Web Server, Scheduler, and Workers. + Core Ideas '''''''''' @@ -194,10 +226,10 @@ Example DAG with decorator: .. _concepts:executor_config: -executor_config -=============== +``executor_config`` +=================== -The executor_config is an argument placed into operators that allow airflow users to override tasks +The ``executor_config`` is an argument placed into operators that allow airflow users to override tasks before launch. Currently this is primarily used by the :class:`KubernetesExecutor`, but will soon be available for other overrides. @@ -1545,7 +1577,8 @@ This example illustrates some possibilities Packaged DAGs -''''''''''''' +============= + While often you will specify DAGs in a single ``.py`` file it might sometimes be required to combine a DAG and its dependencies. For example, you might want to combine several DAGs together to version them together or you might want @@ -1594,8 +1627,8 @@ do the same, but then it is more suitable to use a virtualenv and pip. pure Python modules can be packaged. -.airflowignore -'''''''''''''' +``.airflowignore`` +================== A ``.airflowignore`` file specifies the directories or files in ``DAG_FOLDER`` or ``PLUGINS_FOLDER`` that Airflow should intentionally ignore. diff --git a/docs/apache-airflow/howto/index.rst b/docs/apache-airflow/howto/index.rst index 0baec68688..b8c2912d8a 100644 --- a/docs/apache-airflow/howto/index.rst +++ b/docs/apache-airflow/howto/index.rst @@ -20,7 +20,7 @@ How-to Guides ============= -Setting up the sandbox in the :doc:`../start` section was easy; +Setting up the sandbox in the :doc:`/start/index` section was easy; building a production-grade environment requires a bit more work! These how-to guides will step you through common tasks in using and diff --git a/docs/apache-airflow/index.rst b/docs/apache-airflow/index.rst index dcc20acee5..b02deafa7b 100644 --- a/docs/apache-airflow/index.rst +++ b/docs/apache-airflow/index.rst @@ -76,7 +76,7 @@ unit of work and continuity. Home project license - start + start/index installation upgrading-to-2 upgrade-check diff --git a/docs/apache-airflow/redirects.txt b/docs/apache-airflow/redirects.txt index 36a72fa895..fac6825e29 100644 --- a/docs/apache-airflow/redirects.txt +++ b/docs/apache-airflow/redirects.txt @@ -37,6 +37,9 @@ howto/write-logs.rst logging-monitoring/logging-tasks.rst metrics.rst logging-monitoring/metrics.rst howto/tracking-user-activity.rst logging-monitoring/tracking-user-activity.rst +# Quick start +start.rst start/index.rst + # References cli-ref.rst cli-and-env-variables-ref.rst _api/index.rst python-api-ref.rst diff --git a/docs/apache-airflow/start/.gitignore b/docs/apache-airflow/start/.gitignore new file mode 100644 index 0000000000..69e1d56a72 --- /dev/null +++ b/docs/apache-airflow/start/.gitignore @@ -0,0 +1,4 @@ +/dags +/logs +/plugins +/.env diff --git a/docs/apache-airflow/start/airflow.sh b/docs/apache-airflow/start/airflow.sh new file mode 100755 index 0000000000..2324ba681e --- /dev/null +++ b/docs/apache-airflow/start/airflow.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# +# Run airflow command in container +# + +PROJECT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +set -euo pipefail + +export COMPOSE_FILE=${PROJECT_DIR}/docker-compose.yaml +exec docker-compose run airflow-worker "${@}" diff --git a/docs/apache-airflow/start/docker-compose.yaml b/docs/apache-airflow/start/docker-compose.yaml new file mode 100644 index 0000000000..18919dc378 --- /dev/null +++ b/docs/apache-airflow/start/docker-compose.yaml @@ -0,0 +1,134 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +# Basic Airflow cluster configuration for CeleryExecutor with Redis and PostgreSQL. +# +# WARNING: This configuration is for local development. Do not use it in a production deployment. +# +# This configuration supports basic configuration using environment variables or an .env file +# The following variables are supported: +# +# AIRFLOW_IMAGE_NAME - Docker image name used to run Airflow. +# Default: apache/airflow:master-python3.8 +# AIRFLOW_UID - User ID in Airflow containers +# Default: 50000 +# AIRFLOW_GID - Group ID in Airflow containers +# Default: 50000 +# _AIRFLOW_WWW_USER_USERNAME - Username for the administrator account. +# Default: airflow +# _AIRFLOW_WWW_USER_PASSWORD - Password for the administrator account. +# Default: airflow +# +# Feel free to modify this file to suit your needs. +--- +version: '3' +x-airflow-common: + &airflow-common + image: ${AIRFLOW_IMAGE_NAME:-apache/airflow:master-python3.8} + environment: + &airflow-common-env + AIRFLOW__CORE__EXECUTOR: CeleryExecutor + AIRFLOW__CORE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow + AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:airflow@postgres/airflow + AIRFLOW__CELERY__BROKER_URL: redis://:@redis:6379/0 + AIRFLOW__CORE__FERNET_KEY: '' + AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true' + AIRFLOW__CORE__LOAD_EXAMPLES: 'true' + volumes: + - ./dags:/opt/airflow/dags + - ./logs:/opt/airflow/logs + - ./plugins:/opt/airflow/plugins + user: "${AIRFLOW_UID:-50000}:${AIRFLOW_GID:-50000}" + depends_on: + redis: + condition: service_healthy + postgres: + condition: service_healthy + +services: + postgres: + image: postgres:13 + environment: + POSTGRES_USER: airflow + POSTGRES_PASSWORD: airflow + POSTGRES_DB: airflow + volumes: + - postgres-db-volume:/var/lib/postgresql/data + healthcheck: + test: ["CMD", "pg_isready", "-U", "airflow"] + interval: 5s + retries: 5 + restart: always + + redis: + image: redis:latest + ports: + - 6379:6379 + healthcheck: + test: ["CMD", "redis-cli", "ping"] + interval: 5s + timeout: 30s + retries: 50 + restart: always + + airflow-webserver: + <<: *airflow-common + command: webserver + ports: + - 8080:8080 + healthcheck: + test: ["CMD", "curl", "--fail", "http://localhost:8080/health"] + interval: 10s + timeout: 10s + retries: 5 + restart: always + + airflow-scheduler: + <<: *airflow-common + command: scheduler + restart: always + + airflow-worker: + <<: *airflow-common + command: celery worker + restart: always + + airflow-init: + <<: *airflow-common + command: version + environment: + <<: *airflow-common-env + _AIRFLOW_DB_UPGRADE: 'true' + _AIRFLOW_WWW_USER_CREATE: 'true' + _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow} + _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow} + + flower: + <<: *airflow-common + command: celery flower + ports: + - 5555:5555 + healthcheck: + test: ["CMD", "curl", "--fail", "http://localhost:5555/"] + interval: 10s + timeout: 10s + retries: 5 + restart: always + +volumes: + postgres-db-volume: diff --git a/docs/apache-airflow/start/docker.rst b/docs/apache-airflow/start/docker.rst new file mode 100644 index 0000000000..dc07dc38cf --- /dev/null +++ b/docs/apache-airflow/start/docker.rst @@ -0,0 +1,170 @@ + .. Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + .. http://www.apache.org/licenses/LICENSE-2.0 + + .. Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. + +Running Airflow in Docker +######################### + +This quick-start guide will allow you to quickly start Airflow with :doc:`CeleryExecutor ` in Docker. This is the fastest way to start Airflow. + +Before you begin +================ + +Follow these steps to install the necessary tools. + +1. Install `Docker Community Edition (CE) `__ on your workstation. +2. Install `Docker Compose `__ v1.27.0 and newer on your workstation. + +Older versions of ``docker-compose`` do not support all features required by ``docker-compose.yaml`` file, so double check that it meets the minimum version requirements. + +``docker-compose.yaml`` +======================= + +To deploy Airflow on Docker Compose, you should fetch `docker-compose.yaml <../docker-compose.yaml>`__. + +.. jinja:: quick_start_ctx + + .. code-block:: bash + + curl -LfO '{{ doc_root_url }}docker-compose.yaml' + +This file contains several service definitions: + +- ``airflow-scheduler`` - The :doc:`scheduler ` monitors all tasks and DAGs, then triggers the + task instances once their dependencies are complete. +- ``airflow-webserver`` - The webserver available at ``http://localhost:8080``. +- ``airflow-worker`` - The worker that executes the tasks given by the scheduler. +- ``airflow-init`` - The initialization service. +- ``flower`` - `The flower app `__ for monitoring the environment. It is available at ``http://localhost:8080``. +- ``postgres`` - The database. +- ``redis`` - `The redis `__ - broker that forwards messages from scheduler to worker. + +All these services allow you to run Airflow with :doc:`CeleryExecutor `. For more information, see :ref:`architecture`. + +Some directories in the container are mounted, which means that their contents are synchronized between your computer and the container. + +- ``./dags`` - you can put your DAG files here. +- ``./logs`` - contains logs from task execution and scheduler. +- ``./plugins`` - you can put your :doc:`custom plugins ` here. + +Initializing Environment +======================== + +Before starting Airflow for the first time, You need to prepare your environment, i.e. create the necessary files, directories and initialize the database. + +On **Linux**, the mounted volumes in container use the native Linux filesystem user/group permissions, so you have to make sure the container and host computer have matching file permissions. + +.. code-block:: bash + + mkdir ./dags ./logs ./plugins + echo -e "AIRFLOW_UID=$(id -u)\nAIRFLOW_GID=0" > .env + +On **all operating system**, you need to run database migrations and create the first user account. To do it, run. + +.. code-block:: bash + + docker-compose up --rm airflow-init + +After initialization is complete, you should see a message like below. + +.. code-block:: text + + airflow-init_1 | Upgrades done + airflow-init_1 | Admin user airflow created + airflow-init_1 | 2.1.0.dev0 + start_airflow-init_1 exited with code 0 + +The account created has the login ``airflow`` and the password ``airflow``. + +Running Airflow +=============== + +Now you can start all services: + +.. code-block:: bash + + docker-compose up + +In the second terminal you can check the condition of the containers and make sure that no containers are in unhealthy condition: + +.. code-block:: bash + + $ docker ps + CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES + 247ebe6cf87a apache/airflow:master-python3.8 "/usr/bin/dumb-init …" 3 minutes ago Up 3 minutes 8080/tcp compose_airflow-worker_1 + ed9b09fc84b1 apache/airflow:master-python3.8 "/usr/bin/dumb-init …" 3 minutes ago Up 3 minutes 8080/tcp compose_airflow-scheduler_1 + 65ac1da2c219 apache/airflow:master-python3.8 "/usr/bin/dumb-init …" 3 minutes ago Up 3 minutes (healthy) 0.0.0.0:5555->5555/tcp, 8080/tcp compose_flower_1 + 7cb1fb603a98 apache/airflow:master-python3.8 "/usr/bin/dumb-init …" 3 minutes ago Up 3 minutes (healthy) 0.0.0.0:8080->8080/tcp compose_airflow-webserver_1 + 74f3bbe506eb postgres:13 "docker-entrypoint.s…" 18 minutes ago Up 17 minutes (healthy) 5432/tcp compose_postgres_1 + 0bd6576d23cb redis:latest "docker-entrypoint.s…" 10 hours ago Up 17 minutes (healthy) 0.0.0.0:6379->6379/tcp compose_redis_1 + +Once the cluster has started up, you can log in to the web interface and try to run some tasks. The webserver available at: ``http://localhost:8080``. The default account has the login ``airflow`` and the password ``airflow``. + +.. image:: /img/dags.png + +Accessing Command Line Interface +================================ + +You can also run :doc:`CLI commands `, but you have to do it in one of the defined ``airflow-*`` services. For example, to run ``airflow info``, run the following command: + +.. code-block:: bash + + docker-compose run airflow-worker airflow info + +If you have Linux or Mac OS, you can make your work easier and download a optional wrapper scripts that will allow you to run commands with a simpler command. + +.. jinja:: quick_start_ctx + + .. code-block:: bash + + curl -LfO '{{ doc_root_url }}airflow.sh' + chmod +x airflow.sh + +Now you can run commands easier. + +.. code-block:: bash + + ./airflow.sh info + +You can also use ``bash`` as parameter to enter interactive bash shell in the container or ``python`` to enter +python container. + +.. code-block:: bash + + ./airflow.sh bash + +.. code-block:: bash + + ./airflow.sh python + +Cleaning up +=========== + +To stop and delete containers, delete volumes with database data and download images, run: + +.. code-block:: bash + + docker-compose down --volumes --rmi all + +Notes +===== + +By default, the Docker Compose file uses the latest Airflow image (`apache/airflow `__). If you need, you can :ref:`customize and extend it `. + +What's Next? +============ + +From this point, you can head to the :doc:`/tutorial` section for further examples or the :doc:`/howto/index` section if you're ready to get your hands dirty. diff --git a/docs/apache-airflow/start/index.rst b/docs/apache-airflow/start/index.rst new file mode 100644 index 0000000000..c86ef8380e --- /dev/null +++ b/docs/apache-airflow/start/index.rst @@ -0,0 +1,27 @@ + .. Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + .. http://www.apache.org/licenses/LICENSE-2.0 + + .. Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. + +Quick start +=========== + +This section contains quick start guides to help you get up and running with Apache Airflow. + +.. toctree:: + :maxdepth: 1 + + local.rst + docker.rst diff --git a/docs/apache-airflow/start.rst b/docs/apache-airflow/start/local.rst similarity index 74% rename from docs/apache-airflow/start.rst rename to docs/apache-airflow/start/local.rst index 1865321971..291d259e6d 100644 --- a/docs/apache-airflow/start.rst +++ b/docs/apache-airflow/start/local.rst @@ -17,8 +17,10 @@ -Quick Start ------------ +Running Airflow locally +----------------------- + +This quick start guide will help you bootstrap a Airflow standalone instance on your local machine. .. note:: @@ -40,7 +42,6 @@ Quick Start If you wish to install airflow using those tools you should use the constraint files and convert them to appropriate format and workflow that your tool requires. - The installation of Airflow is painless if you are following the instructions below. Airflow uses constraint files to enable reproducible installation, so using ``pip`` and constraint files is recommended. @@ -106,38 +107,6 @@ run the commands below. --start-date 2015-01-01 \ --end-date 2015-01-02 -Basic Airflow architecture --------------------------- - -Primarily intended for development use, the basic Airflow architecture with the Local and Sequential executors is an -excellent starting point for understanding the architecture of Apache Airflow. - -.. image:: img/arch-diag-basic.png - - -There are a few components to note: - -* **Metadata Database**: Airflow uses a SQL database to store metadata about the data pipelines being run. In the - diagram above, this is represented as Postgres which is extremely popular with Airflow. - Alternate databases supported with Airflow include MySQL. - -* **Web Server** and **Scheduler**: The Airflow web server and Scheduler are separate processes run (in this case) - on the local machine and interact with the database mentioned above. - -* The **Executor** is shown separately above, since it is commonly discussed within Airflow and in the documentation, but - in reality it is NOT a separate process, but run within the Scheduler. - -* The **Worker(s)** are separate processes which also interact with the other components of the Airflow architecture and - the metadata repository. - -* ``airflow.cfg`` is the Airflow configuration file which is accessed by the Web Server, Scheduler, and Workers. - -* **DAGs** refers to the DAG files containing Python code, representing the data pipelines to be run by Airflow. The - location of these files is specified in the Airflow configuration file, but they need to be accessible by the - Web Server, Scheduler, and Workers. - - - What's Next? '''''''''''' -From this point, you can head to the :doc:`tutorial` section for further examples or the :doc:`howto/index` section if you're ready to get your hands dirty. +From this point, you can head to the :doc:`/tutorial` section for further examples or the :doc:`/howto/index` section if you're ready to get your hands dirty. diff --git a/docs/conf.py b/docs/conf.py index eda3d5dc10..188a4b665a 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -231,6 +231,11 @@ if PACKAGE_NAME == 'apache-airflow': html_js_files = ['jira-links.js'] else: html_js_files = [] +if PACKAGE_NAME == 'apache-airflow': + html_extra_path = [ + f"{ROOT_DIR}/docs/apache-airflow/start/docker-compose.yaml", + f"{ROOT_DIR}/docs/apache-airflow/start/airflow.sh", + ] # -- Theme configuration ------------------------------------------------------- # Custom sidebar templates, maps document names to template names. @@ -303,7 +308,16 @@ html_context = { # Jinja context if PACKAGE_NAME == 'apache-airflow': - jinja_contexts = {'config_ctx': {"configs": default_config_yaml()}} + jinja_contexts = { + 'config_ctx': {"configs": default_config_yaml()}, + 'quick_start_ctx': { + 'doc_root_url': (f'https://airflow.apache.org/docs/apache-airflow/{PACKAGE_VERSION}/') + if FOR_PRODUCTION + else ( + 'http://apache-airflow-docs.s3-website.eu-central-1.amazonaws.com/docs/apache-airflow/latest/' + ) + }, + } elif PACKAGE_NAME.startswith('apache-airflow-providers-'): def _load_config(): diff --git a/docs/exts/docs_build/lint_checks.py b/docs/exts/docs_build/lint_checks.py index 225f6f91ae..c457eda8db 100644 --- a/docs/exts/docs_build/lint_checks.py +++ b/docs/exts/docs_build/lint_checks.py @@ -22,6 +22,9 @@ from glob import glob from itertools import chain from typing import Iterable, List, Optional, Set +import yaml + +import airflow from docs.exts.docs_build.docs_builder import ALL_PROVIDER_YAMLS # pylint: disable=no-name-in-module from docs.exts.docs_build.errors import DocBuildError # pylint: disable=no-name-in-module @@ -148,7 +151,9 @@ def _check_missing_guide_references(operator_names, python_module_paths) -> List return build_errors -def assert_file_not_contains(file_path: str, pattern: str, message: str) -> Optional[DocBuildError]: +def assert_file_not_contains( + *, file_path: str, pattern: str, message: Optional[str] = None +) -> Optional[DocBuildError]: """ Asserts that file does not contain the pattern. Return message error if it does. @@ -159,7 +164,9 @@ def assert_file_not_contains(file_path: str, pattern: str, message: str) -> Opti return _extract_file_content(file_path, message, pattern, False) -def assert_file_contains(file_path: str, pattern: str, message: str) -> Optional[DocBuildError]: +def assert_file_contains( + *, file_path: str, pattern: str, message: Optional[str] = None +) -> Optional[DocBuildError]: """ Asserts that file does contain the pattern. Return message error if it does not. @@ -170,7 +177,9 @@ def assert_file_contains(file_path: str, pattern: str, message: str) -> Optional return _extract_file_content(file_path, message, pattern, True) -def _extract_file_content(file_path: str, message, pattern: str, expected_contain: bool): +def _extract_file_content(file_path: str, message: Optional[str], pattern: str, expected_contain: bool): + if not message: + message = f"Pattern '{pattern}' could not be found in '{file_path}' file." with open(file_path, "rb", 0) as doc_file: pattern_compiled = re.compile(pattern) found = False @@ -309,6 +318,54 @@ def check_pypi_repository_in_provider_tocs() -> List[DocBuildError]: return build_errors +def check_docker_image_tag_in_quick_start_guide() -> List[DocBuildError]: + """Check that a good docker image is used in the quick start guide for Docker.""" + build_errors = [] + + compose_file_path = f"{DOCS_DIR}/apache-airflow/start/docker-compose.yaml" + expected_tag = 'master-python3.8' if "dev" in airflow.__version__ else airflow.__version__ + # master tag is little outdated. + expected_image = f'apache/airflow:{expected_tag}' + with open(compose_file_path) as yaml_file: + content = yaml.safe_load(yaml_file) + current_image_expression = content['x-airflow-common']['image'] + if expected_image not in current_image_expression: + build_errors.append( + DocBuildError( + file_path=compose_file_path, + line_no=None, + message=( + f"Invalid image in docker - compose.yaml\n" + f"Current image expression: {current_image_expression}\n" + f"Expected image: {expected_image}\n" + f"Please check the value of x-airflow-common.image key" + ), + ) + ) + build_error = assert_file_contains( + file_path=f"{DOCS_DIR}/apache-airflow/start/docker.rst", + pattern=re.escape(f'{expected_image} "/usr/bin/dumb-init'), + ) + if build_error: + build_errors.append(build_error) + + return build_errors + + +def check_airflow_versions_in_quick_start_guide() -> List[DocBuildError]: + """Check that a airflow version is presented in example in the quick start guide for Docker.""" + build_errors = [] + + build_error = assert_file_contains( + file_path=f"{DOCS_DIR}/apache-airflow/start/docker.rst", + pattern=re.escape(f"airflow-init_1 | {airflow.__version__}"), + ) + if build_error: + build_errors.append(build_error) + + return build_errors + + def run_all_check() -> List[DocBuildError]: """Run all checks from this module""" general_errors = [] @@ -317,4 +374,7 @@ def run_all_check() -> List[DocBuildError]: general_errors.extend(check_exampleinclude_for_example_dags()) general_errors.extend(check_example_dags_in_provider_tocs()) general_errors.extend(check_pypi_repository_in_provider_tocs()) + general_errors.extend(check_docker_image_tag_in_quick_start_guide()) + general_errors.extend(check_airflow_versions_in_quick_start_guide()) + return general_errors