update airflow config for 2.3.3

This commit is contained in:
Harold Woo 2022-11-22 16:04:45 -08:00 коммит произвёл Mikaël Ducharme
Родитель e80472ab9a
Коммит d19cc711aa
2 изменённых файлов: 147 добавлений и 24 удалений

Просмотреть файл

@ -1,4 +1,14 @@
[core]
# Hostname by providing a path to a callable, which will resolve the hostname.
# The format is "package.function".
#
# For example, default value "socket.getfqdn" means that result from getfqdn() of "socket"
# package will be used as hostname.
#
# No argument should be required in the function specified.
# If using IP address as hostname is preferred, use value ``airflow.utils.net.get_host_ip_address``
# hostname_callable = socket.getfqdn
default_timezone = utc
hide_sensitive_var_conn_fields = True
@ -34,12 +44,6 @@ max_active_runs_per_dag = 5
# environment
load_examples = False
# Whether to load the default connections that ship with Airflow. It's good to
# get started, but you probably want to set this to ``False`` in a production
# environment
# We have configured google_cloud_default, so hopefully this wont remove it.
load_default_connections = False
# Where your Airflow plugins are stored
plugins_folder = $AIRFLOW_HOME/plugins
@ -88,6 +92,10 @@ unit_test_mode = False
# RCE exploits).
enable_xcom_pickling = False
# When a task is killed forcefully, this is the amount of time in seconds that
# it has to cleanup after it is sent a SIGTERM, before it is SIGKILLED
killed_task_cleanup_time = 60
# Whether to override params with dag_run.conf. If you pass some key-value pairs
# through ``airflow dags backfill -c`` or
# ``airflow dags trigger -c``, the key-value pairs will override the existing ones in params.
@ -96,15 +104,31 @@ dag_run_conf_overrides_params = True
# When discovering DAGs, ignore any files that don't contain the strings ``DAG`` and ``airflow``.
dag_discovery_safe_mode = False
# The pattern syntax used in the ".airflowignore" files in the DAG directories. Valid values are
# ``regexp`` or ``glob``.
dag_ignore_file_syntax = regexp
# The number of retries each task is going to have by default. Can be overridden at dag or task level.
default_task_retries = 0
# The weighting method used for the effective total priority weight of the task
default_task_weight_rule = downstream
# The default task execution_timeout value for the operators. Expected an integer value to
# be passed into timedelta as seconds. If not specified, then the value is considered as None,
# meaning that the operators are never timed out by default.
default_task_execution_timeout =
# We will override the next 2 intervals in prod via env vars.
# Updating serialized DAG can not be faster than a minimum interval to reduce database write rate.
# This flag sets the minimum interval (in seconds) after which the serialized DAGs in the DB should be updated.
# This helps in reducing database write rate.
min_serialized_dag_update_interval = 10
# If True, serialized DAGs are compressed before writing to DB.
# Note: this will disable the DAG dependencies view
compress_serialized_dags = False
# Fetching serialized DAG can not be faster than a minimum interval to reduce database
# read rate. This config controls when your DAGs are updated in the Webserver
min_serialized_dag_fetch_interval = 5
@ -139,8 +163,18 @@ lazy_load_plugins = True
# loaded from module.
lazy_discover_providers = True
# The maximum list/dict length an XCom can push to trigger task mapping. If the pushed list/dict has a
# length exceeding this value, the task pushing the XCom will be failed automatically to prevent the
# mapped tasks from clogging the scheduler.
max_map_length = 1024
[database]
# Whether to load the default connections that ship with Airflow. It's good to
# get started, but you probably want to set this to ``False`` in a production
# environment
# We have configured google_cloud_default, so hopefully this wont remove it.
load_default_connections = False
# The SqlAlchemy connection string to the metadata database.
# SqlAlchemy supports many different database engine, more information
# their website
@ -160,6 +194,14 @@ sql_alchemy_pool_recycle = 3600
# Currently it is only used in ``DagFileProcessor.process_file`` to retry ``dagbag.sync_to_db``.
max_db_retries = 3
# Collation for ``dag_id``, ``task_id``, ``key`` columns in case they have different encoding.
# By default this collation is the same as the database collation, however for ``mysql`` and ``mariadb``
# the default is ``utf8mb3_general_ci`` so that the index sizes of our index keys will not exceed
# the maximum size of allowed index when collation is set to ``utf8mb4`` variant
# (see https://github.com/apache/airflow/pull/17603#issuecomment-901121618).
# and https://github.com/apache/airflow/pull/17729/
# sql_engine_collation_for_ids =
[logging]
# The folder where airflow should store its log files. This location
@ -170,10 +212,22 @@ base_log_folder = $AIRFLOW_HOME/logs
# Supported values: ``CRITICAL``, ``ERROR``, ``WARNING``, ``INFO``, ``DEBUG``.
logging_level = INFO
# Logging level for celery. If not set, it uses the value of logging_level
#
# Supported values: ``CRITICAL``, ``ERROR``, ``WARNING``, ``INFO``, ``DEBUG``.
# celery_logging_level =
# Logging level for Flask-appbuilder UI.
#
# Supported values: ``CRITICAL``, ``ERROR``, ``WARNING``, ``INFO``, ``DEBUG``.
fab_logging_level = WARN
fab_logging_level = WARNING
# When you start an airflow worker, airflow starts a tiny web server
# subprocess to serve the workers local log files to the airflow main
# web server, who then builds pages and sends them to users. This defines
# the port on which the logs are served. It needs to be unused, and open
# visible from the main web server to connect into the workers.
worker_log_server_port = 8793
# Logging class
# Specify the class that will specify the logging configuration
@ -204,7 +258,7 @@ log_filename_template = {{ ti.dag_id }}/{{ ti.task_id }}/{{ execution_date.strft
log_processor_filename_template = {{ filename }}.log
# full path of dag_processor_manager logfile
dag_processor_manager_log_location = {AIRFLOW_HOME}/logs/dag_processor_manager/dag_processor_manager.log
dag_processor_manager_log_location = ${AIRFLOW_HOME}/logs/dag_processor_manager/dag_processor_manager.log
# Name of handler to read task instance logs.
# Defaults to use ``task`` handler.
@ -212,8 +266,8 @@ task_log_reader = task
# A comma\-separated list of third-party logger names that will be configured to print messages to
# consoles\.
# Example: extra_loggers = connexion,sqlalchemy
# extra_loggers =
# Example: extra_logger_names = connexion,sqlalchemy
# extra_logger_names =
[webserver]
@ -255,10 +309,29 @@ auth_backend = $AIRFLOW_AUTH_BACKEND
# provided SSL will be enabled. This does not change the web server port.
# web_server_ssl_key =
# The type of backend used to store web session data, can be 'database' or 'securecookie'
# Example: session_backend = securecookie
# session_backend = database
# Number of seconds the webserver waits before killing gunicorn master that doesn't respond
web_server_master_timeout = 300
# Number of seconds the gunicorn webserver waits before timing out on a worker
web_server_worker_timeout = 300
# Number of workers to refresh at a time. When set to 0, worker refresh is
# disabled. When nonzero, airflow periodically refreshes webserver workers by
# bringing up new ones and killing old ones.
worker_refresh_batch_size = 1
# Number of seconds to wait before refreshing a batch of workers.
worker_refresh_interval = 6000
# If set to True, Airflow will track files in plugins_folder directory. When it detects changes,
# then reload the gunicorn.
# You can toggle this for Development when iterating on plugins
reload_on_plugin_change = False
# We set this to True for local development, and override it with ENV var in prod
# False in prod so that changes pushed to plugins folder do not kill currently running backfills
reload_on_plugin_change = True
# Log files for the gunicorn webserver. '-' means log to stderr.
access_logfile = -
@ -365,6 +438,26 @@ session_lifetime_minutes = 43200
# Sets a custom page title for the DAGs overview page and site title for all pages
# instance_name =
# Whether the custom page title for the DAGs overview page contains any Markup language
instance_name_has_markup = False
# How frequently, in seconds, the DAG data will auto-refresh in graph or grid view
# when auto-refresh is turned on
auto_refresh_interval = 3
# Boolean for displaying warning for publicly viewable deployment
warn_deployment_exposure = True
# Comma separated string of view events to exclude from dag audit view.
# All other events will be added minus the ones passed here.
# The audit logs in the db will not be affected by this parameter.
audit_view_excluded_events = gantt,landing_times,tries,duration,calendar,graph,grid,tree,tree_data
# Comma separated string of view events to include in dag audit view.
# If passed, only these events will populate the dag audit view.
# The audit logs in the db will not be affected by this parameter.
# Example: audit_view_included_events = dagrun_cleared,failed
# audit_view_included_events =
[email]
email_backend = $AIRFLOW_EMAIL_BACKEND
@ -457,18 +550,16 @@ worker_concurrency = 32
# Example: worker_prefetch_multiplier = 1
# worker_prefetch_multiplier =
# Specify if remote control of the workers is enabled.
# When using Amazon SQS as the broker, Celery creates lots of ``.*reply-celery-pidbox`` queues. You can
# prevent this by setting this to false. However, with this disabled Flower won't work.
# worker_enable_remote_control = true
# Umask that will be used when starting workers with the ``airflow celery worker``
# in daemon mode. This control the file-creation mode mask which determines the initial
# value of file permission bits for newly created files.
# worker_umask = 0o077
# When you start an airflow worker, airflow starts a tiny web server
# subprocess to serve the workers local log files to the airflow main
# web server, who then builds pages and sends them to users. This defines
# the port on which the logs are served. It needs to be unused, and open
# visible from the main web server to connect into the workers.
worker_log_server_port = 8793
# The Celery broker URL. Celery supports RabbitMQ, Redis and experimentally
# a sqlalchemy database. Refer to the Celery documentation for more
# information.
@ -583,10 +674,6 @@ allow_illegal_arguments = False
# listen (in seconds).
job_heartbeat_sec = 5
# How often (in seconds) to check and tidy up 'running' TaskInstancess
# that no longer have a matching DagRun
clean_tis_without_dagrun_interval = 15.0
# The scheduler constantly tries to trigger new tasks (look at the
# scheduler section in the docs for more information). This defines
# how often the scheduler should run (in seconds).
@ -605,6 +692,10 @@ scheduler_idle_sleep_time = 1
# this interval. Keeping this number low will increase CPU usage.
min_file_process_interval = 60
# How often (in seconds) to check for stale DAGs (DAGs which are no longer present in
# the expected files) which should be deactivated.
deactivate_stale_dags_interval = 120
# How often (in seconds) to scan the DAGs directory for new files. Default to 5 minutes.
# This is set via env var to 300 in prod, but 30 for local testing
dag_dir_list_interval = 30
@ -629,6 +720,9 @@ child_process_log_directory = ${AIRFLOW_HOME}/logs/scheduler
# associated task instance as failed and will re-schedule the task.
scheduler_zombie_task_threshold = 300
# How often (in seconds) should the scheduler check for zombie tasks.
zombie_detection_interval = 60.0
# Turn off scheduler catchup by setting this to False.
# Default behavior is unchanged and
# Command Line Backfills still work, but the scheduler
@ -637,6 +731,13 @@ scheduler_zombie_task_threshold = 300
# DAG definition (catchup)
catchup_by_default = False
# Setting this to True will make first task instance of a task
# ignore depends_on_past setting. A task instance will be considered
# as the first task instance of a task when there is no task instance
# in the DB with an execution_date earlier than it., i.e. no manual marking
# success will be needed for a newly added task to be scheduled.
ignore_first_depends_on_past_by_default = True
# This changes the batch size of queries in the scheduling main loop.
# If this is too high, SQL query performance may be impacted by one
# or more of the following:
@ -685,6 +786,14 @@ parsing_processes = 2
# * ``alphabetical``: Sort by filename
file_parsing_sort_mode = modified_time
# Whether the dag processor is running as a standalone process or it is a subprocess of a scheduler
# job.
standalone_dag_processor = False
# Only applicable if `[scheduler]standalone_dag_processor` is true and callbacks are stored
# in database. Contains maximum number of callbacks that are fetched during a single loop.
max_callbacks_per_loop = 20
# Turn off scheduler use of cron intervals by setting this to False.
# DAGs submitted manually in the web UI or with trigger_dag will still run.
use_job_schedule = True
@ -696,6 +805,12 @@ allow_trigger_in_future = False
# DAG dependency detector class to use
dependency_detector = airflow.serialization.serialized_objects.DependencyDetector
# How often to check for expired trigger requests that have not run yet.
trigger_timeout_check_interval = 15
[triggerer]
# How many triggers a single Triggerer will run at once, by default.
default_capacity = 1000
[metrics]
# Statsd (https://github.com/etsy/statsd) integration settings
@ -790,6 +905,9 @@ fallback_page_limit = 100
# Indicates whether the response can be shared with requesting code from the given origin.
# access_control_allow_origin =
[lineage]
# what lineage backend to use
# backend =
[mesos]
# Mesos master address which MesosExecutor will connect to.
@ -857,3 +975,8 @@ authenticate = False
# [github_enterprise]
# api_rev = v3
[sensors]
# A sensor will immediately fail without retrying if timeout is reached
# Set to 3 days, default is 7 days or 604800
default_timeout = 259200

Просмотреть файл

@ -28,7 +28,7 @@ from flask_appbuilder.security.manager import AUTH_OAUTH
basedir = os.path.abspath(os.path.dirname(__file__))
# The SQLAlchemy connection string.
SQLALCHEMY_DATABASE_URI = conf.conf.get('core', 'SQL_ALCHEMY_CONN')
SQLALCHEMY_DATABASE_URI = conf.conf.get('database', 'SQL_ALCHEMY_CONN')
# Flask-WTF flag for CSRF
CSRF_ENABLED = True