update airflow config for 2.3.3

2022-11-22 16:04:45 -08:00 · 2022-11-22 16:04:45 -08:00 · d19cc711aa
--- a/airflow.cfg
+++ b/airflow.cfg
@ -1,4 +1,14 @@
 [core]
+# Hostname by providing a path to a callable, which will resolve the hostname.
+# The format is "package.function".
+#
+# For example, default value "socket.getfqdn" means that result from getfqdn() of "socket"
+# package will be used as hostname.
+#
+# No argument should be required in the function specified.
+# If using IP address as hostname is preferred, use value ``airflow.utils.net.get_host_ip_address``
+# hostname_callable = socket.getfqdn
+
 default_timezone = utc

 hide_sensitive_var_conn_fields = True
@ -34,12 +44,6 @@ max_active_runs_per_dag = 5
 # environment
 load_examples = False

-# Whether to load the default connections that ship with Airflow. It's good to
-# get started, but you probably want to set this to ``False`` in a production
-# environment
-# We have configured google_cloud_default, so hopefully this wont remove it.
-load_default_connections = False
-
 # Where your Airflow plugins are stored
 plugins_folder = $AIRFLOW_HOME/plugins

@ -88,6 +92,10 @@ unit_test_mode = False
 # RCE exploits).
 enable_xcom_pickling = False

+# When a task is killed forcefully, this is the amount of time in seconds that
+# it has to cleanup after it is sent a SIGTERM, before it is SIGKILLED
+killed_task_cleanup_time = 60
+
 # Whether to override params with dag_run.conf. If you pass some key-value pairs
 # through ``airflow dags backfill -c`` or
 # ``airflow dags trigger -c``, the key-value pairs will override the existing ones in params.
@ -96,15 +104,31 @@ dag_run_conf_overrides_params = True
 # When discovering DAGs, ignore any files that don't contain the strings ``DAG`` and ``airflow``.
 dag_discovery_safe_mode = False

+# The pattern syntax used in the ".airflowignore" files in the DAG directories. Valid values are
+# ``regexp`` or ``glob``.
+dag_ignore_file_syntax = regexp
+
 # The number of retries each task is going to have by default. Can be overridden at dag or task level.
 default_task_retries = 0

+# The weighting method used for the effective total priority weight of the task
+default_task_weight_rule = downstream
+
+# The default task execution_timeout value for the operators. Expected an integer value to
+# be passed into timedelta as seconds. If not specified, then the value is considered as None,
+# meaning that the operators are never timed out by default.
+default_task_execution_timeout =
+
 # We will override the next 2 intervals in prod via env vars.
 # Updating serialized DAG can not be faster than a minimum interval to reduce database write rate.
 # This flag sets the minimum interval (in seconds) after which the serialized DAGs in the DB should be updated.
 # This helps in reducing database write rate.
 min_serialized_dag_update_interval = 10

+# If True, serialized DAGs are compressed before writing to DB.
+# Note: this will disable the DAG dependencies view
+compress_serialized_dags = False
+
 # Fetching serialized DAG can not be faster than a minimum interval to reduce database
 # read rate. This config controls when your DAGs are updated in the Webserver
 min_serialized_dag_fetch_interval = 5
@ -139,8 +163,18 @@ lazy_load_plugins = True
 # loaded from module.
 lazy_discover_providers = True

+# The maximum list/dict length an XCom can push to trigger task mapping. If the pushed list/dict has a
+# length exceeding this value, the task pushing the XCom will be failed automatically to prevent the
+# mapped tasks from clogging the scheduler.
+max_map_length = 1024

 [database]
+# Whether to load the default connections that ship with Airflow. It's good to
+# get started, but you probably want to set this to ``False`` in a production
+# environment
+# We have configured google_cloud_default, so hopefully this wont remove it.
+load_default_connections = False
+
 # The SqlAlchemy connection string to the metadata database.
 # SqlAlchemy supports many different database engine, more information
 # their website
@ -160,6 +194,14 @@ sql_alchemy_pool_recycle = 3600
 # Currently it is only used in ``DagFileProcessor.process_file`` to retry ``dagbag.sync_to_db``.
 max_db_retries = 3

+# Collation for ``dag_id``, ``task_id``, ``key`` columns in case they have different encoding.
+# By default this collation is the same as the database collation, however for ``mysql`` and ``mariadb``
+# the default is ``utf8mb3_general_ci`` so that the index sizes of our index keys will not exceed
+# the maximum size of allowed index when collation is set to ``utf8mb4`` variant
+# (see https://github.com/apache/airflow/pull/17603#issuecomment-901121618).
+# and https://github.com/apache/airflow/pull/17729/
+# sql_engine_collation_for_ids =
+

 [logging]
 # The folder where airflow should store its log files. This location
@ -170,10 +212,22 @@ base_log_folder = $AIRFLOW_HOME/logs
 # Supported values: ``CRITICAL``, ``ERROR``, ``WARNING``, ``INFO``, ``DEBUG``.
 logging_level = INFO

+# Logging level for celery. If not set, it uses the value of logging_level
+#
+# Supported values: ``CRITICAL``, ``ERROR``, ``WARNING``, ``INFO``, ``DEBUG``.
+# celery_logging_level =
+
 # Logging level for Flask-appbuilder UI.
 #
 # Supported values: ``CRITICAL``, ``ERROR``, ``WARNING``, ``INFO``, ``DEBUG``.
-fab_logging_level = WARN
+fab_logging_level = WARNING
+
+# When you start an airflow worker, airflow starts a tiny web server
+# subprocess to serve the workers local log files to the airflow main
+# web server, who then builds pages and sends them to users. This defines
+# the port on which the logs are served. It needs to be unused, and open
+# visible from the main web server to connect into the workers.
+worker_log_server_port = 8793

 # Logging class
 # Specify the class that will specify the logging configuration
@ -204,7 +258,7 @@ log_filename_template = {{ ti.dag_id }}/{{ ti.task_id }}/{{ execution_date.strft
 log_processor_filename_template = {{ filename }}.log

 # full path of dag_processor_manager logfile
-dag_processor_manager_log_location = {AIRFLOW_HOME}/logs/dag_processor_manager/dag_processor_manager.log
+dag_processor_manager_log_location = ${AIRFLOW_HOME}/logs/dag_processor_manager/dag_processor_manager.log

 # Name of handler to read task instance logs.
 # Defaults to use ``task`` handler.
@ -212,8 +266,8 @@ task_log_reader = task

 # A comma\-separated list of third-party logger names that will be configured to print messages to
 # consoles\.
-# Example: extra_loggers = connexion,sqlalchemy
-# extra_loggers =
+# Example: extra_logger_names = connexion,sqlalchemy
+# extra_logger_names =


 [webserver]
@ -255,10 +309,29 @@ auth_backend = $AIRFLOW_AUTH_BACKEND
 # provided SSL will be enabled. This does not change the web server port.
 # web_server_ssl_key =

+# The type of backend used to store web session data, can be 'database' or 'securecookie'
+# Example: session_backend = securecookie
+# session_backend = database
+
+# Number of seconds the webserver waits before killing gunicorn master that doesn't respond
+web_server_master_timeout = 300
+
+# Number of seconds the gunicorn webserver waits before timing out on a worker
+web_server_worker_timeout = 300
+
+# Number of workers to refresh at a time. When set to 0, worker refresh is
+# disabled. When nonzero, airflow periodically refreshes webserver workers by
+# bringing up new ones and killing old ones.
+worker_refresh_batch_size = 1
+
+# Number of seconds to wait before refreshing a batch of workers.
+worker_refresh_interval = 6000
+
 # If set to True, Airflow will track files in plugins_folder directory. When it detects changes,
 # then reload the gunicorn.
-# You can toggle this for Development when iterating on plugins
-reload_on_plugin_change = False
+# We set this to True for local development, and override it with ENV var in prod
+# False in prod so that changes pushed to plugins folder do not kill currently running backfills
+reload_on_plugin_change = True

 # Log files for the gunicorn webserver. '-' means log to stderr.
 access_logfile = -
@ -365,6 +438,26 @@ session_lifetime_minutes = 43200
 # Sets a custom page title for the DAGs overview page and site title for all pages
 # instance_name =

+# Whether the custom page title for the DAGs overview page contains any Markup language
+instance_name_has_markup = False
+
+# How frequently, in seconds, the DAG data will auto-refresh in graph or grid view
+# when auto-refresh is turned on
+auto_refresh_interval = 3
+
+# Boolean for displaying warning for publicly viewable deployment
+warn_deployment_exposure = True
+
+# Comma separated string of view events to exclude from dag audit view.
+# All other events will be added minus the ones passed here.
+# The audit logs in the db will not be affected by this parameter.
+audit_view_excluded_events = gantt,landing_times,tries,duration,calendar,graph,grid,tree,tree_data
+
+# Comma separated string of view events to include in dag audit view.
+# If passed, only these events will populate the dag audit view.
+# The audit logs in the db will not be affected by this parameter.
+# Example: audit_view_included_events = dagrun_cleared,failed
+# audit_view_included_events =

 [email]
 email_backend = $AIRFLOW_EMAIL_BACKEND
@ -457,18 +550,16 @@ worker_concurrency = 32
 # Example: worker_prefetch_multiplier = 1
 # worker_prefetch_multiplier =

+# Specify if remote control of the workers is enabled.
+# When using Amazon SQS as the broker, Celery creates lots of ``.*reply-celery-pidbox`` queues. You can
+# prevent this by setting this to false. However, with this disabled Flower won't work.
+# worker_enable_remote_control = true
+
 # Umask that will be used when starting workers with the ``airflow celery worker``
 # in daemon mode. This control the file-creation mode mask which determines the initial
 # value of file permission bits for newly created files.
 # worker_umask = 0o077

-# When you start an airflow worker, airflow starts a tiny web server
-# subprocess to serve the workers local log files to the airflow main
-# web server, who then builds pages and sends them to users. This defines
-# the port on which the logs are served. It needs to be unused, and open
-# visible from the main web server to connect into the workers.
-worker_log_server_port = 8793
-
 # The Celery broker URL. Celery supports RabbitMQ, Redis and experimentally
 # a sqlalchemy database. Refer to the Celery documentation for more
 # information.
@ -583,10 +674,6 @@ allow_illegal_arguments = False
 # listen (in seconds).
 job_heartbeat_sec = 5

-# How often (in seconds) to check and tidy up 'running' TaskInstancess
-# that no longer have a matching DagRun
-clean_tis_without_dagrun_interval = 15.0
-
 # The scheduler constantly tries to trigger new tasks (look at the
 # scheduler section in the docs for more information). This defines
 # how often the scheduler should run (in seconds).
@ -605,6 +692,10 @@ scheduler_idle_sleep_time = 1
 # this interval. Keeping this number low will increase CPU usage.
 min_file_process_interval = 60

+# How often (in seconds) to check for stale DAGs (DAGs which are no longer present in
+# the expected files) which should be deactivated.
+deactivate_stale_dags_interval = 120
+
 # How often (in seconds) to scan the DAGs directory for new files. Default to 5 minutes.
 # This is set via env var to 300 in prod, but 30 for local testing
 dag_dir_list_interval = 30
@ -629,6 +720,9 @@ child_process_log_directory = ${AIRFLOW_HOME}/logs/scheduler
 # associated task instance as failed and will re-schedule the task.
 scheduler_zombie_task_threshold = 300

+# How often (in seconds) should the scheduler check for zombie tasks.
+zombie_detection_interval = 60.0
+
 # Turn off scheduler catchup by setting this to False.
 # Default behavior is unchanged and
 # Command Line Backfills still work, but the scheduler
@ -637,6 +731,13 @@ scheduler_zombie_task_threshold = 300
 # DAG definition (catchup)
 catchup_by_default = False

+# Setting this to True will make first task instance of a task
+# ignore depends_on_past setting. A task instance will be considered
+# as the first task instance of a task when there is no task instance
+# in the DB with an execution_date earlier than it., i.e. no manual marking
+# success will be needed for a newly added task to be scheduled.
+ignore_first_depends_on_past_by_default = True
+
 # This changes the batch size of queries in the scheduling main loop.
 # If this is too high, SQL query performance may be impacted by one
 # or more of the following:
@ -685,6 +786,14 @@ parsing_processes = 2
 # * ``alphabetical``: Sort by filename
 file_parsing_sort_mode = modified_time

+# Whether the dag processor is running as a standalone process or it is a subprocess of a scheduler
+# job.
+standalone_dag_processor = False
+
+# Only applicable if `[scheduler]standalone_dag_processor` is true and  callbacks are stored
+# in database. Contains maximum number of callbacks that are fetched during a single loop.
+max_callbacks_per_loop = 20
+
 # Turn off scheduler use of cron intervals by setting this to False.
 # DAGs submitted manually in the web UI or with trigger_dag will still run.
 use_job_schedule = True
@ -696,6 +805,12 @@ allow_trigger_in_future = False
 # DAG dependency detector class to use
 dependency_detector = airflow.serialization.serialized_objects.DependencyDetector

+# How often to check for expired trigger requests that have not run yet.
+trigger_timeout_check_interval = 15
+
+[triggerer]
+# How many triggers a single Triggerer will run at once, by default.
+default_capacity = 1000

 [metrics]
 # Statsd (https://github.com/etsy/statsd) integration settings
@ -790,6 +905,9 @@ fallback_page_limit = 100
 # Indicates whether the response can be shared with requesting code from the given origin.
 # access_control_allow_origin =

+[lineage]
+# what lineage backend to use
+# backend =

 [mesos]
 # Mesos master address which MesosExecutor will connect to.
@ -857,3 +975,8 @@ authenticate = False

 # [github_enterprise]
 # api_rev = v3
+
+[sensors]
+# A sensor will immediately fail without retrying if timeout is reached
+# Set to 3 days, default is 7 days or 604800
+default_timeout = 259200
--- a/webserver_config.py
+++ b/webserver_config.py
@ -28,7 +28,7 @@ from flask_appbuilder.security.manager import AUTH_OAUTH
 basedir = os.path.abspath(os.path.dirname(__file__))

 # The SQLAlchemy connection string.
-SQLALCHEMY_DATABASE_URI = conf.conf.get('core', 'SQL_ALCHEMY_CONN')
+SQLALCHEMY_DATABASE_URI = conf.conf.get('database', 'SQL_ALCHEMY_CONN')

 # Flask-WTF flag for CSRF
 CSRF_ENABLED = True