[DSRE-6] Upgrade Airflow (wtmo) to 2.1.1

This commit is contained in:
Harold Woo 2021-10-12 17:29:13 -07:00 коммит произвёл haroldwoo
Родитель 2e56562945
Коммит e1518a5ff5
60 изменённых файлов: 4492 добавлений и 1809 удалений

1
.gitignore поставляемый
Просмотреть файл

@ -7,6 +7,7 @@ venv
logs
unittests.cfg
airflow-webserver.pid
airflow-worker.pid
.config
.credentials

Просмотреть файл

@ -3,7 +3,7 @@
# https://forums.docker.com/t/multiple-projects-stopped-building-on-docker-hub-operation-not-permitted/92570/6
# and https://forums.docker.com/t/multiple-projects-stopped-building-on-docker-hub-operation-not-permitted/92570/11
FROM python:3.7-slim-buster
MAINTAINER Jannis Leidel <jezdez@mozilla.com>
MAINTAINER Harold Woo <hwoo@mozilla.com>
# Due to AIRFLOW-6854, Python 3.7 is chosen as the base python version.

Просмотреть файл

@ -1,15 +1,16 @@
[core]
# 1.10 additions
default_timezone = utc
log_filename_template = {{ ti.dag_id }}/{{ ti.task_id }}/{{ execution_date.strftime("%%Y-%%m-%%dT%%H:%%M:%%S") }}/{{ try_number }}.log
hide_sensitive_var_conn_fields = True
sensitive_var_conn_names = 'cred,CRED,secret,SECRET,pass,PASS,password,PASSWORD,private,PRIVATE,key,KEY,cert,CERT,token,TOKEN,AKIA'
# This setting would not have any effect in an existing deployment where the default_pool already exists.
# default_pool_task_slot_count = 50
# The folder where your airflow pipelines live, most likely a
# subfolder in a code repository
dags_folder = $AIRFLOW_HOME/dags
# The folder where airflow should store its log files. This location
base_log_folder = $AIRFLOW_HOME/logs
# The executor class that airflow should use. Choices include
# SequentialExecutor, LocalExecutor, CeleryExecutor
executor = CeleryExecutor
@ -34,7 +35,7 @@ sql_alchemy_pool_recycle = 3600
parallelism = 16
# The number of task instances allowed to run concurrently by the scheduler
dag_concurrency = 16
max_active_tasks_per_dag = 16
# Are DAGs paused by default at creation
dags_are_paused_at_creation = True
@ -47,9 +48,20 @@ max_active_runs_per_dag = 5
# environment
load_examples = False
# Whether to load the default connections that ship with Airflow. It's good to
# get started, but you probably want to set this to ``False`` in a production
# environment
# We have configured google_cloud_default, so hopefully this wont remove it.
load_default_connections = False
# Where your Airflow plugins are stored
plugins_folder = $AIRFLOW_HOME/plugins
# Should tasks be executed via forking of the parent process ("False",
# the speedier option) or by spawning a new python process ("True" slow,
# but means plugin changes picked up by tasks straight away)
execute_tasks_new_python_interpreter = False
# Secret key to save connection passwords in the db
# Setting this to $AIRFLOW_FERNET_KEY is broken in 1.9 for initdb. Set $AIRFLOW__CORE__FERNET_KEY instead
# fernet_key =
@ -58,15 +70,162 @@ plugins_folder = $AIRFLOW_HOME/plugins
donot_pickle = False
# How long before timing out a python file import while filling the DagBag
dagbag_import_timeout = 30
dagbag_import_timeout = 30.0
# Should a traceback be shown in the UI for dagbag import errors,
# instead of just the exception message
dagbag_import_error_tracebacks = True
# If tracebacks are shown, how many entries from the traceback should be shown
dagbag_import_error_traceback_depth = 2
# How long before timing out a DagFileProcessor, which processes a dag file
dag_file_processor_timeout = 50
# The class to use for running task instances in a subprocess.
# Choices include StandardTaskRunner, CgroupTaskRunner or the full import path to the class
# when using a custom task runner.
task_runner = StandardTaskRunner
# If set, tasks without a ``run_as_user`` argument will be run with this user
# Can be used to de-elevate a sudo user running Airflow when executing tasks
# default_impersonation =
# What security module to use (for example kerberos)
# security =
# Turn unit test mode on (overwrites many configuration options with test
# values at runtime)
unit_test_mode = False
# Whether to enable pickling for xcom (note that this is insecure and allows for
# RCE exploits).
enable_xcom_pickling = False
# Whether to override params with dag_run.conf. If you pass some key-value pairs
# through ``airflow dags backfill -c`` or
# ``airflow dags trigger -c``, the key-value pairs will override the existing ones in params.
dag_run_conf_overrides_params = True
# When discovering DAGs, ignore any files that don't contain the strings ``DAG`` and ``airflow``.
dag_discovery_safe_mode = False
# The number of retries each task is going to have by default. Can be overridden at dag or task level.
default_task_retries = 0
# We will override the next 2 intervals in prod via env vars.
# Updating serialized DAG can not be faster than a minimum interval to reduce database write rate.
# This flag sets the minimum interval (in seconds) after which the serialized DAGs in the DB should be updated.
# This helps in reducing database write rate.
min_serialized_dag_update_interval = 10
# Fetching serialized DAG can not be faster than a minimum interval to reduce database
# read rate. This config controls when your DAGs are updated in the Webserver
min_serialized_dag_fetch_interval = 5
# Whether to persist DAG files code in DB.
# If set to True, Webserver reads file contents from DB instead of
# trying to access files in a DAG folder.
# (Default is ``True``)
# Example: store_dag_code = True
# store_dag_code =
# Maximum number of Rendered Task Instance Fields (Template Fields) per task to store
# in the Database.
# All the template_fields for each of Task Instance are stored in the Database.
# Keeping this number small may cause an error when you try to view ``Rendered`` tab in
# TaskInstance view for older tasks.
max_num_rendered_ti_fields_per_task = 30
# On each dagrun check against defined SLAs
check_slas = True
# Path to custom XCom class that will be used to store and resolve operators results
# Example: xcom_backend = path.to.CustomXCom
xcom_backend = airflow.models.xcom.BaseXCom
# By default Airflow plugins are lazily-loaded (only loaded when required). Set it to ``False``,
# if you want to load plugins whenever 'airflow' is invoked via cli or loaded from module.
lazy_load_plugins = True
# By default Airflow providers are lazily-discovered (discovery and imports happen only when required).
# Set it to False, if you want to discover providers whenever 'airflow' is invoked via cli or
# loaded from module.
lazy_discover_providers = True
# Number of times the code should be retried in case of DB Operational Errors.
# Not all transactions will be retried as it can cause undesired state.
# Currently it is only used in ``DagFileProcessor.process_file`` to retry ``dagbag.sync_to_db``.
max_db_retries = 3
[logging]
# The folder where airflow should store its log files. This location
base_log_folder = $AIRFLOW_HOME/logs
# Logging level.
#
# Supported values: ``CRITICAL``, ``ERROR``, ``WARNING``, ``INFO``, ``DEBUG``.
logging_level = INFO
# Logging level for Flask-appbuilder UI.
#
# Supported values: ``CRITICAL``, ``ERROR``, ``WARNING``, ``INFO``, ``DEBUG``.
fab_logging_level = WARN
# Logging class
# Specify the class that will specify the logging configuration
# This class has to be on the python classpath
# Example: logging_config_class = my.path.default_local_settings.LOGGING_CONFIG
# logging_config_class =
# Flag to enable/disable Colored logs in Console
# Colour the logs when the controlling terminal is a TTY.
colored_console_log = True
# Log format for when Colored logs is enabled
colored_log_format = [%%(blue)s%%(asctime)s%%(reset)s] {{%%(blue)s%%(filename)s:%%(reset)s%%(lineno)d}} %%(log_color)s%%(levelname)s%%(reset)s - %%(log_color)s%%(message)s%%(reset)s
colored_formatter_class = airflow.utils.log.colored_log.CustomTTYColoredFormatter
# Format of Log line
log_format = [%%(asctime)s] {{%%(filename)s:%%(lineno)d}} %%(levelname)s - %%(message)s
simple_log_format = %%(asctime)s %%(levelname)s - %%(message)s
# Specify prefix pattern like mentioned below with stream handler TaskHandlerWithCustomFormatter
# Example: task_log_prefix_template = {{ti.dag_id}}-{{ti.task_id}}-{{execution_date}}-{{try_number}}
# task_log_prefix_template =
# Formatting for how airflow generates file names/paths for each task run.
log_filename_template = {{ ti.dag_id }}/{{ ti.task_id }}/{{ execution_date.strftime("%%Y-%%m-%%dT%%H:%%M:%%S") }}/{{ try_number }}.log
# Formatting for how airflow generates file names for log
log_processor_filename_template = {{ filename }}.log
# full path of dag_processor_manager logfile
dag_processor_manager_log_location = {AIRFLOW_HOME}/logs/dag_processor_manager/dag_processor_manager.log
# Name of handler to read task instance logs.
# Defaults to use ``task`` handler.
task_log_reader = task
# A comma\-separated list of third-party logger names that will be configured to print messages to
# consoles\.
# Example: extra_loggers = connexion,sqlalchemy
# extra_loggers =
[webserver]
rbac = $WEBSERVER_USE_RBAC
# The base url of your website as airflow cannot guess what domain or
# cname you are using. This is use in automated emails that
# airflow sends to point links to the right web server
base_url = $URL
# Default timezone to display all dates in the UI, can be UTC, system, or
# any IANA timezone string (e.g. Europe/Amsterdam). If left empty the
# default value of core/default_timezone will be used
# Example: default_ui_timezone = America/New_York
default_ui_timezone = UTC
# The ip specified when starting the web server
web_server_host = 0.0.0.0
@ -83,19 +242,151 @@ workers = 4
# sync (default), eventlet, gevent
worker_class = gevent
# Expose the configuration file in the web server
expose_config = true
# Set to true to turn on authentication : http://pythonhosted.org/airflow/installation.html#web-authentication
authenticate = $AIRFLOW_AUTHENTICATE
auth_backend = $AIRFLOW_AUTH_BACKEND
# Filter the list of dags by owner name (requires authentication to be enabled)
filter_by_owner = False
# Paths to the SSL certificate and key for the web server. When both are
# provided SSL will be enabled. This does not change the web server port.
# web_server_ssl_cert =
# Paths to the SSL certificate and key for the web server. When both are
# provided SSL will be enabled. This does not change the web server port.
# web_server_ssl_key =
# If set to True, Airflow will track files in plugins_folder directory. When it detects changes,
# then reload the gunicorn.
# You can toggle this for Development when iterating on plugins
reload_on_plugin_change = False
# Log files for the gunicorn webserver. '-' means log to stderr.
access_logfile = -
# Log files for the gunicorn webserver. '-' means log to stderr.
error_logfile = -
# Access log format for gunicorn webserver.
# default format is %%(h)s %%(l)s %%(u)s %%(t)s "%%(r)s" %%(s)s %%(b)s "%%(f)s" "%%(a)s"
# documentation - https://docs.gunicorn.org/en/stable/settings.html#access-log-format
# access_logformat =
# Expose the configuration file in the web server
expose_config = True
# Expose hostname in the web server
expose_hostname = True
# Expose stacktrace in the web server
expose_stacktrace = True
# Default DAG view. Valid values are: ``tree``, ``graph``, ``duration``, ``gantt``, ``landing_times``
dag_default_view = tree
# Default DAG orientation. Valid values are:
# ``LR`` (Left->Right), ``TB`` (Top->Bottom), ``RL`` (Right->Left), ``BT`` (Bottom->Top)
dag_orientation = LR
# The amount of time (in secs) webserver will wait for initial handshake
# while fetching logs from other worker machine
log_fetch_timeout_sec = 5
# Time interval (in secs) to wait before next log fetching.
log_fetch_delay_sec = 2
# Distance away from page bottom to enable auto tailing.
log_auto_tailing_offset = 30
# Animation speed for auto tailing log display.
log_animation_speed = 1000
# By default, the webserver shows paused DAGs. Flip this to hide paused
# DAGs by default
hide_paused_dags_by_default = False
# Consistent page size across all listing views in the UI
page_size = 100
# Define the color of navigation bar
navbar_color = #fff
# Default dagrun to show in UI
default_dag_run_display_number = 25
# Enable werkzeug ``ProxyFix`` middleware for reverse proxy
enable_proxy_fix = False
# Number of values to trust for ``X-Forwarded-For``.
# More info: https://werkzeug.palletsprojects.com/en/0.16.x/middleware/proxy_fix/
proxy_fix_x_for = 1
# Number of values to trust for ``X-Forwarded-Proto``
proxy_fix_x_proto = 1
# Number of values to trust for ``X-Forwarded-Host``
proxy_fix_x_host = 1
# Number of values to trust for ``X-Forwarded-Port``
proxy_fix_x_port = 1
# Number of values to trust for ``X-Forwarded-Prefix``
proxy_fix_x_prefix = 1
# Set secure flag on session cookie
cookie_secure = False
# Set samesite policy on session cookie
cookie_samesite = Lax
# Default setting for wrap toggle on DAG code and TI log views.
default_wrap = False
# Allow the UI to be rendered in a frame
x_frame_enabled = True
# Send anonymous user activity to your analytics tool
# choose from google_analytics, segment, or metarouter
# analytics_tool =
# Unique ID of your account in the analytics tool
# analytics_id =
# 'Recent Tasks' stats will show for old DagRuns if set
show_recent_stats_for_completed_runs = True
# Update FAB permissions and sync security manager roles
# on webserver startup
update_fab_perms = True
# The UI cookie lifetime in minutes. User will be logged out from UI after
# ``session_lifetime_minutes`` of non-activity
session_lifetime_minutes = 43200
# Sets a custom page title for the DAGs overview page and site title for all pages
# instance_name =
[email]
email_backend = $AIRFLOW_EMAIL_BACKEND
# Email connection to use
# email_conn_id = smtp_default
# Whether email alerts should be sent when a task is retried
default_email_on_retry = True
# Whether email alerts should be sent when a task failed
default_email_on_failure = True
# File that will be used as the template for Email subject (which will be rendered using Jinja2).
# If not set, Airflow uses a base template.
# Example: subject_template = /path/to/my_subject_template_file
# subject_template =
# File that will be used as the template for Email content (which will be rendered using Jinja2).
# If not set, Airflow uses a base template.
# Example: html_content_template = /path/to/my_html_content_template_file
# html_content_template =
[smtp]
# If you want airflow to send emails on retries, failure, and you want to
# the airflow.utils.send_email function, you have to configure an smtp
@ -107,6 +398,30 @@ smtp_port = 587
smtp_user = $AIRFLOW_SMTP_USER
smtp_password = $AIRFLOW_SMTP_PASSWORD
smtp_mail_from = $AIRFLOW_SMTP_FROM
# smtp_timeout = 30
# smtp_retry_limit = 5
[sentry]
# Sentry (https://docs.sentry.io) integration. Here you can supply
# additional configuration options based on the Python platform. See:
# https://docs.sentry.io/error-reporting/configuration/?platform=python.
# Unsupported options: ``integrations``, ``in_app_include``, ``in_app_exclude``,
# ``ignore_errors``, ``before_breadcrumb``, ``before_send``, ``transport``.
# Enable error reporting to Sentry
# sentry_on = false
# sentry_dsn =
[celery_kubernetes_executor]
# This section only applies if you are using the ``CeleryKubernetesExecutor`` in
# ``[core]`` section above
# Define when to send a task to ``KubernetesExecutor`` when using ``CeleryKubernetesExecutor``.
# When the queue of a task is the value of ``kubernetes_queue`` (default ``kubernetes``),
# the task is executed via ``KubernetesExecutor``,
# otherwise via ``CeleryExecutor``
# kubernetes_queue = kubernetes
[celery]
# This section only applies if you are using the CeleryExecutor in
@ -121,6 +436,30 @@ celery_app_name = airflow.executors.celery_executor
# your worker box and the nature of your tasks
worker_concurrency = 32
# The maximum and minimum concurrency that will be used when starting workers with the
# ``airflow celery worker`` command (always keep minimum processes, but grow
# to maximum if necessary). Note the value should be max_concurrency,min_concurrency
# Pick these numbers based on resources on worker box and the nature of the task.
# If autoscale option is available, worker_concurrency will be ignored.
# http://docs.celeryproject.org/en/latest/reference/celery.bin.worker.html#cmdoption-celery-worker-autoscale
# Example: worker_autoscale = 16,12
# worker_autoscale =
# Used to increase the number of tasks that a worker prefetches which can improve performance.
# The number of processes multiplied by worker_prefetch_multiplier is the number of tasks
# that are prefetched by a worker. A value greater than 1 can result in tasks being unnecessarily
# blocked if there are multiple workers and one worker prefetches tasks that sit behind long
# running tasks while another worker has unutilized processes that are unable to process the already
# claimed blocked tasks.
# https://docs.celeryproject.org/en/stable/userguide/optimizing.html#prefetch-limits
# Example: worker_prefetch_multiplier = 1
# worker_prefetch_multiplier =
# Umask that will be used when starting workers with the ``airflow celery worker``
# in daemon mode. This control the file-creation mode mask which determines the initial
# value of file permission bits for newly created files.
# worker_umask = 0o077
# When you start an airflow worker, airflow starts a tiny web server
# subprocess to serve the workers local log files to the airflow main
# web server, who then builds pages and sends them to users. This defines
@ -136,36 +475,151 @@ broker_url = $AIRFLOW_BROKER_URL
# Another key Celery setting
result_backend = $AIRFLOW_RESULT_URL
# Celery Flower is a sweet UI for Celery. Airflow has a shortcut to start
# it ``airflow celery flower``. This defines the IP that Celery Flower runs on
flower_host = 0.0.0.0
# The root URL for Flower
# Example: flower_url_prefix = /flower
# flower_url_prefix =
# Celery Flower is a sweet UI for Celery. Airflow has a shortcut to start
# it `airflow flower`. This defines the port that Celery Flower runs on
flower_port = $AIRFLOW_FLOWER_PORT
# Securing Flower with Basic Authentication
# Accepts user:password pairs separated by a comma
# Example: flower_basic_auth = user1:password1,user2:password2
# flower_basic_auth =
# How many processes CeleryExecutor uses to sync task state.
# 0 means to use max(1, number of cores - 1) processes.
sync_parallelism = 0
# Import path for celery configuration options
celery_config_options = airflow.config_templates.default_celery.DEFAULT_CELERY_CONFIG
ssl_active = False
# ssl_key =
# ssl_cert =
# ssl_cacert =
# Celery Pool implementation.
# Choices include: ``prefork`` (default), ``eventlet``, ``gevent`` or ``solo``.
# See:
# https://docs.celeryproject.org/en/latest/userguide/workers.html#concurrency
# https://docs.celeryproject.org/en/latest/userguide/concurrency/eventlet.html
pool = prefork
# The number of seconds to wait before timing out ``send_task_to_executor`` or
# ``fetch_celery_task_state`` operations.
operation_timeout = 3.0
# Celery task will report its status as 'started' when the task is executed by a worker.
# This is used in Airflow to keep track of the running tasks and if a Scheduler is restarted
# or run in HA mode, it can adopt the orphan tasks launched by previous SchedulerJob.
task_track_started = True
# Time in seconds after which Adopted tasks are cleared by CeleryExecutor. This is helpful to clear
# stalled tasks.
task_adoption_timeout = 600
# The Maximum number of retries for publishing task messages to the broker when failing
# due to ``AirflowTaskTimeout`` error before giving up and marking Task as failed.
task_publish_max_retries = 3
# Worker initialisation check to validate Metadata Database connection
worker_precheck = False
# [dask]
# This section only applies if you are using the DaskExecutor in
# [core] section above
# The IP address and port of the Dask cluster's scheduler.
# cluster_address = 127.0.0.1:8786
# TLS/ SSL settings to access a secured Dask scheduler.
# tls_ca =
# tls_cert =
# tls_key =
[celery_broker_transport_options]
# This section is for specifying options which can be passed to the
# underlying celery broker transport. See:
# http://docs.celeryproject.org/en/latest/userguide/configuration.html#std:setting-broker_transport_options
# The visibility timeout defines the number of seconds to wait for the worker
# to acknowledge the task before the message is redelivered to another worker.
# Make sure to increase the visibility timeout to match the time of the longest
# ETA you're planning to use.
# visibility_timeout is only supported for Redis and SQS celery brokers.
# See:
# http://docs.celeryproject.org/en/master/userguide/configuration.html#std:setting-broker_transport_options
# Example: visibility_timeout = 21600
# visibility_timeout =
[operators]
# Default queue that tasks get assigned to and that worker listen on.
default_queue = default
# The default owner assigned to each new operator, unless
# provided explicitly or passed via ``default_args``
# default_owner = airflow
# default_cpus = 1
# default_ram = 512
# default_disk = 512
# default_gpus = 0
# Is allowed to pass additional/unused arguments (args, kwargs) to the BaseOperator operator.
# If set to False, an exception will be thrown, otherwise only the console message will be displayed.
allow_illegal_arguments = False
[scheduler]
# Task instances listen for external kill signal (when you clear tasks
# from the CLI or the UI), this defines the frequency at which they should
# listen (in seconds).
job_heartbeat_sec = 5
# How often (in seconds) to check and tidy up 'running' TaskInstancess
# that no longer have a matching DagRun
clean_tis_without_dagrun_interval = 15.0
# The scheduler constantly tries to trigger new tasks (look at the
# scheduler section in the docs for more information). This defines
# how often the scheduler should run (in seconds).
scheduler_heartbeat_sec = 5
# after how much time should the scheduler terminate in seconds
# -1 indicates to run continuously (see also num_runs)
run_duration = -1
# The number of times to try to schedule each DAG file
# -1 indicates unlimited number
num_runs = -1
# after how much time a new DAGs should be picked up from the filesystem
min_file_process_interval = 0
# The number of seconds to wait between consecutive DAG file processing
# Deprecated since version 2.2.0: The option has been moved to scheduler.scheduler_idle_sleep_time
processor_poll_interval = 1
dag_dir_list_interval = 300
# Number of seconds after which a DAG file is parsed. The DAG file is parsed every
# ``min_file_process_interval`` number of seconds. Updates to DAGs are reflected after
# this interval. Keeping this number low will increase CPU usage.
min_file_process_interval = 60
# How often should stats be printed to the logs
# How often (in seconds) to scan the DAGs directory for new files. Default to 5 minutes.
# This is set via env var to 300 in prod, but 30 for local testing
dag_dir_list_interval = 30
# How often should stats be printed to the logs. Setting to 0 will disable printing stats
print_stats_interval = 30
# How often (in seconds) should pool usage stats be sent to statsd (if statsd_on is enabled)
pool_metrics_interval = 20.0
# If the last scheduler heartbeat happened more than scheduler_health_check_threshold
# ago (in seconds), scheduler is considered unhealthy.
# This is used by the health check in the "/health" endpoint
scheduler_health_check_threshold = 30
# How often (in seconds) should the scheduler check for orphaned tasks and SchedulerJobs
orphaned_tasks_check_interval = 300.0
child_process_log_directory = ${AIRFLOW_HOME}/logs/scheduler
# Local task jobs periodically heartbeat to the DB. If the job has
@ -173,22 +627,185 @@ child_process_log_directory = ${AIRFLOW_HOME}/logs/scheduler
# associated task instance as failed and will re-schedule the task.
scheduler_zombie_task_threshold = 300
# Turn off scheduler catchup by setting this to False.
# Default behavior is unchanged and
# Command Line Backfills still work, but the scheduler
# will not do scheduler catchup if this is False,
# however it can be set on a per DAG basis in the
# DAG definition (catchup)
catchup_by_default = True
catchup_by_default = False
# This changes the batch size of queries in the scheduling main loop.
# If this is too high, SQL query performance may be impacted by one
# or more of the following:
# - reversion to full table scan
# - complexity of query predicate
# - excessive locking
# Additionally, you may hit the maximum allowable query length for your db.
# Set this to 0 for no limit (not advised)
max_tis_per_query = 512
# Should the scheduler issue ``SELECT ... FOR UPDATE`` in relevant queries.
# If this is set to False then you should not run more than a single
# scheduler at once
use_row_level_locking = True
# Max number of DAGs to create DagRuns for per scheduler loop
#
# Default: 10
# max_dagruns_to_create_per_loop =
# How many DagRuns should a scheduler examine (and lock) when scheduling
# and queuing tasks.
#
# Default: 20
# max_dagruns_per_loop_to_schedule =
# Should the Task supervisor process perform a "mini scheduler" to attempt to schedule more tasks of the
# same DAG. Leaving this on will mean tasks in the same DAG execute quicker, but might starve out other
# dags in some circumstances
#
# Default: True
# schedule_after_task_execution =
# The scheduler can run multiple processes in parallel to parse dags.
# This defines how many processes will run.
parsing_processes = 2
# One of ``modified_time``, ``random_seeded_by_host`` and ``alphabetical``.
# The scheduler will list and sort the dag files to decide the parsing order.
#
# * ``modified_time``: Sort by modified time of the files. This is useful on large scale to parse the
# recently modified DAGs first.
# * ``random_seeded_by_host``: Sort randomly across multiple Schedulers but with same order on the
# same host. This is useful when running with Scheduler in HA mode where each scheduler can
# parse different DAG files.
# * ``alphabetical``: Sort by filename
file_parsing_sort_mode = modified_time
# Turn off scheduler use of cron intervals by setting this to False.
# DAGs submitted manually in the web UI or with trigger_dag will still run.
use_job_schedule = True
# Allow externally triggered DagRuns for Execution Dates in the future
# Only has effect if schedule_interval is set to None in DAG
allow_trigger_in_future = False
# DAG dependency detector class to use
dependency_detector = airflow.serialization.serialized_objects.DependencyDetector
[metrics]
# Statsd (https://github.com/etsy/statsd) integration settings
# statsd_on = False
# statsd_host = localhost
# statsd_port = 8125
# statsd_prefix = airflow
# To enable datadog integration to send airflow metrics.
statsd_datadog_enabled = False
# List of datadog tags attached to all metrics(e.g: key1:value1,key2:value2)
# statsd_datadog_tags =
# [secrets]
# Full class name of secrets backend to enable (will precede env vars and metastore in search path)
# Example: backend = airflow.providers.amazon.aws.secrets.systems_manager.SystemsManagerParameterStoreBackend
# backend =
# The backend_kwargs param is loaded into a dictionary and passed to __init__ of secrets backend class.
# See documentation for the secrets backend you are using. JSON is expected.
# Example for AWS Systems Manager ParameterStore:
# ``{{"connections_prefix": "/airflow/connections", "profile_name": "default"}}``
# backend_kwargs =
# [cli]
# In what way should the cli access the API. The LocalClient will use the
# database directly, while the json_client will use the api running on the
# webserver
# api_client = airflow.api.client.local_client
# If you set web_server_url_prefix, do NOT forget to append it here, ex:
# ``endpoint_url = http://localhost:8080/myroot``
# So api will look like: ``http://localhost:8080/myroot/api/experimental/...``
# endpoint_url = http://localhost:8080
[debug]
# Used only with ``DebugExecutor``. If set to ``True`` DAG will fail with first
# failed task. Helpful for debugging purposes.
fail_fast = False
[api]
# Enables the deprecated experimental API. Please note that these APIs do not have access control.
# The authenticated user has full access.
#
# .. warning::
#
# This `Experimental REST API <https://airflow.readthedocs.io/en/latest/rest-api-ref.html>`__ is
# deprecated since version 2.0. Please consider using
# `the Stable REST API <https://airflow.readthedocs.io/en/latest/stable-rest-api-ref.html>`__.
# For more information on migration, see
# `UPDATING.md <https://github.com/apache/airflow/blob/master/UPDATING.md>`_
enable_experimental_api = False
# How to authenticate users of the API. See
# https://airflow.apache.org/docs/apache-airflow/stable/security.html for possible values.
# ("airflow.api.auth.backend.default" allows all requests for historic reasons)
auth_backend = airflow.api.auth.backend.deny_all
# Used to set the maximum page limit for API requests
maximum_page_limit = 100
# Used to set the default page limit when limit is zero. A default limit
# of 100 is set on OpenApi spec. However, this particular default limit
# only work when limit is set equal to zero(0) from API requests.
# If no limit is supplied, the OpenApi spec default is used.
fallback_page_limit = 100
# The intended audience for JWT token credentials used for authorization. This value must match on the client and server sides. If empty, audience will not be tested.
# Example: google_oauth2_audience = project-id-random-value.apps.googleusercontent.com
# google_oauth2_audience =
# Path to Google Cloud Service Account key file (JSON). If omitted, authorization based on
# `the Application Default Credentials
# <https://cloud.google.com/docs/authentication/production#finding_credentials_automatically>`__ will
# be used.
# Example: google_key_path = /files/service-account-json
# google_key_path =
# Used in response to a preflight request to indicate which HTTP
# headers can be used when making the actual request. This header is
# the server side response to the browser's
# Access-Control-Request-Headers header.
# access_control_allow_headers =
# Specifies the method or methods allowed when accessing the resource.
# access_control_allow_methods =
# Indicates whether the response can be shared with requesting code from the given origin.
# access_control_allow_origin =
# [smart_sensor]
# TODO(hwoo) - Test smart sensors and enable this if the need arises.
# When `use_smart_sensor` is True, Airflow redirects multiple qualified sensor tasks to
# smart sensor task.
# use_smart_sensor = False
# `shard_code_upper_limit` is the upper limit of `shard_code` value. The `shard_code` is generated
# by `hashcode % shard_code_upper_limit`.
# shard_code_upper_limit = 10000
# The number of running smart sensor processes for each service.
# shards = 5
# comma separated sensor classes support in smart_sensor.
# sensors_enabled = NamedHivePartitionSensor
[mesos]
# Mesos master address which MesosExecutor will connect to.
master = localhost:5050
@ -223,3 +840,35 @@ authenticate = False
# Mesos credentials, if authentication is enabled
# default_principal = admin
# default_secret = admin
# [lineage]
# what lineage backend to use
# backend =
# [atlas]
# sasl_enabled = False
# host =
# port = 21000
# username =
# password =
# [hive]
# Default mapreduce queue for HiveOperator tasks
# default_hive_mapred_queue =
# Template for mapred_job_name in HiveOperator, supports the following named parameters
# hostname, dag_id, task_id, execution_date
# mapred_job_name_template =
# [kerberos]
# ccache = /tmp/airflow_krb5_ccache
# gets augmented with fqdn
# principal = airflow
# reinit_frequency = 3600
# kinit_path = kinit
# keytab = airflow.keytab
# [github_enterprise]
# api_rev = v3

Просмотреть файл

@ -33,13 +33,12 @@ function update_gcp() {
container_id=$(docker ps | grep telemetry-airflow_web | cut -d' ' -f1)
docker exec $container_id \
airflow connections -d --conn_id $conn_id
airflow connections delete $conn_id
docker exec $container_id \
airflow connections -a \
--conn_id $conn_id \
--conn_type google_cloud_platform \
--conn_extra "$(format_gcp $keyfile)"
airflow connections add $conn_id \
--conn-type google_cloud_platform \
--conn-extra "$(format_gcp $keyfile)"
}
update_gcp $connection $keyfile_path

85
bin/run
Просмотреть файл

@ -68,13 +68,12 @@ init_connections() {
export AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID:-dummy_access_key_id}
export AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY:-dummy_secret_access_key}
airflow connections --delete --conn_id databricks_default
airflow connections delete databricks_default
airflow connections --add \
--conn_id databricks_default \
--conn_type databricks \
--conn_host https://dbc-caf9527b-e073.cloud.databricks.com \
--conn_extra "{\"token\":\"${DB_TOKEN}\", \"host\": \"\"}"
airflow connections add databricks_default \
--conn-type databricks \
--conn-host https://dbc-caf9527b-e073.cloud.databricks.com \
--conn-extra "{\"token\":\"${DB_TOKEN}\", \"host\": \"\"}"
gcp_conn=(
"google_cloud_airflow_dataproc"
@ -87,11 +86,11 @@ init_connections() {
"google_cloud_shared_prod"
)
for conn_id in "${gcp_conn[@]}"; do
airflow connections --delete --conn_id "${conn_id}"
airflow connections --add \
--conn_id "${conn_id}" \
--conn_type google_cloud_platform \
--conn_extra "$(gcp_default_extras)"
airflow connections delete "${conn_id}"
airflow connections add "${conn_id}" \
--conn-type google_cloud_platform \
--conn-extra "$(gcp_default_extras)"
done
aws_conn=(
@ -107,46 +106,46 @@ init_connections() {
"aws_socorro_readonly_s3"
)
for conn_id in "${aws_conn[@]}"; do
airflow connections --delete --conn_id "${conn_id}"
airflow connections --add \
--conn_id "${conn_id}" \
--conn_type s3 \
--conn_extra "$(aws_default_extras)"
airflow connections delete "${conn_id}"
airflow connections add "${conn_id}" \
--conn-type s3 \
--conn-extra "$(aws_default_extras)"
done
airflow connections --delete --conn_id "http_netlify_build_webhook"
airflow connections --add \
--conn_id "http_netlify_build_webhook" \
--conn_type http \
--conn_host "https://httpbin.org/"
airflow connections delete "http_netlify_build_webhook"
airflow connections add "http_netlify_build_webhook" \
--conn-type http \
--conn-host "https://httpbin.org/"
}
init_variables() {
airflow variables -s "bugzilla_probe_expiry_bot_api_key" "bugzilla-api-key"
airflow variables -s "app_store_connect_username" "username"
airflow variables -s "app_store_connect_password" "password"
airflow variables -s "surveygizmo_daily_attitudes_survey_id" "12345"
airflow variables -s "surveygizmo_api_token" "tokentokentoken"
airflow variables -s "surveygizmo_api_secret" "tapsekret"
airflow variables -s "jetstream_cluster_ip" "127.0.0.1"
airflow variables -s "jetstream_cluster_cert" "cert"
airflow variables set "bugzilla_probe_expiry_bot_api_key" "bugzilla-api-key"
airflow variables set "app_store_connect_username" "username"
airflow variables set "app_store_connect_password" "password"
airflow variables set "surveygizmo_daily_attitudes_survey_id" "12345"
airflow variables set "surveygizmo_api_token" "tokentokentoken"
airflow variables set "surveygizmo_api_secret" "tapsekret"
airflow variables set "jetstream_cluster_ip" "127.0.0.1"
airflow variables set "jetstream_cluster_cert" "cert"
airflow variables -s "taar_bigtable_instance_id" "taar_bigtable_instance_id"
airflow variables -s "taar_etl_storage_bucket" "taar_etl_storage_bucket"
airflow variables -s "taar_etl_model_storage_bucket" "taar_etl_model_storage_bucket"
airflow variables -s "taar_gcp_project_id" "taar_gcp_project_id"
airflow variables -s "taar_dataflow_subnetwork" "taar_dataflow_subnetwork"
airflow variables -s "taar_dataflow_service_account_email" "taar_dataflow_service_account_email"
airflow variables set "taar_bigtable_instance_id" "taar_bigtable_instance_id"
airflow variables set "taar_etl_storage_bucket" "taar_etl_storage_bucket"
airflow variables set "taar_etl_model_storage_bucket" "taar_etl_model_storage_bucket"
airflow variables set "taar_gcp_project_id" "taar_gcp_project_id"
airflow variables set "taar_dataflow_subnetwork" "taar_dataflow_subnetwork"
airflow variables set "taar_dataflow_service_account_email" "taar_dataflow_service_account_email"
airflow variables -s "looker_repos_secret_git_ssh_key_b64" "looker_repos_secret_git_ssh_key_b64"
airflow variables -s "looker_api_client_id_staging" "looker_api_client_id_staging"
airflow variables -s "looker_api_client_secret_staging" "looker_api_client_secret_staging"
airflow variables -s "looker_api_client_id_prod" "looker_api_client_id_prod"
airflow variables -s "looker_api_client_secret_prod" "looker_api_client_secret_prod"
airflow variables -s "dataops_looker_github_secret_access_token" "dataops_looker_github_secret_access_token"
airflow variables set "looker_repos_secret_git_ssh_key_b64" "looker_repos_secret_git_ssh_key_b64"
airflow variables set "looker_api_client_id_staging" "looker_api_client_id_staging"
airflow variables set "looker_api_client_secret_staging" "looker_api_client_secret_staging"
airflow variables set "looker_api_client_id_prod" "looker_api_client_id_prod"
airflow variables set "looker_api_client_secret_prod" "looker_api_client_secret_prod"
airflow variables set "dataops_looker_github_secret_access_token" "dataops_looker_github_secret_access_token"
airflow variables -s "glean_dictionary_netlify_build_webhook_id" "status/200"
airflow variables -s "lookml_generator_release_str" "v0.0.0"
airflow variables set "glean_dictionary_netlify_build_webhook_id" "status/200"
airflow variables set "lookml_generator_release_str" "v0.0.0"
}
[ $# -lt 1 ] && usage

Просмотреть файл

@ -44,10 +44,10 @@ fi
CONTAINER_ID=$(docker ps | grep _web | cut -d' ' -f1)
echo "Web container id is $CONTAINER_ID. Adding gcp connection..."
docker exec $CONTAINER_ID airflow connections -d --conn_id $GCP_CONN_ID
docker exec $CONTAINER_ID airflow connections delete $GCP_CONN_ID
docker exec $CONTAINER_ID airflow connections -a --conn_id $GCP_CONN_ID \
--conn_type google_cloud_platform \
--conn_extra "$JSON_CREDS"
docker exec $CONTAINER_ID airflow connections add $GCP_CONN_ID \
--conn-type google_cloud_platform \
--conn-extra "$JSON_CREDS"
echo "visit https://go.corp.mozilla.com/wtmodev for more info"

Просмотреть файл

@ -44,7 +44,7 @@ function get_errors_in_listing {
# Parse the logs for ERROR messages, these typically correspond to python
# exceptions in the DAG. In general, there should NOT be any errors when
# runnning the local environment.
docker-compose exec web airflow dags list | grep "ERROR"
docker-compose exec web airflow dags list -v | grep "ERROR"
}
@ -77,7 +77,7 @@ function main() {
if [[ $num_errors -ne 0 && $TESTING -eq 0 ]]; then
# Print full error output
docker-compose exec web airflow list_dags
docker-compose exec web airflow dags list -v
echo "Failure!"
exit 1
elif [[ $TESTING -eq 1 ]]; then

Просмотреть файл

@ -1,3 +1,8 @@
import gevent
from gevent import monkey, pool
monkey.patch_all()
STATE_COLORS = {
"queued": 'gray',
"running": 'lime',

0
dags/.airflowignore Normal file
Просмотреть файл

Просмотреть файл

@ -3,7 +3,7 @@ import os
from datetime import datetime, timedelta
from airflow import DAG
from airflow.operators.sensors import ExternalTaskSensor
from airflow.sensors.external_task import ExternalTaskSensor
from airflow.operators.subdag_operator import SubDagOperator
from utils.dataproc import (
moz_dataproc_pyspark_runner,

Просмотреть файл

@ -1,7 +1,7 @@
import datetime
from airflow import DAG
from airflow.contrib.hooks.aws_hook import AwsHook
from airflow.providers.amazon.aws.hooks.base_aws import AwsBaseHook
from operators.task_sensor import ExternalTaskCompletedSensor
from airflow.operators.subdag_operator import SubDagOperator
@ -29,7 +29,7 @@ with DAG(
) as dag:
# Jobs read from/write to s3://telemetry-public-analysis-2/bhr/data/hang_aggregates/
write_aws_conn_id = 'aws_dev_telemetry_public_analysis_2_rw'
aws_access_key, aws_secret_key, _ = AwsHook(write_aws_conn_id).get_credentials()
aws_access_key, aws_secret_key, _ = AwsBaseHook(aws_conn_id=write_aws_conn_id, client_type='s3').get_credentials()
wait_for_bhr_ping = ExternalTaskCompletedSensor(
task_id="wait_for_bhr_ping",

Просмотреть файл

@ -10,13 +10,12 @@ import uuid
import time
from airflow import DAG
from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook
from airflow.operators import PythonOperator
from airflow.operators.python import PythonOperator
from operators.bq_sensor import BigQuerySQLSensorOperator
from operators.gcp_container_operator import GKEPodOperator
DOCS = """\
# burnham 👩‍🚀📈🤖
# burnham
The burnham project is an end-to-end test suite that aims to automatically
verify that Glean-based products correctly measure, collect, and submit
@ -359,6 +358,7 @@ WHERE
# GCP and GKE default values
DEFAULT_GCP_CONN_ID = "google_cloud_derived_datasets"
DEFAULT_GCP_PROJECT_ID = "moz-fx-data-derived-datasets"
DEFAULT_GKE_LOCATION = "us-central1-a"
DEFAULT_GKE_CLUSTER_NAME = "bq-load-gke-1"
DEFAULT_GKE_NAMESPACE = "default"
@ -420,7 +420,7 @@ def burnham_run(
return GKEPodOperator(
task_id=task_id,
gcp_conn_id=gcp_conn_id,
project_id=GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id,
project_id=DEFAULT_GCP_PROJECT_ID,
location=gke_location,
cluster_name=gke_cluster_name,
namespace=gke_namespace,
@ -446,7 +446,7 @@ def burnham_sensor(task_id, sql, gcp_conn_id=DEFAULT_GCP_CONN_ID, **kwargs):
return BigQuerySQLSensorOperator(
task_id=task_id,
sql=sql,
bigquery_conn_id=gcp_conn_id,
gcp_conn_id=gcp_conn_id,
use_legacy_sql=False,
**kwargs,
)
@ -483,7 +483,7 @@ def burnham_bigquery_run(
return GKEPodOperator(
task_id=task_id,
gcp_conn_id=gcp_conn_id,
project_id=GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id,
project_id=DEFAULT_GCP_PROJECT_ID,
location=gke_location,
cluster_name=gke_cluster_name,
namespace=gke_namespace,

Просмотреть файл

@ -2,8 +2,6 @@ from airflow import DAG
from datetime import timedelta, datetime
from operators.gcp_container_operator import GKEPodOperator
from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook
docs = """
### Clean GKE Pods

Просмотреть файл

@ -1,7 +1,7 @@
import datetime
from airflow import models
from airflow.operators.sensors import ExternalTaskSensor
from airflow.sensors.external_task import ExternalTaskSensor
from airflow.operators.subdag_operator import SubDagOperator
from utils.gcp import (
bigquery_etl_copy_deduplicate,
@ -10,7 +10,6 @@ from utils.gcp import (
bigquery_xcom_query,
)
from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook
from utils.gcp import gke_command
DOCS = """\

Просмотреть файл

@ -1,7 +1,7 @@
import datetime
from airflow import DAG
from airflow.contrib.hooks.aws_hook import AwsHook
from airflow.providers.amazon.aws.hooks.base_aws import AwsBaseHook
from operators.task_sensor import ExternalTaskCompletedSensor
from airflow.operators.subdag_operator import SubDagOperator
@ -35,13 +35,15 @@ with DAG(
) as dag:
# top_signatures_correlations uploads results to public analysis bucket
write_aws_conn_id = "aws_dev_telemetry_public_analysis_2_rw"
analysis_access_key, analysis_secret_key, _ = AwsHook(
write_aws_conn_id
analysis_access_key, analysis_secret_key, _ = AwsBaseHook(
aws_conn_id=write_aws_conn_id,
client_type='s3'
).get_credentials()
# modules_with_missing_symbols sends results as email
ses_aws_conn_id = "aws_data_iam_ses"
ses_access_key, ses_secret_key, _ = AwsHook(ses_aws_conn_id).get_credentials()
ses_access_key, ses_secret_key, _ = AwsBaseHook(
aws_conn_id=ses_aws_conn_id, client_type='s3').get_credentials()
wait_for_socorro_import = ExternalTaskCompletedSensor(
task_id="wait_for_socorro_import",

Просмотреть файл

@ -3,8 +3,7 @@ from datetime import datetime, timedelta
from utils.gcp import bigquery_etl_query, gke_command
from airflow.operators.sensors import ExternalTaskSensor
from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook
from airflow.sensors.external_task import ExternalTaskSensor
from operators.gcp_container_operator import GKEPodOperator
default_args = {
@ -18,6 +17,8 @@ default_args = {
# We rely on max_active_runs=1 at the DAG level to manage the dependency on past runs.
with DAG('experiments_live',
default_args=default_args,
# Will be renamed to max_active_tasks sometime later as main upstream branch states
# max_active_tasks=4,
concurrency=4,
max_active_runs=1,
schedule_interval="*/5 * * * *") as dag:

Просмотреть файл

@ -1,5 +1,5 @@
from airflow import DAG
from airflow.contrib.hooks.aws_hook import AwsHook
from airflow.providers.amazon.aws.hooks.base_aws import AwsBaseHook
from operators.task_sensor import ExternalTaskCompletedSensor
from airflow.operators.subdag_operator import SubDagOperator
from datetime import datetime, timedelta
@ -35,7 +35,7 @@ dag = DAG("firefox_public_data_report", default_args=default_args, schedule_inte
# Required to write json output to s3://telemetry-public-analysis-2/public-data-report/hardware/
write_aws_conn_id='aws_dev_telemetry_public_analysis_2_rw'
aws_access_key, aws_secret_key, session = AwsHook(write_aws_conn_id).get_credentials()
aws_access_key, aws_secret_key, session = AwsBaseHook(aws_conn_id=write_aws_conn_id, client_type='s3').get_credentials()
# hardware_report's execution date will be {now}-7days. It will read last week's main pings,
# therefore we need to wait for yesterday's Main Ping deduplication task to finish

Просмотреть файл

@ -1,8 +1,6 @@
from datetime import datetime, timedelta
from airflow import DAG
from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook
from airflow.executors import get_default_executor
from operators.task_sensor import ExternalTaskCompletedSensor
from airflow.operators.subdag_operator import SubDagOperator
@ -38,12 +36,9 @@ PERCENT_RELEASE_WINDOWS_SAMPLING = "10"
dag = DAG(GLAM_DAG, default_args=default_args, schedule_interval="0 2 * * *")
gcp_conn = GoogleCloudBaseHook("google_cloud_airflow_dataproc")
# Make sure all the data for the given day has arrived before running.
wait_for_main_ping = ExternalTaskCompletedSensor(
task_id="wait_for_main_ping",
project_id=project_id,
external_dag_id="copy_deduplicate",
external_task_id="copy_deduplicate_main_ping",
execution_delta=timedelta(hours=1),
@ -181,7 +176,6 @@ clients_histogram_aggregates = SubDagOperator(
dataset_id,
),
task_id=GLAM_CLIENTS_HISTOGRAM_AGGREGATES_SUBDAG,
executor=get_default_executor(),
dag=dag,
)
@ -236,6 +230,10 @@ client_scalar_probe_counts = gke_command(
# SubdagOperator uses a SequentialExecutor by default
# so its tasks will run sequentially.
# Note: In 2.0, SubDagOperator is changed to use airflow scheduler instead of
# backfill to schedule tasks in the subdag. User no longer need to specify
# the executor in SubDagOperator. (We don't but the assumption that Sequential
# Executor is used is now wrong)
clients_histogram_bucket_counts = SubDagOperator(
subdag=repeated_subdag(
GLAM_DAG,
@ -273,7 +271,6 @@ extract_counts = SubDagOperator(
"counts"
),
task_id="extract_user_counts",
executor=get_default_executor(),
dag=dag
)
@ -288,7 +285,6 @@ extract_sample_counts = SubDagOperator(
"sample-counts"
),
task_id="extract_sample_counts",
executor=get_default_executor(),
dag=dag
)
@ -301,7 +297,6 @@ extracts_per_channel = SubDagOperator(
dataset_id
),
task_id="extracts",
executor=get_default_executor(),
dag=dag,
)

Просмотреть файл

@ -1,15 +1,11 @@
from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook
from airflow.contrib.operators.bigquery_to_gcs import BigQueryToCloudStorageOperator
from airflow.contrib.operators.gcs_delete_operator import (
GoogleCloudStorageDeleteOperator,
)
from airflow.executors import get_default_executor
from airflow.providers.google.cloud.transfers.bigquery_to_gcs import BigQueryToGCSOperator
from airflow.providers.google.cloud.operators.gcs import GCSDeleteObjectsOperator
from airflow.operators.subdag_operator import SubDagOperator
from airflow.models import DAG
from utils.gcp import bigquery_etl_query
gcp_conn = GoogleCloudBaseHook("google_cloud_airflow_dataproc")
gcp_conn_id = "google_cloud_airflow_dataproc"
project_id = "moz-fx-data-shared-prod"
glam_bucket = "moz-fx-data-glam-prod-fca7-etl-data"
@ -33,7 +29,6 @@ def extracts_subdag(
channel,
),
task_id="extract_{}".format(channel),
executor=get_default_executor(),
dag=dag,
)
@ -75,24 +70,24 @@ def extract_channel_subdag(
dag=dag,
)
gcs_delete = GoogleCloudStorageDeleteOperator(
gcs_delete = GCSDeleteObjectsOperator(
task_id="glam_gcs_delete_old_{}_extracts".format(channel),
bucket_name=glam_bucket,
prefix="aggs-desktop-{}".format(channel),
google_cloud_storage_conn_id=gcp_conn.gcp_conn_id,
gcp_conn_id=gcp_conn_id,
dag=dag,
)
gcs_destination = "gs://{bucket}/aggs-desktop-{channel}-*.csv".format(
bucket=glam_bucket, channel=channel
)
bq2gcs = BigQueryToCloudStorageOperator(
bq2gcs = BigQueryToGCSOperator(
task_id="glam_extract_{}_to_csv".format(channel),
source_project_dataset_table="{}.{}.{}".format(
project_id, dataset_id, bq_extract_table
),
destination_cloud_storage_uris=gcs_destination,
bigquery_conn_id=gcp_conn.gcp_conn_id,
gcp_conn_id=gcp_conn_id,
export_format="CSV",
print_header=False,
dag=dag,
@ -135,11 +130,13 @@ def extract_user_counts(
dag=dag,
)
gcs_delete = GoogleCloudStorageDeleteOperator(
gcs_delete = GCSDeleteObjectsOperator(
task_id="glam_gcs_delete_{}_extracts".format(task_prefix),
bucket_name=glam_bucket,
prefix="glam-extract-firefox-{}".format(file_prefix),
google_cloud_storage_conn_id=gcp_conn.gcp_conn_id,
gcp_conn_id=gcp_conn_id,
dag=dag,
)
@ -151,13 +148,14 @@ def extract_user_counts(
gcs_destination = "gs://{}/glam-extract-firefox-{}.csv".format(
glam_bucket, file_prefix
)
bq2gcs = BigQueryToCloudStorageOperator(
bq2gcs = BigQueryToGCSOperator(
task_id="glam_extract_{}_to_csv".format(task_prefix),
source_project_dataset_table="{}.{}.{}".format(
project_id, dataset_id, bq_extract_table
),
destination_cloud_storage_uris=gcs_destination,
bigquery_conn_id=gcp_conn.gcp_conn_id,
gcp_conn_id=gcp_conn_id,
export_format="CSV",
print_header=False,
dag=dag,

Просмотреть файл

@ -1,6 +1,5 @@
from airflow.models import DAG
from airflow.operators.subdag_operator import SubDagOperator
from airflow.executors import get_default_executor
from glam_subdags.general import repeated_subdag
from utils.gcp import bigquery_etl_query
@ -42,7 +41,6 @@ def histogram_aggregates_subdag(
dataset_id,
),
task_id=GLAM_HISTOGRAM_AGGREGATES_FINAL_SUBDAG,
executor=get_default_executor(),
dag=dag,
)

Просмотреть файл

@ -1,7 +1,8 @@
import datetime
import os
from airflow import DAG
from airflow.contrib.hooks.aws_hook import AwsHook
from airflow.providers.amazon.aws.hooks.base_aws import AwsBaseHook
from operators.task_sensor import ExternalTaskCompletedSensor
from airflow.operators.subdag_operator import SubDagOperator
@ -39,7 +40,11 @@ with DAG(
) as dag:
# Jobs read from/write to s3://telemetry-public-analysis-2/gfx/telemetry-data/
write_aws_conn_id = 'aws_dev_telemetry_public_analysis_2_rw'
aws_access_key, aws_secret_key, _ = AwsHook(write_aws_conn_id).get_credentials()
is_dev = os.environ.get("DEPLOY_ENVIRONMENT") == "dev"
if is_dev:
aws_access_key, aws_secret_key = ('replace_me', 'replace_me')
else:
aws_access_key, aws_secret_key, _ = AwsBaseHook(aws_conn_id=write_aws_conn_id, client_type='s3').get_credentials()
wait_for_main_ping = ExternalTaskCompletedSensor(
task_id="wait_for_main_ping",

Просмотреть файл

@ -1,88 +0,0 @@
from airflow import DAG
from datetime import datetime, timedelta
from utils.gcp import bigquery_etl_query
from operators.task_sensor import ExternalTaskCompletedSensor
from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook
from operators.gcp_container_operator import GKEPodOperator
default_args = {
'owner': 'frank@mozilla.com',
'depends_on_past': False,
'start_date': datetime(2020, 1, 1),
'email_on_failure': True,
'email_on_retry': True,
'retries': 2,
'retry_delay': timedelta(minutes=30),
}
with DAG('incline_dashboard',
default_args=default_args,
schedule_interval="0 4 * * *") as dag:
wait_for_baseline_clients_last_seen = ExternalTaskCompletedSensor(
task_id="wait_for_baseline_clients_last_seen",
external_dag_id="copy_deduplicate",
external_task_id="baseline_clients_last_seen",
execution_delta=timedelta(hours=3),
mode="reschedule",
pool="DATA_ENG_EXTERNALTASKSENSOR",
email_on_retry=False,
)
wait_for_core_clients_last_seen = ExternalTaskCompletedSensor(
task_id="wait_for_core_clients_last_seen",
external_dag_id="bqetl_core",
external_task_id="telemetry_derived__core_clients_last_seen__v1",
execution_delta=timedelta(hours=2),
mode="reschedule",
pool="DATA_ENG_EXTERNALTASKSENSOR",
email_on_retry=False,
)
project = "moz-fx-data-shared-prod"
dataset = "org_mozilla_firefox_derived"
migrated_clients = bigquery_etl_query(
task_id="generate_migrated_clients",
project_id=project,
dataset_id=dataset,
# We recreate this entire table from scratch every day because we are
# taking the last seen migration ping over all time for each client.
destination_table=None,
date_partition_parameter=None,
sql_file_path="sql/moz-fx-data-shared-prod/org_mozilla_firefox_derived/migrated_clients_v1/init.sql",
owner="frank@mozilla.com",
email=["telemetry-alerts@mozilla.com", "frank@mozilla.com"]
)
exec_dash = bigquery_etl_query(
task_id="generate_incline_exec_dash",
destination_table="incline_executive_v1",
project_id=project,
dataset_id=dataset,
owner="frank@mozilla.com",
email=["telemetry-alerts@mozilla.com", "frank@mozilla.com"],
)
gcp_conn_id = 'google_cloud_derived_datasets'
export_incline_dash = GKEPodOperator(
task_id="export_incline_dash",
name="export-incline-dash",
arguments=["script/export_incline_dash", "{{ ds }}"],
gcp_conn_id=gcp_conn_id,
project_id=GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id,
location="us-central1-a",
cluster_name="bq-load-gke-1",
namespace="default",
image="gcr.io/moz-fx-data-airflow-prod-88e0/bigquery-etl:latest",
)
(
[wait_for_baseline_clients_last_seen, wait_for_core_clients_last_seen] >>
migrated_clients >>
exec_dash >>
export_incline_dash
)

Просмотреть файл

@ -5,7 +5,10 @@ from airflow import DAG
from operators.task_sensor import ExternalTaskCompletedSensor
from airflow.operators.subdag_operator import SubDagOperator
from datetime import datetime, timedelta
from operators.backport.bigquery_operator_1_10_2 import BigQueryOperator
from airflow.providers.google.cloud.operators.bigquery import (
BigQueryExecuteQueryOperator
)
from six.moves.urllib.request import urlopen
from utils.dataproc import (
moz_dataproc_pyspark_runner,
@ -109,8 +112,7 @@ response = urlopen('/'.join([
'https://raw.githubusercontent.com/mozilla/bigquery-etl/main/sql',
'moz-fx-data-shared-prod', 'revenue_derived', 'client_ltv_v1', 'query.sql']))
BigQueryOperator.template_fields += ('query_params',)
ltv_revenue_join=BigQueryOperator(
ltv_revenue_join=BigQueryExecuteQueryOperator(
task_id='ltv_revenue_join',
sql=response.read().decode('utf-8'),
query_params=[{"name": "submission_date", "parameterType": {"type": "DATE"}, "parameterValue": {"value": "{{ ds }}"}}],
@ -129,7 +131,7 @@ response = urlopen('/'.join([
'moz-fx-data-shared-prod', 'revenue_derived', 'client_ltv_normalized', 'query.sql']))
# Normalized LTV View is for general-use and doesn't contain any revenue data
ltv_normalized_view=BigQueryOperator(
ltv_normalized_view=BigQueryExecuteQueryOperator(
task_id='ltv_normalized_view',
sql=response.read().decode('utf-8'),
query_params=[{"name": "submission_date", "parameterType": {"type": "DATE"}, "parameterValue": {"value": "{{ ds }}"}}],
@ -147,7 +149,7 @@ response = urlopen('/'.join([
'https://raw.githubusercontent.com/mozilla/bigquery-etl/main/sql',
'moz-fx-data-shared-prod', 'revenue_derived', 'client_ltv_normalized_v1', 'query.sql']))
client_ltv_normalized_v1=BigQueryOperator(
client_ltv_normalized_v1=BigQueryExecuteQueryOperator(
task_id='client_ltv_normalized_v1',
sql=response.read().decode('utf-8'),
query_params=[{"name": "submission_date", "parameterType": {"type": "DATE"}, "parameterValue": {"value": "{{ ds }}"}}],

Просмотреть файл

@ -1,7 +1,8 @@
import os
from airflow import DAG
from datetime import datetime, timedelta
from airflow.contrib.hooks.aws_hook import AwsHook
from airflow.providers.amazon.aws.hooks.base_aws import AwsBaseHook
from utils.gcp import gke_command
@ -17,17 +18,22 @@ default_args = {
}
with DAG("mad_server", default_args=default_args, schedule_interval="@weekly") as dag:
is_dev = os.environ.get("DEPLOY_ENVIRONMENT") == "dev"
aws_conn_id="aws_dev_mad_resources_training"
# mad-server expects AWS creds in some custom env vars.
s3_env_vars = {
key: value
for key, value in zip(
("S3_ACCESS_KEY_ID", "S3_SECRET_ACCESS_KEY", "S3_SESSION_TOKEN"),
AwsHook(aws_conn_id).get_credentials() if aws_conn_id else (),
)
if value is not None}
if is_dev:
aws_conn_id = None
s3_env_vars = {}
else:
aws_conn_id="aws_dev_mad_resources_training"
s3_env_vars = {
key: value
for key, value in zip(
("S3_ACCESS_KEY_ID", "S3_SECRET_ACCESS_KEY", "S3_SESSION_TOKEN"),
AwsBaseHook(aws_conn_id=aws_conn_id, client_type='s3').get_credentials() if aws_conn_id else (),
)
if value is not None
}
mad_server_pull = gke_command(
task_id="mad_server_pull",

Просмотреть файл

@ -3,8 +3,7 @@ import os
from datetime import datetime, timedelta
from airflow import DAG
from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook
from airflow.contrib.operators.gcs_delete_operator import GoogleCloudStorageDeleteOperator
from airflow.providers.google.cloud.operators.gcs import GCSDeleteObjectsOperator
from airflow.operators.subdag_operator import SubDagOperator
@ -34,13 +33,13 @@ subdag_args = default_args.copy()
subdag_args["retries"] = 0
task_id = "mobile_aggregate_view_dataproc"
gcp_conn = GoogleCloudBaseHook("google_cloud_airflow_dataproc")
keyfile = json.loads(gcp_conn.extras["extra__google_cloud_platform__keyfile_dict"])
project_id = keyfile["project_id"]
gcp_conn_id = "google_cloud_airflow_dataproc"
project_id = "airflow-dataproc"
dev_test_service_account = "replace_me"
is_dev = os.environ.get("DEPLOY_ENVIRONMENT") == "dev"
client_email = (
keyfile["client_email"]
dev_test_service_account
if is_dev
else "dataproc-runner-prod@airflow-dataproc.iam.gserviceaccount.com"
)
@ -100,7 +99,7 @@ mobile_aggregate_view_dataproc = SubDagOperator(
"gs://moz-fx-data-derived-datasets-parquet-tmp/avro/mozaggregator/mobile/moz-fx-data-shared-prod",
]
),
gcp_conn_id=gcp_conn.gcp_conn_id,
gcp_conn_id=gcp_conn_id,
service_account=client_email,
artifact_bucket=artifact_bucket,
storage_bucket=storage_bucket,
@ -126,11 +125,11 @@ if EXPORT_TO_AVRO:
dag=dag,
).set_downstream(mobile_aggregate_view_dataproc)
GoogleCloudStorageDeleteOperator(
GCSDeleteObjectsOperator(
task_id="delete_mobile_metrics_avro",
bucket_name="moz-fx-data-derived-datasets-parquet-tmp",
prefix="avro/mozaggregator/mobile/moz-fx-data-shared-prod/{{ ds_nodash }}/mobile_metrics_v1",
google_cloud_storage_conn_id=gcp_conn.gcp_conn_id,
gcp_conn_id=gcp_conn_id,
dag=dag
).set_upstream(mobile_aggregate_view_dataproc)

Просмотреть файл

@ -3,10 +3,7 @@ import os
from datetime import datetime, timedelta
from airflow import DAG
from airflow.contrib.operators.gcs_delete_operator import (
GoogleCloudStorageDeleteOperator,
)
from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook
from airflow.providers.google.cloud.operators.gcs import GCSDeleteObjectsOperator
from airflow.operators.subdag_operator import SubDagOperator
from utils.dataproc import moz_dataproc_pyspark_runner, copy_artifacts_dev
from utils.gcp import gke_command
@ -39,13 +36,13 @@ subdag_args = default_args.copy()
subdag_args["retries"] = 0
task_id = "prerelease_telemetry_aggregate_view_dataproc"
gcp_conn = GoogleCloudBaseHook("google_cloud_airflow_dataproc")
keyfile = json.loads(gcp_conn.extras["extra__google_cloud_platform__keyfile_dict"])
project_id = keyfile["project_id"]
gcp_conn_id = "google_cloud_airflow_dataproc"
project_id = "airflow-dataproc"
dev_test_service_account = "replace_me"
is_dev = os.environ.get("DEPLOY_ENVIRONMENT") == "dev"
client_email = (
keyfile["client_email"]
dev_test_service_account
if is_dev
else "dataproc-runner-prod@airflow-dataproc.iam.gserviceaccount.com"
)
@ -114,7 +111,7 @@ prerelease_telemetry_aggregate_view_dataproc = SubDagOperator(
"gs://moz-fx-data-derived-datasets-parquet-tmp/avro/mozaggregator/prerelease/moz-fx-data-shared-prod",
]
),
gcp_conn_id=gcp_conn.gcp_conn_id,
gcp_conn_id=gcp_conn_id,
service_account=client_email,
artifact_bucket=artifact_bucket,
storage_bucket=storage_bucket,
@ -207,11 +204,11 @@ if EXPORT_TO_AVRO:
).set_downstream(prerelease_telemetry_aggregate_view_dataproc)
# Delete the GCS data
GoogleCloudStorageDeleteOperator(
GCSDeleteObjectsOperator(
task_id="delete_main_avro",
bucket_name="moz-fx-data-derived-datasets-parquet-tmp",
prefix="avro/mozaggregator/prerelease/moz-fx-data-shared-prod/{{ ds_nodash }}/main_v4",
google_cloud_storage_conn_id=gcp_conn.gcp_conn_id,
gcp_conn_id=gcp_conn_id,
dag=dag,
).set_upstream(prerelease_telemetry_aggregate_view_dataproc)

Просмотреть файл

@ -1,30 +0,0 @@
from airflow import DAG
from datetime import datetime, timedelta
from airflow.operators.dummy_operator import DummyOperator
default_args = {
"owner": "frank@mozilla.com",
"depends_on_past": True,
"start_date": datetime(2018, 12, 17),
"email": ["telemetry-alerts@mozilla.com", "frank@mozilla.com"],
"email_on_failure": True,
"email_on_retry": True,
"retries": 3,
"retry_delay": timedelta(minutes=30),
}
dag = DAG(
"release_telemetry_aggregates",
default_args=default_args,
schedule_interval="@daily",
)
# See mozaggregator_prerelease and mozaggregator_mobile for functional
# implementations using dataproc operator. This is not implemented due to the
# migration to GCP and https://bugzilla.mozilla.org/show_bug.cgi?id=1517018
release_telemetry_aggregate_view = DummyOperator(
task_id="release_telemetry_aggregate_view",
job_name="Release Telemetry Aggregate View",
dag=dag,
)

Просмотреть файл

@ -7,7 +7,11 @@ the upstream GkePodOperator works fine.
### As of 1.10.12 I've removed the backported 1.10.7 gcp_container_operator,
kubernetes_pod_operator, and the 1.10.2 kube_client
### Fivetran operator backported from 2.0+
Fivetran provides and [operator, sensor and hook](https://github.com/fivetran/airflow-provider-fivetran)
for integrating with the Fivetran API for Airflow version 2.0+. Backported to
make it usable in Airflow 1.10.15.
### For 2.1.0 I've removed bigquery_operator_1_10_2.py, in favor of the new
google provider code.

Просмотреть файл

@ -1,612 +0,0 @@
# -*- coding: utf-8 -*-
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import json
from airflow.contrib.hooks.bigquery_hook import BigQueryHook
from airflow.contrib.hooks.gcs_hook import GoogleCloudStorageHook, _parse_gcs_url
from airflow.models import BaseOperator
from airflow.utils.decorators import apply_defaults
class BigQueryOperator(BaseOperator):
"""
Executes BigQuery SQL queries in a specific BigQuery database
:param bql: (Deprecated. Use `sql` parameter instead) the sql code to be
executed (templated)
:type bql: Can receive a str representing a sql statement,
a list of str (sql statements), or reference to a template file.
Template reference are recognized by str ending in '.sql'.
:param sql: the sql code to be executed (templated)
:type sql: Can receive a str representing a sql statement,
a list of str (sql statements), or reference to a template file.
Template reference are recognized by str ending in '.sql'.
:param destination_dataset_table: A dotted
(<project>.|<project>:)<dataset>.<table> that, if set, will store the results
of the query. (templated)
:type destination_dataset_table: string
:param write_disposition: Specifies the action that occurs if the destination table
already exists. (default: 'WRITE_EMPTY')
:type write_disposition: string
:param create_disposition: Specifies whether the job is allowed to create new tables.
(default: 'CREATE_IF_NEEDED')
:type create_disposition: string
:param allow_large_results: Whether to allow large results.
:type allow_large_results: boolean
:param flatten_results: If true and query uses legacy SQL dialect, flattens
all nested and repeated fields in the query results. ``allow_large_results``
must be ``true`` if this is set to ``false``. For standard SQL queries, this
flag is ignored and results are never flattened.
:type flatten_results: boolean
:param bigquery_conn_id: reference to a specific BigQuery hook.
:type bigquery_conn_id: string
:param delegate_to: The account to impersonate, if any.
For this to work, the service account making the request must have domain-wide
delegation enabled.
:type delegate_to: string
:param udf_config: The User Defined Function configuration for the query.
See https://cloud.google.com/bigquery/user-defined-functions for details.
:type udf_config: list
:param use_legacy_sql: Whether to use legacy SQL (true) or standard SQL (false).
:type use_legacy_sql: boolean
:param maximum_billing_tier: Positive integer that serves as a multiplier
of the basic price.
Defaults to None, in which case it uses the value set in the project.
:type maximum_billing_tier: integer
:param maximum_bytes_billed: Limits the bytes billed for this job.
Queries that will have bytes billed beyond this limit will fail
(without incurring a charge). If unspecified, this will be
set to your project default.
:type maximum_bytes_billed: float
:param api_resource_configs: a dictionary that contain params
'configuration' applied for Google BigQuery Jobs API:
https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs
for example, {'query': {'useQueryCache': False}}. You could use it
if you need to provide some params that are not supported by BigQueryOperator
like args.
:type api_resource_configs: dict
:param schema_update_options: Allows the schema of the destination
table to be updated as a side effect of the load job.
:type schema_update_options: tuple
:param query_params: a dictionary containing query parameter types and
values, passed to BigQuery.
:type query_params: dict
:param labels: a dictionary containing labels for the job/query,
passed to BigQuery
:type labels: dict
:param priority: Specifies a priority for the query.
Possible values include INTERACTIVE and BATCH.
The default value is INTERACTIVE.
:type priority: string
:param time_partitioning: configure optional time partitioning fields i.e.
partition by field, type and expiration as per API specifications.
:type time_partitioning: dict
:param cluster_fields: Request that the result of this query be stored sorted
by one or more columns. This is only available in conjunction with
time_partitioning. The order of columns given determines the sort order.
:type cluster_fields: list of str
:param location: The geographic location of the job. Required except for
US and EU. See details at
https://cloud.google.com/bigquery/docs/locations#specifying_your_location
:type location: str
"""
template_fields = ('bql', 'sql', 'destination_dataset_table', 'labels')
template_ext = ('.sql', )
ui_color = '#e4f0e8'
@apply_defaults
def __init__(self,
bql=None,
sql=None,
destination_dataset_table=None,
write_disposition='WRITE_EMPTY',
allow_large_results=False,
flatten_results=None,
bigquery_conn_id='bigquery_default',
delegate_to=None,
udf_config=None,
use_legacy_sql=True,
maximum_billing_tier=None,
maximum_bytes_billed=None,
create_disposition='CREATE_IF_NEEDED',
schema_update_options=(),
query_params=None,
labels=None,
priority='INTERACTIVE',
time_partitioning=None,
api_resource_configs=None,
cluster_fields=None,
location=None,
*args,
**kwargs):
super(BigQueryOperator, self).__init__(*args, **kwargs)
self.bql = bql
self.sql = sql if sql else bql
self.destination_dataset_table = destination_dataset_table
self.write_disposition = write_disposition
self.create_disposition = create_disposition
self.allow_large_results = allow_large_results
self.flatten_results = flatten_results
self.bigquery_conn_id = bigquery_conn_id
self.delegate_to = delegate_to
self.udf_config = udf_config
self.use_legacy_sql = use_legacy_sql
self.maximum_billing_tier = maximum_billing_tier
self.maximum_bytes_billed = maximum_bytes_billed
self.schema_update_options = schema_update_options
self.query_params = query_params
self.labels = labels
self.bq_cursor = None
self.priority = priority
self.time_partitioning = time_partitioning
self.api_resource_configs = api_resource_configs
self.cluster_fields = cluster_fields
self.location = location
# TODO remove `bql` in Airflow 2.0
if self.bql:
import warnings
warnings.warn('Deprecated parameter `bql` used in Task id: {}. '
'Use `sql` parameter instead to pass the sql to be '
'executed. `bql` parameter is deprecated and '
'will be removed in a future version of '
'Airflow.'.format(self.task_id),
category=DeprecationWarning)
if self.sql is None:
raise TypeError('{} missing 1 required positional '
'argument: `sql`'.format(self.task_id))
def execute(self, context):
if self.bq_cursor is None:
self.log.info('Executing: %s', self.sql)
hook = BigQueryHook(
bigquery_conn_id=self.bigquery_conn_id,
use_legacy_sql=self.use_legacy_sql,
delegate_to=self.delegate_to,
location=self.location,
)
conn = hook.get_conn()
self.bq_cursor = conn.cursor()
self.bq_cursor.run_query(
sql=self.sql,
destination_dataset_table=self.destination_dataset_table,
write_disposition=self.write_disposition,
allow_large_results=self.allow_large_results,
flatten_results=self.flatten_results,
udf_config=self.udf_config,
maximum_billing_tier=self.maximum_billing_tier,
maximum_bytes_billed=self.maximum_bytes_billed,
create_disposition=self.create_disposition,
query_params=self.query_params,
labels=self.labels,
schema_update_options=self.schema_update_options,
priority=self.priority,
time_partitioning=self.time_partitioning,
api_resource_configs=self.api_resource_configs,
cluster_fields=self.cluster_fields,
)
def on_kill(self):
super(BigQueryOperator, self).on_kill()
if self.bq_cursor is not None:
self.log.info('Cancelling running query')
self.bq_cursor.cancel_query()
class BigQueryCreateEmptyTableOperator(BaseOperator):
"""
Creates a new, empty table in the specified BigQuery dataset,
optionally with schema.
The schema to be used for the BigQuery table may be specified in one of
two ways. You may either directly pass the schema fields in, or you may
point the operator to a Google cloud storage object name. The object in
Google cloud storage must be a JSON file with the schema fields in it.
You can also create a table without schema.
:param project_id: The project to create the table into. (templated)
:type project_id: string
:param dataset_id: The dataset to create the table into. (templated)
:type dataset_id: string
:param table_id: The Name of the table to be created. (templated)
:type table_id: string
:param schema_fields: If set, the schema field list as defined here:
https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.load.schema
**Example**: ::
schema_fields=[{"name": "emp_name", "type": "STRING", "mode": "REQUIRED"},
{"name": "salary", "type": "INTEGER", "mode": "NULLABLE"}]
:type schema_fields: list
:param gcs_schema_object: Full path to the JSON file containing
schema (templated). For
example: ``gs://test-bucket/dir1/dir2/employee_schema.json``
:type gcs_schema_object: string
:param time_partitioning: configure optional time partitioning fields i.e.
partition by field, type and expiration as per API specifications.
.. seealso::
https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#timePartitioning
:type time_partitioning: dict
:param bigquery_conn_id: Reference to a specific BigQuery hook.
:type bigquery_conn_id: string
:param google_cloud_storage_conn_id: Reference to a specific Google
cloud storage hook.
:type google_cloud_storage_conn_id: string
:param delegate_to: The account to impersonate, if any. For this to
work, the service account making the request must have domain-wide
delegation enabled.
:type delegate_to: string
:param labels: a dictionary containing labels for the table, passed to BigQuery
**Example (with schema JSON in GCS)**: ::
CreateTable = BigQueryCreateEmptyTableOperator(
task_id='BigQueryCreateEmptyTableOperator_task',
dataset_id='ODS',
table_id='Employees',
project_id='internal-gcp-project',
gcs_schema_object='gs://schema-bucket/employee_schema.json',
bigquery_conn_id='airflow-service-account',
google_cloud_storage_conn_id='airflow-service-account'
)
**Corresponding Schema file** (``employee_schema.json``): ::
[
{
"mode": "NULLABLE",
"name": "emp_name",
"type": "STRING"
},
{
"mode": "REQUIRED",
"name": "salary",
"type": "INTEGER"
}
]
**Example (with schema in the DAG)**: ::
CreateTable = BigQueryCreateEmptyTableOperator(
task_id='BigQueryCreateEmptyTableOperator_task',
dataset_id='ODS',
table_id='Employees',
project_id='internal-gcp-project',
schema_fields=[{"name": "emp_name", "type": "STRING", "mode": "REQUIRED"},
{"name": "salary", "type": "INTEGER", "mode": "NULLABLE"}],
bigquery_conn_id='airflow-service-account',
google_cloud_storage_conn_id='airflow-service-account'
)
:type labels: dict
"""
template_fields = ('dataset_id', 'table_id', 'project_id',
'gcs_schema_object', 'labels')
ui_color = '#f0eee4'
@apply_defaults
def __init__(self,
dataset_id,
table_id,
project_id=None,
schema_fields=None,
gcs_schema_object=None,
time_partitioning=None,
bigquery_conn_id='bigquery_default',
google_cloud_storage_conn_id='google_cloud_default',
delegate_to=None,
labels=None,
*args, **kwargs):
super(BigQueryCreateEmptyTableOperator, self).__init__(*args, **kwargs)
self.project_id = project_id
self.dataset_id = dataset_id
self.table_id = table_id
self.schema_fields = schema_fields
self.gcs_schema_object = gcs_schema_object
self.bigquery_conn_id = bigquery_conn_id
self.google_cloud_storage_conn_id = google_cloud_storage_conn_id
self.delegate_to = delegate_to
self.time_partitioning = {} if time_partitioning is None else time_partitioning
self.labels = labels
def execute(self, context):
bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
delegate_to=self.delegate_to)
if not self.schema_fields and self.gcs_schema_object:
gcs_bucket, gcs_object = _parse_gcs_url(self.gcs_schema_object)
gcs_hook = GoogleCloudStorageHook(
google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
delegate_to=self.delegate_to)
schema_fields = json.loads(gcs_hook.download(
gcs_bucket,
gcs_object).decode("utf-8"))
else:
schema_fields = self.schema_fields
conn = bq_hook.get_conn()
cursor = conn.cursor()
cursor.create_empty_table(
project_id=self.project_id,
dataset_id=self.dataset_id,
table_id=self.table_id,
schema_fields=schema_fields,
time_partitioning=self.time_partitioning,
labels=self.labels
)
class BigQueryCreateExternalTableOperator(BaseOperator):
"""
Creates a new external table in the dataset with the data in Google Cloud
Storage.
The schema to be used for the BigQuery table may be specified in one of
two ways. You may either directly pass the schema fields in, or you may
point the operator to a Google cloud storage object name. The object in
Google cloud storage must be a JSON file with the schema fields in it.
:param bucket: The bucket to point the external table to. (templated)
:type bucket: string
:param source_objects: List of Google cloud storage URIs to point
table to. (templated)
If source_format is 'DATASTORE_BACKUP', the list must only contain a single URI.
:type source_objects: list
:param destination_project_dataset_table: The dotted (<project>.)<dataset>.<table>
BigQuery table to load data into (templated). If <project> is not included,
project will be the project defined in the connection json.
:type destination_project_dataset_table: string
:param schema_fields: If set, the schema field list as defined here:
https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.load.schema
**Example**: ::
schema_fields=[{"name": "emp_name", "type": "STRING", "mode": "REQUIRED"},
{"name": "salary", "type": "INTEGER", "mode": "NULLABLE"}]
Should not be set when source_format is 'DATASTORE_BACKUP'.
:type schema_fields: list
:param schema_object: If set, a GCS object path pointing to a .json file that
contains the schema for the table. (templated)
:type schema_object: string
:param source_format: File format of the data.
:type source_format: string
:param compression: [Optional] The compression type of the data source.
Possible values include GZIP and NONE.
The default value is NONE.
This setting is ignored for Google Cloud Bigtable,
Google Cloud Datastore backups and Avro formats.
:type compression: string
:param skip_leading_rows: Number of rows to skip when loading from a CSV.
:type skip_leading_rows: int
:param field_delimiter: The delimiter to use for the CSV.
:type field_delimiter: string
:param max_bad_records: The maximum number of bad records that BigQuery can
ignore when running the job.
:type max_bad_records: int
:param quote_character: The value that is used to quote data sections in a CSV file.
:type quote_character: string
:param allow_quoted_newlines: Whether to allow quoted newlines (true) or not (false).
:type allow_quoted_newlines: boolean
:param allow_jagged_rows: Accept rows that are missing trailing optional columns.
The missing values are treated as nulls. If false, records with missing trailing
columns are treated as bad records, and if there are too many bad records, an
invalid error is returned in the job result. Only applicable to CSV, ignored
for other formats.
:type allow_jagged_rows: bool
:param bigquery_conn_id: Reference to a specific BigQuery hook.
:type bigquery_conn_id: string
:param google_cloud_storage_conn_id: Reference to a specific Google
cloud storage hook.
:type google_cloud_storage_conn_id: string
:param delegate_to: The account to impersonate, if any. For this to
work, the service account making the request must have domain-wide
delegation enabled.
:type delegate_to: string
:param src_fmt_configs: configure optional fields specific to the source format
:type src_fmt_configs: dict
:param labels a dictionary containing labels for the table, passed to BigQuery
:type labels: dict
"""
template_fields = ('bucket', 'source_objects',
'schema_object', 'destination_project_dataset_table', 'labels')
ui_color = '#f0eee4'
@apply_defaults
def __init__(self,
bucket,
source_objects,
destination_project_dataset_table,
schema_fields=None,
schema_object=None,
source_format='CSV',
compression='NONE',
skip_leading_rows=0,
field_delimiter=',',
max_bad_records=0,
quote_character=None,
allow_quoted_newlines=False,
allow_jagged_rows=False,
bigquery_conn_id='bigquery_default',
google_cloud_storage_conn_id='google_cloud_default',
delegate_to=None,
src_fmt_configs={},
labels=None,
*args, **kwargs):
super(BigQueryCreateExternalTableOperator, self).__init__(*args, **kwargs)
# GCS config
self.bucket = bucket
self.source_objects = source_objects
self.schema_object = schema_object
# BQ config
self.destination_project_dataset_table = destination_project_dataset_table
self.schema_fields = schema_fields
self.source_format = source_format
self.compression = compression
self.skip_leading_rows = skip_leading_rows
self.field_delimiter = field_delimiter
self.max_bad_records = max_bad_records
self.quote_character = quote_character
self.allow_quoted_newlines = allow_quoted_newlines
self.allow_jagged_rows = allow_jagged_rows
self.bigquery_conn_id = bigquery_conn_id
self.google_cloud_storage_conn_id = google_cloud_storage_conn_id
self.delegate_to = delegate_to
self.src_fmt_configs = src_fmt_configs
self.labels = labels
def execute(self, context):
bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
delegate_to=self.delegate_to)
if not self.schema_fields and self.schema_object \
and self.source_format != 'DATASTORE_BACKUP':
gcs_hook = GoogleCloudStorageHook(
google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
delegate_to=self.delegate_to)
schema_fields = json.loads(gcs_hook.download(
self.bucket,
self.schema_object).decode("utf-8"))
else:
schema_fields = self.schema_fields
source_uris = ['gs://{}/{}'.format(self.bucket, source_object)
for source_object in self.source_objects]
conn = bq_hook.get_conn()
cursor = conn.cursor()
cursor.create_external_table(
external_project_dataset_table=self.destination_project_dataset_table,
schema_fields=schema_fields,
source_uris=source_uris,
source_format=self.source_format,
compression=self.compression,
skip_leading_rows=self.skip_leading_rows,
field_delimiter=self.field_delimiter,
max_bad_records=self.max_bad_records,
quote_character=self.quote_character,
allow_quoted_newlines=self.allow_quoted_newlines,
allow_jagged_rows=self.allow_jagged_rows,
src_fmt_configs=self.src_fmt_configs,
labels=self.labels
)
class BigQueryDeleteDatasetOperator(BaseOperator):
""""
This operator deletes an existing dataset from your Project in Big query.
https://cloud.google.com/bigquery/docs/reference/rest/v2/datasets/delete
:param project_id: The project id of the dataset.
:type project_id: string
:param dataset_id: The dataset to be deleted.
:type dataset_id: string
**Example**: ::
delete_temp_data = BigQueryDeleteDatasetOperator(dataset_id = 'temp-dataset',
project_id = 'temp-project',
bigquery_conn_id='_my_gcp_conn_',
task_id='Deletetemp',
dag=dag)
"""
template_fields = ('dataset_id', 'project_id')
ui_color = '#f00004'
@apply_defaults
def __init__(self,
dataset_id,
project_id=None,
bigquery_conn_id='bigquery_default',
delegate_to=None,
*args, **kwargs):
self.dataset_id = dataset_id
self.project_id = project_id
self.bigquery_conn_id = bigquery_conn_id
self.delegate_to = delegate_to
self.log.info('Dataset id: %s', self.dataset_id)
self.log.info('Project id: %s', self.project_id)
super(BigQueryDeleteDatasetOperator, self).__init__(*args, **kwargs)
def execute(self, context):
bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
delegate_to=self.delegate_to)
conn = bq_hook.get_conn()
cursor = conn.cursor()
cursor.delete_dataset(
project_id=self.project_id,
dataset_id=self.dataset_id
)
class BigQueryCreateEmptyDatasetOperator(BaseOperator):
""""
This operator is used to create new dataset for your Project in Big query.
https://cloud.google.com/bigquery/docs/reference/rest/v2/datasets#resource
:param project_id: The name of the project where we want to create the dataset.
Don't need to provide, if projectId in dataset_reference.
:type project_id: str
:param dataset_id: The id of dataset. Don't need to provide,
if datasetId in dataset_reference.
:type dataset_id: str
:param dataset_reference: Dataset reference that could be provided with request body.
More info:
https://cloud.google.com/bigquery/docs/reference/rest/v2/datasets#resource
:type dataset_reference: dict
**Example**: ::
create_new_dataset = BigQueryCreateEmptyDatasetOperator(
dataset_id = 'new-dataset',
project_id = 'my-project',
dataset_reference = {"friendlyName": "New Dataset"}
bigquery_conn_id='_my_gcp_conn_',
task_id='newDatasetCreator',
dag=dag)
"""
template_fields = ('dataset_id', 'project_id')
ui_color = '#f0eee4'
@apply_defaults
def __init__(self,
dataset_id,
project_id=None,
dataset_reference=None,
bigquery_conn_id='bigquery_default',
delegate_to=None,
*args, **kwargs):
self.dataset_id = dataset_id
self.project_id = project_id
self.bigquery_conn_id = bigquery_conn_id
self.dataset_reference = dataset_reference if dataset_reference else {}
self.delegate_to = delegate_to
self.log.info('Dataset id: %s', self.dataset_id)
self.log.info('Project id: %s', self.project_id)
super(BigQueryCreateEmptyDatasetOperator, self).__init__(*args, **kwargs)
def execute(self, context):
bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
delegate_to=self.delegate_to)
conn = bq_hook.get_conn()
cursor = conn.cursor()
cursor.create_empty_dataset(
project_id=self.project_id,
dataset_id=self.dataset_id,
dataset_reference=self.dataset_reference)

Просмотреть файл

@ -18,9 +18,8 @@
# under the License.
from airflow.sensors.base_sensor_operator import BaseSensorOperator
from airflow.contrib.hooks.bigquery_hook import BigQueryHook
from airflow.utils.decorators import apply_defaults
from airflow.providers.google.cloud.hooks.bigquery import BigQueryHook
class BigQuerySQLSensorOperator(BaseSensorOperator):
"""
@ -30,9 +29,9 @@ class BigQuerySQLSensorOperator(BaseSensorOperator):
single value. If that value is coerced to false in some way,
the sensor continues to wait.
:type sql: str
:param bigquery_conn_id: The connection ID to use when connecting to
:param gcp_conn_id: The connection ID to use when connecting to
Google BigQuery.
:type bigquery_conn_id: str
:type gcp_conn_id: str
:param use_legacy_sql: Whether to use BQ legacy SQL
:type use_legacy_sql: bool
:param timeout: Time in seconds to wait for the sensor,
@ -40,14 +39,13 @@ class BigQuerySQLSensorOperator(BaseSensorOperator):
:type timeout: int
"""
template_fields = BaseSensorOperator.template_fields + [
template_fields = BaseSensorOperator.template_fields + (
'sql',
]
)
@apply_defaults
def __init__(self,
sql,
bigquery_conn_id='bigquery_default_conn',
gcp_conn_id='bigquery_default_conn',
use_legacy_sql=False,
timeout=60*60*24,
*args,
@ -58,7 +56,7 @@ class BigQuerySQLSensorOperator(BaseSensorOperator):
*args,
**kwargs)
self.sql = sql
self.bigquery_conn_id = bigquery_conn_id
self.gcp_conn_id = gcp_conn_id
self.use_legacy_sql = use_legacy_sql
self.poke_interval = 120
self.mode = 'reschedule'
@ -78,5 +76,5 @@ class BigQuerySQLSensorOperator(BaseSensorOperator):
return True
def get_db_hook(self):
return BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
return BigQueryHook(gcp_conn_id=self.gcp_conn_id,
use_legacy_sql=self.use_legacy_sql)

Просмотреть файл

@ -1,38 +1,17 @@
import os
import subprocess
import tempfile
from airflow.providers.google.cloud.operators.kubernetes_engine import GKEStartPodOperator as UpstreamGKEPodOperator
from google.auth.environment_vars import CREDENTIALS
from airflow import AirflowException
from airflow.contrib.hooks.gcp_container_hook import GKEClusterHook
from airflow.contrib.operators.gcp_container_operator import GKEPodOperator as UpstreamGKEPodOperator
KUBE_CONFIG_ENV_VAR = "KUBECONFIG"
GCLOUD_APP_CRED = "CLOUDSDK_AUTH_CREDENTIAL_FILE_OVERRIDE"
# Note: In the next version of airflow this will change.
# This module is deprecated. Please use `airflow.providers.google.cloud.operators.kubernetes_engine`.
class GKEPodOperator(UpstreamGKEPodOperator):
"""
We override execute and _set_env_from_extras methods to support:
- `CLOUDSDK_AUTH_CREDENTIAL_FILE_OVERRIDE` environment variable that is
set to the path of the Service Account JSON key file. This is neccesary
for gcloud to operate.
- Adjust when NamedTemporaryFile file descriptor is closed.
- Preserve XCOM result when do_xcom_push is True.
- Override init to default image_pull_policy=Always, in_cluster=False, do_xcom_push=False and GKE params
- In 1.10.x this inherited from upstream GKEPodOperator, rather than GKEStartPodOperator(v2)
- In 1.10.x we needed to override the execute and helper methods to set an environment
variable for authentication to work (CLOUDSDK_AUTH_CREDENTIAL_FILE_OVERRIDE). Fixed in v2
- We will keep this class and call the upstream GkeStartPodOperator now, because
numerous places in our code references it still
- Overrides init to default image_pull_policy=Always, in_cluster=False,
do_xcom_push=False and GKE params
- Defaults reattach_on_restart=False to address a 1.10.12 regression where GkePodOperators
reruns will simply attach to an existing pod and not perform any new work.
- Hard sets reattach_on_restart=False when do_xcom_push=True to address an error
Retrying a failed task with do_xcom_push=True causes airflow to reattach to the pod
eventually causing a 'Handshake status 500 Internal Server Error'. Logs will indicate
@ -75,75 +54,3 @@ class GKEPodOperator(UpstreamGKEPodOperator):
namespace=namespace,
*args,
**kwargs)
def execute(self, context):
# We can remove this override once upgraded to 2.0. https://issues.apache.org/jira/browse/AIRFLOW-4072
# Moz specific - Commented out key_file references (Jason fixed auth behaviour with 1.10.2)
# key_file = None
# If gcp_conn_id is not specified gcloud will use the default
# service account credentials.
if self.gcp_conn_id:
from airflow.hooks.base_hook import BaseHook
# extras is a deserialized json object
extras = BaseHook.get_connection(self.gcp_conn_id).extra_dejson
self._set_env_from_extras(extras=extras) # Moz specific since func no longer returns value
# Write config to a temp file and set the environment variable to point to it.
# This is to avoid race conditions of reading/writing a single file
with tempfile.NamedTemporaryFile() as conf_file:
os.environ[KUBE_CONFIG_ENV_VAR] = conf_file.name
# Attempt to get/update credentials
# We call gcloud directly instead of using google-cloud-python api
# because there is no way to write kubernetes config to a file, which is
# required by KubernetesPodOperator.
# The gcloud command looks at the env variable `KUBECONFIG` for where to save
# the kubernetes config file.
subprocess.check_call(
["gcloud", "container", "clusters", "get-credentials",
self.cluster_name,
"--zone", self.location,
"--project", self.project_id])
# if key_file: # Moz specific commented out
# key_file.close() # Moz specific commented out
# Tell `KubernetesPodOperator` where the config file is located
self.config_file = os.environ[KUBE_CONFIG_ENV_VAR]
result = super(UpstreamGKEPodOperator, self).execute(context) # Moz specific
if self.do_xcom_push: # Moz specific
return result # Moz specific
def _set_env_from_extras(self, extras):
"""
Sets the environment variable `GOOGLE_APPLICATION_CREDENTIALS` and
`CLOUDSDK_AUTH_CREDENTIAL_FILE_OVERRIDE`with either:
- The path to the keyfile from the specified connection id
- A generated file's path if the user specified JSON in the connection id. The
file is assumed to be deleted after the process dies due to how mkstemp()
works.
The environment variable is used inside the gcloud command to determine correct
service account to use.
"""
key_path = self._get_field(extras, 'key_path', False)
keyfile_json_str = self._get_field(extras, 'keyfile_dict', False)
if not key_path and not keyfile_json_str:
self.log.info('Using gcloud with application default credentials.')
elif key_path:
os.environ[CREDENTIALS] = key_path
os.environ[GCLOUD_APP_CRED] = key_path
return None
else:
# Write service account JSON to secure file for gcloud to reference
service_key = tempfile.NamedTemporaryFile(delete=False)
service_key.write(keyfile_json_str.encode('utf-8'))
os.environ[CREDENTIALS] = service_key.name
os.environ[GCLOUD_APP_CRED] = service_key.name
# Return file object to have a pointer to close after use,
# thus deleting from file system.
service_key.close() # Moz specific instead of return service_key

Просмотреть файл

@ -1,459 +0,0 @@
import os
import re
import time
import uuid
from datetime import timedelta
from airflow.contrib.hooks.gcp_dataproc_hook import DataProcHook
from airflow.contrib.operators.dataproc_operator import DataprocOperationBaseOperator
from airflow.exceptions import AirflowException
from airflow.models import BaseOperator
from airflow.utils.decorators import apply_defaults
from airflow.utils import timezone
from airflow.version import version
"""
We overwrite DataprocClusterCreateOperator here to create clusters with an option to
install component gateway, which we install by default. We also add labels to the gce
cluster config.
Previously on 1.10.2, we had to include DataprocOperationBaseOperator from master
which used the v1beta2 rest api for creating clusters allowing us to install optional
components and component gateway, but this class has been updated since 1.10.4.
"""
# pylint: disable=too-many-instance-attributes
class DataprocClusterCreateOperator(DataprocOperationBaseOperator):
"""
--
Pulled from 1.10.7
We modify the _build_gce_cluster_config method to install component gateway.
--
Create a new cluster on Google Cloud Dataproc. The operator will wait until the
creation is successful or an error occurs in the creation process.
The parameters allow to configure the cluster. Please refer to
https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.clusters
for a detailed explanation on the different parameters. Most of the configuration
parameters detailed in the link are available as a parameter to this operator.
:param cluster_name: The name of the DataProc cluster to create. (templated)
:type cluster_name: str
:param project_id: The ID of the google cloud project in which
to create the cluster. (templated)
:type project_id: str
:param num_workers: The # of workers to spin up. If set to zero will
spin up cluster in a single node mode
:type num_workers: int
:param storage_bucket: The storage bucket to use, setting to None lets dataproc
generate a custom one for you
:type storage_bucket: str
:param init_actions_uris: List of GCS uri's containing
dataproc initialization scripts
:type init_actions_uris: list[str]
:param init_action_timeout: Amount of time executable scripts in
init_actions_uris has to complete
:type init_action_timeout: str
:param metadata: dict of key-value google compute engine metadata entries
to add to all instances
:type metadata: dict
:param image_version: the version of software inside the Dataproc cluster
:type image_version: str
:param custom_image: custom Dataproc image for more info see
https://cloud.google.com/dataproc/docs/guides/dataproc-images
:type custom_image: str
:param custom_image_project_id: project id for the custom Dataproc image, for more info see
https://cloud.google.com/dataproc/docs/guides/dataproc-images
:type custom_image_project_id: str
:param autoscaling_policy: The autoscaling policy used by the cluster. Only resource names
including projectid and location (region) are valid. Example:
``projects/[projectId]/locations/[dataproc_region]/autoscalingPolicies/[policy_id]``
:type autoscaling_policy: str
:param properties: dict of properties to set on
config files (e.g. spark-defaults.conf), see
https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.clusters#SoftwareConfig
:type properties: dict
:param optional_components: List of optional cluster components, for more info see
https://cloud.google.com/dataproc/docs/reference/rest/v1/ClusterConfig#Component
:type optional_components: list[str]
:param num_masters: The # of master nodes to spin up
:type num_masters: int
:param master_machine_type: Compute engine machine type to use for the master node
:type master_machine_type: str
:param master_disk_type: Type of the boot disk for the master node
(default is ``pd-standard``).
Valid values: ``pd-ssd`` (Persistent Disk Solid State Drive) or
``pd-standard`` (Persistent Disk Hard Disk Drive).
:type master_disk_type: str
:param master_disk_size: Disk size for the master node
:type master_disk_size: int
:param master_num_local_ssds : Number of local SSDs to mount. Local SSDs are used for writing and reading Apache Hadoop and Apache Spark scratch files, such as shuffle outputs. Adding SSDs will improve Spark runtime performance.
(default is 0)
:type master_num_local_ssds : int
:param worker_machine_type: Compute engine machine type to use for the worker nodes
:type worker_machine_type: str
:param worker_disk_type: Type of the boot disk for the worker node
(default is ``pd-standard``).
Valid values: ``pd-ssd`` (Persistent Disk Solid State Drive) or
``pd-standard`` (Persistent Disk Hard Disk Drive).
:type worker_disk_type: str
:param worker_disk_size: Disk size for the worker nodes
:type worker_disk_size: int
:param worker_num_local_ssds : Number of local SSDs to mount. Local SSDs are used for writing and reading Apache Hadoop and Apache Spark scratch files, such as shuffle outputs. Adding SSDs will improve Spark runtime performance.
(default is 0)
:type worker_num_local_ssds : int
:param num_preemptible_workers: The # of preemptible worker nodes to spin up
:type num_preemptible_workers: int
:param labels: dict of labels to add to the cluster
:type labels: dict
:param zone: The zone where the cluster will be located. Set to None to auto-zone. (templated)
:type zone: str
:param network_uri: The network uri to be used for machine communication, cannot be
specified with subnetwork_uri
:type network_uri: str
:param subnetwork_uri: The subnetwork uri to be used for machine communication,
cannot be specified with network_uri
:type subnetwork_uri: str
:param internal_ip_only: If true, all instances in the cluster will only
have internal IP addresses. This can only be enabled for subnetwork
enabled networks
:type internal_ip_only: bool
:param tags: The GCE tags to add to all instances
:type tags: list[str]
:param region: leave as 'global', might become relevant in the future. (templated)
:type region: str
:param gcp_conn_id: The connection ID to use connecting to Google Cloud Platform.
:type gcp_conn_id: str
:param delegate_to: The account to impersonate, if any.
For this to work, the service account making the request must have domain-wide
delegation enabled.
:type delegate_to: str
:param service_account: The service account of the dataproc instances.
:type service_account: str
:param service_account_scopes: The URIs of service account scopes to be included.
:type service_account_scopes: list[str]
:param idle_delete_ttl: The longest duration that cluster would keep alive while
staying idle. Passing this threshold will cause cluster to be auto-deleted.
A duration in seconds.
:type idle_delete_ttl: int
:param auto_delete_time: The time when cluster will be auto-deleted.
:type auto_delete_time: datetime.datetime
:param auto_delete_ttl: The life duration of cluster, the cluster will be
auto-deleted at the end of this duration.
A duration in seconds. (If auto_delete_time is set this parameter will be ignored)
:type auto_delete_ttl: int
:param customer_managed_key: The customer-managed key used for disk encryption
``projects/[PROJECT_STORING_KEYS]/locations/[LOCATION]/keyRings/[KEY_RING_NAME]/cryptoKeys/[KEY_NAME]`` # noqa # pylint: disable=line-too-long
:type customer_managed_key: str
Moz specific
:param install_component_gateway: Install alpha feature component gateway.
:type install_component_gateway: boolean
"""
template_fields = ['cluster_name', 'project_id', 'zone', 'region']
# pylint: disable=too-many-arguments,too-many-locals
@apply_defaults
def __init__(self,
project_id,
cluster_name,
num_workers,
job_name=None, # Moz specific
zone=None,
network_uri=None,
subnetwork_uri=None,
internal_ip_only=None,
tags=None,
storage_bucket=None,
init_actions_uris=None,
init_action_timeout="10m",
metadata=None,
custom_image=None,
custom_image_project_id=None,
image_version=None,
autoscaling_policy=None,
properties=None,
optional_components=['ANACONDA'], # Moz specific
num_masters=1,
master_machine_type='n1-standard-4',
master_disk_type='pd-standard',
master_disk_size=500,
master_num_local_ssds=0,
worker_machine_type='n1-standard-4',
worker_disk_type='pd-standard',
worker_disk_size=500,
worker_num_local_ssds=0,
num_preemptible_workers=0,
labels=None,
region='global',
service_account=None,
service_account_scopes=None,
idle_delete_ttl=None,
auto_delete_time=None,
auto_delete_ttl=None,
customer_managed_key=None,
install_component_gateway=True, # Moz specific
*args,
**kwargs):
super(DataprocClusterCreateOperator, self).__init__(
project_id=project_id, region=region, *args, **kwargs)
self.cluster_name = cluster_name
self.job_name = job_name
self.num_masters = num_masters
self.num_workers = num_workers
self.num_preemptible_workers = num_preemptible_workers
self.storage_bucket = storage_bucket
self.init_actions_uris = init_actions_uris
self.init_action_timeout = init_action_timeout
self.metadata = metadata
self.custom_image = custom_image
self.custom_image_project_id = custom_image_project_id
self.image_version = image_version
self.properties = properties or dict()
self.optional_components = optional_components
self.master_machine_type = master_machine_type
self.master_disk_type = master_disk_type
self.master_disk_size = master_disk_size
self.master_num_local_ssds = master_num_local_ssds
self.autoscaling_policy = autoscaling_policy
self.worker_machine_type = worker_machine_type
self.worker_disk_type = worker_disk_type
self.worker_disk_size = worker_disk_size
self.worker_num_local_ssds = worker_num_local_ssds
self.labels = labels
self.zone = zone
self.network_uri = network_uri
self.subnetwork_uri = subnetwork_uri
self.internal_ip_only = internal_ip_only
self.tags = tags
self.service_account = service_account
self.service_account_scopes = service_account_scopes
self.idle_delete_ttl = idle_delete_ttl
self.auto_delete_time = auto_delete_time
self.auto_delete_ttl = auto_delete_ttl
self.customer_managed_key = customer_managed_key
self.single_node = num_workers == 0
self.install_component_gateway = install_component_gateway # Moz specific
assert not (self.custom_image and self.image_version), \
"custom_image and image_version can't be both set"
assert (
not self.single_node or (
self.single_node and self.num_preemptible_workers == 0
)
), "num_workers == 0 means single node mode - no preemptibles allowed"
def _get_init_action_timeout(self):
match = re.match(r"^(\d+)(s|m)$", self.init_action_timeout)
if match:
if match.group(2) == "s":
return self.init_action_timeout
elif match.group(2) == "m":
val = float(match.group(1))
return "{}s".format(timedelta(minutes=val).seconds)
raise AirflowException(
"DataprocClusterCreateOperator init_action_timeout"
" should be expressed in minutes or seconds. i.e. 10m, 30s")
def _build_gce_cluster_config(self, cluster_data):
"""
We optionally add alpha feature 'enable component gateway'
"""
if self.install_component_gateway: # Moz specific start
# Fetch current nested dict and add nested keys
cluster_config_new = cluster_data['config']
cluster_config_new.update({'endpointConfig' : {'enableHttpPortAccess' : True}})
# Overwrite the config key with newly created
cluster_data.update({'config' : cluster_config_new}) # Moz specific end
if self.zone:
zone_uri = \
'https://www.googleapis.com/compute/v1/projects/{}/zones/{}'.format(
self.project_id, self.zone
)
cluster_data['config']['gceClusterConfig']['zoneUri'] = zone_uri
if self.metadata:
cluster_data['config']['gceClusterConfig']['metadata'] = self.metadata
if self.network_uri:
cluster_data['config']['gceClusterConfig']['networkUri'] = self.network_uri
if self.subnetwork_uri:
cluster_data['config']['gceClusterConfig']['subnetworkUri'] = \
self.subnetwork_uri
if self.internal_ip_only:
if not self.subnetwork_uri:
raise AirflowException("Set internal_ip_only to true only when"
" you pass a subnetwork_uri.")
cluster_data['config']['gceClusterConfig']['internalIpOnly'] = True
if self.tags:
cluster_data['config']['gceClusterConfig']['tags'] = self.tags
if self.service_account:
cluster_data['config']['gceClusterConfig']['serviceAccount'] = \
self.service_account
if self.service_account_scopes:
cluster_data['config']['gceClusterConfig']['serviceAccountScopes'] = \
self.service_account_scopes
return cluster_data
def _build_lifecycle_config(self, cluster_data):
if self.idle_delete_ttl:
cluster_data['config']['lifecycleConfig']['idleDeleteTtl'] = \
"{}s".format(self.idle_delete_ttl)
if self.auto_delete_time:
utc_auto_delete_time = timezone.convert_to_utc(self.auto_delete_time)
cluster_data['config']['lifecycleConfig']['autoDeleteTime'] = \
utc_auto_delete_time.format('%Y-%m-%dT%H:%M:%S.%fZ', formatter='classic')
elif self.auto_delete_ttl:
cluster_data['config']['lifecycleConfig']['autoDeleteTtl'] = \
"{}s".format(self.auto_delete_ttl)
return cluster_data
def _build_cluster_data(self):
if self.zone:
master_type_uri = \
"https://www.googleapis.com/compute/v1/projects/{}/zones/{}/machineTypes/{}"\
.format(self.project_id, self.zone, self.master_machine_type)
worker_type_uri = \
"https://www.googleapis.com/compute/v1/projects/{}/zones/{}/machineTypes/{}"\
.format(self.project_id, self.zone, self.worker_machine_type)
else:
master_type_uri = self.master_machine_type
worker_type_uri = self.worker_machine_type
cluster_data = {
'projectId': self.project_id,
'clusterName': self.cluster_name,
'labels': {},
'config': {
'gceClusterConfig': {
},
'masterConfig': {
'numInstances': self.num_masters,
'machineTypeUri': master_type_uri,
'diskConfig': {
'bootDiskType': self.master_disk_type,
'bootDiskSizeGb': self.master_disk_size,
'numLocalSsds': self.master_num_local_ssds,
}
},
'workerConfig': {
'numInstances': self.num_workers,
'machineTypeUri': worker_type_uri,
'diskConfig': {
'bootDiskType': self.worker_disk_type,
'bootDiskSizeGb': self.worker_disk_size,
'numLocalSsds': self.worker_num_local_ssds,
}
},
'secondaryWorkerConfig': {},
'softwareConfig': {},
'lifecycleConfig': {},
'encryptionConfig': {},
'autoscalingConfig': {},
}
}
if self.num_preemptible_workers > 0:
cluster_data['config']['secondaryWorkerConfig'] = {
'numInstances': self.num_preemptible_workers,
'machineTypeUri': worker_type_uri,
'diskConfig': {
'bootDiskType': self.worker_disk_type,
'bootDiskSizeGb': self.worker_disk_size
},
'isPreemptible': True
}
cluster_data['labels'] = self.labels or {}
# Dataproc labels must conform to the following regex:
# [a-z]([-a-z0-9]*[a-z0-9])? (current airflow version string follows
# semantic versioning spec: x.y.z).
cluster_data['labels'].update({'airflow-version':
'v' + version.replace('.', '-').replace('+', '-')})
# Moz specific
cluster_data['labels'].update({'owner': self.owner.lower().replace('@mozilla.com', '').replace('.', '-'),
'env': os.getenv('DEPLOY_ENVIRONMENT', 'env_not_set'),
'jobname': self.job_name.lower().replace('_', '-')})
if self.storage_bucket:
cluster_data['config']['configBucket'] = self.storage_bucket
if self.image_version:
cluster_data['config']['softwareConfig']['imageVersion'] = self.image_version
elif self.custom_image:
project_id = self.custom_image_project_id if (self.custom_image_project_id) else self.project_id
custom_image_url = 'https://www.googleapis.com/compute/beta/projects/' \
'{}/global/images/{}'.format(project_id,
self.custom_image)
cluster_data['config']['masterConfig']['imageUri'] = custom_image_url
if not self.single_node:
cluster_data['config']['workerConfig']['imageUri'] = custom_image_url
cluster_data = self._build_gce_cluster_config(cluster_data)
if self.single_node:
self.properties["dataproc:dataproc.allow.zero.workers"] = "true"
if self.properties:
cluster_data['config']['softwareConfig']['properties'] = self.properties
if self.optional_components:
cluster_data['config']['softwareConfig']['optionalComponents'] = self.optional_components
cluster_data = self._build_lifecycle_config(cluster_data)
if self.init_actions_uris:
init_actions_dict = [
{
'executableFile': uri,
'executionTimeout': self._get_init_action_timeout()
} for uri in self.init_actions_uris
]
cluster_data['config']['initializationActions'] = init_actions_dict
if self.customer_managed_key:
cluster_data['config']['encryptionConfig'] =\
{'gcePdKmsKeyName': self.customer_managed_key}
if self.autoscaling_policy:
cluster_data['config']['autoscalingConfig'] = {'policyUri': self.autoscaling_policy}
return cluster_data
def start(self):
"""
Create a new cluster on Google Cloud Dataproc.
"""
self.log.info('Creating cluster: %s', self.cluster_name)
cluster_data = self._build_cluster_data()
return (
self.hook.get_conn().projects().regions().clusters().create( # pylint: disable=no-member
projectId=self.project_id,
region=self.region,
body=cluster_data,
requestId=str(uuid.uuid4()),
).execute())

Просмотреть файл

@ -1,10 +1,8 @@
from airflow.models import BaseOperator
from airflow.utils.decorators import apply_defaults
import logging
import time
class SleepOperator(BaseOperator):
@apply_defaults
def __init__(self, sleep_time=30, *args, **kwargs):
super(SleepOperator, self).__init__(*args, **kwargs)
self.sleep_time=sleep_time

Просмотреть файл

@ -5,9 +5,8 @@ from sqlalchemy import func
from airflow.exceptions import AirflowException
from airflow.models import DagBag, DagModel, DagRun, TaskInstance
from airflow.operators.sensors import ExternalTaskSensor
from airflow.sensors.external_task import ExternalTaskSensor
from airflow.utils.db import provide_session
from airflow.utils.decorators import apply_defaults
from airflow.utils.state import State
@ -28,7 +27,6 @@ class ExternalTaskCompletedSensor(ExternalTaskSensor):
"""
@apply_defaults
def __init__(self, failed_states = None, *args, **kwargs):
super().__init__(*args, **kwargs)
self.failed_states = failed_states or [State.FAILED, State.UPSTREAM_FAILED, State.SKIPPED]
@ -41,7 +39,7 @@ class ExternalTaskCompletedSensor(ExternalTaskSensor):
dttm = context['execution_date'] - self.execution_delta
elif self.execution_date_fn:
# Moz specific - _handle_execution_date_fn may not be defined in this context
raise AirflowException("execution_date_fn is not supported by this sensor.")
raise AirflowException("execution_date_fn is not supported by this custom mozilla sensor.")
else:
dttm = context['execution_date']

Просмотреть файл

@ -1,14 +1,8 @@
from airflow import DAG
from datetime import datetime, timedelta
from airflow.contrib.hooks.aws_hook import AwsHook
from airflow.executors import get_default_executor
from airflow.operators.subdag_operator import SubDagOperator
from operators.task_sensor import ExternalTaskCompletedSensor
from utils.dataproc import (
moz_dataproc_pyspark_runner,
moz_dataproc_jar_runner,
get_dataproc_parameters,
)
from utils.gcp import (
bigquery_etl_query,
bigquery_etl_copy_deduplicate,
@ -91,7 +85,6 @@ main_summary_export = SubDagOperator(
default_args=default_args,
num_workers=40),
task_id="main_summary_export",
executor=get_default_executor(),
dag=dag)
clients_daily_export = SubDagOperator(
@ -141,7 +134,6 @@ clients_daily_export = SubDagOperator(
default_args=default_args,
num_preemptible_workers=10),
task_id="clients_daily_export",
executor=get_default_executor(),
dag=dag)
wait_for_clients_daily = ExternalTaskCompletedSensor(

Просмотреть файл

@ -1,17 +1,16 @@
from airflow import DAG
from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook
from airflow.contrib.operators.dataproc_operator import (
DataprocClusterCreateOperator,
DataprocClusterDeleteOperator,
DataProcPySparkOperator,
from airflow.providers.google.cloud.operators.dataproc import (
DataprocCreateClusterOperator,
DataprocDeleteClusterOperator,
DataprocSubmitPySparkJobOperator,
)
def spark_subdag(
parent_dag_name,
child_dag_name,
default_args,
gcp_conn_id,
project_id,
service_account,
main,
pyfiles,
@ -27,6 +26,7 @@ def spark_subdag(
:param str child_dag_name: Name of the child DAG.
:param Dict[str, Any] default_args: Default arguments for the child DAG.
:param str gcp_conn_id: Name of the connection string.
:param str project_id: GCP project id corresponding to the gcp_conn_id.
:param str service_account: The address of the service account.
:param str dataproc_region: The region of the DataProc cluster.
:param str main:
@ -36,12 +36,10 @@ def spark_subdag(
:return: DAG
"""
connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id)
shared_config = {
"cluster_name": "prio-staging-{{ds_nodash}}",
"gcp_conn_id": gcp_conn_id,
"project_id": connection.project_id,
"project_id": project_id,
# From an error when not specifying the region:
# - Dataproc images 2.0 and higher do not support the to-be
# deprecated global region. Please use any non-global Dataproc
@ -54,7 +52,7 @@ def spark_subdag(
}
with DAG(f"{parent_dag_name}.{child_dag_name}", default_args=default_args) as dag:
create_dataproc_cluster = DataprocClusterCreateOperator(
create_dataproc_cluster = DataprocCreateClusterOperator(
task_id="create_dataproc_cluster",
image_version="preview-ubuntu18",
service_account=service_account,
@ -68,10 +66,10 @@ def spark_subdag(
**shared_config,
)
run_dataproc_spark = DataProcPySparkOperator(
run_dataproc_spark = DataprocSubmitPySparkJobOperator(
task_id="run_dataproc_spark",
main=main,
dataproc_pyspark_jars=[
dataproc_jars=[
"gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar"
],
pyfiles=pyfiles,
@ -80,7 +78,7 @@ def spark_subdag(
**shared_config,
)
delete_dataproc_cluster = DataprocClusterDeleteOperator(
delete_dataproc_cluster = DataprocDeleteClusterOperator(
task_id="delete_dataproc_cluster",
trigger_rule="all_done",
dag=dag,

Просмотреть файл

@ -2,10 +2,9 @@ from datetime import timedelta
from os import environ
from airflow import DAG
from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook
from airflow.contrib.operators.gcp_container_operator import (
GKEClusterCreateOperator,
GKEClusterDeleteOperator,
from airflow.providers.google.cloud.operators.kubernetes_engine import (
GKECreateClusterOperator,
GKEDeleteClusterOperator,
)
from airflow.operators.bash_operator import BashOperator
from operators.gcp_container_operator import GKEPodOperator
@ -17,6 +16,7 @@ def container_subdag(
child_dag_name,
default_args,
gcp_conn_id,
project_id,
service_account,
server_id,
env_vars={},
@ -35,6 +35,7 @@ def container_subdag(
:param str child_dag_name: Name of the child DAG.
:param Dict[str, Any] default_args: Default arguments for the child DAG.
:param str gcp_conn_id: Name of the connection string.
:param str project_id: GCP project id associated with the gcp_conn_id.
:param str service_account: The address of the service account.
:param str server_id: The identifier for the Prio processor
:param Dict[str, str] env_vars: Environment variables for configuring
@ -50,12 +51,10 @@ def container_subdag(
"""
assert server_id in ["a", "b", "admin"]
connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id)
cluster_name = f"gke-prio-{server_id}"
shared_config = {
"project_id": connection.project_id,
"project_id": project_id,
"gcp_conn_id": gcp_conn_id,
"location": location,
}
@ -67,7 +66,7 @@ def container_subdag(
# https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/
# https://cloud.google.com/composer/docs/how-to/using/using-kubernetes-pod-operator
# https://airflow.apache.org/docs/stable/_api/airflow/contrib/operators/kubernetes_pod_operator/index.html
create_gke_cluster = GKEClusterCreateOperator(
create_gke_cluster = GKECreateClusterOperator(
task_id="create_gke_cluster",
body=create_gke_config(
name=cluster_name,
@ -141,7 +140,7 @@ def container_subdag(
**kwargs,
)
delete_gke_cluster = GKEClusterDeleteOperator(
delete_gke_cluster = GKEDeleteClusterOperator(
task_id="delete_gke_cluster",
name=cluster_name,
trigger_rule="all_done",

Просмотреть файл

@ -37,10 +37,9 @@ the environment.
from functools import partial
from airflow import DAG
from airflow.contrib.operators.gcs_to_gcs import (
GoogleCloudStorageToGoogleCloudStorageOperator,
)
from airflow.operators import DummyOperator, PythonOperator
from airflow.providers.google.cloud.transfers.gcs_to_gcs import GCSToGCSOperator
from airflow.operators.python import PythonOperator
from airflow.operators.dummy import DummyOperator
from airflow.operators.subdag_operator import SubDagOperator
from prio import dataproc, kubernetes
@ -56,7 +55,7 @@ def transfer_data_subdag(
submission_date,
server_id,
public_key_hex_external,
google_cloud_storage_conn_id,
gcp_conn_id,
):
"""Copy the partitioned data from the staging bucket into the corresponding
receiving buckets in each processor. The job then submits a `_SUCCESS` file
@ -78,22 +77,22 @@ def transfer_data_subdag(
"raw/shares",
]
)
transfer_dataset = GoogleCloudStorageToGoogleCloudStorageOperator(
transfer_dataset = GCSToGCSOperator(
task_id="transfer_dataset",
source_bucket=source_bucket,
source_object=f"staging/submission_date={submission_date}/server_id={server_id}/*",
destination_bucket=destination_bucket,
destination_object=f"{prefix}/",
google_cloud_storage_conn_id=google_cloud_storage_conn_id,
gcp_conn_id=gcp_conn_id,
dag=dag,
)
mark_dataset_success = GoogleCloudStorageToGoogleCloudStorageOperator(
mark_dataset_success = GCSToGCSOperator(
task_id="mark_dataset_success",
source_bucket=source_bucket,
source_object="staging/_SUCCESS",
destination_bucket=destination_bucket,
destination_object=f"{prefix}/_SUCCESS",
google_cloud_storage_conn_id=google_cloud_storage_conn_id,
gcp_conn_id=gcp_conn_id,
dag=dag,
)
transfer_dataset >> mark_dataset_success
@ -104,6 +103,7 @@ def ingestion_subdag(
dag,
default_args,
gcp_conn_id,
project_id,
service_account,
bucket_bootstrap_admin,
bucket_data_admin,
@ -125,13 +125,14 @@ def ingestion_subdag(
default_args=default_args,
server_id="admin",
gcp_conn_id=gcp_conn_id,
project_id=project_id,
service_account=service_account,
arguments=[
"bash",
"-xc",
f"source bin/dataproc; bootstrap gs://{bucket_bootstrap_admin}",
],
env_var=dict(SUBMODULE="origin"),
env_vars=dict(SUBMODULE="origin"),
),
task_id="bootstrap",
dag=dag,
@ -144,6 +145,7 @@ def ingestion_subdag(
child_dag_name="staging",
default_args=default_args,
gcp_conn_id=gcp_conn_id,
project_id=project_id,
service_account=service_account,
main=f"gs://{bucket_bootstrap_admin}/processor-origin.py",
pyfiles=[f"gs://{bucket_bootstrap_admin}/prio_processor.egg"],
@ -175,7 +177,7 @@ def ingestion_subdag(
destination_bucket_prefix=bucket_prefix,
app_name=app_name,
submission_date="{{ ds }}",
google_cloud_storage_conn_id=gcp_conn_id,
gcp_conn_id=gcp_conn_id,
)
transfer_a = SubDagOperator(
@ -213,7 +215,7 @@ def ingestion_subdag(
def prio_processor_subdag(
dag, default_args, gcp_conn_id, service_account, server_id, env_vars
dag, default_args, gcp_conn_id, project_id, service_account, server_id, env_vars
):
return SubDagOperator(
subdag=kubernetes.container_subdag(
@ -221,6 +223,7 @@ def prio_processor_subdag(
child_dag_name=f"processor_{server_id}",
default_args=default_args,
gcp_conn_id=gcp_conn_id,
project_id=project_id,
service_account=service_account,
server_id=server_id,
arguments=["bin/process"],
@ -231,7 +234,7 @@ def prio_processor_subdag(
)
def load_bigquery_subdag(dag, default_args, gcp_conn_id, service_account, env_vars):
def load_bigquery_subdag(dag, default_args, gcp_conn_id, project_id, service_account, env_vars):
# Take the resulting aggregates and insert them into a BigQuery table. This
# table is effectively append-only, so rerunning the dag will cause duplicate
# results. In practice, rerunning the DAG is problematic when operation is
@ -243,6 +246,7 @@ def load_bigquery_subdag(dag, default_args, gcp_conn_id, service_account, env_va
default_args=default_args,
server_id="admin",
gcp_conn_id=gcp_conn_id,
project_id=project_id,
service_account=service_account,
arguments=["bash", "-c", "bin/insert"],
env_vars=env_vars,

Просмотреть файл

@ -2,7 +2,6 @@ from datetime import datetime, timedelta
from os import environ
from airflow import DAG
from airflow.contrib.hooks.gcs_hook import GoogleCloudStorageHook
from prio.processor import ingestion_subdag, load_bigquery_subdag, prio_processor_subdag
DEFAULT_ARGS = {
@ -26,13 +25,12 @@ IS_DEV = environ.get("DEPLOY_ENVIRONMENT") != "prod"
ENVIRONMENT = "dev" if IS_DEV else "prod"
PRIO_ADMIN_CONN = "google_cloud_prio_admin"
PRIO_ADMIN_PROJECT_ID = "moz-fx-prio-admin-prod-098j"
PRIO_A_CONN = "google_cloud_prio_a"
PRIO_A_PROJECT_ID = "moz-fx-prio-a-prod-kju7"
PROJECT_ADMIN = GoogleCloudStorageHook(PRIO_ADMIN_CONN).project_id
PROJECT_A = GoogleCloudStorageHook(PRIO_A_CONN).project_id
SERVICE_ACCOUNT_ADMIN = f"prio-admin-runner@{PROJECT_ADMIN}.iam.gserviceaccount.com"
SERVICE_ACCOUNT_A = f"prio-runner-{ENVIRONMENT}-a@{PROJECT_A}.iam.gserviceaccount.com"
SERVICE_ACCOUNT_ADMIN = f"prio-admin-runner@{PRIO_ADMIN_PROJECT_ID}.iam.gserviceaccount.com"
SERVICE_ACCOUNT_A = f"prio-runner-{ENVIRONMENT}-a@{PRIO_A_PROJECT_ID}.iam.gserviceaccount.com"
# Private bucket of server B is necessary for transfer
BUCKET_PRIVATE_A = f"moz-fx-prio-{ENVIRONMENT}-a-private"
@ -58,6 +56,7 @@ ingest = ingestion_subdag(
dag,
DEFAULT_ARGS,
PRIO_ADMIN_CONN,
PRIO_ADMIN_PROJECT_ID,
SERVICE_ACCOUNT_ADMIN,
BUCKET_BOOTSTRAP_ADMIN,
BUCKET_DATA_ADMIN,
@ -73,6 +72,7 @@ processor_a = prio_processor_subdag(
dag,
DEFAULT_ARGS,
PRIO_A_CONN,
PRIO_A_PROJECT_ID,
SERVICE_ACCOUNT_A,
"a",
{
@ -99,6 +99,7 @@ load_bigquery = load_bigquery_subdag(
dag,
DEFAULT_ARGS,
PRIO_ADMIN_CONN,
PRIO_ADMIN_PROJECT_ID,
SERVICE_ACCOUNT_ADMIN,
env_vars={
"APP_NAME": APP_NAME,

Просмотреть файл

@ -2,7 +2,6 @@ from datetime import datetime, timedelta
from os import environ
from airflow import DAG
from airflow.contrib.hooks.gcs_hook import GoogleCloudStorageHook
from prio.processor import prio_processor_subdag
DEFAULT_ARGS = {
@ -25,7 +24,7 @@ DEFAULT_ARGS = {
IS_DEV = environ.get("DEPLOY_ENVIRONMENT") != "prod"
ENVIRONMENT = "dev" if IS_DEV else "prod"
PRIO_B_CONN = "google_cloud_prio_b"
PROJECT_B = GoogleCloudStorageHook(PRIO_B_CONN).project_id
PROJECT_B = "moz-fx-prio-b-prod-a67n"
SERVICE_ACCOUNT_B = f"prio-runner-{ENVIRONMENT}-b@{PROJECT_B}.iam.gserviceaccount.com"
BUCKET_PRIVATE_B = f"moz-fx-prio-{ENVIRONMENT}-b-private"
BUCKET_SHARED_A = f"moz-fx-prio-{ENVIRONMENT}-a-shared"
@ -48,6 +47,7 @@ processor_b = prio_processor_subdag(
dag,
DEFAULT_ARGS,
PRIO_B_CONN,
PROJECT_B,
SERVICE_ACCOUNT_B,
"b",
{

Просмотреть файл

@ -2,13 +2,13 @@ import time
from datetime import datetime, timedelta
from airflow import DAG
from airflow.contrib.hooks.aws_hook import AwsHook
from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook
from airflow.providers.amazon.aws.hooks.base_aws import AwsBaseHook
from airflow.models import Variable
from airflow.operators.http_operator import SimpleHttpOperator
from airflow.operators.python_operator import PythonOperator
from operators.gcp_container_operator import GKEPodOperator
DOCS = """\
# Probe Scraper
@ -36,6 +36,9 @@ resource.labels.pod_name="POD_NAME_FROM_AIRFLOW_LOGS" severity>=DEFAULT
Adjust the time window as needed and you should be able to see logs associated with the failure.
"""
DEFAULT_LOOKML_GENERATOR_IMAGE_VERSION = "v1.17.0"
default_args = {
'owner': 'dthorn@mozilla.com',
'depends_on_past': False,
@ -52,7 +55,7 @@ with DAG('probe_scraper',
schedule_interval='0 0 * * 1-5') as dag:
aws_conn_id='aws_prod_probe_scraper'
aws_access_key, aws_secret_key, session = AwsHook(aws_conn_id).get_credentials()
aws_access_key, aws_secret_key, session = AwsBaseHook(aws_conn_id=aws_conn_id, client_type='s3').get_credentials()
# Built from repo https://github.com/mozilla/probe-scraper
probe_scraper_image='gcr.io/moz-fx-data-airflow-prod-88e0/probe-scraper:latest'
@ -132,15 +135,20 @@ with DAG('probe_scraper',
probe_scraper >> delay_python_task
gcp_gke_conn_id = "google_cloud_airflow_gke"
project_id = "moz-fx-data-airflow-gke-prod"
image_tag = Variable.get("lookml_generator_release_str")
if image_tag is None:
image_tag = DEFAULT_LOOKML_GENERATOR_IMAGE_VERSION
lookml_generator_prod = GKEPodOperator(
owner="ascholtz@mozilla.com",
email=["ascholtz@mozilla.com", "dataops+alerts@mozilla.com"],
task_id="lookml_generator",
name="lookml-generator-1",
image="gcr.io/moz-fx-data-airflow-prod-88e0/lookml-generator:" + Variable.get("lookml_generator_release_str"),
image="gcr.io/moz-fx-data-airflow-prod-88e0/lookml-generator:" + image_tag,
startup_timeout_seconds=500,
gcp_conn_id=gcp_gke_conn_id,
project_id=GoogleCloudBaseHook(gcp_conn_id=gcp_gke_conn_id).project_id,
project_id=project_id,
cluster_name="workloads-prod-v1",
location="us-west1",
dag=dag,
@ -168,7 +176,7 @@ with DAG('probe_scraper',
name="lookml-generator-staging-1",
image="gcr.io/moz-fx-data-airflow-prod-88e0/lookml-generator:latest",
gcp_conn_id=gcp_gke_conn_id,
project_id=GoogleCloudBaseHook(gcp_conn_id=gcp_gke_conn_id).project_id,
project_id=project_id,
cluster_name="workloads-prod-v1",
location="us-west1",
dag=dag,

Просмотреть файл

@ -1,12 +1,11 @@
from airflow import DAG
from airflow.operators.subdag_operator import SubDagOperator
from airflow.contrib.hooks.aws_hook import AwsHook
from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook
from airflow.contrib.operators.bigquery_table_delete_operator import (
BigQueryTableDeleteOperator,
from airflow.providers.google.cloud.operators.bigquery import (
BigQueryDeleteTableOperator,
)
from airflow.contrib.operators.gcp_transfer_operator import (
S3ToGoogleCloudStorageTransferOperator,
from airflow.providers.google.cloud.operators.cloud_storage_transfer_service import (
CloudDataTransferServiceS3ToGCSOperator
)
from datetime import datetime, timedelta
@ -53,7 +52,7 @@ cluster_name = "socorro-import-dataproc-cluster"
# Defined in Airflow's UI -> Admin -> Connections
gcp_conn_id = "google_cloud_airflow_dataproc"
connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id)
project_id = "airflow-dataproc"
# Required to copy socorro json data from aws prod s3 to gcs
read_aws_conn_id = "aws_socorro_readonly_s3"
@ -73,14 +72,14 @@ objects_prefix = "{}/{}/{}={}".format(
)
# copy json crashstats from s3 to gcs
s3_to_gcs = S3ToGoogleCloudStorageTransferOperator(
s3_to_gcs = CloudDataTransferServiceS3ToGCSOperator(
task_id="s3_to_gcs",
s3_bucket="crashstats-telemetry-crashes-prod-us-west-2",
project_id=project_id,
gcs_bucket=gcs_data_bucket,
description="socorro crash report copy from s3 to gcs",
aws_conn_id=read_aws_conn_id,
gcp_conn_id=gcp_conn_id,
project_id=connection.project_id,
object_conditions={"includePrefixes": "v1/crash_report/{{ ds_nodash }}"},
transfer_options={"deleteObjectsUniqueInSink": True},
timeout=3600,
@ -116,7 +115,7 @@ crash_report_parquet = SubDagOperator(
bq_gcp_conn_id = "google_cloud_derived_datasets"
bq_connection = GoogleCloudBaseHook(gcp_conn_id=bq_gcp_conn_id)
bq_project_id = "moz-fx-data-derived-datasets"
dest_s3_key = "s3://telemetry-parquet"
@ -142,9 +141,9 @@ gke_args = [
]
# We remove the current date partition for idempotency.
remove_bq_table_partition = BigQueryTableDeleteOperator(
remove_bq_table_partition = BigQueryDeleteTableOperator(
task_id="remove_bq_table_partition",
bigquery_conn_id=bq_gcp_conn_id,
gcp_conn_id=bq_gcp_conn_id,
deletion_dataset_table="{}.{}${{{{ds_nodash}}}}".format(bq_dataset, bq_table_name),
ignore_if_missing=True,
dag=dag,
@ -153,7 +152,7 @@ remove_bq_table_partition = BigQueryTableDeleteOperator(
bq_load = GKEPodOperator(
task_id="bigquery_load",
gcp_conn_id=bq_gcp_conn_id,
project_id=bq_connection.project_id,
project_id=bq_project_id,
name="load-socorro-crash-parquet-to-bq",
image=docker_image,
arguments=gke_args,

Просмотреть файл

@ -1,8 +1,8 @@
from datetime import datetime, timedelta
from airflow import DAG
from airflow.contrib.hooks.aws_hook import AwsHook
from airflow.operators.sensors import ExternalTaskSensor
from airflow.providers.amazon.aws.hooks.base_aws import AwsBaseHook
from airflow.sensors.external_task import ExternalTaskSensor
from airflow.operators.subdag_operator import SubDagOperator
from airflow.models import Variable
from itertools import chain
@ -21,14 +21,16 @@ TAAR_ETL_CONTAINER_IMAGE = "gcr.io/moz-fx-data-airflow-prod-88e0/taar_gcp_etl:0.
# Dataproc connection to GCP
gcpdataproc_conn_id = "google_cloud_airflow_dataproc"
taar_gcpdataproc_conn_id = "google_cloud_airflow_dataproc"
taar_gcpdataproc_project_id = "airflow-dataproc"
taar_aws_conn_id = "airflow_taar_rw_s3"
taar_aws_access_key, taar_aws_secret_key, session = AwsHook(taar_aws_conn_id).get_credentials()
taar_aws_access_key, taar_aws_secret_key, session = AwsBaseHook(
aws_conn_id=taar_aws_conn_id, client_type='s3').get_credentials()
taarlite_cluster_name = "dataproc-taarlite-guidguid"
taar_locale_cluster_name = "dataproc-taar-locale"
taar_similarity_cluster_name = "dataproc-taar-similarity"
taar_gcpdataproc_conn_id = "google_cloud_airflow_dataproc"
default_args = {
"owner": "epavlov@mozilla.com",
@ -107,7 +109,8 @@ taar_locale = SubDagOperator(
"--prefix",
"taar/locale",
],
gcp_conn_id=taar_gcpdataproc_conn_id
gcp_conn_id=taar_gcpdataproc_conn_id,
project_id=taar_gcpdataproc_project_id
),
dag=dag
)
@ -136,6 +139,7 @@ taar_similarity = SubDagOperator(
"--prefix", "taar/similarity"
],
gcp_conn_id=taar_gcpdataproc_conn_id,
project_id=taar_gcpdataproc_project_id,
master_disk_type="pd-ssd",
worker_disk_type="pd-ssd",
master_disk_size=1024,
@ -176,6 +180,7 @@ taar_collaborative_recommender = SubDagOperator(
init_actions_uris=[],
aws_conn_id=taar_aws_conn_id,
gcp_conn_id=taar_gcpdataproc_conn_id,
project_id=taar_gcpdataproc_project_id,
default_args=default_args
),
dag=dag,
@ -204,11 +209,13 @@ taar_lite = SubDagOperator(
"--prefix", "taar/lite"
],
gcp_conn_id=taar_gcpdataproc_conn_id,
project_id=taar_gcpdataproc_project_id,
),
dag=dag,
)
taar_lite_guidranking = GKEPodOperator(
task_id="taar_lite_guidranking",
name="taar_lite_guidranking",
@ -230,3 +237,4 @@ wait_for_clients_daily_export >> taar_locale
wait_for_clients_daily_export >> taar_collaborative_recommender
wait_for_clients_daily_export >> taar_lite
wait_for_clients_daily_export >> taar_lite_guidranking

Просмотреть файл

@ -12,6 +12,7 @@ from utils.dataproc import moz_dataproc_pyspark_runner
taar_ensemble_cluster_name = "dataproc-taar-ensemble"
taar_gcpdataproc_conn_id = "google_cloud_airflow_dataproc"
taar_gcpdataproc_project_id = "airflow-dataproc"
TAAR_BIGTABLE_INSTANCE_ID = Variable.get("taar_bigtable_instance_id")
TAAR_ETL_STORAGE_BUCKET = Variable.get("taar_etl_storage_bucket")
@ -181,6 +182,7 @@ taar_ensemble = SubDagOperator(
"0.005",
],
gcp_conn_id=taar_gcpdataproc_conn_id,
project_id=taar_gcpdataproc_project_id,
master_disk_type="pd-ssd",
worker_disk_type="pd-ssd",
master_disk_size=1024,

Просмотреть файл

@ -1,6 +1,5 @@
from airflow import DAG
from airflow.contrib.hooks.aws_hook import AwsHook
from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook
from airflow.providers.amazon.aws.hooks.base_aws import AwsBaseHook
from airflow.operators.subdag_operator import SubDagOperator
from datetime import datetime, timedelta
@ -33,11 +32,11 @@ cluster_name = 'app-update-out-of-date-dataproc-cluster'
# Defined in Airflow's UI -> Admin -> Connections
gcp_conn_id = 'google_cloud_airflow_dataproc'
connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id)
# Required to write json output back to s3://telemetry-public-analysis-2/app-update/data/out-of-date/
write_aws_conn_id='aws_dev_telemetry_public_analysis_2_rw'
aws_access_key, aws_secret_key, session = AwsHook(write_aws_conn_id).get_credentials()
aws_access_key, aws_secret_key, session = AwsBaseHook(
aws_conn_id=write_aws_conn_id, client_type='s3').get_credentials()
crash_report_parquet = SubDagOperator(
task_id="update_orphaning_dashboard_etl",

Просмотреть файл

@ -3,21 +3,21 @@ import os
from collections import namedtuple
from airflow import models
from airflow.contrib.hooks.aws_hook import AwsHook
from airflow.operators.bash_operator import BashOperator
from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook
from airflow.contrib.operators.dataproc_operator import DataprocClusterDeleteOperator, DataProcSparkOperator, DataProcPySparkOperator
from airflow.exceptions import AirflowException
from airflow.utils.trigger_rule import TriggerRule
# Our own dataproc operator used to install component gateway
from operators.moz_dataproc_operator import DataprocClusterCreateOperator
"""
Note: We are currently on 1.10.7 and when we upgrade, the spark operators will move.
This module is deprecated. Please use `airflow.providers.google.cloud.operators.dataproc
"""
from airflow.operators.bash_operator import BashOperator
from airflow.providers.amazon.aws.hooks.base_aws import AwsBaseHook
# When google deprecates dataproc_v1beta2 in DataprocHook/Operator classes
# We can import these from our patched code, rather than upgrading/deploying
# apache-airflow-providers-google > 6.0.0, and google-cloud-dataproc > 2.5.0
# from utils.patched.dataproc_operator import (
from airflow.providers.google.cloud.operators.dataproc import (
ClusterGenerator,
DataprocCreateClusterOperator,
DataprocDeleteClusterOperator,
DataprocSubmitPySparkJobOperator,
DataprocSubmitSparkJobOperator,
)
class DataProcHelper:
"""
@ -32,8 +32,8 @@ class DataProcHelper:
region='us-west1',
subnetwork_uri=None,
internal_ip_only=None,
idle_delete_ttl='14400',
auto_delete_ttl='28800',
idle_delete_ttl=14400,
auto_delete_ttl=28800,
master_machine_type='n1-standard-8',
worker_machine_type='n1-standard-4',
num_preemptible_workers=0,
@ -45,6 +45,7 @@ class DataProcHelper:
install_component_gateway=True,
aws_conn_id=None,
gcp_conn_id='google_cloud_airflow_dataproc',
project_id='airflow-dataproc',
artifact_bucket='moz-fx-data-prod-airflow-dataproc-artifacts',
storage_bucket='moz-fx-data-prod-dataproc-scratch',
master_disk_type='pd-standard',
@ -99,12 +100,11 @@ class DataProcHelper:
self.install_component_gateway = install_component_gateway
self.aws_conn_id = aws_conn_id
self.gcp_conn_id = gcp_conn_id
self.connection = GoogleCloudBaseHook(gcp_conn_id=self.gcp_conn_id)
self.project_id = project_id
def create_cluster(self):
"""
Returns a DataprocClusterCreateOperator
Returns a DataprocCreateClusterOperator
"""
properties = {}
@ -115,7 +115,7 @@ class DataProcHelper:
if self.aws_conn_id:
for key, value in zip(
("access.key", "secret.key", "session.token"),
AwsHook(self.aws_conn_id).get_credentials(),
AwsBaseHook(aws_conn_id=self.aws_conn_id, client_type='s3').get_credentials(),
):
if value is not None:
properties["core:fs.s3a." + key] = value
@ -133,48 +133,71 @@ class DataProcHelper:
}
metadata.update(self.additional_metadata)
return DataprocClusterCreateOperator(
task_id='create_dataproc_cluster',
cluster_name=self.cluster_name,
job_name=self.job_name,
gcp_conn_id=self.gcp_conn_id,
service_account=self.service_account,
project_id=self.connection.project_id,
cluster_generator = ClusterGenerator(
project_id = self.project_id,
num_workers = self.num_workers,
subnetwork_uri = self.subnetwork_uri,
internal_ip_only = self.internal_ip_only,
storage_bucket=self.storage_bucket,
num_workers=self.num_workers,
image_version=self.image_version,
properties=properties,
region=self.region,
subnetwork_uri=self.subnetwork_uri,
internal_ip_only=self.internal_ip_only,
idle_delete_ttl=self.idle_delete_ttl,
auto_delete_ttl=self.auto_delete_ttl,
master_machine_type=self.master_machine_type,
worker_machine_type=self.worker_machine_type,
num_preemptible_workers=self.num_preemptible_workers,
optional_components = self.optional_components,
install_component_gateway = self.install_component_gateway,
init_actions_uris=self.init_actions_uris,
metadata = metadata,
image_version=self.image_version,
properties = properties,
optional_components = self.optional_components,
master_machine_type=self.master_machine_type,
master_disk_type=self.master_disk_type,
master_disk_size=self.master_disk_size,
worker_machine_type=self.worker_machine_type,
worker_disk_type=self.worker_disk_type,
worker_disk_size=self.worker_disk_size,
master_num_local_ssds=self.master_num_local_ssds,
worker_num_local_ssds=self.worker_num_local_ssds,
metadata=metadata,
num_preemptible_workers=self.num_preemptible_workers,
service_account=self.service_account,
idle_delete_ttl=self.idle_delete_ttl,
auto_delete_ttl=self.auto_delete_ttl
)
cluster_config = cluster_generator.make()
# The DataprocCreateClusterOperator and ClusterGenerator dont support component gateway or local ssds
# ClusterConfig format is
# https://cloud.google.com/dataproc/docs/reference/rpc/google.cloud.dataproc.v1#google.cloud.dataproc.v1.ClusterConfig
if self.install_component_gateway:
cluster_config.update({'endpoint_config' : {'enable_http_port_access' : True}})
if self.master_num_local_ssds > 0:
master_instance_group_config = cluster_config['master_config']
master_instance_group_config['disk_config']['num_local_ssds'] = self.master_num_local_ssds
cluster_config.update({'master_config' : master_instance_group_config})
if self.worker_num_local_ssds > 0:
worker_instance_group_config = cluster_config['worker_config']
worker_instance_group_config['disk_config']['num_local_ssds'] =self.worker_num_local_ssds
cluster_config.update({'worker_config' : worker_instance_group_config})
return DataprocCreateClusterOperator(
task_id='create_dataproc_cluster',
cluster_name=self.cluster_name,
project_id = self.project_id,
use_if_exists=True,
delete_on_error=True,
labels={ 'env': os.getenv('DEPLOY_ENVIRONMENT', 'env_not_set'),
'owner': os.getenv('AIRFLOW_CTX_DAG_OWNER', 'owner_not_set'),
'jobname': self.job_name.lower().replace('_', '-') },
gcp_conn_id=self.gcp_conn_id,
region=self.region,
cluster_config = cluster_config
)
def delete_cluster(self):
"""
Returns a DataprocClusterDeleteOperator
Returns a DataprocDeleteClusterOperator
"""
return DataprocClusterDeleteOperator(
return DataprocDeleteClusterOperator(
task_id='delete_dataproc_cluster',
trigger_rule=TriggerRule.ALL_DONE,
cluster_name=self.cluster_name,
region=self.region,
gcp_conn_id=self.gcp_conn_id,
project_id=self.connection.project_id)
project_id=self.project_id)
# End DataProcHelper
@ -187,8 +210,8 @@ def moz_dataproc_pyspark_runner(parent_dag_name=None,
region='us-west1',
subnetwork_uri=None,
internal_ip_only=None,
idle_delete_ttl='10800',
auto_delete_ttl='21600',
idle_delete_ttl=10800,
auto_delete_ttl=21600,
master_machine_type='n1-standard-8',
worker_machine_type='n1-standard-4',
num_preemptible_workers=0,
@ -203,6 +226,7 @@ def moz_dataproc_pyspark_runner(parent_dag_name=None,
job_name=None,
aws_conn_id=None,
gcp_conn_id='google_cloud_airflow_dataproc',
project_id='airflow-dataproc',
artifact_bucket='moz-fx-data-prod-airflow-dataproc-artifacts',
storage_bucket='moz-fx-data-prod-dataproc-scratch',
master_disk_type='pd-standard',
@ -215,7 +239,7 @@ def moz_dataproc_pyspark_runner(parent_dag_name=None,
"""
This will initially create a GCP Dataproc cluster with Anaconda/Jupyter/Component gateway.
Then we call DataProcPySparkOperator to execute the pyspark script defined by the argument
Then we call DataprocSubmitPySparkJobOperator to execute the pyspark script defined by the argument
python_driver_code. Once that succeeds, we teardown the cluster.
**Example**: ::
@ -281,6 +305,9 @@ def moz_dataproc_pyspark_runner(parent_dag_name=None,
:param str aws_conn_id: Airflow connection id for S3 access (if needed).
:param str gcp_conn_id: The connection ID to use connecting to GCP.
:param str project_id: The project ID corresponding to the gcp_conn_id. We
add this because the dev environment doesn't parse it out
correctly from the dummy connections.
:param str artifact_bucket: Path to resources for bootstrapping the dataproc cluster
:param str storage_bucket: Path to scratch bucket for intermediate cluster results
:param list optional_components: List of optional components to install on cluster
@ -338,6 +365,7 @@ def moz_dataproc_pyspark_runner(parent_dag_name=None,
install_component_gateway=install_component_gateway,
aws_conn_id=aws_conn_id,
gcp_conn_id=gcp_conn_id,
project_id=project_id,
artifact_bucket=artifact_bucket,
storage_bucket=storage_bucket,
master_disk_type=master_disk_type,
@ -353,7 +381,7 @@ def moz_dataproc_pyspark_runner(parent_dag_name=None,
with models.DAG(_dag_name, default_args=default_args) as dag:
create_dataproc_cluster = dataproc_helper.create_cluster()
run_pyspark_on_dataproc = DataProcPySparkOperator(
run_pyspark_on_dataproc = DataprocSubmitPySparkJobOperator(
task_id='run_dataproc_pyspark',
job_name=job_name,
cluster_name=cluster_name,
@ -361,6 +389,7 @@ def moz_dataproc_pyspark_runner(parent_dag_name=None,
main=python_driver_code,
arguments=py_args,
gcp_conn_id=gcp_conn_id,
project_id=project_id
)
delete_dataproc_cluster = dataproc_helper.delete_cluster()
@ -379,8 +408,8 @@ def moz_dataproc_jar_runner(parent_dag_name=None,
region='us-west1',
subnetwork_uri=None,
internal_ip_only=None,
idle_delete_ttl='14400',
auto_delete_ttl='28800',
idle_delete_ttl=14400,
auto_delete_ttl=28800,
master_machine_type='n1-standard-8',
worker_machine_type='n1-standard-4',
num_preemptible_workers=0,
@ -394,6 +423,7 @@ def moz_dataproc_jar_runner(parent_dag_name=None,
job_name=None,
aws_conn_id=None,
gcp_conn_id='google_cloud_airflow_dataproc',
project_id='airflow-dataproc',
master_disk_type='pd-standard',
worker_disk_type='pd-standard',
master_disk_size=1024,
@ -404,7 +434,7 @@ def moz_dataproc_jar_runner(parent_dag_name=None,
"""
This will initially create a GCP Dataproc cluster with Anaconda/Jupyter/Component gateway.
Then we call DataProcSparkOperator to execute the jar defined by the arguments
Then we call DataprocSubmitSparkJobOperator to execute the jar defined by the arguments
jar_urls and main_class. Once that succeeds, we teardown the cluster.
**Example**: ::
@ -468,6 +498,7 @@ def moz_dataproc_jar_runner(parent_dag_name=None,
install_component_gateway=install_component_gateway,
aws_conn_id=aws_conn_id,
gcp_conn_id=gcp_conn_id,
project_id=project_id,
master_disk_type=master_disk_type,
master_disk_size=master_disk_size,
worker_disk_type=worker_disk_type,
@ -481,15 +512,17 @@ def moz_dataproc_jar_runner(parent_dag_name=None,
with models.DAG(_dag_name, default_args=default_args) as dag:
create_dataproc_cluster = dataproc_helper.create_cluster()
run_jar_on_dataproc = DataProcSparkOperator(
run_jar_on_dataproc = DataprocSubmitSparkJobOperator(
cluster_name=cluster_name,
region=region,
task_id='run_jar_on_dataproc',
job_name=job_name,
dataproc_spark_jars=jar_urls,
dataproc_jars=jar_urls,
main_class=main_class,
arguments=jar_args,
gcp_conn_id=gcp_conn_id)
gcp_conn_id=gcp_conn_id,
project_id=project_id
)
delete_dataproc_cluster = dataproc_helper.delete_cluster()
@ -512,8 +545,8 @@ def moz_dataproc_scriptrunner(parent_dag_name=None,
region='us-west1',
subnetwork_uri=None,
internal_ip_only=None,
idle_delete_ttl='14400',
auto_delete_ttl='28800',
idle_delete_ttl=14400,
auto_delete_ttl=28800,
master_machine_type='n1-standard-8',
worker_machine_type='n1-standard-4',
num_preemptible_workers=0,
@ -527,6 +560,7 @@ def moz_dataproc_scriptrunner(parent_dag_name=None,
job_name=None,
aws_conn_id=None,
gcp_conn_id='google_cloud_airflow_dataproc',
project_id='airflow-dataproc',
master_disk_type='pd-standard',
worker_disk_type='pd-standard',
master_disk_size=1024,
@ -538,7 +572,7 @@ def moz_dataproc_scriptrunner(parent_dag_name=None,
"""
This will initially create a GCP Dataproc cluster with Anaconda/Jupyter/Component gateway.
Then we execute a script uri (either https or gcs) similar to how we use our custom AWS
EmrSparkOperator. This will call DataProcSparkOperator using EMR's script-runner.jar, which
EmrSparkOperator. This will call DataprocSubmitSparkJobOperator using EMR's script-runner.jar, which
then executes the airflow_gcp.sh entrypoint script. The entrypoint script expects another
script uri, along with it's arguments, as parameters. Once that succeeds, we teardown the
cluster.
@ -609,6 +643,7 @@ def moz_dataproc_scriptrunner(parent_dag_name=None,
install_component_gateway=install_component_gateway,
aws_conn_id=aws_conn_id,
gcp_conn_id=gcp_conn_id,
project_id=project_id,
master_disk_type=master_disk_type,
master_disk_size=master_disk_size,
worker_disk_type=worker_disk_type,
@ -636,17 +671,19 @@ def moz_dataproc_scriptrunner(parent_dag_name=None,
with models.DAG(_dag_name, default_args=default_args) as dag:
create_dataproc_cluster = dataproc_helper.create_cluster()
# Run DataprocSparkOperator with script-runner.jar pointing to airflow_gcp.sh.
# Run DataprocSubmitSparkJobOperator with script-runner.jar pointing to airflow_gcp.sh.
run_script_on_dataproc = DataProcSparkOperator(
run_script_on_dataproc = DataprocSubmitSparkJobOperator(
cluster_name=cluster_name,
region=region,
task_id='run_script_on_dataproc',
job_name=job_name,
dataproc_spark_jars=[jar_url],
dataproc_jars=[jar_url],
main_class='com.amazon.elasticmapreduce.scriptrunner.ScriptRunner',
arguments=args,
gcp_conn_id=gcp_conn_id)
gcp_conn_id=gcp_conn_id,
project_id=project_id
)
delete_dataproc_cluster = dataproc_helper.delete_cluster()
@ -715,13 +752,13 @@ def get_dataproc_parameters(conn_id="google_cloud_airflow_dataproc"):
and should either be the production default ("dataproc-runner-prod"), or a
service key associated with a sandbox account.
"""
gcp_conn = GoogleCloudBaseHook(conn_id)
keyfile = json.loads(gcp_conn.extras["extra__google_cloud_platform__keyfile_dict"])
dev_project_id = "replace_me"
dev_client_email = "replace_me"
project_id = keyfile["project_id"]
is_dev = os.environ.get("DEPLOY_ENVIRONMENT") == "dev"
project_id = "airflow-dataproc" if is_dev else dev_project_id
client_email = (
keyfile["client_email"]
dev_client_email
if is_dev
else "dataproc-runner-prod@airflow-dataproc.iam.gserviceaccount.com"
)

Просмотреть файл

@ -1,14 +1,12 @@
from operators.gcp_container_operator import GKEPodOperator
from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook
def simpleprophet_forecast(
task_id,
datasource,
project_id,
dataset_id,
table_id,
gcp_conn_id="google_cloud_derived_datasets",
project_id='moz-fx-data-derived-datasets',
gke_location="us-central1-a",
gke_cluster_name="bq-load-gke-1",
gke_namespace="default",
@ -25,6 +23,7 @@ def simpleprophet_forecast(
:param str table_id: [Required] ID of target table
:param str gcp_conn_id: Airflow connection id for GCP access
:param str project_id: GCP project id associated with gcp_conn_id
:param str gke_location: GKE cluster location
:param str gke_cluster_name: GKE cluster name
:param str gke_namespace: GKE cluster namespace
@ -40,7 +39,7 @@ def simpleprophet_forecast(
return GKEPodOperator(
task_id=task_id,
gcp_conn_id=gcp_conn_id,
project_id=GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id,
project_id=project_id,
location=gke_location,
cluster_name=gke_cluster_name,
namespace=gke_namespace,

Просмотреть файл

@ -1,20 +1,25 @@
from airflow import models
from airflow.utils import trigger_rule
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.subdag_operator import SubDagOperator
from airflow.contrib.hooks.aws_hook import AwsHook
from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook
from airflow.contrib.operators.dataproc_operator import DataprocClusterCreateOperator, DataprocClusterDeleteOperator, DataProcSparkOperator, DataProcPySparkOperator # noqa
from operators.gcp_container_operator import GKEPodOperator
from airflow.contrib.operators.bigquery_table_delete_operator import BigQueryTableDeleteOperator # noqa:E501
from airflow.contrib.operators.bigquery_to_gcs import BigQueryToCloudStorageOperator
from airflow.contrib.operators.gcp_transfer_operator import S3ToGoogleCloudStorageTransferOperator # noqa:E501
from airflow.contrib.operators.gcs_delete_operator import GoogleCloudStorageDeleteOperator
from airflow.providers.amazon.aws.hooks.base_aws import AwsBaseHook
from airflow.providers.google.cloud.operators.dataproc import (
DataprocCreateClusterOperator,
DataprocDeleteClusterOperator,
DataprocSubmitPySparkJobOperator,
)
from airflow.providers.google.cloud.transfers.bigquery_to_gcs import BigQueryToGCSOperator
from airflow.providers.google.cloud.operators.gcs import GCSDeleteObjectsOperator
import json
import re
GCP_PROJECT_ID = "moz-fx-data-derived-datasets"
def export_to_parquet(
table,
@ -67,7 +72,7 @@ def export_to_parquet(
cluster_name += "-export-{{ ds_nodash }}"
dag_prefix = parent_dag_name + "." if parent_dag_name else ""
connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id)
project_id = GCP_PROJECT_ID
if destination_table is None:
destination_table = unqualified_table
@ -82,11 +87,12 @@ def export_to_parquet(
with models.DAG(dag_id=dag_prefix + dag_name, default_args=default_args) as dag:
create_dataproc_cluster = DataprocClusterCreateOperator(
create_dataproc_cluster = DataprocCreateClusterOperator(
task_id="create_dataproc_cluster",
cluster_name=cluster_name,
gcp_conn_id=gcp_conn_id,
project_id=connection.project_id,
region="us-west1",
project_id=project_id,
num_workers=num_workers,
image_version="1.4",
storage_bucket=dataproc_storage_bucket,
@ -100,13 +106,13 @@ def export_to_parquet(
metadata={"PIP_PACKAGES": "google-cloud-bigquery==1.20.0"},
)
run_dataproc_pyspark = DataProcPySparkOperator(
run_dataproc_pyspark = DataprocSubmitPySparkJobOperator(
task_id="run_dataproc_pyspark",
cluster_name=cluster_name,
dataproc_pyspark_jars=[
dataproc_jars=[
"gs://spark-lib/bigquery/spark-bigquery-latest.jar"
],
dataproc_pyspark_properties={
dataproc_properties={
"spark.jars.packages": "org.apache.spark:spark-avro_2.11:2.4.4",
},
main="https://raw.githubusercontent.com/mozilla/bigquery-etl/main"
@ -125,31 +131,33 @@ def export_to_parquet(
+ [static_partitions]
+ arguments,
gcp_conn_id=gcp_conn_id,
project_id=project_id,
)
delete_dataproc_cluster = DataprocClusterDeleteOperator(
delete_dataproc_cluster = DataprocDeleteClusterOperator(
task_id="delete_dataproc_cluster",
cluster_name=cluster_name,
gcp_conn_id=gcp_conn_id,
project_id=connection.project_id,
trigger_rule=trigger_rule.TriggerRule.ALL_DONE,
project_id=project_id,
trigger_rule="all_done",
region="us-west1",
)
if not use_storage_api:
avro_export = BigQueryToCloudStorageOperator(
avro_export = BigQueryToGCSOperator(
task_id="avro_export",
source_project_dataset_table=table,
destination_cloud_storage_uris=avro_path,
compression=None,
export_format="AVRO",
bigquery_conn_id=gcp_conn_id,
gcp_conn_id=gcp_conn_id,
)
avro_delete = GoogleCloudStorageDeleteOperator(
avro_delete = GCSDeleteObjectsOperator(
task_id="avro_delete",
bucket_name=gcs_output_bucket,
prefix=avro_prefix,
google_cloud_storage_conn_id=gcp_conn_id,
trigger_rule=trigger_rule.TriggerRule.ALL_DONE,
gcp_conn_id=gcp_conn_id,
trigger_rule="all_done",
)
avro_export >> run_dataproc_pyspark >> avro_delete
@ -210,7 +218,7 @@ def bigquery_etl_query(
parameters += (date_partition_parameter + ":DATE:{{ds}}",)
return GKEPodOperator(
gcp_conn_id=gcp_conn_id,
project_id=GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id,
project_id=project_id,
location=gke_location,
cluster_name=gke_cluster_name,
namespace=gke_namespace,
@ -278,7 +286,7 @@ def bigquery_etl_copy_deduplicate(
return GKEPodOperator(
task_id=task_id,
gcp_conn_id=gcp_conn_id,
project_id=GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id,
project_id=GCP_PROJECT_ID,
location=gke_location,
cluster_name=gke_cluster_name,
namespace=gke_namespace,
@ -341,7 +349,7 @@ def bigquery_xcom_query(
query = "{{ " + "task_instance.xcom_pull({!r})".format(xcom_task_id) + " }}"
return GKEPodOperator(
gcp_conn_id=gcp_conn_id,
project_id=GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id,
project_id=GCP_PROJECT_ID,
location=gke_location,
cluster_name=gke_cluster_name,
namespace=gke_namespace,
@ -407,7 +415,7 @@ def gke_command(
key: value
for key, value in zip(
("AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", "AWS_SESSION_TOKEN"),
AwsHook(aws_conn_id).get_credentials() if aws_conn_id else (),
AwsBaseHook(aws_conn_id=aws_conn_id, client_type='s3').get_credentials() if aws_conn_id else (),
)
if value is not None}
context_env_vars["XCOM_PUSH"] = json.dumps(xcom_push)
@ -416,7 +424,7 @@ def gke_command(
return GKEPodOperator(
task_id=task_id,
gcp_conn_id=gcp_conn_id,
project_id=GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id,
project_id=GCP_PROJECT_ID,
location=gke_location,
cluster_name=gke_cluster_name,
namespace=gke_namespace,

Просмотреть файл

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -126,5 +126,3 @@ AUTH_ROLE_PUBLIC = 'Admin'
# APP_THEME = "superhero.css"
# APP_THEME = "united.css"
# APP_THEME = "yeti.css"

Просмотреть файл

@ -5,6 +5,7 @@ services:
image: mysql:5.7
ports:
- '3306:3306'
# command: ['--explicit_defaults_for_timestamp=1', '--character-set-server=utf8mb4']
command: ['--explicit_defaults_for_timestamp=1']
environment:
MYSQL_ROOT_PASSWORD: secret
@ -51,10 +52,11 @@ services:
- AIRFLOW_EMAIL_BACKEND=airflow.macros.log_email_backend.log_email_backend
- AIRFLOW__KUBERNETES__IN_CLUSTER=False
- URL=http://localhost:8000
- WEBSERVER_USE_RBAC=False
# URL-encoded dummy connections; note that we define some other connections
# in the bin/run script
- AIRFLOW_CONN_ADM_SFTP=ftp://myname:mypassword@myhost.com:8000?known_hosts=myhost.com+AAAABBBBB
# TODO(hwoo) - improve developer workflow by not loading all dags
# - AIRFLOW__CORE__DAGS_FOLDER=$AIRFLOW_HOME/devdags
web:
extends:

Просмотреть файл

@ -8,14 +8,8 @@ from airflow import configuration
# Backfill Plugin Imports
from backfill.main import Backfill
# Get RBAC config.
rbac_authentication_enabled = configuration.getboolean("webserver", "RBAC")
# Init the plugin in Webserver's "Admin" Menu with Menu Item as "Backfill"
if rbac_authentication_enabled == True:
backfill_admin_view = {"category" : "Admin", "name" : "Backfill (Alpha)", "view": Backfill()}
else:
backfill_admin_view = Backfill(category="Admin", name="Backfill (Alpha)")
backfill_admin_view = {"category" : "Admin", "name" : "Backfill (Alpha)", "view": Backfill()}
# Creating a flask blueprint to integrate the templates folder
backfill_blueprint = Blueprint(

Просмотреть файл

@ -33,8 +33,6 @@ else:
# Local file where history will be stored
FILE = airflow_home_path + '/logs/backfill_history.txt'
rbac_authentication_enabled = configuration.getboolean("webserver", "RBAC")
# RE for remove ansi escape characters
ansi_escape = re.compile(r'\x1B[@-_][0-?]*[ -/]*[@-~]')
@ -58,26 +56,15 @@ def file_ops(mode, data=None):
return 1
def get_baseview():
if rbac_authentication_enabled == True:
return AppBuilderBaseView
else:
return BaseView
return AppBuilderBaseView
class Backfill(get_baseview()):
route_base = "/admin/backfill/"
if rbac_authentication_enabled == True:
@app_builder_expose('/')
def list(self):
""" Render the backfill page to client with RBAC"""
return self.render_template("backfill_page.html",
rbac_authentication_enabled=rbac_authentication_enabled)
else:
@expose('/')
def base(self):
""" Render the backfill page to client """
return self.render("backfill_page.html")
@app_builder_expose('/')
def list(self):
return self.render_template("backfill_page.html")
@expose('/stream')
@app_builder_expose('/stream')
@ -106,9 +93,10 @@ class Backfill(get_baseview()):
if use_task_regex == 'true':
cmd.extend(['-t', str(task_regex)])
elif clear == 'false':
cmd.append('dags')
cmd.append('backfill')
if dry_run == 'true':
cmd.append('--dry_run')
cmd.append('--dry-run')
if use_task_regex == 'true':
cmd.extend(['-t', str(task_regex)])

Просмотреть файл

@ -4,21 +4,20 @@ Plugin that adds a "Mozilla" entry to the top bar with some useful links.
Based on an example at
https://github.com/airflow-plugins/Getting-Started/blob/master/Tutorial/creating-ui-modification.md
"""
from airflow.plugins_manager import AirflowPlugin
from flask_admin.base import MenuLink
telemetry_airflow = MenuLink(
category="Mozilla",
name="telemetry-airflow on GitHub",
url="https://github.com/mozilla/telemetry-airflow")
telemetry_airflow = {
"name": "telemetry-airflow on GitHub",
"category": "Mozilla",
"href": "https://github.com/mozilla/telemetry-airflow"
}
wtmo_dev = MenuLink(
category="Mozilla",
name="WTMO Developer Guide",
url="https://mana.mozilla.org/wiki/display/DOPS/WTMO+Developer+Guide")
wtmo_dev = {
"name": "WTMO Developer Guide",
"category": "Mozilla",
"href": "https://mana.mozilla.org/wiki/display/DOPS/WTMO+Developer+Guide"
}
class MozMenuPlugin(AirflowPlugin):
name = "Mozilla"
@ -26,5 +25,5 @@ class MozMenuPlugin(AirflowPlugin):
flask_blueprints = []
hooks = []
executors = []
admin_views = []
menu_links = [telemetry_airflow, wtmo_dev]
appbuilder_views = []
appbuilder_menu_items = [telemetry_airflow, wtmo_dev]

Просмотреть файл

@ -1,4 +1,4 @@
{% extends 'airflow/master.html' %}
{% extends base_template %}
{% block title %}Airflow - Backfill Plugin{% endblock %}
@ -320,13 +320,12 @@
</style>
{% endblock %}
{% block body %}
{% if rbac_authentication_enabled %}
{% block navbar %}
<header class="top" role="header">
{% include 'appbuilder/navbar.html' %}
</header>
{% endblock %}
{%endif%}
<div class="container">
<h1>Backfill (Alpha)</h1>

Просмотреть файл

@ -1,16 +1,11 @@
boto3==1.15.18
botocore<1.19.0,>=1.18.0
kombu==4.6.10 # CeleryExecutor issues with 1.10.2 supposedly fixed in 1.10.5 airflow, but still observed issues on 1.10.7
importlib-metadata==2.1.0
importlib-metadata>=1.7
argcomplete==1.12.2
pandas-gbq==0.14.1
# removed hdfs
apache-airflow[celery,postgres,hive,jdbc,async,password,crypto,github_enterprise,datadog,statsd,s3,mysql,google_auth,gcp_api,kubernetes]==1.10.15
apache-airflow-upgrade-check
# Airflow 2.0 backported providers
apache-airflow-backport-providers-google
apache-airflow-backport-providers-amazon
apache-airflow-backport-providers-http
apache-airflow[amazon,celery,postgres,apache.hive,jdbc,async,password,crypto,github_enterprise,datadog,statsd,mysql,google_auth,cncf.kubernetes]==2.1.1
cryptography>=3.2
mozlogging
retrying
@ -19,11 +14,14 @@ redis
hiredis
requests
jsonschema
flask-admin
Flask-OAuthlib
Authlib~=0.15.3
Flask-AppBuilder>=3.3.0
pytz
werkzeug==0.16.0
werkzeug>=1.0.1,~=1.0
# The next requirements are for kubernetes-client/python
urllib3>=1.24.2 # MIT
urllib3>=1.24.2 # MIT
ipaddress>=1.0.17;python_version=="2.7" # PSF
websocket-client>=0.32.0,!=0.40.0,!=0.41.*,!=0.42.* # LGPLv2+
# Pin to older version, newer version has issues
@ -31,4 +29,13 @@ JPype1==0.7.1
shelljob==0.5.6
# Fix no inspection available issue
# https://github.com/apache/airflow/issues/8211
SQLAlchemy==1.3.15
SQLAlchemy>=1.3.18
# Airflow 2 no longer installs http provider by default, until chardet becomes an optional dependency of requests
apache-airflow-providers-http
airflow-provider-fivetran
# Upgrade google dataproc provider to fix beta client clusterConfig and mismatch issues
apache-airflow-providers-google==5.0.0
# 2.4.0 is broken for dataproc cluster create/delete
# 2.6.0 and 3.0.0 are newer but not compatible with apache-airflow-providers-google
# yet until maybe v7.0.0 bc 'google.cloud.dataproc_v1beta2' is deprecated
google-cloud-dataproc==2.5.0

Просмотреть файл

@ -4,156 +4,184 @@
#
# pip-compile
#
airflow-provider-fivetran==1.0.1 # via -r requirements.in
alembic==1.6.5 # via apache-airflow
amqp==2.6.1 # via kombu
apache-airflow-backport-providers-amazon==2021.3.3 # via -r requirements.in
apache-airflow-backport-providers-google==2021.3.3 # via -r requirements.in
apache-airflow-backport-providers-http==2021.4.10 # via -r requirements.in
apache-airflow-upgrade-check==1.4.0 # via -r requirements.in
apache-airflow[async,celery,crypto,datadog,gcp_api,github_enterprise,google_auth,hive,jdbc,kubernetes,mysql,password,postgres,s3,statsd]==1.10.15 # via -r requirements.in, apache-airflow-backport-providers-amazon, apache-airflow-backport-providers-google, apache-airflow-upgrade-check
apispec[yaml]==1.3.3 # via flask-appbuilder
argcomplete==1.12.2 # via -r requirements.in, apache-airflow
anyio==3.3.0 # via httpcore
apache-airflow-providers-amazon==2.1.0 # via apache-airflow
apache-airflow-providers-apache-hive==2.0.1 # via apache-airflow
apache-airflow-providers-celery==2.0.0 # via apache-airflow
apache-airflow-providers-cncf-kubernetes==2.0.2 # via apache-airflow
apache-airflow-providers-datadog==2.0.0 # via apache-airflow
apache-airflow-providers-ftp==2.0.0 # via apache-airflow
apache-airflow-providers-google==5.0.0 # via -r requirements.in
apache-airflow-providers-http==2.0.0 # via -r requirements.in
apache-airflow-providers-imap==2.0.0 # via apache-airflow
apache-airflow-providers-jdbc==2.0.0 # via apache-airflow
apache-airflow-providers-mysql==2.1.0 # via apache-airflow
apache-airflow-providers-postgres==2.0.0 # via apache-airflow
apache-airflow-providers-sqlite==2.0.0 # via apache-airflow
apache-airflow[amazon,apache.hive,async,celery,cncf.kubernetes,crypto,datadog,github_enterprise,google_auth,jdbc,mysql,password,postgres,statsd]==2.1.1 # via -r requirements.in, airflow-provider-fivetran, apache-airflow-providers-amazon, apache-airflow-providers-apache-hive, apache-airflow-providers-celery, apache-airflow-providers-cncf-kubernetes, apache-airflow-providers-datadog, apache-airflow-providers-google, apache-airflow-providers-http, apache-airflow-providers-jdbc, apache-airflow-providers-mysql, apache-airflow-providers-postgres
apispec[yaml]==3.3.2 # via flask-appbuilder
argcomplete==1.12.2 # via -r requirements.in, apache-airflow, nox
attrs==20.3.0 # via apache-airflow, cattrs, jsonschema
authlib==0.15.4 # via -r requirements.in
babel==2.9.1 # via flask-babel
backports.entry-points-selectable==1.1.0 # via virtualenv
bcrypt==3.2.0 # via apache-airflow, flask-bcrypt
billiard==3.6.4.0 # via celery
boto3==1.15.18 # via -r requirements.in, apache-airflow, apache-airflow-backport-providers-amazon, watchtower
botocore==1.18.18 # via -r requirements.in, apache-airflow-backport-providers-amazon, boto3, s3transfer
blinker==1.4 # via apache-airflow
boto3==1.15.18 # via -r requirements.in, apache-airflow-providers-amazon, watchtower
botocore==1.18.18 # via -r requirements.in, boto3, s3transfer
cached-property==1.5.2 # via apache-airflow
cachetools==4.2.2 # via google-auth
cattrs==1.7.1 # via apache-airflow
celery==4.4.7 # via apache-airflow, flower
certifi==2021.5.30 # via kubernetes, requests
cattrs==1.5.0 # via apache-airflow
celery==4.4.7 # via apache-airflow-providers-celery, flower
certifi==2021.5.30 # via httpx, kubernetes, requests
cffi==1.14.6 # via bcrypt, cryptography, google-crc32c
chardet==3.0.4 # via requests
click==7.1.2 # via flask, flask-appbuilder, hmsclient
colorama==0.4.4 # via flask-appbuilder
colorlog==4.0.2 # via apache-airflow
configparser==3.5.3 # via apache-airflow
charset-normalizer==2.0.4 # via httpx
click==7.1.2 # via clickclick, flask, flask-appbuilder, hmsclient
clickclick==20.10.2 # via apache-airflow
colorama==0.4.4 # via flask-appbuilder, rich
colorlog==4.0.2 # via apache-airflow, nox
commonmark==0.9.1 # via rich
croniter==0.3.37 # via apache-airflow
cryptography==3.4.7 # via -r requirements.in, apache-airflow, pyopenssl
datadog==0.42.0 # via apache-airflow
cryptography==3.4.7 # via -r requirements.in, apache-airflow, apache-airflow-providers-cncf-kubernetes, authlib, pyopenssl
datadog==0.42.0 # via apache-airflow-providers-datadog
defusedxml==0.7.1 # via python3-openid
dill==0.3.4 # via apache-airflow
distlib==0.3.2 # via virtualenv
dnspython==1.16.0 # via email-validator, eventlet
docutils==0.17.1 # via python-daemon
email-validator==1.1.3 # via apache-airflow, flask-appbuilder
docutils==0.16 # via apache-airflow, python-daemon
email-validator==1.1.3 # via flask-appbuilder
eventlet==0.31.1 # via apache-airflow
flask-admin==1.5.4 # via apache-airflow
flask-appbuilder==2.3.4 # via apache-airflow
filelock==3.0.12 # via virtualenv
flask-admin==1.5.8 # via -r requirements.in
flask-appbuilder==3.3.2 # via -r requirements.in, apache-airflow
flask-babel==1.0.0 # via flask-appbuilder
flask-bcrypt==0.7.1 # via apache-airflow
flask-caching==1.3.3 # via apache-airflow
flask-caching==1.10.1 # via apache-airflow
flask-jwt-extended==3.25.1 # via flask-appbuilder
flask-login==0.4.1 # via apache-airflow, flask-appbuilder
flask-oauthlib==0.9.5 # via -r requirements.in, apache-airflow
flask-openid==1.2.5 # via flask-appbuilder
flask-sqlalchemy==2.5.1 # via flask-appbuilder
flask-swagger==0.2.14 # via apache-airflow
flask-wtf==0.14.3 # via apache-airflow, flask-appbuilder
flask==1.1.4 # via apache-airflow, flask-admin, flask-appbuilder, flask-babel, flask-bcrypt, flask-caching, flask-jwt-extended, flask-login, flask-oauthlib, flask-openid, flask-sqlalchemy, flask-swagger, flask-wtf
flower==0.9.7 # via apache-airflow
funcsigs==1.0.2 # via apache-airflow
future==0.18.2 # via apache-airflow, pyhive
flask==1.1.4 # via apache-airflow, flask-admin, flask-appbuilder, flask-babel, flask-bcrypt, flask-caching, flask-jwt-extended, flask-login, flask-oauthlib, flask-openid, flask-sqlalchemy, flask-wtf
flower==0.9.7 # via apache-airflow-providers-celery
future==0.18.2 # via pyhive
gevent==21.1.2 # via apache-airflow
google-ads==7.0.0 # via apache-airflow-backport-providers-google
google-api-core[grpc,grpcgcp]==1.31.0 # via apache-airflow-backport-providers-google, google-ads, google-api-python-client, google-cloud-appengine-logging, google-cloud-automl, google-cloud-bigquery, google-cloud-bigquery-datatransfer, google-cloud-bigquery-storage, google-cloud-bigtable, google-cloud-container, google-cloud-core, google-cloud-datacatalog, google-cloud-dataproc, google-cloud-dlp, google-cloud-kms, google-cloud-language, google-cloud-logging, google-cloud-memcache, google-cloud-monitoring, google-cloud-os-login, google-cloud-pubsub, google-cloud-redis, google-cloud-secret-manager, google-cloud-spanner, google-cloud-speech, google-cloud-tasks, google-cloud-texttospeech, google-cloud-translate, google-cloud-videointelligence, google-cloud-vision, google-cloud-workflows
google-api-python-client==1.12.8 # via apache-airflow, apache-airflow-backport-providers-google
google-auth-httplib2==0.1.0 # via apache-airflow, apache-airflow-backport-providers-google, google-api-python-client
google-ads==13.0.0 # via apache-airflow-providers-google
google-api-core[grpc,grpcgcp]==1.31.0 # via apache-airflow-providers-google, google-ads, google-api-python-client, google-cloud-appengine-logging, google-cloud-automl, google-cloud-bigquery, google-cloud-bigquery-datatransfer, google-cloud-bigquery-storage, google-cloud-bigtable, google-cloud-container, google-cloud-core, google-cloud-datacatalog, google-cloud-dataproc, google-cloud-dlp, google-cloud-kms, google-cloud-language, google-cloud-logging, google-cloud-memcache, google-cloud-monitoring, google-cloud-os-login, google-cloud-pubsub, google-cloud-redis, google-cloud-secret-manager, google-cloud-spanner, google-cloud-speech, google-cloud-tasks, google-cloud-texttospeech, google-cloud-translate, google-cloud-videointelligence, google-cloud-vision, google-cloud-workflows
google-api-python-client==1.12.8 # via apache-airflow-providers-google
google-auth-httplib2==0.1.0 # via apache-airflow-providers-google, google-api-python-client
google-auth-oauthlib==0.4.4 # via google-ads, pandas-gbq, pydata-google-auth
google-auth==1.32.1 # via apache-airflow, apache-airflow-backport-providers-google, google-api-core, google-api-python-client, google-auth-httplib2, google-auth-oauthlib, google-cloud-core, google-cloud-storage, kubernetes, pandas-gbq, pydata-google-auth
google-auth==1.32.1 # via apache-airflow-providers-google, google-api-core, google-api-python-client, google-auth-httplib2, google-auth-oauthlib, google-cloud-core, google-cloud-storage, kubernetes, pandas-gbq, pydata-google-auth
google-cloud-appengine-logging==0.1.1 # via google-cloud-logging
google-cloud-audit-log==0.1.0 # via google-cloud-logging
google-cloud-automl==2.4.0 # via apache-airflow-backport-providers-google
google-cloud-bigquery-datatransfer==3.3.0 # via apache-airflow-backport-providers-google
google-cloud-automl==2.4.0 # via apache-airflow-providers-google
google-cloud-bigquery-datatransfer==3.3.0 # via apache-airflow-providers-google
google-cloud-bigquery-storage==2.6.0 # via google-cloud-bigquery
google-cloud-bigquery[bqstorage,pandas]==2.20.0 # via pandas-gbq
google-cloud-bigtable==1.7.0 # via apache-airflow, apache-airflow-backport-providers-google
google-cloud-container==1.0.1 # via apache-airflow, apache-airflow-backport-providers-google
google-cloud-bigtable==1.7.0 # via apache-airflow-providers-google
google-cloud-container==1.0.1 # via apache-airflow-providers-google
google-cloud-core==1.7.1 # via google-cloud-bigquery, google-cloud-bigtable, google-cloud-logging, google-cloud-spanner, google-cloud-storage, google-cloud-translate
google-cloud-datacatalog==3.3.0 # via apache-airflow-backport-providers-google
google-cloud-dataproc==2.4.0 # via apache-airflow-backport-providers-google
google-cloud-dlp==1.0.0 # via apache-airflow, apache-airflow-backport-providers-google
google-cloud-kms==2.4.0 # via apache-airflow-backport-providers-google
google-cloud-language==1.3.0 # via apache-airflow, apache-airflow-backport-providers-google
google-cloud-logging==2.5.0 # via apache-airflow-backport-providers-google
google-cloud-memcache==1.1.0 # via apache-airflow-backport-providers-google
google-cloud-monitoring==2.4.0 # via apache-airflow-backport-providers-google
google-cloud-os-login==2.2.1 # via apache-airflow-backport-providers-google
google-cloud-pubsub==2.6.1 # via apache-airflow-backport-providers-google
google-cloud-redis==2.2.0 # via apache-airflow-backport-providers-google
google-cloud-secret-manager==1.0.0 # via apache-airflow, apache-airflow-backport-providers-google
google-cloud-spanner==1.19.1 # via apache-airflow, apache-airflow-backport-providers-google
google-cloud-speech==1.3.2 # via apache-airflow, apache-airflow-backport-providers-google
google-cloud-storage==1.40.0 # via apache-airflow, apache-airflow-backport-providers-google
google-cloud-tasks==2.4.0 # via apache-airflow-backport-providers-google
google-cloud-texttospeech==1.0.1 # via apache-airflow, apache-airflow-backport-providers-google
google-cloud-translate==1.7.0 # via apache-airflow, apache-airflow-backport-providers-google
google-cloud-videointelligence==1.16.1 # via apache-airflow, apache-airflow-backport-providers-google
google-cloud-vision==1.0.0 # via apache-airflow, apache-airflow-backport-providers-google
google-cloud-workflows==1.1.0 # via apache-airflow-backport-providers-google
google-cloud-datacatalog==3.3.0 # via apache-airflow-providers-google
google-cloud-dataproc==2.5.0 # via -r requirements.in, apache-airflow-providers-google
google-cloud-dlp==1.0.0 # via apache-airflow-providers-google
google-cloud-kms==2.4.0 # via apache-airflow-providers-google
google-cloud-language==1.3.0 # via apache-airflow-providers-google
google-cloud-logging==2.5.0 # via apache-airflow-providers-google
google-cloud-memcache==1.0.0 # via apache-airflow-providers-google
google-cloud-monitoring==2.4.0 # via apache-airflow-providers-google
google-cloud-os-login==2.2.1 # via apache-airflow-providers-google
google-cloud-pubsub==2.6.1 # via apache-airflow-providers-google
google-cloud-redis==2.2.0 # via apache-airflow-providers-google
google-cloud-secret-manager==1.0.0 # via apache-airflow-providers-google
google-cloud-spanner==1.19.1 # via apache-airflow-providers-google
google-cloud-speech==1.3.2 # via apache-airflow-providers-google
google-cloud-storage==1.40.0 # via apache-airflow-providers-google
google-cloud-tasks==2.4.0 # via apache-airflow-providers-google
google-cloud-texttospeech==1.0.1 # via apache-airflow-providers-google
google-cloud-translate==1.7.0 # via apache-airflow-providers-google
google-cloud-videointelligence==1.16.1 # via apache-airflow-providers-google
google-cloud-vision==1.0.0 # via apache-airflow-providers-google
google-cloud-workflows==1.1.0 # via apache-airflow-providers-google
google-crc32c==1.1.2 # via google-resumable-media
google-resumable-media==1.3.1 # via google-cloud-bigquery, google-cloud-storage
googleapis-common-protos[grpc]==1.53.0 # via google-ads, google-api-core, google-cloud-audit-log, grpc-google-iam-v1
graphviz==0.16 # via apache-airflow
greenlet==1.1.0 # via apache-airflow, eventlet, gevent
grpc-google-iam-v1==0.12.3 # via google-cloud-bigtable, google-cloud-container, google-cloud-datacatalog, google-cloud-kms, google-cloud-pubsub, google-cloud-secret-manager, google-cloud-spanner, google-cloud-tasks
grpcio-gcp==0.2.2 # via apache-airflow, apache-airflow-backport-providers-google, google-api-core
grpcio-gcp==0.2.2 # via apache-airflow-providers-google, google-api-core
grpcio==1.38.1 # via google-ads, google-api-core, google-cloud-bigquery, google-cloud-pubsub, googleapis-common-protos, grpc-google-iam-v1, grpcio-gcp
gunicorn==20.1.0 # via apache-airflow
h11==0.12.0 # via httpcore
hiredis==2.0.0 # via -r requirements.in
hmsclient==0.1.1 # via apache-airflow
hmsclient==0.1.1 # via apache-airflow-providers-apache-hive
httpcore==0.13.6 # via httpx
httplib2==0.19.1 # via google-api-python-client, google-auth-httplib2
httpx==0.19.0 # via apache-airflow, apache-airflow-providers-google
humanize==3.10.0 # via flower
idna==2.10 # via email-validator, requests
importlib-metadata==2.1.0 # via -r requirements.in, apache-airflow, apache-airflow-upgrade-check, argcomplete, importlib-resources, jsonschema, kombu
idna==2.10 # via anyio, email-validator, requests, rfc3986
importlib-metadata==1.7.0 # via -r requirements.in, apache-airflow, argcomplete, importlib-resources, jsonschema, kombu, nox, virtualenv
importlib-resources==1.5.0 # via apache-airflow
inflection==0.5.1 # via apache-airflow
iso8601==0.1.14 # via apache-airflow
itsdangerous==1.1.0 # via flask, flask-wtf
jaydebeapi==1.2.3 # via apache-airflow
jinja2==2.11.3 # via apache-airflow, flask, flask-babel, python-nvd3
isodate==0.6.0 # via openapi-schema-validator
itsdangerous==1.1.0 # via apache-airflow, flask, flask-wtf
jaydebeapi==1.2.3 # via apache-airflow-providers-jdbc
jinja2==2.11.3 # via apache-airflow, flask, flask-babel, python-nvd3, swagger-ui-bundle
jmespath==0.10.0 # via boto3, botocore
jpype1==0.7.1 # via -r requirements.in, apache-airflow, jaydebeapi
json-merge-patch==0.2 # via apache-airflow, apache-airflow-backport-providers-google
jsonschema==3.2.0 # via -r requirements.in, apache-airflow, flask-appbuilder
jpype1==0.7.1 # via -r requirements.in, jaydebeapi
json-merge-patch==0.2 # via apache-airflow-providers-google
jsonschema==3.2.0 # via -r requirements.in, apache-airflow, flask-appbuilder, openapi-schema-validator, openapi-spec-validator
kombu==4.6.10 # via -r requirements.in, celery
kubernetes==11.0.0 # via apache-airflow
kubernetes==11.0.0 # via apache-airflow-providers-cncf-kubernetes
lazy-object-proxy==1.4.3 # via apache-airflow
libcst==0.3.19 # via google-cloud-bigquery-storage, google-cloud-datacatalog, google-cloud-os-login, google-cloud-pubsub, google-cloud-workflows
lockfile==0.12.2 # via python-daemon
lockfile==0.12.2 # via apache-airflow, python-daemon
mako==1.1.4 # via alembic
markdown==2.6.11 # via apache-airflow
markupsafe==2.0.1 # via jinja2, mako, wtforms
markupsafe==1.1.1 # via apache-airflow, jinja2, mako, wtforms
marshmallow-enum==1.5.1 # via flask-appbuilder
marshmallow-sqlalchemy==0.23.1 # via apache-airflow, flask-appbuilder
marshmallow==2.21.0 # via flask-appbuilder, marshmallow-enum, marshmallow-sqlalchemy
marshmallow-oneofschema==3.0.1 # via apache-airflow
marshmallow-sqlalchemy==0.23.1 # via flask-appbuilder
marshmallow==3.13.0 # via flask-appbuilder, marshmallow-enum, marshmallow-oneofschema, marshmallow-sqlalchemy
mozlogging==0.1.0 # via -r requirements.in
mypy-extensions==0.4.3 # via typing-inspect
mysqlclient==1.3.14 # via apache-airflow
mysql-connector-python==8.0.22 # via apache-airflow-providers-mysql
mysqlclient==1.3.14 # via apache-airflow-providers-mysql
natsort==7.1.1 # via croniter
newrelic==6.4.4.161 # via -r requirements.in
numpy==1.21.0 # via pandas, pyarrow
nox==2020.12.31 # via google-ads
numpy==1.21.0 # via apache-airflow, pandas, pyarrow
oauthlib==2.1.0 # via apache-airflow, flask-oauthlib, requests-oauthlib
packaging==21.0 # via apache-airflow, apache-airflow-upgrade-check, google-api-core, google-cloud-appengine-logging, google-cloud-automl, google-cloud-bigquery, google-cloud-bigquery-datatransfer, google-cloud-bigquery-storage, google-cloud-datacatalog, google-cloud-dataproc, google-cloud-kms, google-cloud-logging, google-cloud-memcache, google-cloud-monitoring, google-cloud-os-login, google-cloud-redis, google-cloud-tasks, google-cloud-workflows
pandas-gbq==0.14.1 # via -r requirements.in, apache-airflow, apache-airflow-backport-providers-google
openapi-schema-validator==0.1.5 # via openapi-spec-validator
openapi-spec-validator==0.3.1 # via apache-airflow
packaging==21.0 # via google-api-core, google-cloud-appengine-logging, google-cloud-automl, google-cloud-bigquery, google-cloud-bigquery-datatransfer, google-cloud-bigquery-storage, google-cloud-datacatalog, google-cloud-dataproc, google-cloud-kms, google-cloud-logging, google-cloud-monitoring, google-cloud-os-login, google-cloud-redis, google-cloud-tasks, google-cloud-workflows
pandas-gbq==0.14.1 # via -r requirements.in, apache-airflow-providers-google
pandas==1.3.0 # via apache-airflow, google-cloud-bigquery, pandas-gbq
pendulum==1.4.4 # via apache-airflow
pendulum==2.1.2 # via apache-airflow
platformdirs==2.2.0 # via virtualenv
prison==0.1.3 # via flask-appbuilder
prometheus-client==0.8.0 # via flower
proto-plus==1.19.0 # via google-cloud-appengine-logging, google-cloud-automl, google-cloud-bigquery, google-cloud-bigquery-datatransfer, google-cloud-bigquery-storage, google-cloud-datacatalog, google-cloud-dataproc, google-cloud-kms, google-cloud-logging, google-cloud-memcache, google-cloud-monitoring, google-cloud-os-login, google-cloud-pubsub, google-cloud-redis, google-cloud-tasks, google-cloud-workflows
protobuf==3.17.3 # via google-ads, google-api-core, google-cloud-audit-log, google-cloud-bigquery, googleapis-common-protos, proto-plus
proto-plus==1.19.0 # via google-ads, google-cloud-appengine-logging, google-cloud-automl, google-cloud-bigquery, google-cloud-bigquery-datatransfer, google-cloud-bigquery-storage, google-cloud-datacatalog, google-cloud-dataproc, google-cloud-kms, google-cloud-logging, google-cloud-memcache, google-cloud-monitoring, google-cloud-os-login, google-cloud-pubsub, google-cloud-redis, google-cloud-tasks, google-cloud-workflows
protobuf==3.17.3 # via google-api-core, google-cloud-audit-log, google-cloud-bigquery, googleapis-common-protos, mysql-connector-python, proto-plus
psutil==5.8.0 # via apache-airflow
psycopg2-binary==2.9.1 # via apache-airflow
psycopg2-binary==2.9.1 # via apache-airflow-providers-postgres
pure-sasl==0.6.2 # via thrift-sasl
py==1.10.0 # via nox
pyarrow==4.0.1 # via google-cloud-bigquery
pyasn1-modules==0.2.8 # via google-auth
pyasn1==0.4.8 # via pyasn1-modules, rsa
pycparser==2.20 # via cffi
pydata-google-auth==1.2.0 # via pandas-gbq
pygments==2.9.0 # via apache-airflow
pyhive[hive]==0.6.4 # via apache-airflow
pyjwt==1.7.1 # via flask-appbuilder, flask-jwt-extended
pyopenssl==20.0.1 # via apache-airflow, apache-airflow-backport-providers-google
pygments==2.9.0 # via apache-airflow, rich
pyhive[hive]==0.6.4 # via apache-airflow-providers-apache-hive
pyjwt==1.7.1 # via apache-airflow, flask-appbuilder, flask-jwt-extended
pyopenssl==20.0.1 # via apache-airflow-providers-google
pyparsing==2.4.7 # via httplib2, packaging
pyrsistent==0.18.0 # via jsonschema
python-daemon==2.3.0 # via apache-airflow
@ -161,43 +189,47 @@ python-dateutil==2.8.1 # via alembic, apache-airflow, botocore, croniter, fla
python-editor==1.0.4 # via alembic
python-nvd3==0.15.0 # via apache-airflow
python-slugify==4.0.1 # via apache-airflow, python-nvd3
python3-openid==3.2.0 # via flask-openid
pytz==2021.1 # via -r requirements.in, babel, celery, flask-babel, flower, google-api-core, pandas, tzlocal
python3-openid==3.2.0 # via apache-airflow, flask-openid
pytz==2021.1 # via -r requirements.in, babel, celery, flask-babel, flower, google-api-core, pandas
pytzdata==2020.1 # via pendulum
pyyaml==5.4.1 # via apispec, flask-swagger, google-ads, kubernetes, libcst
pyyaml==5.4.1 # via apache-airflow, apispec, clickclick, google-ads, kubernetes, libcst, openapi-spec-validator
redis==3.5.3 # via -r requirements.in
requests-oauthlib==1.1.0 # via apache-airflow, flask-oauthlib, google-auth-oauthlib, kubernetes
requests==2.23.0 # via -r requirements.in, apache-airflow, datadog, google-api-core, google-cloud-bigquery, google-cloud-storage, kubernetes, requests-oauthlib
requests==2.23.0 # via -r requirements.in, airflow-provider-fivetran, apache-airflow-providers-http, datadog, google-api-core, google-cloud-bigquery, google-cloud-storage, kubernetes, requests-oauthlib
retrying==1.3.3 # via -r requirements.in
rfc3986[idna2008]==1.5.0 # via httpx
rich==10.9.0 # via apache-airflow
rsa==4.7.2 # via google-auth
s3transfer==0.3.7 # via boto3
sasl==0.3.1 # via pyhive
setproctitle==1.2.2 # via apache-airflow
shelljob==0.5.6 # via -r requirements.in
six==1.16.0 # via bcrypt, eventlet, flask-jwt-extended, google-api-core, google-api-python-client, google-auth, google-auth-httplib2, google-cloud-core, google-resumable-media, grpcio, jsonschema, kubernetes, prison, protobuf, pyopenssl, python-dateutil, retrying, sasl, sqlalchemy-utils, tenacity, thrift, thrift-sasl
sqlalchemy-jsonfield==0.9.0 # via apache-airflow
six==1.16.0 # via bcrypt, eventlet, flask-jwt-extended, google-api-core, google-api-python-client, google-auth, google-auth-httplib2, google-cloud-core, google-resumable-media, grpcio, isodate, jsonschema, kubernetes, openapi-schema-validator, openapi-spec-validator, prison, protobuf, pyopenssl, python-dateutil, retrying, sasl, sqlalchemy-utils, tenacity, thrift, thrift-sasl, virtualenv
sniffio==1.2.0 # via anyio, httpcore, httpx
sqlalchemy-jsonfield==1.0.0 # via apache-airflow
sqlalchemy-utils==0.37.8 # via flask-appbuilder
sqlalchemy==1.3.15 # via -r requirements.in, alembic, apache-airflow, flask-sqlalchemy, marshmallow-sqlalchemy, sqlalchemy-jsonfield, sqlalchemy-utils
sqlalchemy==1.3.24 # via -r requirements.in, alembic, apache-airflow, flask-appbuilder, flask-sqlalchemy, marshmallow-sqlalchemy, sqlalchemy-jsonfield, sqlalchemy-utils
statsd==3.3.0 # via apache-airflow
swagger-ui-bundle==0.0.8 # via apache-airflow
tabulate==0.8.9 # via apache-airflow
tenacity==4.12.0 # via apache-airflow
tenacity==6.2.0 # via apache-airflow
termcolor==1.1.0 # via apache-airflow
text-unidecode==1.3 # via python-slugify
thrift-sasl==0.4.3 # via pyhive
thrift==0.13.0 # via apache-airflow, hmsclient, pyhive, thrift-sasl
tornado==5.1.1 # via apache-airflow, flower
typing-extensions==3.10.0.0 # via apache-airflow, libcst, typing-inspect
thrift==0.13.0 # via apache-airflow-providers-apache-hive, hmsclient, pyhive, thrift-sasl
tornado==5.1.1 # via flower
typing-extensions==3.10.0.0 # via anyio, apache-airflow, libcst, rich, typing-inspect
typing-inspect==0.7.1 # via libcst
tzlocal==1.5.1 # via apache-airflow, pendulum
unicodecsv==0.14.1 # via apache-airflow
uritemplate==3.0.1 # via google-api-python-client
urllib3==1.25.11 # via -r requirements.in, botocore, kubernetes, requests
vine==1.3.0 # via amqp, apache-airflow, celery, flower
watchtower==0.7.3 # via apache-airflow-backport-providers-amazon
vine==1.3.0 # via amqp, apache-airflow-providers-celery, celery, flower
virtualenv==20.7.2 # via nox
watchtower==1.0.6 # via apache-airflow-providers-amazon
websocket-client==1.1.0 # via -r requirements.in, kubernetes
werkzeug==0.16.0 # via -r requirements.in, apache-airflow, flask, flask-caching, flask-jwt-extended
werkzeug==1.0.1 # via -r requirements.in, apache-airflow, flask, flask-jwt-extended
wtforms==2.3.3 # via flask-admin, flask-wtf
zipp==3.5.0 # via importlib-metadata, importlib-resources
zope.deprecation==4.4.0 # via apache-airflow
zope.event==4.5.0 # via gevent
zope.interface==5.4.0 # via gevent

Просмотреть файл

@ -70,14 +70,14 @@ OAUTH_PROVIDERS = [{
'token_key':'access_token',
'icon':'fa-google',
'remote_app': {
'base_url':'https://www.googleapis.com/oauth2/v2/',
'request_token_params':{
'api_base_url':'https://www.googleapis.com/oauth2/v2/',
'client_kwargs':{
'scope': 'email profile'
},
'access_token_url':'https://accounts.google.com/o/oauth2/token',
'authorize_url':'https://accounts.google.com/o/oauth2/auth',
'request_token_url': None,
'consumer_key': GOOGLE_KEY,
'consumer_secret': GOOGLE_SECRET,
'client_id': GOOGLE_KEY,
'client_secret': GOOGLE_SECRET,
}
}]