[DSRE-6] Upgrade Airflow (wtmo) to 2.1.1
This commit is contained in:
Родитель
2e56562945
Коммит
e1518a5ff5
|
@ -7,6 +7,7 @@ venv
|
|||
logs
|
||||
unittests.cfg
|
||||
airflow-webserver.pid
|
||||
airflow-worker.pid
|
||||
.config
|
||||
.credentials
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
# https://forums.docker.com/t/multiple-projects-stopped-building-on-docker-hub-operation-not-permitted/92570/6
|
||||
# and https://forums.docker.com/t/multiple-projects-stopped-building-on-docker-hub-operation-not-permitted/92570/11
|
||||
FROM python:3.7-slim-buster
|
||||
MAINTAINER Jannis Leidel <jezdez@mozilla.com>
|
||||
MAINTAINER Harold Woo <hwoo@mozilla.com>
|
||||
|
||||
# Due to AIRFLOW-6854, Python 3.7 is chosen as the base python version.
|
||||
|
||||
|
|
695
airflow.cfg
695
airflow.cfg
|
@ -1,15 +1,16 @@
|
|||
[core]
|
||||
# 1.10 additions
|
||||
default_timezone = utc
|
||||
log_filename_template = {{ ti.dag_id }}/{{ ti.task_id }}/{{ execution_date.strftime("%%Y-%%m-%%dT%%H:%%M:%%S") }}/{{ try_number }}.log
|
||||
|
||||
hide_sensitive_var_conn_fields = True
|
||||
sensitive_var_conn_names = 'cred,CRED,secret,SECRET,pass,PASS,password,PASSWORD,private,PRIVATE,key,KEY,cert,CERT,token,TOKEN,AKIA'
|
||||
|
||||
# This setting would not have any effect in an existing deployment where the default_pool already exists.
|
||||
# default_pool_task_slot_count = 50
|
||||
|
||||
# The folder where your airflow pipelines live, most likely a
|
||||
# subfolder in a code repository
|
||||
dags_folder = $AIRFLOW_HOME/dags
|
||||
|
||||
# The folder where airflow should store its log files. This location
|
||||
base_log_folder = $AIRFLOW_HOME/logs
|
||||
|
||||
# The executor class that airflow should use. Choices include
|
||||
# SequentialExecutor, LocalExecutor, CeleryExecutor
|
||||
executor = CeleryExecutor
|
||||
|
@ -34,7 +35,7 @@ sql_alchemy_pool_recycle = 3600
|
|||
parallelism = 16
|
||||
|
||||
# The number of task instances allowed to run concurrently by the scheduler
|
||||
dag_concurrency = 16
|
||||
max_active_tasks_per_dag = 16
|
||||
|
||||
# Are DAGs paused by default at creation
|
||||
dags_are_paused_at_creation = True
|
||||
|
@ -47,9 +48,20 @@ max_active_runs_per_dag = 5
|
|||
# environment
|
||||
load_examples = False
|
||||
|
||||
# Whether to load the default connections that ship with Airflow. It's good to
|
||||
# get started, but you probably want to set this to ``False`` in a production
|
||||
# environment
|
||||
# We have configured google_cloud_default, so hopefully this wont remove it.
|
||||
load_default_connections = False
|
||||
|
||||
# Where your Airflow plugins are stored
|
||||
plugins_folder = $AIRFLOW_HOME/plugins
|
||||
|
||||
# Should tasks be executed via forking of the parent process ("False",
|
||||
# the speedier option) or by spawning a new python process ("True" slow,
|
||||
# but means plugin changes picked up by tasks straight away)
|
||||
execute_tasks_new_python_interpreter = False
|
||||
|
||||
# Secret key to save connection passwords in the db
|
||||
# Setting this to $AIRFLOW_FERNET_KEY is broken in 1.9 for initdb. Set $AIRFLOW__CORE__FERNET_KEY instead
|
||||
# fernet_key =
|
||||
|
@ -58,15 +70,162 @@ plugins_folder = $AIRFLOW_HOME/plugins
|
|||
donot_pickle = False
|
||||
|
||||
# How long before timing out a python file import while filling the DagBag
|
||||
dagbag_import_timeout = 30
|
||||
dagbag_import_timeout = 30.0
|
||||
|
||||
# Should a traceback be shown in the UI for dagbag import errors,
|
||||
# instead of just the exception message
|
||||
dagbag_import_error_tracebacks = True
|
||||
|
||||
# If tracebacks are shown, how many entries from the traceback should be shown
|
||||
dagbag_import_error_traceback_depth = 2
|
||||
|
||||
# How long before timing out a DagFileProcessor, which processes a dag file
|
||||
dag_file_processor_timeout = 50
|
||||
|
||||
# The class to use for running task instances in a subprocess.
|
||||
# Choices include StandardTaskRunner, CgroupTaskRunner or the full import path to the class
|
||||
# when using a custom task runner.
|
||||
task_runner = StandardTaskRunner
|
||||
|
||||
# If set, tasks without a ``run_as_user`` argument will be run with this user
|
||||
# Can be used to de-elevate a sudo user running Airflow when executing tasks
|
||||
# default_impersonation =
|
||||
|
||||
# What security module to use (for example kerberos)
|
||||
# security =
|
||||
|
||||
# Turn unit test mode on (overwrites many configuration options with test
|
||||
# values at runtime)
|
||||
unit_test_mode = False
|
||||
|
||||
# Whether to enable pickling for xcom (note that this is insecure and allows for
|
||||
# RCE exploits).
|
||||
enable_xcom_pickling = False
|
||||
|
||||
# Whether to override params with dag_run.conf. If you pass some key-value pairs
|
||||
# through ``airflow dags backfill -c`` or
|
||||
# ``airflow dags trigger -c``, the key-value pairs will override the existing ones in params.
|
||||
dag_run_conf_overrides_params = True
|
||||
|
||||
# When discovering DAGs, ignore any files that don't contain the strings ``DAG`` and ``airflow``.
|
||||
dag_discovery_safe_mode = False
|
||||
|
||||
# The number of retries each task is going to have by default. Can be overridden at dag or task level.
|
||||
default_task_retries = 0
|
||||
|
||||
# We will override the next 2 intervals in prod via env vars.
|
||||
# Updating serialized DAG can not be faster than a minimum interval to reduce database write rate.
|
||||
# This flag sets the minimum interval (in seconds) after which the serialized DAGs in the DB should be updated.
|
||||
# This helps in reducing database write rate.
|
||||
min_serialized_dag_update_interval = 10
|
||||
|
||||
# Fetching serialized DAG can not be faster than a minimum interval to reduce database
|
||||
# read rate. This config controls when your DAGs are updated in the Webserver
|
||||
min_serialized_dag_fetch_interval = 5
|
||||
|
||||
# Whether to persist DAG files code in DB.
|
||||
# If set to True, Webserver reads file contents from DB instead of
|
||||
# trying to access files in a DAG folder.
|
||||
# (Default is ``True``)
|
||||
# Example: store_dag_code = True
|
||||
# store_dag_code =
|
||||
|
||||
# Maximum number of Rendered Task Instance Fields (Template Fields) per task to store
|
||||
# in the Database.
|
||||
# All the template_fields for each of Task Instance are stored in the Database.
|
||||
# Keeping this number small may cause an error when you try to view ``Rendered`` tab in
|
||||
# TaskInstance view for older tasks.
|
||||
max_num_rendered_ti_fields_per_task = 30
|
||||
|
||||
# On each dagrun check against defined SLAs
|
||||
check_slas = True
|
||||
|
||||
# Path to custom XCom class that will be used to store and resolve operators results
|
||||
# Example: xcom_backend = path.to.CustomXCom
|
||||
xcom_backend = airflow.models.xcom.BaseXCom
|
||||
|
||||
# By default Airflow plugins are lazily-loaded (only loaded when required). Set it to ``False``,
|
||||
# if you want to load plugins whenever 'airflow' is invoked via cli or loaded from module.
|
||||
lazy_load_plugins = True
|
||||
|
||||
# By default Airflow providers are lazily-discovered (discovery and imports happen only when required).
|
||||
# Set it to False, if you want to discover providers whenever 'airflow' is invoked via cli or
|
||||
# loaded from module.
|
||||
lazy_discover_providers = True
|
||||
|
||||
# Number of times the code should be retried in case of DB Operational Errors.
|
||||
# Not all transactions will be retried as it can cause undesired state.
|
||||
# Currently it is only used in ``DagFileProcessor.process_file`` to retry ``dagbag.sync_to_db``.
|
||||
max_db_retries = 3
|
||||
|
||||
|
||||
[logging]
|
||||
# The folder where airflow should store its log files. This location
|
||||
base_log_folder = $AIRFLOW_HOME/logs
|
||||
|
||||
# Logging level.
|
||||
#
|
||||
# Supported values: ``CRITICAL``, ``ERROR``, ``WARNING``, ``INFO``, ``DEBUG``.
|
||||
logging_level = INFO
|
||||
|
||||
# Logging level for Flask-appbuilder UI.
|
||||
#
|
||||
# Supported values: ``CRITICAL``, ``ERROR``, ``WARNING``, ``INFO``, ``DEBUG``.
|
||||
fab_logging_level = WARN
|
||||
|
||||
# Logging class
|
||||
# Specify the class that will specify the logging configuration
|
||||
# This class has to be on the python classpath
|
||||
# Example: logging_config_class = my.path.default_local_settings.LOGGING_CONFIG
|
||||
# logging_config_class =
|
||||
|
||||
# Flag to enable/disable Colored logs in Console
|
||||
# Colour the logs when the controlling terminal is a TTY.
|
||||
colored_console_log = True
|
||||
|
||||
# Log format for when Colored logs is enabled
|
||||
colored_log_format = [%%(blue)s%%(asctime)s%%(reset)s] {{%%(blue)s%%(filename)s:%%(reset)s%%(lineno)d}} %%(log_color)s%%(levelname)s%%(reset)s - %%(log_color)s%%(message)s%%(reset)s
|
||||
colored_formatter_class = airflow.utils.log.colored_log.CustomTTYColoredFormatter
|
||||
|
||||
# Format of Log line
|
||||
log_format = [%%(asctime)s] {{%%(filename)s:%%(lineno)d}} %%(levelname)s - %%(message)s
|
||||
simple_log_format = %%(asctime)s %%(levelname)s - %%(message)s
|
||||
|
||||
# Specify prefix pattern like mentioned below with stream handler TaskHandlerWithCustomFormatter
|
||||
# Example: task_log_prefix_template = {{ti.dag_id}}-{{ti.task_id}}-{{execution_date}}-{{try_number}}
|
||||
# task_log_prefix_template =
|
||||
|
||||
# Formatting for how airflow generates file names/paths for each task run.
|
||||
log_filename_template = {{ ti.dag_id }}/{{ ti.task_id }}/{{ execution_date.strftime("%%Y-%%m-%%dT%%H:%%M:%%S") }}/{{ try_number }}.log
|
||||
|
||||
# Formatting for how airflow generates file names for log
|
||||
log_processor_filename_template = {{ filename }}.log
|
||||
|
||||
# full path of dag_processor_manager logfile
|
||||
dag_processor_manager_log_location = {AIRFLOW_HOME}/logs/dag_processor_manager/dag_processor_manager.log
|
||||
|
||||
# Name of handler to read task instance logs.
|
||||
# Defaults to use ``task`` handler.
|
||||
task_log_reader = task
|
||||
|
||||
# A comma\-separated list of third-party logger names that will be configured to print messages to
|
||||
# consoles\.
|
||||
# Example: extra_loggers = connexion,sqlalchemy
|
||||
# extra_loggers =
|
||||
|
||||
|
||||
[webserver]
|
||||
rbac = $WEBSERVER_USE_RBAC
|
||||
# The base url of your website as airflow cannot guess what domain or
|
||||
# cname you are using. This is use in automated emails that
|
||||
# airflow sends to point links to the right web server
|
||||
base_url = $URL
|
||||
|
||||
# Default timezone to display all dates in the UI, can be UTC, system, or
|
||||
# any IANA timezone string (e.g. Europe/Amsterdam). If left empty the
|
||||
# default value of core/default_timezone will be used
|
||||
# Example: default_ui_timezone = America/New_York
|
||||
default_ui_timezone = UTC
|
||||
|
||||
# The ip specified when starting the web server
|
||||
web_server_host = 0.0.0.0
|
||||
|
||||
|
@ -83,19 +242,151 @@ workers = 4
|
|||
# sync (default), eventlet, gevent
|
||||
worker_class = gevent
|
||||
|
||||
# Expose the configuration file in the web server
|
||||
expose_config = true
|
||||
|
||||
# Set to true to turn on authentication : http://pythonhosted.org/airflow/installation.html#web-authentication
|
||||
authenticate = $AIRFLOW_AUTHENTICATE
|
||||
auth_backend = $AIRFLOW_AUTH_BACKEND
|
||||
|
||||
# Filter the list of dags by owner name (requires authentication to be enabled)
|
||||
filter_by_owner = False
|
||||
# Paths to the SSL certificate and key for the web server. When both are
|
||||
# provided SSL will be enabled. This does not change the web server port.
|
||||
# web_server_ssl_cert =
|
||||
|
||||
# Paths to the SSL certificate and key for the web server. When both are
|
||||
# provided SSL will be enabled. This does not change the web server port.
|
||||
# web_server_ssl_key =
|
||||
|
||||
# If set to True, Airflow will track files in plugins_folder directory. When it detects changes,
|
||||
# then reload the gunicorn.
|
||||
# You can toggle this for Development when iterating on plugins
|
||||
reload_on_plugin_change = False
|
||||
|
||||
# Log files for the gunicorn webserver. '-' means log to stderr.
|
||||
access_logfile = -
|
||||
|
||||
# Log files for the gunicorn webserver. '-' means log to stderr.
|
||||
error_logfile = -
|
||||
|
||||
# Access log format for gunicorn webserver.
|
||||
# default format is %%(h)s %%(l)s %%(u)s %%(t)s "%%(r)s" %%(s)s %%(b)s "%%(f)s" "%%(a)s"
|
||||
# documentation - https://docs.gunicorn.org/en/stable/settings.html#access-log-format
|
||||
# access_logformat =
|
||||
|
||||
# Expose the configuration file in the web server
|
||||
expose_config = True
|
||||
|
||||
# Expose hostname in the web server
|
||||
expose_hostname = True
|
||||
|
||||
# Expose stacktrace in the web server
|
||||
expose_stacktrace = True
|
||||
|
||||
# Default DAG view. Valid values are: ``tree``, ``graph``, ``duration``, ``gantt``, ``landing_times``
|
||||
dag_default_view = tree
|
||||
|
||||
# Default DAG orientation. Valid values are:
|
||||
# ``LR`` (Left->Right), ``TB`` (Top->Bottom), ``RL`` (Right->Left), ``BT`` (Bottom->Top)
|
||||
dag_orientation = LR
|
||||
|
||||
# The amount of time (in secs) webserver will wait for initial handshake
|
||||
# while fetching logs from other worker machine
|
||||
log_fetch_timeout_sec = 5
|
||||
|
||||
# Time interval (in secs) to wait before next log fetching.
|
||||
log_fetch_delay_sec = 2
|
||||
|
||||
# Distance away from page bottom to enable auto tailing.
|
||||
log_auto_tailing_offset = 30
|
||||
|
||||
# Animation speed for auto tailing log display.
|
||||
log_animation_speed = 1000
|
||||
|
||||
# By default, the webserver shows paused DAGs. Flip this to hide paused
|
||||
# DAGs by default
|
||||
hide_paused_dags_by_default = False
|
||||
|
||||
# Consistent page size across all listing views in the UI
|
||||
page_size = 100
|
||||
|
||||
# Define the color of navigation bar
|
||||
navbar_color = #fff
|
||||
|
||||
# Default dagrun to show in UI
|
||||
default_dag_run_display_number = 25
|
||||
|
||||
# Enable werkzeug ``ProxyFix`` middleware for reverse proxy
|
||||
enable_proxy_fix = False
|
||||
|
||||
# Number of values to trust for ``X-Forwarded-For``.
|
||||
# More info: https://werkzeug.palletsprojects.com/en/0.16.x/middleware/proxy_fix/
|
||||
proxy_fix_x_for = 1
|
||||
|
||||
# Number of values to trust for ``X-Forwarded-Proto``
|
||||
proxy_fix_x_proto = 1
|
||||
|
||||
# Number of values to trust for ``X-Forwarded-Host``
|
||||
proxy_fix_x_host = 1
|
||||
|
||||
# Number of values to trust for ``X-Forwarded-Port``
|
||||
proxy_fix_x_port = 1
|
||||
|
||||
# Number of values to trust for ``X-Forwarded-Prefix``
|
||||
proxy_fix_x_prefix = 1
|
||||
|
||||
# Set secure flag on session cookie
|
||||
cookie_secure = False
|
||||
|
||||
# Set samesite policy on session cookie
|
||||
cookie_samesite = Lax
|
||||
|
||||
# Default setting for wrap toggle on DAG code and TI log views.
|
||||
default_wrap = False
|
||||
|
||||
# Allow the UI to be rendered in a frame
|
||||
x_frame_enabled = True
|
||||
|
||||
# Send anonymous user activity to your analytics tool
|
||||
# choose from google_analytics, segment, or metarouter
|
||||
# analytics_tool =
|
||||
|
||||
# Unique ID of your account in the analytics tool
|
||||
# analytics_id =
|
||||
|
||||
# 'Recent Tasks' stats will show for old DagRuns if set
|
||||
show_recent_stats_for_completed_runs = True
|
||||
|
||||
# Update FAB permissions and sync security manager roles
|
||||
# on webserver startup
|
||||
update_fab_perms = True
|
||||
|
||||
# The UI cookie lifetime in minutes. User will be logged out from UI after
|
||||
# ``session_lifetime_minutes`` of non-activity
|
||||
session_lifetime_minutes = 43200
|
||||
|
||||
# Sets a custom page title for the DAGs overview page and site title for all pages
|
||||
# instance_name =
|
||||
|
||||
|
||||
[email]
|
||||
email_backend = $AIRFLOW_EMAIL_BACKEND
|
||||
|
||||
# Email connection to use
|
||||
# email_conn_id = smtp_default
|
||||
|
||||
# Whether email alerts should be sent when a task is retried
|
||||
default_email_on_retry = True
|
||||
|
||||
# Whether email alerts should be sent when a task failed
|
||||
default_email_on_failure = True
|
||||
|
||||
# File that will be used as the template for Email subject (which will be rendered using Jinja2).
|
||||
# If not set, Airflow uses a base template.
|
||||
# Example: subject_template = /path/to/my_subject_template_file
|
||||
# subject_template =
|
||||
|
||||
# File that will be used as the template for Email content (which will be rendered using Jinja2).
|
||||
# If not set, Airflow uses a base template.
|
||||
# Example: html_content_template = /path/to/my_html_content_template_file
|
||||
# html_content_template =
|
||||
|
||||
|
||||
[smtp]
|
||||
# If you want airflow to send emails on retries, failure, and you want to
|
||||
# the airflow.utils.send_email function, you have to configure an smtp
|
||||
|
@ -107,6 +398,30 @@ smtp_port = 587
|
|||
smtp_user = $AIRFLOW_SMTP_USER
|
||||
smtp_password = $AIRFLOW_SMTP_PASSWORD
|
||||
smtp_mail_from = $AIRFLOW_SMTP_FROM
|
||||
# smtp_timeout = 30
|
||||
# smtp_retry_limit = 5
|
||||
|
||||
|
||||
[sentry]
|
||||
# Sentry (https://docs.sentry.io) integration. Here you can supply
|
||||
# additional configuration options based on the Python platform. See:
|
||||
# https://docs.sentry.io/error-reporting/configuration/?platform=python.
|
||||
# Unsupported options: ``integrations``, ``in_app_include``, ``in_app_exclude``,
|
||||
# ``ignore_errors``, ``before_breadcrumb``, ``before_send``, ``transport``.
|
||||
# Enable error reporting to Sentry
|
||||
# sentry_on = false
|
||||
# sentry_dsn =
|
||||
|
||||
|
||||
[celery_kubernetes_executor]
|
||||
# This section only applies if you are using the ``CeleryKubernetesExecutor`` in
|
||||
# ``[core]`` section above
|
||||
# Define when to send a task to ``KubernetesExecutor`` when using ``CeleryKubernetesExecutor``.
|
||||
# When the queue of a task is the value of ``kubernetes_queue`` (default ``kubernetes``),
|
||||
# the task is executed via ``KubernetesExecutor``,
|
||||
# otherwise via ``CeleryExecutor``
|
||||
# kubernetes_queue = kubernetes
|
||||
|
||||
|
||||
[celery]
|
||||
# This section only applies if you are using the CeleryExecutor in
|
||||
|
@ -121,6 +436,30 @@ celery_app_name = airflow.executors.celery_executor
|
|||
# your worker box and the nature of your tasks
|
||||
worker_concurrency = 32
|
||||
|
||||
# The maximum and minimum concurrency that will be used when starting workers with the
|
||||
# ``airflow celery worker`` command (always keep minimum processes, but grow
|
||||
# to maximum if necessary). Note the value should be max_concurrency,min_concurrency
|
||||
# Pick these numbers based on resources on worker box and the nature of the task.
|
||||
# If autoscale option is available, worker_concurrency will be ignored.
|
||||
# http://docs.celeryproject.org/en/latest/reference/celery.bin.worker.html#cmdoption-celery-worker-autoscale
|
||||
# Example: worker_autoscale = 16,12
|
||||
# worker_autoscale =
|
||||
|
||||
# Used to increase the number of tasks that a worker prefetches which can improve performance.
|
||||
# The number of processes multiplied by worker_prefetch_multiplier is the number of tasks
|
||||
# that are prefetched by a worker. A value greater than 1 can result in tasks being unnecessarily
|
||||
# blocked if there are multiple workers and one worker prefetches tasks that sit behind long
|
||||
# running tasks while another worker has unutilized processes that are unable to process the already
|
||||
# claimed blocked tasks.
|
||||
# https://docs.celeryproject.org/en/stable/userguide/optimizing.html#prefetch-limits
|
||||
# Example: worker_prefetch_multiplier = 1
|
||||
# worker_prefetch_multiplier =
|
||||
|
||||
# Umask that will be used when starting workers with the ``airflow celery worker``
|
||||
# in daemon mode. This control the file-creation mode mask which determines the initial
|
||||
# value of file permission bits for newly created files.
|
||||
# worker_umask = 0o077
|
||||
|
||||
# When you start an airflow worker, airflow starts a tiny web server
|
||||
# subprocess to serve the workers local log files to the airflow main
|
||||
# web server, who then builds pages and sends them to users. This defines
|
||||
|
@ -136,36 +475,151 @@ broker_url = $AIRFLOW_BROKER_URL
|
|||
# Another key Celery setting
|
||||
result_backend = $AIRFLOW_RESULT_URL
|
||||
|
||||
# Celery Flower is a sweet UI for Celery. Airflow has a shortcut to start
|
||||
# it ``airflow celery flower``. This defines the IP that Celery Flower runs on
|
||||
flower_host = 0.0.0.0
|
||||
|
||||
# The root URL for Flower
|
||||
# Example: flower_url_prefix = /flower
|
||||
# flower_url_prefix =
|
||||
|
||||
# Celery Flower is a sweet UI for Celery. Airflow has a shortcut to start
|
||||
# it `airflow flower`. This defines the port that Celery Flower runs on
|
||||
flower_port = $AIRFLOW_FLOWER_PORT
|
||||
|
||||
# Securing Flower with Basic Authentication
|
||||
# Accepts user:password pairs separated by a comma
|
||||
# Example: flower_basic_auth = user1:password1,user2:password2
|
||||
# flower_basic_auth =
|
||||
|
||||
# How many processes CeleryExecutor uses to sync task state.
|
||||
# 0 means to use max(1, number of cores - 1) processes.
|
||||
sync_parallelism = 0
|
||||
|
||||
# Import path for celery configuration options
|
||||
celery_config_options = airflow.config_templates.default_celery.DEFAULT_CELERY_CONFIG
|
||||
ssl_active = False
|
||||
# ssl_key =
|
||||
# ssl_cert =
|
||||
# ssl_cacert =
|
||||
|
||||
# Celery Pool implementation.
|
||||
# Choices include: ``prefork`` (default), ``eventlet``, ``gevent`` or ``solo``.
|
||||
# See:
|
||||
# https://docs.celeryproject.org/en/latest/userguide/workers.html#concurrency
|
||||
# https://docs.celeryproject.org/en/latest/userguide/concurrency/eventlet.html
|
||||
pool = prefork
|
||||
|
||||
# The number of seconds to wait before timing out ``send_task_to_executor`` or
|
||||
# ``fetch_celery_task_state`` operations.
|
||||
operation_timeout = 3.0
|
||||
|
||||
# Celery task will report its status as 'started' when the task is executed by a worker.
|
||||
# This is used in Airflow to keep track of the running tasks and if a Scheduler is restarted
|
||||
# or run in HA mode, it can adopt the orphan tasks launched by previous SchedulerJob.
|
||||
task_track_started = True
|
||||
|
||||
# Time in seconds after which Adopted tasks are cleared by CeleryExecutor. This is helpful to clear
|
||||
# stalled tasks.
|
||||
task_adoption_timeout = 600
|
||||
|
||||
# The Maximum number of retries for publishing task messages to the broker when failing
|
||||
# due to ``AirflowTaskTimeout`` error before giving up and marking Task as failed.
|
||||
task_publish_max_retries = 3
|
||||
|
||||
# Worker initialisation check to validate Metadata Database connection
|
||||
worker_precheck = False
|
||||
|
||||
|
||||
# [dask]
|
||||
# This section only applies if you are using the DaskExecutor in
|
||||
# [core] section above
|
||||
# The IP address and port of the Dask cluster's scheduler.
|
||||
# cluster_address = 127.0.0.1:8786
|
||||
|
||||
# TLS/ SSL settings to access a secured Dask scheduler.
|
||||
# tls_ca =
|
||||
# tls_cert =
|
||||
# tls_key =
|
||||
|
||||
|
||||
[celery_broker_transport_options]
|
||||
# This section is for specifying options which can be passed to the
|
||||
# underlying celery broker transport. See:
|
||||
# http://docs.celeryproject.org/en/latest/userguide/configuration.html#std:setting-broker_transport_options
|
||||
# The visibility timeout defines the number of seconds to wait for the worker
|
||||
# to acknowledge the task before the message is redelivered to another worker.
|
||||
# Make sure to increase the visibility timeout to match the time of the longest
|
||||
# ETA you're planning to use.
|
||||
# visibility_timeout is only supported for Redis and SQS celery brokers.
|
||||
# See:
|
||||
# http://docs.celeryproject.org/en/master/userguide/configuration.html#std:setting-broker_transport_options
|
||||
# Example: visibility_timeout = 21600
|
||||
# visibility_timeout =
|
||||
|
||||
|
||||
[operators]
|
||||
# Default queue that tasks get assigned to and that worker listen on.
|
||||
default_queue = default
|
||||
|
||||
# The default owner assigned to each new operator, unless
|
||||
# provided explicitly or passed via ``default_args``
|
||||
# default_owner = airflow
|
||||
# default_cpus = 1
|
||||
# default_ram = 512
|
||||
# default_disk = 512
|
||||
# default_gpus = 0
|
||||
|
||||
# Is allowed to pass additional/unused arguments (args, kwargs) to the BaseOperator operator.
|
||||
# If set to False, an exception will be thrown, otherwise only the console message will be displayed.
|
||||
allow_illegal_arguments = False
|
||||
|
||||
|
||||
[scheduler]
|
||||
# Task instances listen for external kill signal (when you clear tasks
|
||||
# from the CLI or the UI), this defines the frequency at which they should
|
||||
# listen (in seconds).
|
||||
job_heartbeat_sec = 5
|
||||
|
||||
# How often (in seconds) to check and tidy up 'running' TaskInstancess
|
||||
# that no longer have a matching DagRun
|
||||
clean_tis_without_dagrun_interval = 15.0
|
||||
|
||||
# The scheduler constantly tries to trigger new tasks (look at the
|
||||
# scheduler section in the docs for more information). This defines
|
||||
# how often the scheduler should run (in seconds).
|
||||
scheduler_heartbeat_sec = 5
|
||||
|
||||
# after how much time should the scheduler terminate in seconds
|
||||
# -1 indicates to run continuously (see also num_runs)
|
||||
run_duration = -1
|
||||
# The number of times to try to schedule each DAG file
|
||||
# -1 indicates unlimited number
|
||||
num_runs = -1
|
||||
|
||||
# after how much time a new DAGs should be picked up from the filesystem
|
||||
min_file_process_interval = 0
|
||||
# The number of seconds to wait between consecutive DAG file processing
|
||||
# Deprecated since version 2.2.0: The option has been moved to scheduler.scheduler_idle_sleep_time
|
||||
processor_poll_interval = 1
|
||||
|
||||
dag_dir_list_interval = 300
|
||||
# Number of seconds after which a DAG file is parsed. The DAG file is parsed every
|
||||
# ``min_file_process_interval`` number of seconds. Updates to DAGs are reflected after
|
||||
# this interval. Keeping this number low will increase CPU usage.
|
||||
min_file_process_interval = 60
|
||||
|
||||
# How often should stats be printed to the logs
|
||||
# How often (in seconds) to scan the DAGs directory for new files. Default to 5 minutes.
|
||||
# This is set via env var to 300 in prod, but 30 for local testing
|
||||
dag_dir_list_interval = 30
|
||||
|
||||
# How often should stats be printed to the logs. Setting to 0 will disable printing stats
|
||||
print_stats_interval = 30
|
||||
|
||||
# How often (in seconds) should pool usage stats be sent to statsd (if statsd_on is enabled)
|
||||
pool_metrics_interval = 20.0
|
||||
|
||||
# If the last scheduler heartbeat happened more than scheduler_health_check_threshold
|
||||
# ago (in seconds), scheduler is considered unhealthy.
|
||||
# This is used by the health check in the "/health" endpoint
|
||||
scheduler_health_check_threshold = 30
|
||||
|
||||
# How often (in seconds) should the scheduler check for orphaned tasks and SchedulerJobs
|
||||
orphaned_tasks_check_interval = 300.0
|
||||
child_process_log_directory = ${AIRFLOW_HOME}/logs/scheduler
|
||||
|
||||
# Local task jobs periodically heartbeat to the DB. If the job has
|
||||
|
@ -173,22 +627,185 @@ child_process_log_directory = ${AIRFLOW_HOME}/logs/scheduler
|
|||
# associated task instance as failed and will re-schedule the task.
|
||||
scheduler_zombie_task_threshold = 300
|
||||
|
||||
|
||||
# Turn off scheduler catchup by setting this to False.
|
||||
# Default behavior is unchanged and
|
||||
# Command Line Backfills still work, but the scheduler
|
||||
# will not do scheduler catchup if this is False,
|
||||
# however it can be set on a per DAG basis in the
|
||||
# DAG definition (catchup)
|
||||
catchup_by_default = True
|
||||
catchup_by_default = False
|
||||
|
||||
# This changes the batch size of queries in the scheduling main loop.
|
||||
# If this is too high, SQL query performance may be impacted by one
|
||||
# or more of the following:
|
||||
# - reversion to full table scan
|
||||
# - complexity of query predicate
|
||||
# - excessive locking
|
||||
# Additionally, you may hit the maximum allowable query length for your db.
|
||||
# Set this to 0 for no limit (not advised)
|
||||
max_tis_per_query = 512
|
||||
|
||||
# Should the scheduler issue ``SELECT ... FOR UPDATE`` in relevant queries.
|
||||
# If this is set to False then you should not run more than a single
|
||||
# scheduler at once
|
||||
use_row_level_locking = True
|
||||
|
||||
# Max number of DAGs to create DagRuns for per scheduler loop
|
||||
#
|
||||
# Default: 10
|
||||
# max_dagruns_to_create_per_loop =
|
||||
|
||||
# How many DagRuns should a scheduler examine (and lock) when scheduling
|
||||
# and queuing tasks.
|
||||
#
|
||||
# Default: 20
|
||||
# max_dagruns_per_loop_to_schedule =
|
||||
|
||||
# Should the Task supervisor process perform a "mini scheduler" to attempt to schedule more tasks of the
|
||||
# same DAG. Leaving this on will mean tasks in the same DAG execute quicker, but might starve out other
|
||||
# dags in some circumstances
|
||||
#
|
||||
# Default: True
|
||||
# schedule_after_task_execution =
|
||||
|
||||
# The scheduler can run multiple processes in parallel to parse dags.
|
||||
# This defines how many processes will run.
|
||||
parsing_processes = 2
|
||||
|
||||
# One of ``modified_time``, ``random_seeded_by_host`` and ``alphabetical``.
|
||||
# The scheduler will list and sort the dag files to decide the parsing order.
|
||||
#
|
||||
# * ``modified_time``: Sort by modified time of the files. This is useful on large scale to parse the
|
||||
# recently modified DAGs first.
|
||||
# * ``random_seeded_by_host``: Sort randomly across multiple Schedulers but with same order on the
|
||||
# same host. This is useful when running with Scheduler in HA mode where each scheduler can
|
||||
# parse different DAG files.
|
||||
# * ``alphabetical``: Sort by filename
|
||||
file_parsing_sort_mode = modified_time
|
||||
|
||||
# Turn off scheduler use of cron intervals by setting this to False.
|
||||
# DAGs submitted manually in the web UI or with trigger_dag will still run.
|
||||
use_job_schedule = True
|
||||
|
||||
# Allow externally triggered DagRuns for Execution Dates in the future
|
||||
# Only has effect if schedule_interval is set to None in DAG
|
||||
allow_trigger_in_future = False
|
||||
|
||||
# DAG dependency detector class to use
|
||||
dependency_detector = airflow.serialization.serialized_objects.DependencyDetector
|
||||
|
||||
|
||||
[metrics]
|
||||
# Statsd (https://github.com/etsy/statsd) integration settings
|
||||
# statsd_on = False
|
||||
# statsd_host = localhost
|
||||
# statsd_port = 8125
|
||||
# statsd_prefix = airflow
|
||||
|
||||
# To enable datadog integration to send airflow metrics.
|
||||
statsd_datadog_enabled = False
|
||||
|
||||
# List of datadog tags attached to all metrics(e.g: key1:value1,key2:value2)
|
||||
# statsd_datadog_tags =
|
||||
|
||||
|
||||
# [secrets]
|
||||
# Full class name of secrets backend to enable (will precede env vars and metastore in search path)
|
||||
# Example: backend = airflow.providers.amazon.aws.secrets.systems_manager.SystemsManagerParameterStoreBackend
|
||||
# backend =
|
||||
|
||||
# The backend_kwargs param is loaded into a dictionary and passed to __init__ of secrets backend class.
|
||||
# See documentation for the secrets backend you are using. JSON is expected.
|
||||
# Example for AWS Systems Manager ParameterStore:
|
||||
# ``{{"connections_prefix": "/airflow/connections", "profile_name": "default"}}``
|
||||
# backend_kwargs =
|
||||
|
||||
# [cli]
|
||||
# In what way should the cli access the API. The LocalClient will use the
|
||||
# database directly, while the json_client will use the api running on the
|
||||
# webserver
|
||||
# api_client = airflow.api.client.local_client
|
||||
|
||||
# If you set web_server_url_prefix, do NOT forget to append it here, ex:
|
||||
# ``endpoint_url = http://localhost:8080/myroot``
|
||||
# So api will look like: ``http://localhost:8080/myroot/api/experimental/...``
|
||||
# endpoint_url = http://localhost:8080
|
||||
|
||||
|
||||
[debug]
|
||||
# Used only with ``DebugExecutor``. If set to ``True`` DAG will fail with first
|
||||
# failed task. Helpful for debugging purposes.
|
||||
fail_fast = False
|
||||
|
||||
|
||||
[api]
|
||||
# Enables the deprecated experimental API. Please note that these APIs do not have access control.
|
||||
# The authenticated user has full access.
|
||||
#
|
||||
# .. warning::
|
||||
#
|
||||
# This `Experimental REST API <https://airflow.readthedocs.io/en/latest/rest-api-ref.html>`__ is
|
||||
# deprecated since version 2.0. Please consider using
|
||||
# `the Stable REST API <https://airflow.readthedocs.io/en/latest/stable-rest-api-ref.html>`__.
|
||||
# For more information on migration, see
|
||||
# `UPDATING.md <https://github.com/apache/airflow/blob/master/UPDATING.md>`_
|
||||
enable_experimental_api = False
|
||||
|
||||
# How to authenticate users of the API. See
|
||||
# https://airflow.apache.org/docs/apache-airflow/stable/security.html for possible values.
|
||||
# ("airflow.api.auth.backend.default" allows all requests for historic reasons)
|
||||
auth_backend = airflow.api.auth.backend.deny_all
|
||||
|
||||
# Used to set the maximum page limit for API requests
|
||||
maximum_page_limit = 100
|
||||
|
||||
# Used to set the default page limit when limit is zero. A default limit
|
||||
# of 100 is set on OpenApi spec. However, this particular default limit
|
||||
# only work when limit is set equal to zero(0) from API requests.
|
||||
# If no limit is supplied, the OpenApi spec default is used.
|
||||
fallback_page_limit = 100
|
||||
|
||||
# The intended audience for JWT token credentials used for authorization. This value must match on the client and server sides. If empty, audience will not be tested.
|
||||
# Example: google_oauth2_audience = project-id-random-value.apps.googleusercontent.com
|
||||
# google_oauth2_audience =
|
||||
|
||||
# Path to Google Cloud Service Account key file (JSON). If omitted, authorization based on
|
||||
# `the Application Default Credentials
|
||||
# <https://cloud.google.com/docs/authentication/production#finding_credentials_automatically>`__ will
|
||||
# be used.
|
||||
# Example: google_key_path = /files/service-account-json
|
||||
# google_key_path =
|
||||
|
||||
# Used in response to a preflight request to indicate which HTTP
|
||||
# headers can be used when making the actual request. This header is
|
||||
# the server side response to the browser's
|
||||
# Access-Control-Request-Headers header.
|
||||
# access_control_allow_headers =
|
||||
|
||||
# Specifies the method or methods allowed when accessing the resource.
|
||||
# access_control_allow_methods =
|
||||
|
||||
# Indicates whether the response can be shared with requesting code from the given origin.
|
||||
# access_control_allow_origin =
|
||||
|
||||
|
||||
# [smart_sensor]
|
||||
# TODO(hwoo) - Test smart sensors and enable this if the need arises.
|
||||
# When `use_smart_sensor` is True, Airflow redirects multiple qualified sensor tasks to
|
||||
# smart sensor task.
|
||||
# use_smart_sensor = False
|
||||
|
||||
# `shard_code_upper_limit` is the upper limit of `shard_code` value. The `shard_code` is generated
|
||||
# by `hashcode % shard_code_upper_limit`.
|
||||
# shard_code_upper_limit = 10000
|
||||
|
||||
# The number of running smart sensor processes for each service.
|
||||
# shards = 5
|
||||
|
||||
# comma separated sensor classes support in smart_sensor.
|
||||
# sensors_enabled = NamedHivePartitionSensor
|
||||
|
||||
|
||||
[mesos]
|
||||
# Mesos master address which MesosExecutor will connect to.
|
||||
master = localhost:5050
|
||||
|
@ -223,3 +840,35 @@ authenticate = False
|
|||
# Mesos credentials, if authentication is enabled
|
||||
# default_principal = admin
|
||||
# default_secret = admin
|
||||
|
||||
|
||||
# [lineage]
|
||||
# what lineage backend to use
|
||||
# backend =
|
||||
|
||||
# [atlas]
|
||||
# sasl_enabled = False
|
||||
# host =
|
||||
# port = 21000
|
||||
# username =
|
||||
# password =
|
||||
|
||||
# [hive]
|
||||
# Default mapreduce queue for HiveOperator tasks
|
||||
# default_hive_mapred_queue =
|
||||
|
||||
# Template for mapred_job_name in HiveOperator, supports the following named parameters
|
||||
# hostname, dag_id, task_id, execution_date
|
||||
# mapred_job_name_template =
|
||||
|
||||
# [kerberos]
|
||||
# ccache = /tmp/airflow_krb5_ccache
|
||||
|
||||
# gets augmented with fqdn
|
||||
# principal = airflow
|
||||
# reinit_frequency = 3600
|
||||
# kinit_path = kinit
|
||||
# keytab = airflow.keytab
|
||||
|
||||
# [github_enterprise]
|
||||
# api_rev = v3
|
||||
|
|
|
@ -33,13 +33,12 @@ function update_gcp() {
|
|||
container_id=$(docker ps | grep telemetry-airflow_web | cut -d' ' -f1)
|
||||
|
||||
docker exec $container_id \
|
||||
airflow connections -d --conn_id $conn_id
|
||||
airflow connections delete $conn_id
|
||||
|
||||
docker exec $container_id \
|
||||
airflow connections -a \
|
||||
--conn_id $conn_id \
|
||||
--conn_type google_cloud_platform \
|
||||
--conn_extra "$(format_gcp $keyfile)"
|
||||
airflow connections add $conn_id \
|
||||
--conn-type google_cloud_platform \
|
||||
--conn-extra "$(format_gcp $keyfile)"
|
||||
}
|
||||
|
||||
update_gcp $connection $keyfile_path
|
||||
|
|
85
bin/run
85
bin/run
|
@ -68,13 +68,12 @@ init_connections() {
|
|||
export AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID:-dummy_access_key_id}
|
||||
export AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY:-dummy_secret_access_key}
|
||||
|
||||
airflow connections --delete --conn_id databricks_default
|
||||
airflow connections delete databricks_default
|
||||
|
||||
airflow connections --add \
|
||||
--conn_id databricks_default \
|
||||
--conn_type databricks \
|
||||
--conn_host https://dbc-caf9527b-e073.cloud.databricks.com \
|
||||
--conn_extra "{\"token\":\"${DB_TOKEN}\", \"host\": \"\"}"
|
||||
airflow connections add databricks_default \
|
||||
--conn-type databricks \
|
||||
--conn-host https://dbc-caf9527b-e073.cloud.databricks.com \
|
||||
--conn-extra "{\"token\":\"${DB_TOKEN}\", \"host\": \"\"}"
|
||||
|
||||
gcp_conn=(
|
||||
"google_cloud_airflow_dataproc"
|
||||
|
@ -87,11 +86,11 @@ init_connections() {
|
|||
"google_cloud_shared_prod"
|
||||
)
|
||||
for conn_id in "${gcp_conn[@]}"; do
|
||||
airflow connections --delete --conn_id "${conn_id}"
|
||||
airflow connections --add \
|
||||
--conn_id "${conn_id}" \
|
||||
--conn_type google_cloud_platform \
|
||||
--conn_extra "$(gcp_default_extras)"
|
||||
airflow connections delete "${conn_id}"
|
||||
|
||||
airflow connections add "${conn_id}" \
|
||||
--conn-type google_cloud_platform \
|
||||
--conn-extra "$(gcp_default_extras)"
|
||||
done
|
||||
|
||||
aws_conn=(
|
||||
|
@ -107,46 +106,46 @@ init_connections() {
|
|||
"aws_socorro_readonly_s3"
|
||||
)
|
||||
for conn_id in "${aws_conn[@]}"; do
|
||||
airflow connections --delete --conn_id "${conn_id}"
|
||||
airflow connections --add \
|
||||
--conn_id "${conn_id}" \
|
||||
--conn_type s3 \
|
||||
--conn_extra "$(aws_default_extras)"
|
||||
airflow connections delete "${conn_id}"
|
||||
|
||||
airflow connections add "${conn_id}" \
|
||||
--conn-type s3 \
|
||||
--conn-extra "$(aws_default_extras)"
|
||||
done
|
||||
|
||||
airflow connections --delete --conn_id "http_netlify_build_webhook"
|
||||
airflow connections --add \
|
||||
--conn_id "http_netlify_build_webhook" \
|
||||
--conn_type http \
|
||||
--conn_host "https://httpbin.org/"
|
||||
airflow connections delete "http_netlify_build_webhook"
|
||||
|
||||
airflow connections add "http_netlify_build_webhook" \
|
||||
--conn-type http \
|
||||
--conn-host "https://httpbin.org/"
|
||||
}
|
||||
|
||||
init_variables() {
|
||||
airflow variables -s "bugzilla_probe_expiry_bot_api_key" "bugzilla-api-key"
|
||||
airflow variables -s "app_store_connect_username" "username"
|
||||
airflow variables -s "app_store_connect_password" "password"
|
||||
airflow variables -s "surveygizmo_daily_attitudes_survey_id" "12345"
|
||||
airflow variables -s "surveygizmo_api_token" "tokentokentoken"
|
||||
airflow variables -s "surveygizmo_api_secret" "tapsekret"
|
||||
airflow variables -s "jetstream_cluster_ip" "127.0.0.1"
|
||||
airflow variables -s "jetstream_cluster_cert" "cert"
|
||||
airflow variables set "bugzilla_probe_expiry_bot_api_key" "bugzilla-api-key"
|
||||
airflow variables set "app_store_connect_username" "username"
|
||||
airflow variables set "app_store_connect_password" "password"
|
||||
airflow variables set "surveygizmo_daily_attitudes_survey_id" "12345"
|
||||
airflow variables set "surveygizmo_api_token" "tokentokentoken"
|
||||
airflow variables set "surveygizmo_api_secret" "tapsekret"
|
||||
airflow variables set "jetstream_cluster_ip" "127.0.0.1"
|
||||
airflow variables set "jetstream_cluster_cert" "cert"
|
||||
|
||||
airflow variables -s "taar_bigtable_instance_id" "taar_bigtable_instance_id"
|
||||
airflow variables -s "taar_etl_storage_bucket" "taar_etl_storage_bucket"
|
||||
airflow variables -s "taar_etl_model_storage_bucket" "taar_etl_model_storage_bucket"
|
||||
airflow variables -s "taar_gcp_project_id" "taar_gcp_project_id"
|
||||
airflow variables -s "taar_dataflow_subnetwork" "taar_dataflow_subnetwork"
|
||||
airflow variables -s "taar_dataflow_service_account_email" "taar_dataflow_service_account_email"
|
||||
airflow variables set "taar_bigtable_instance_id" "taar_bigtable_instance_id"
|
||||
airflow variables set "taar_etl_storage_bucket" "taar_etl_storage_bucket"
|
||||
airflow variables set "taar_etl_model_storage_bucket" "taar_etl_model_storage_bucket"
|
||||
airflow variables set "taar_gcp_project_id" "taar_gcp_project_id"
|
||||
airflow variables set "taar_dataflow_subnetwork" "taar_dataflow_subnetwork"
|
||||
airflow variables set "taar_dataflow_service_account_email" "taar_dataflow_service_account_email"
|
||||
|
||||
airflow variables -s "looker_repos_secret_git_ssh_key_b64" "looker_repos_secret_git_ssh_key_b64"
|
||||
airflow variables -s "looker_api_client_id_staging" "looker_api_client_id_staging"
|
||||
airflow variables -s "looker_api_client_secret_staging" "looker_api_client_secret_staging"
|
||||
airflow variables -s "looker_api_client_id_prod" "looker_api_client_id_prod"
|
||||
airflow variables -s "looker_api_client_secret_prod" "looker_api_client_secret_prod"
|
||||
airflow variables -s "dataops_looker_github_secret_access_token" "dataops_looker_github_secret_access_token"
|
||||
airflow variables set "looker_repos_secret_git_ssh_key_b64" "looker_repos_secret_git_ssh_key_b64"
|
||||
airflow variables set "looker_api_client_id_staging" "looker_api_client_id_staging"
|
||||
airflow variables set "looker_api_client_secret_staging" "looker_api_client_secret_staging"
|
||||
airflow variables set "looker_api_client_id_prod" "looker_api_client_id_prod"
|
||||
airflow variables set "looker_api_client_secret_prod" "looker_api_client_secret_prod"
|
||||
airflow variables set "dataops_looker_github_secret_access_token" "dataops_looker_github_secret_access_token"
|
||||
|
||||
airflow variables -s "glean_dictionary_netlify_build_webhook_id" "status/200"
|
||||
airflow variables -s "lookml_generator_release_str" "v0.0.0"
|
||||
airflow variables set "glean_dictionary_netlify_build_webhook_id" "status/200"
|
||||
airflow variables set "lookml_generator_release_str" "v0.0.0"
|
||||
}
|
||||
|
||||
[ $# -lt 1 ] && usage
|
||||
|
|
|
@ -44,10 +44,10 @@ fi
|
|||
CONTAINER_ID=$(docker ps | grep _web | cut -d' ' -f1)
|
||||
|
||||
echo "Web container id is $CONTAINER_ID. Adding gcp connection..."
|
||||
docker exec $CONTAINER_ID airflow connections -d --conn_id $GCP_CONN_ID
|
||||
docker exec $CONTAINER_ID airflow connections delete $GCP_CONN_ID
|
||||
|
||||
docker exec $CONTAINER_ID airflow connections -a --conn_id $GCP_CONN_ID \
|
||||
--conn_type google_cloud_platform \
|
||||
--conn_extra "$JSON_CREDS"
|
||||
docker exec $CONTAINER_ID airflow connections add $GCP_CONN_ID \
|
||||
--conn-type google_cloud_platform \
|
||||
--conn-extra "$JSON_CREDS"
|
||||
|
||||
echo "visit https://go.corp.mozilla.com/wtmodev for more info"
|
||||
|
|
|
@ -44,7 +44,7 @@ function get_errors_in_listing {
|
|||
# Parse the logs for ERROR messages, these typically correspond to python
|
||||
# exceptions in the DAG. In general, there should NOT be any errors when
|
||||
# runnning the local environment.
|
||||
docker-compose exec web airflow dags list | grep "ERROR"
|
||||
docker-compose exec web airflow dags list -v | grep "ERROR"
|
||||
}
|
||||
|
||||
|
||||
|
@ -77,7 +77,7 @@ function main() {
|
|||
|
||||
if [[ $num_errors -ne 0 && $TESTING -eq 0 ]]; then
|
||||
# Print full error output
|
||||
docker-compose exec web airflow list_dags
|
||||
docker-compose exec web airflow dags list -v
|
||||
echo "Failure!"
|
||||
exit 1
|
||||
elif [[ $TESTING -eq 1 ]]; then
|
||||
|
|
|
@ -1,3 +1,8 @@
|
|||
import gevent
|
||||
from gevent import monkey, pool
|
||||
|
||||
monkey.patch_all()
|
||||
|
||||
STATE_COLORS = {
|
||||
"queued": 'gray',
|
||||
"running": 'lime',
|
||||
|
|
|
@ -3,7 +3,7 @@ import os
|
|||
from datetime import datetime, timedelta
|
||||
|
||||
from airflow import DAG
|
||||
from airflow.operators.sensors import ExternalTaskSensor
|
||||
from airflow.sensors.external_task import ExternalTaskSensor
|
||||
from airflow.operators.subdag_operator import SubDagOperator
|
||||
from utils.dataproc import (
|
||||
moz_dataproc_pyspark_runner,
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import datetime
|
||||
|
||||
from airflow import DAG
|
||||
from airflow.contrib.hooks.aws_hook import AwsHook
|
||||
from airflow.providers.amazon.aws.hooks.base_aws import AwsBaseHook
|
||||
from operators.task_sensor import ExternalTaskCompletedSensor
|
||||
from airflow.operators.subdag_operator import SubDagOperator
|
||||
|
||||
|
@ -29,7 +29,7 @@ with DAG(
|
|||
) as dag:
|
||||
# Jobs read from/write to s3://telemetry-public-analysis-2/bhr/data/hang_aggregates/
|
||||
write_aws_conn_id = 'aws_dev_telemetry_public_analysis_2_rw'
|
||||
aws_access_key, aws_secret_key, _ = AwsHook(write_aws_conn_id).get_credentials()
|
||||
aws_access_key, aws_secret_key, _ = AwsBaseHook(aws_conn_id=write_aws_conn_id, client_type='s3').get_credentials()
|
||||
|
||||
wait_for_bhr_ping = ExternalTaskCompletedSensor(
|
||||
task_id="wait_for_bhr_ping",
|
||||
|
|
|
@ -10,13 +10,12 @@ import uuid
|
|||
import time
|
||||
|
||||
from airflow import DAG
|
||||
from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook
|
||||
from airflow.operators import PythonOperator
|
||||
from airflow.operators.python import PythonOperator
|
||||
from operators.bq_sensor import BigQuerySQLSensorOperator
|
||||
from operators.gcp_container_operator import GKEPodOperator
|
||||
|
||||
DOCS = """\
|
||||
# burnham 👩🚀📈🤖
|
||||
# burnham
|
||||
|
||||
The burnham project is an end-to-end test suite that aims to automatically
|
||||
verify that Glean-based products correctly measure, collect, and submit
|
||||
|
@ -359,6 +358,7 @@ WHERE
|
|||
|
||||
# GCP and GKE default values
|
||||
DEFAULT_GCP_CONN_ID = "google_cloud_derived_datasets"
|
||||
DEFAULT_GCP_PROJECT_ID = "moz-fx-data-derived-datasets"
|
||||
DEFAULT_GKE_LOCATION = "us-central1-a"
|
||||
DEFAULT_GKE_CLUSTER_NAME = "bq-load-gke-1"
|
||||
DEFAULT_GKE_NAMESPACE = "default"
|
||||
|
@ -420,7 +420,7 @@ def burnham_run(
|
|||
return GKEPodOperator(
|
||||
task_id=task_id,
|
||||
gcp_conn_id=gcp_conn_id,
|
||||
project_id=GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id,
|
||||
project_id=DEFAULT_GCP_PROJECT_ID,
|
||||
location=gke_location,
|
||||
cluster_name=gke_cluster_name,
|
||||
namespace=gke_namespace,
|
||||
|
@ -446,7 +446,7 @@ def burnham_sensor(task_id, sql, gcp_conn_id=DEFAULT_GCP_CONN_ID, **kwargs):
|
|||
return BigQuerySQLSensorOperator(
|
||||
task_id=task_id,
|
||||
sql=sql,
|
||||
bigquery_conn_id=gcp_conn_id,
|
||||
gcp_conn_id=gcp_conn_id,
|
||||
use_legacy_sql=False,
|
||||
**kwargs,
|
||||
)
|
||||
|
@ -483,7 +483,7 @@ def burnham_bigquery_run(
|
|||
return GKEPodOperator(
|
||||
task_id=task_id,
|
||||
gcp_conn_id=gcp_conn_id,
|
||||
project_id=GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id,
|
||||
project_id=DEFAULT_GCP_PROJECT_ID,
|
||||
location=gke_location,
|
||||
cluster_name=gke_cluster_name,
|
||||
namespace=gke_namespace,
|
||||
|
|
|
@ -2,8 +2,6 @@ from airflow import DAG
|
|||
from datetime import timedelta, datetime
|
||||
from operators.gcp_container_operator import GKEPodOperator
|
||||
|
||||
from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook
|
||||
|
||||
docs = """
|
||||
### Clean GKE Pods
|
||||
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import datetime
|
||||
|
||||
from airflow import models
|
||||
from airflow.operators.sensors import ExternalTaskSensor
|
||||
from airflow.sensors.external_task import ExternalTaskSensor
|
||||
from airflow.operators.subdag_operator import SubDagOperator
|
||||
from utils.gcp import (
|
||||
bigquery_etl_copy_deduplicate,
|
||||
|
@ -10,7 +10,6 @@ from utils.gcp import (
|
|||
bigquery_xcom_query,
|
||||
)
|
||||
|
||||
from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook
|
||||
from utils.gcp import gke_command
|
||||
|
||||
DOCS = """\
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import datetime
|
||||
|
||||
from airflow import DAG
|
||||
from airflow.contrib.hooks.aws_hook import AwsHook
|
||||
from airflow.providers.amazon.aws.hooks.base_aws import AwsBaseHook
|
||||
from operators.task_sensor import ExternalTaskCompletedSensor
|
||||
from airflow.operators.subdag_operator import SubDagOperator
|
||||
|
||||
|
@ -35,13 +35,15 @@ with DAG(
|
|||
) as dag:
|
||||
# top_signatures_correlations uploads results to public analysis bucket
|
||||
write_aws_conn_id = "aws_dev_telemetry_public_analysis_2_rw"
|
||||
analysis_access_key, analysis_secret_key, _ = AwsHook(
|
||||
write_aws_conn_id
|
||||
analysis_access_key, analysis_secret_key, _ = AwsBaseHook(
|
||||
aws_conn_id=write_aws_conn_id,
|
||||
client_type='s3'
|
||||
).get_credentials()
|
||||
|
||||
# modules_with_missing_symbols sends results as email
|
||||
ses_aws_conn_id = "aws_data_iam_ses"
|
||||
ses_access_key, ses_secret_key, _ = AwsHook(ses_aws_conn_id).get_credentials()
|
||||
ses_access_key, ses_secret_key, _ = AwsBaseHook(
|
||||
aws_conn_id=ses_aws_conn_id, client_type='s3').get_credentials()
|
||||
|
||||
wait_for_socorro_import = ExternalTaskCompletedSensor(
|
||||
task_id="wait_for_socorro_import",
|
||||
|
|
|
@ -3,8 +3,7 @@ from datetime import datetime, timedelta
|
|||
|
||||
from utils.gcp import bigquery_etl_query, gke_command
|
||||
|
||||
from airflow.operators.sensors import ExternalTaskSensor
|
||||
from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook
|
||||
from airflow.sensors.external_task import ExternalTaskSensor
|
||||
from operators.gcp_container_operator import GKEPodOperator
|
||||
|
||||
default_args = {
|
||||
|
@ -18,6 +17,8 @@ default_args = {
|
|||
# We rely on max_active_runs=1 at the DAG level to manage the dependency on past runs.
|
||||
with DAG('experiments_live',
|
||||
default_args=default_args,
|
||||
# Will be renamed to max_active_tasks sometime later as main upstream branch states
|
||||
# max_active_tasks=4,
|
||||
concurrency=4,
|
||||
max_active_runs=1,
|
||||
schedule_interval="*/5 * * * *") as dag:
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
from airflow import DAG
|
||||
from airflow.contrib.hooks.aws_hook import AwsHook
|
||||
from airflow.providers.amazon.aws.hooks.base_aws import AwsBaseHook
|
||||
from operators.task_sensor import ExternalTaskCompletedSensor
|
||||
from airflow.operators.subdag_operator import SubDagOperator
|
||||
from datetime import datetime, timedelta
|
||||
|
@ -35,7 +35,7 @@ dag = DAG("firefox_public_data_report", default_args=default_args, schedule_inte
|
|||
|
||||
# Required to write json output to s3://telemetry-public-analysis-2/public-data-report/hardware/
|
||||
write_aws_conn_id='aws_dev_telemetry_public_analysis_2_rw'
|
||||
aws_access_key, aws_secret_key, session = AwsHook(write_aws_conn_id).get_credentials()
|
||||
aws_access_key, aws_secret_key, session = AwsBaseHook(aws_conn_id=write_aws_conn_id, client_type='s3').get_credentials()
|
||||
|
||||
# hardware_report's execution date will be {now}-7days. It will read last week's main pings,
|
||||
# therefore we need to wait for yesterday's Main Ping deduplication task to finish
|
||||
|
|
13
dags/glam.py
13
dags/glam.py
|
@ -1,8 +1,6 @@
|
|||
from datetime import datetime, timedelta
|
||||
|
||||
from airflow import DAG
|
||||
from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook
|
||||
from airflow.executors import get_default_executor
|
||||
from operators.task_sensor import ExternalTaskCompletedSensor
|
||||
from airflow.operators.subdag_operator import SubDagOperator
|
||||
|
||||
|
@ -38,12 +36,9 @@ PERCENT_RELEASE_WINDOWS_SAMPLING = "10"
|
|||
|
||||
dag = DAG(GLAM_DAG, default_args=default_args, schedule_interval="0 2 * * *")
|
||||
|
||||
gcp_conn = GoogleCloudBaseHook("google_cloud_airflow_dataproc")
|
||||
|
||||
# Make sure all the data for the given day has arrived before running.
|
||||
wait_for_main_ping = ExternalTaskCompletedSensor(
|
||||
task_id="wait_for_main_ping",
|
||||
project_id=project_id,
|
||||
external_dag_id="copy_deduplicate",
|
||||
external_task_id="copy_deduplicate_main_ping",
|
||||
execution_delta=timedelta(hours=1),
|
||||
|
@ -181,7 +176,6 @@ clients_histogram_aggregates = SubDagOperator(
|
|||
dataset_id,
|
||||
),
|
||||
task_id=GLAM_CLIENTS_HISTOGRAM_AGGREGATES_SUBDAG,
|
||||
executor=get_default_executor(),
|
||||
dag=dag,
|
||||
)
|
||||
|
||||
|
@ -236,6 +230,10 @@ client_scalar_probe_counts = gke_command(
|
|||
|
||||
# SubdagOperator uses a SequentialExecutor by default
|
||||
# so its tasks will run sequentially.
|
||||
# Note: In 2.0, SubDagOperator is changed to use airflow scheduler instead of
|
||||
# backfill to schedule tasks in the subdag. User no longer need to specify
|
||||
# the executor in SubDagOperator. (We don't but the assumption that Sequential
|
||||
# Executor is used is now wrong)
|
||||
clients_histogram_bucket_counts = SubDagOperator(
|
||||
subdag=repeated_subdag(
|
||||
GLAM_DAG,
|
||||
|
@ -273,7 +271,6 @@ extract_counts = SubDagOperator(
|
|||
"counts"
|
||||
),
|
||||
task_id="extract_user_counts",
|
||||
executor=get_default_executor(),
|
||||
dag=dag
|
||||
)
|
||||
|
||||
|
@ -288,7 +285,6 @@ extract_sample_counts = SubDagOperator(
|
|||
"sample-counts"
|
||||
),
|
||||
task_id="extract_sample_counts",
|
||||
executor=get_default_executor(),
|
||||
dag=dag
|
||||
)
|
||||
|
||||
|
@ -301,7 +297,6 @@ extracts_per_channel = SubDagOperator(
|
|||
dataset_id
|
||||
),
|
||||
task_id="extracts",
|
||||
executor=get_default_executor(),
|
||||
dag=dag,
|
||||
)
|
||||
|
||||
|
|
|
@ -1,15 +1,11 @@
|
|||
from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook
|
||||
from airflow.contrib.operators.bigquery_to_gcs import BigQueryToCloudStorageOperator
|
||||
from airflow.contrib.operators.gcs_delete_operator import (
|
||||
GoogleCloudStorageDeleteOperator,
|
||||
)
|
||||
from airflow.executors import get_default_executor
|
||||
from airflow.providers.google.cloud.transfers.bigquery_to_gcs import BigQueryToGCSOperator
|
||||
from airflow.providers.google.cloud.operators.gcs import GCSDeleteObjectsOperator
|
||||
from airflow.operators.subdag_operator import SubDagOperator
|
||||
from airflow.models import DAG
|
||||
from utils.gcp import bigquery_etl_query
|
||||
|
||||
|
||||
gcp_conn = GoogleCloudBaseHook("google_cloud_airflow_dataproc")
|
||||
gcp_conn_id = "google_cloud_airflow_dataproc"
|
||||
project_id = "moz-fx-data-shared-prod"
|
||||
glam_bucket = "moz-fx-data-glam-prod-fca7-etl-data"
|
||||
|
||||
|
@ -33,7 +29,6 @@ def extracts_subdag(
|
|||
channel,
|
||||
),
|
||||
task_id="extract_{}".format(channel),
|
||||
executor=get_default_executor(),
|
||||
dag=dag,
|
||||
)
|
||||
|
||||
|
@ -75,24 +70,24 @@ def extract_channel_subdag(
|
|||
dag=dag,
|
||||
)
|
||||
|
||||
gcs_delete = GoogleCloudStorageDeleteOperator(
|
||||
gcs_delete = GCSDeleteObjectsOperator(
|
||||
task_id="glam_gcs_delete_old_{}_extracts".format(channel),
|
||||
bucket_name=glam_bucket,
|
||||
prefix="aggs-desktop-{}".format(channel),
|
||||
google_cloud_storage_conn_id=gcp_conn.gcp_conn_id,
|
||||
gcp_conn_id=gcp_conn_id,
|
||||
dag=dag,
|
||||
)
|
||||
|
||||
gcs_destination = "gs://{bucket}/aggs-desktop-{channel}-*.csv".format(
|
||||
bucket=glam_bucket, channel=channel
|
||||
)
|
||||
bq2gcs = BigQueryToCloudStorageOperator(
|
||||
bq2gcs = BigQueryToGCSOperator(
|
||||
task_id="glam_extract_{}_to_csv".format(channel),
|
||||
source_project_dataset_table="{}.{}.{}".format(
|
||||
project_id, dataset_id, bq_extract_table
|
||||
),
|
||||
destination_cloud_storage_uris=gcs_destination,
|
||||
bigquery_conn_id=gcp_conn.gcp_conn_id,
|
||||
gcp_conn_id=gcp_conn_id,
|
||||
export_format="CSV",
|
||||
print_header=False,
|
||||
dag=dag,
|
||||
|
@ -135,11 +130,13 @@ def extract_user_counts(
|
|||
dag=dag,
|
||||
)
|
||||
|
||||
gcs_delete = GoogleCloudStorageDeleteOperator(
|
||||
|
||||
gcs_delete = GCSDeleteObjectsOperator(
|
||||
task_id="glam_gcs_delete_{}_extracts".format(task_prefix),
|
||||
bucket_name=glam_bucket,
|
||||
|
||||
prefix="glam-extract-firefox-{}".format(file_prefix),
|
||||
google_cloud_storage_conn_id=gcp_conn.gcp_conn_id,
|
||||
gcp_conn_id=gcp_conn_id,
|
||||
dag=dag,
|
||||
)
|
||||
|
||||
|
@ -151,13 +148,14 @@ def extract_user_counts(
|
|||
gcs_destination = "gs://{}/glam-extract-firefox-{}.csv".format(
|
||||
glam_bucket, file_prefix
|
||||
)
|
||||
bq2gcs = BigQueryToCloudStorageOperator(
|
||||
|
||||
bq2gcs = BigQueryToGCSOperator(
|
||||
task_id="glam_extract_{}_to_csv".format(task_prefix),
|
||||
source_project_dataset_table="{}.{}.{}".format(
|
||||
project_id, dataset_id, bq_extract_table
|
||||
),
|
||||
destination_cloud_storage_uris=gcs_destination,
|
||||
bigquery_conn_id=gcp_conn.gcp_conn_id,
|
||||
gcp_conn_id=gcp_conn_id,
|
||||
export_format="CSV",
|
||||
print_header=False,
|
||||
dag=dag,
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
from airflow.models import DAG
|
||||
from airflow.operators.subdag_operator import SubDagOperator
|
||||
from airflow.executors import get_default_executor
|
||||
|
||||
from glam_subdags.general import repeated_subdag
|
||||
from utils.gcp import bigquery_etl_query
|
||||
|
@ -42,7 +41,6 @@ def histogram_aggregates_subdag(
|
|||
dataset_id,
|
||||
),
|
||||
task_id=GLAM_HISTOGRAM_AGGREGATES_FINAL_SUBDAG,
|
||||
executor=get_default_executor(),
|
||||
dag=dag,
|
||||
)
|
||||
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
import datetime
|
||||
import os
|
||||
|
||||
from airflow import DAG
|
||||
from airflow.contrib.hooks.aws_hook import AwsHook
|
||||
from airflow.providers.amazon.aws.hooks.base_aws import AwsBaseHook
|
||||
from operators.task_sensor import ExternalTaskCompletedSensor
|
||||
from airflow.operators.subdag_operator import SubDagOperator
|
||||
|
||||
|
@ -39,7 +40,11 @@ with DAG(
|
|||
) as dag:
|
||||
# Jobs read from/write to s3://telemetry-public-analysis-2/gfx/telemetry-data/
|
||||
write_aws_conn_id = 'aws_dev_telemetry_public_analysis_2_rw'
|
||||
aws_access_key, aws_secret_key, _ = AwsHook(write_aws_conn_id).get_credentials()
|
||||
is_dev = os.environ.get("DEPLOY_ENVIRONMENT") == "dev"
|
||||
if is_dev:
|
||||
aws_access_key, aws_secret_key = ('replace_me', 'replace_me')
|
||||
else:
|
||||
aws_access_key, aws_secret_key, _ = AwsBaseHook(aws_conn_id=write_aws_conn_id, client_type='s3').get_credentials()
|
||||
|
||||
wait_for_main_ping = ExternalTaskCompletedSensor(
|
||||
task_id="wait_for_main_ping",
|
||||
|
|
|
@ -1,88 +0,0 @@
|
|||
from airflow import DAG
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
from utils.gcp import bigquery_etl_query
|
||||
|
||||
from operators.task_sensor import ExternalTaskCompletedSensor
|
||||
from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook
|
||||
from operators.gcp_container_operator import GKEPodOperator
|
||||
|
||||
default_args = {
|
||||
'owner': 'frank@mozilla.com',
|
||||
'depends_on_past': False,
|
||||
'start_date': datetime(2020, 1, 1),
|
||||
'email_on_failure': True,
|
||||
'email_on_retry': True,
|
||||
'retries': 2,
|
||||
'retry_delay': timedelta(minutes=30),
|
||||
}
|
||||
|
||||
|
||||
with DAG('incline_dashboard',
|
||||
default_args=default_args,
|
||||
schedule_interval="0 4 * * *") as dag:
|
||||
|
||||
wait_for_baseline_clients_last_seen = ExternalTaskCompletedSensor(
|
||||
task_id="wait_for_baseline_clients_last_seen",
|
||||
external_dag_id="copy_deduplicate",
|
||||
external_task_id="baseline_clients_last_seen",
|
||||
execution_delta=timedelta(hours=3),
|
||||
mode="reschedule",
|
||||
pool="DATA_ENG_EXTERNALTASKSENSOR",
|
||||
email_on_retry=False,
|
||||
)
|
||||
|
||||
wait_for_core_clients_last_seen = ExternalTaskCompletedSensor(
|
||||
task_id="wait_for_core_clients_last_seen",
|
||||
external_dag_id="bqetl_core",
|
||||
external_task_id="telemetry_derived__core_clients_last_seen__v1",
|
||||
execution_delta=timedelta(hours=2),
|
||||
mode="reschedule",
|
||||
pool="DATA_ENG_EXTERNALTASKSENSOR",
|
||||
email_on_retry=False,
|
||||
)
|
||||
|
||||
project = "moz-fx-data-shared-prod"
|
||||
dataset = "org_mozilla_firefox_derived"
|
||||
|
||||
migrated_clients = bigquery_etl_query(
|
||||
task_id="generate_migrated_clients",
|
||||
project_id=project,
|
||||
dataset_id=dataset,
|
||||
# We recreate this entire table from scratch every day because we are
|
||||
# taking the last seen migration ping over all time for each client.
|
||||
destination_table=None,
|
||||
date_partition_parameter=None,
|
||||
sql_file_path="sql/moz-fx-data-shared-prod/org_mozilla_firefox_derived/migrated_clients_v1/init.sql",
|
||||
owner="frank@mozilla.com",
|
||||
email=["telemetry-alerts@mozilla.com", "frank@mozilla.com"]
|
||||
)
|
||||
|
||||
exec_dash = bigquery_etl_query(
|
||||
task_id="generate_incline_exec_dash",
|
||||
destination_table="incline_executive_v1",
|
||||
project_id=project,
|
||||
dataset_id=dataset,
|
||||
owner="frank@mozilla.com",
|
||||
email=["telemetry-alerts@mozilla.com", "frank@mozilla.com"],
|
||||
)
|
||||
|
||||
gcp_conn_id = 'google_cloud_derived_datasets'
|
||||
export_incline_dash = GKEPodOperator(
|
||||
task_id="export_incline_dash",
|
||||
name="export-incline-dash",
|
||||
arguments=["script/export_incline_dash", "{{ ds }}"],
|
||||
gcp_conn_id=gcp_conn_id,
|
||||
project_id=GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id,
|
||||
location="us-central1-a",
|
||||
cluster_name="bq-load-gke-1",
|
||||
namespace="default",
|
||||
image="gcr.io/moz-fx-data-airflow-prod-88e0/bigquery-etl:latest",
|
||||
)
|
||||
|
||||
(
|
||||
[wait_for_baseline_clients_last_seen, wait_for_core_clients_last_seen] >>
|
||||
migrated_clients >>
|
||||
exec_dash >>
|
||||
export_incline_dash
|
||||
)
|
12
dags/ltv.py
12
dags/ltv.py
|
@ -5,7 +5,10 @@ from airflow import DAG
|
|||
from operators.task_sensor import ExternalTaskCompletedSensor
|
||||
from airflow.operators.subdag_operator import SubDagOperator
|
||||
from datetime import datetime, timedelta
|
||||
from operators.backport.bigquery_operator_1_10_2 import BigQueryOperator
|
||||
|
||||
from airflow.providers.google.cloud.operators.bigquery import (
|
||||
BigQueryExecuteQueryOperator
|
||||
)
|
||||
from six.moves.urllib.request import urlopen
|
||||
from utils.dataproc import (
|
||||
moz_dataproc_pyspark_runner,
|
||||
|
@ -109,8 +112,7 @@ response = urlopen('/'.join([
|
|||
'https://raw.githubusercontent.com/mozilla/bigquery-etl/main/sql',
|
||||
'moz-fx-data-shared-prod', 'revenue_derived', 'client_ltv_v1', 'query.sql']))
|
||||
|
||||
BigQueryOperator.template_fields += ('query_params',)
|
||||
ltv_revenue_join=BigQueryOperator(
|
||||
ltv_revenue_join=BigQueryExecuteQueryOperator(
|
||||
task_id='ltv_revenue_join',
|
||||
sql=response.read().decode('utf-8'),
|
||||
query_params=[{"name": "submission_date", "parameterType": {"type": "DATE"}, "parameterValue": {"value": "{{ ds }}"}}],
|
||||
|
@ -129,7 +131,7 @@ response = urlopen('/'.join([
|
|||
'moz-fx-data-shared-prod', 'revenue_derived', 'client_ltv_normalized', 'query.sql']))
|
||||
|
||||
# Normalized LTV View is for general-use and doesn't contain any revenue data
|
||||
ltv_normalized_view=BigQueryOperator(
|
||||
ltv_normalized_view=BigQueryExecuteQueryOperator(
|
||||
task_id='ltv_normalized_view',
|
||||
sql=response.read().decode('utf-8'),
|
||||
query_params=[{"name": "submission_date", "parameterType": {"type": "DATE"}, "parameterValue": {"value": "{{ ds }}"}}],
|
||||
|
@ -147,7 +149,7 @@ response = urlopen('/'.join([
|
|||
'https://raw.githubusercontent.com/mozilla/bigquery-etl/main/sql',
|
||||
'moz-fx-data-shared-prod', 'revenue_derived', 'client_ltv_normalized_v1', 'query.sql']))
|
||||
|
||||
client_ltv_normalized_v1=BigQueryOperator(
|
||||
client_ltv_normalized_v1=BigQueryExecuteQueryOperator(
|
||||
task_id='client_ltv_normalized_v1',
|
||||
sql=response.read().decode('utf-8'),
|
||||
query_params=[{"name": "submission_date", "parameterType": {"type": "DATE"}, "parameterValue": {"value": "{{ ds }}"}}],
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
import os
|
||||
from airflow import DAG
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
from airflow.contrib.hooks.aws_hook import AwsHook
|
||||
from airflow.providers.amazon.aws.hooks.base_aws import AwsBaseHook
|
||||
|
||||
from utils.gcp import gke_command
|
||||
|
||||
|
@ -17,17 +18,22 @@ default_args = {
|
|||
}
|
||||
|
||||
with DAG("mad_server", default_args=default_args, schedule_interval="@weekly") as dag:
|
||||
|
||||
is_dev = os.environ.get("DEPLOY_ENVIRONMENT") == "dev"
|
||||
aws_conn_id="aws_dev_mad_resources_training"
|
||||
|
||||
# mad-server expects AWS creds in some custom env vars.
|
||||
s3_env_vars = {
|
||||
key: value
|
||||
for key, value in zip(
|
||||
("S3_ACCESS_KEY_ID", "S3_SECRET_ACCESS_KEY", "S3_SESSION_TOKEN"),
|
||||
AwsHook(aws_conn_id).get_credentials() if aws_conn_id else (),
|
||||
)
|
||||
if value is not None}
|
||||
if is_dev:
|
||||
aws_conn_id = None
|
||||
s3_env_vars = {}
|
||||
else:
|
||||
aws_conn_id="aws_dev_mad_resources_training"
|
||||
s3_env_vars = {
|
||||
key: value
|
||||
for key, value in zip(
|
||||
("S3_ACCESS_KEY_ID", "S3_SECRET_ACCESS_KEY", "S3_SESSION_TOKEN"),
|
||||
AwsBaseHook(aws_conn_id=aws_conn_id, client_type='s3').get_credentials() if aws_conn_id else (),
|
||||
)
|
||||
if value is not None
|
||||
}
|
||||
|
||||
mad_server_pull = gke_command(
|
||||
task_id="mad_server_pull",
|
||||
|
|
|
@ -3,8 +3,7 @@ import os
|
|||
from datetime import datetime, timedelta
|
||||
|
||||
from airflow import DAG
|
||||
from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook
|
||||
from airflow.contrib.operators.gcs_delete_operator import GoogleCloudStorageDeleteOperator
|
||||
from airflow.providers.google.cloud.operators.gcs import GCSDeleteObjectsOperator
|
||||
from airflow.operators.subdag_operator import SubDagOperator
|
||||
|
||||
|
||||
|
@ -34,13 +33,13 @@ subdag_args = default_args.copy()
|
|||
subdag_args["retries"] = 0
|
||||
|
||||
task_id = "mobile_aggregate_view_dataproc"
|
||||
gcp_conn = GoogleCloudBaseHook("google_cloud_airflow_dataproc")
|
||||
keyfile = json.loads(gcp_conn.extras["extra__google_cloud_platform__keyfile_dict"])
|
||||
project_id = keyfile["project_id"]
|
||||
gcp_conn_id = "google_cloud_airflow_dataproc"
|
||||
project_id = "airflow-dataproc"
|
||||
dev_test_service_account = "replace_me"
|
||||
|
||||
is_dev = os.environ.get("DEPLOY_ENVIRONMENT") == "dev"
|
||||
client_email = (
|
||||
keyfile["client_email"]
|
||||
dev_test_service_account
|
||||
if is_dev
|
||||
else "dataproc-runner-prod@airflow-dataproc.iam.gserviceaccount.com"
|
||||
)
|
||||
|
@ -100,7 +99,7 @@ mobile_aggregate_view_dataproc = SubDagOperator(
|
|||
"gs://moz-fx-data-derived-datasets-parquet-tmp/avro/mozaggregator/mobile/moz-fx-data-shared-prod",
|
||||
]
|
||||
),
|
||||
gcp_conn_id=gcp_conn.gcp_conn_id,
|
||||
gcp_conn_id=gcp_conn_id,
|
||||
service_account=client_email,
|
||||
artifact_bucket=artifact_bucket,
|
||||
storage_bucket=storage_bucket,
|
||||
|
@ -126,11 +125,11 @@ if EXPORT_TO_AVRO:
|
|||
dag=dag,
|
||||
).set_downstream(mobile_aggregate_view_dataproc)
|
||||
|
||||
GoogleCloudStorageDeleteOperator(
|
||||
GCSDeleteObjectsOperator(
|
||||
task_id="delete_mobile_metrics_avro",
|
||||
bucket_name="moz-fx-data-derived-datasets-parquet-tmp",
|
||||
prefix="avro/mozaggregator/mobile/moz-fx-data-shared-prod/{{ ds_nodash }}/mobile_metrics_v1",
|
||||
google_cloud_storage_conn_id=gcp_conn.gcp_conn_id,
|
||||
gcp_conn_id=gcp_conn_id,
|
||||
dag=dag
|
||||
).set_upstream(mobile_aggregate_view_dataproc)
|
||||
|
||||
|
|
|
@ -3,10 +3,7 @@ import os
|
|||
from datetime import datetime, timedelta
|
||||
|
||||
from airflow import DAG
|
||||
from airflow.contrib.operators.gcs_delete_operator import (
|
||||
GoogleCloudStorageDeleteOperator,
|
||||
)
|
||||
from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook
|
||||
from airflow.providers.google.cloud.operators.gcs import GCSDeleteObjectsOperator
|
||||
from airflow.operators.subdag_operator import SubDagOperator
|
||||
from utils.dataproc import moz_dataproc_pyspark_runner, copy_artifacts_dev
|
||||
from utils.gcp import gke_command
|
||||
|
@ -39,13 +36,13 @@ subdag_args = default_args.copy()
|
|||
subdag_args["retries"] = 0
|
||||
|
||||
task_id = "prerelease_telemetry_aggregate_view_dataproc"
|
||||
gcp_conn = GoogleCloudBaseHook("google_cloud_airflow_dataproc")
|
||||
keyfile = json.loads(gcp_conn.extras["extra__google_cloud_platform__keyfile_dict"])
|
||||
project_id = keyfile["project_id"]
|
||||
gcp_conn_id = "google_cloud_airflow_dataproc"
|
||||
project_id = "airflow-dataproc"
|
||||
dev_test_service_account = "replace_me"
|
||||
|
||||
is_dev = os.environ.get("DEPLOY_ENVIRONMENT") == "dev"
|
||||
client_email = (
|
||||
keyfile["client_email"]
|
||||
dev_test_service_account
|
||||
if is_dev
|
||||
else "dataproc-runner-prod@airflow-dataproc.iam.gserviceaccount.com"
|
||||
)
|
||||
|
@ -114,7 +111,7 @@ prerelease_telemetry_aggregate_view_dataproc = SubDagOperator(
|
|||
"gs://moz-fx-data-derived-datasets-parquet-tmp/avro/mozaggregator/prerelease/moz-fx-data-shared-prod",
|
||||
]
|
||||
),
|
||||
gcp_conn_id=gcp_conn.gcp_conn_id,
|
||||
gcp_conn_id=gcp_conn_id,
|
||||
service_account=client_email,
|
||||
artifact_bucket=artifact_bucket,
|
||||
storage_bucket=storage_bucket,
|
||||
|
@ -207,11 +204,11 @@ if EXPORT_TO_AVRO:
|
|||
).set_downstream(prerelease_telemetry_aggregate_view_dataproc)
|
||||
|
||||
# Delete the GCS data
|
||||
GoogleCloudStorageDeleteOperator(
|
||||
GCSDeleteObjectsOperator(
|
||||
task_id="delete_main_avro",
|
||||
bucket_name="moz-fx-data-derived-datasets-parquet-tmp",
|
||||
prefix="avro/mozaggregator/prerelease/moz-fx-data-shared-prod/{{ ds_nodash }}/main_v4",
|
||||
google_cloud_storage_conn_id=gcp_conn.gcp_conn_id,
|
||||
gcp_conn_id=gcp_conn_id,
|
||||
dag=dag,
|
||||
).set_upstream(prerelease_telemetry_aggregate_view_dataproc)
|
||||
|
||||
|
|
|
@ -1,30 +0,0 @@
|
|||
from airflow import DAG
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
from airflow.operators.dummy_operator import DummyOperator
|
||||
|
||||
default_args = {
|
||||
"owner": "frank@mozilla.com",
|
||||
"depends_on_past": True,
|
||||
"start_date": datetime(2018, 12, 17),
|
||||
"email": ["telemetry-alerts@mozilla.com", "frank@mozilla.com"],
|
||||
"email_on_failure": True,
|
||||
"email_on_retry": True,
|
||||
"retries": 3,
|
||||
"retry_delay": timedelta(minutes=30),
|
||||
}
|
||||
|
||||
dag = DAG(
|
||||
"release_telemetry_aggregates",
|
||||
default_args=default_args,
|
||||
schedule_interval="@daily",
|
||||
)
|
||||
|
||||
# See mozaggregator_prerelease and mozaggregator_mobile for functional
|
||||
# implementations using dataproc operator. This is not implemented due to the
|
||||
# migration to GCP and https://bugzilla.mozilla.org/show_bug.cgi?id=1517018
|
||||
release_telemetry_aggregate_view = DummyOperator(
|
||||
task_id="release_telemetry_aggregate_view",
|
||||
job_name="Release Telemetry Aggregate View",
|
||||
dag=dag,
|
||||
)
|
|
@ -7,7 +7,11 @@ the upstream GkePodOperator works fine.
|
|||
### As of 1.10.12 I've removed the backported 1.10.7 gcp_container_operator,
|
||||
kubernetes_pod_operator, and the 1.10.2 kube_client
|
||||
|
||||
|
||||
### Fivetran operator backported from 2.0+
|
||||
Fivetran provides and [operator, sensor and hook](https://github.com/fivetran/airflow-provider-fivetran)
|
||||
for integrating with the Fivetran API for Airflow version 2.0+. Backported to
|
||||
make it usable in Airflow 1.10.15.
|
||||
|
||||
### For 2.1.0 I've removed bigquery_operator_1_10_2.py, in favor of the new
|
||||
google provider code.
|
||||
|
|
|
@ -1,612 +0,0 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
#
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
import json
|
||||
|
||||
from airflow.contrib.hooks.bigquery_hook import BigQueryHook
|
||||
from airflow.contrib.hooks.gcs_hook import GoogleCloudStorageHook, _parse_gcs_url
|
||||
from airflow.models import BaseOperator
|
||||
from airflow.utils.decorators import apply_defaults
|
||||
|
||||
|
||||
class BigQueryOperator(BaseOperator):
|
||||
"""
|
||||
Executes BigQuery SQL queries in a specific BigQuery database
|
||||
:param bql: (Deprecated. Use `sql` parameter instead) the sql code to be
|
||||
executed (templated)
|
||||
:type bql: Can receive a str representing a sql statement,
|
||||
a list of str (sql statements), or reference to a template file.
|
||||
Template reference are recognized by str ending in '.sql'.
|
||||
:param sql: the sql code to be executed (templated)
|
||||
:type sql: Can receive a str representing a sql statement,
|
||||
a list of str (sql statements), or reference to a template file.
|
||||
Template reference are recognized by str ending in '.sql'.
|
||||
:param destination_dataset_table: A dotted
|
||||
(<project>.|<project>:)<dataset>.<table> that, if set, will store the results
|
||||
of the query. (templated)
|
||||
:type destination_dataset_table: string
|
||||
:param write_disposition: Specifies the action that occurs if the destination table
|
||||
already exists. (default: 'WRITE_EMPTY')
|
||||
:type write_disposition: string
|
||||
:param create_disposition: Specifies whether the job is allowed to create new tables.
|
||||
(default: 'CREATE_IF_NEEDED')
|
||||
:type create_disposition: string
|
||||
:param allow_large_results: Whether to allow large results.
|
||||
:type allow_large_results: boolean
|
||||
:param flatten_results: If true and query uses legacy SQL dialect, flattens
|
||||
all nested and repeated fields in the query results. ``allow_large_results``
|
||||
must be ``true`` if this is set to ``false``. For standard SQL queries, this
|
||||
flag is ignored and results are never flattened.
|
||||
:type flatten_results: boolean
|
||||
:param bigquery_conn_id: reference to a specific BigQuery hook.
|
||||
:type bigquery_conn_id: string
|
||||
:param delegate_to: The account to impersonate, if any.
|
||||
For this to work, the service account making the request must have domain-wide
|
||||
delegation enabled.
|
||||
:type delegate_to: string
|
||||
:param udf_config: The User Defined Function configuration for the query.
|
||||
See https://cloud.google.com/bigquery/user-defined-functions for details.
|
||||
:type udf_config: list
|
||||
:param use_legacy_sql: Whether to use legacy SQL (true) or standard SQL (false).
|
||||
:type use_legacy_sql: boolean
|
||||
:param maximum_billing_tier: Positive integer that serves as a multiplier
|
||||
of the basic price.
|
||||
Defaults to None, in which case it uses the value set in the project.
|
||||
:type maximum_billing_tier: integer
|
||||
:param maximum_bytes_billed: Limits the bytes billed for this job.
|
||||
Queries that will have bytes billed beyond this limit will fail
|
||||
(without incurring a charge). If unspecified, this will be
|
||||
set to your project default.
|
||||
:type maximum_bytes_billed: float
|
||||
:param api_resource_configs: a dictionary that contain params
|
||||
'configuration' applied for Google BigQuery Jobs API:
|
||||
https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs
|
||||
for example, {'query': {'useQueryCache': False}}. You could use it
|
||||
if you need to provide some params that are not supported by BigQueryOperator
|
||||
like args.
|
||||
:type api_resource_configs: dict
|
||||
:param schema_update_options: Allows the schema of the destination
|
||||
table to be updated as a side effect of the load job.
|
||||
:type schema_update_options: tuple
|
||||
:param query_params: a dictionary containing query parameter types and
|
||||
values, passed to BigQuery.
|
||||
:type query_params: dict
|
||||
:param labels: a dictionary containing labels for the job/query,
|
||||
passed to BigQuery
|
||||
:type labels: dict
|
||||
:param priority: Specifies a priority for the query.
|
||||
Possible values include INTERACTIVE and BATCH.
|
||||
The default value is INTERACTIVE.
|
||||
:type priority: string
|
||||
:param time_partitioning: configure optional time partitioning fields i.e.
|
||||
partition by field, type and expiration as per API specifications.
|
||||
:type time_partitioning: dict
|
||||
:param cluster_fields: Request that the result of this query be stored sorted
|
||||
by one or more columns. This is only available in conjunction with
|
||||
time_partitioning. The order of columns given determines the sort order.
|
||||
:type cluster_fields: list of str
|
||||
:param location: The geographic location of the job. Required except for
|
||||
US and EU. See details at
|
||||
https://cloud.google.com/bigquery/docs/locations#specifying_your_location
|
||||
:type location: str
|
||||
"""
|
||||
|
||||
template_fields = ('bql', 'sql', 'destination_dataset_table', 'labels')
|
||||
template_ext = ('.sql', )
|
||||
ui_color = '#e4f0e8'
|
||||
|
||||
@apply_defaults
|
||||
def __init__(self,
|
||||
bql=None,
|
||||
sql=None,
|
||||
destination_dataset_table=None,
|
||||
write_disposition='WRITE_EMPTY',
|
||||
allow_large_results=False,
|
||||
flatten_results=None,
|
||||
bigquery_conn_id='bigquery_default',
|
||||
delegate_to=None,
|
||||
udf_config=None,
|
||||
use_legacy_sql=True,
|
||||
maximum_billing_tier=None,
|
||||
maximum_bytes_billed=None,
|
||||
create_disposition='CREATE_IF_NEEDED',
|
||||
schema_update_options=(),
|
||||
query_params=None,
|
||||
labels=None,
|
||||
priority='INTERACTIVE',
|
||||
time_partitioning=None,
|
||||
api_resource_configs=None,
|
||||
cluster_fields=None,
|
||||
location=None,
|
||||
*args,
|
||||
**kwargs):
|
||||
super(BigQueryOperator, self).__init__(*args, **kwargs)
|
||||
self.bql = bql
|
||||
self.sql = sql if sql else bql
|
||||
self.destination_dataset_table = destination_dataset_table
|
||||
self.write_disposition = write_disposition
|
||||
self.create_disposition = create_disposition
|
||||
self.allow_large_results = allow_large_results
|
||||
self.flatten_results = flatten_results
|
||||
self.bigquery_conn_id = bigquery_conn_id
|
||||
self.delegate_to = delegate_to
|
||||
self.udf_config = udf_config
|
||||
self.use_legacy_sql = use_legacy_sql
|
||||
self.maximum_billing_tier = maximum_billing_tier
|
||||
self.maximum_bytes_billed = maximum_bytes_billed
|
||||
self.schema_update_options = schema_update_options
|
||||
self.query_params = query_params
|
||||
self.labels = labels
|
||||
self.bq_cursor = None
|
||||
self.priority = priority
|
||||
self.time_partitioning = time_partitioning
|
||||
self.api_resource_configs = api_resource_configs
|
||||
self.cluster_fields = cluster_fields
|
||||
self.location = location
|
||||
|
||||
# TODO remove `bql` in Airflow 2.0
|
||||
if self.bql:
|
||||
import warnings
|
||||
warnings.warn('Deprecated parameter `bql` used in Task id: {}. '
|
||||
'Use `sql` parameter instead to pass the sql to be '
|
||||
'executed. `bql` parameter is deprecated and '
|
||||
'will be removed in a future version of '
|
||||
'Airflow.'.format(self.task_id),
|
||||
category=DeprecationWarning)
|
||||
|
||||
if self.sql is None:
|
||||
raise TypeError('{} missing 1 required positional '
|
||||
'argument: `sql`'.format(self.task_id))
|
||||
|
||||
def execute(self, context):
|
||||
if self.bq_cursor is None:
|
||||
self.log.info('Executing: %s', self.sql)
|
||||
hook = BigQueryHook(
|
||||
bigquery_conn_id=self.bigquery_conn_id,
|
||||
use_legacy_sql=self.use_legacy_sql,
|
||||
delegate_to=self.delegate_to,
|
||||
location=self.location,
|
||||
)
|
||||
conn = hook.get_conn()
|
||||
self.bq_cursor = conn.cursor()
|
||||
self.bq_cursor.run_query(
|
||||
sql=self.sql,
|
||||
destination_dataset_table=self.destination_dataset_table,
|
||||
write_disposition=self.write_disposition,
|
||||
allow_large_results=self.allow_large_results,
|
||||
flatten_results=self.flatten_results,
|
||||
udf_config=self.udf_config,
|
||||
maximum_billing_tier=self.maximum_billing_tier,
|
||||
maximum_bytes_billed=self.maximum_bytes_billed,
|
||||
create_disposition=self.create_disposition,
|
||||
query_params=self.query_params,
|
||||
labels=self.labels,
|
||||
schema_update_options=self.schema_update_options,
|
||||
priority=self.priority,
|
||||
time_partitioning=self.time_partitioning,
|
||||
api_resource_configs=self.api_resource_configs,
|
||||
cluster_fields=self.cluster_fields,
|
||||
)
|
||||
|
||||
def on_kill(self):
|
||||
super(BigQueryOperator, self).on_kill()
|
||||
if self.bq_cursor is not None:
|
||||
self.log.info('Cancelling running query')
|
||||
self.bq_cursor.cancel_query()
|
||||
|
||||
|
||||
class BigQueryCreateEmptyTableOperator(BaseOperator):
|
||||
"""
|
||||
Creates a new, empty table in the specified BigQuery dataset,
|
||||
optionally with schema.
|
||||
The schema to be used for the BigQuery table may be specified in one of
|
||||
two ways. You may either directly pass the schema fields in, or you may
|
||||
point the operator to a Google cloud storage object name. The object in
|
||||
Google cloud storage must be a JSON file with the schema fields in it.
|
||||
You can also create a table without schema.
|
||||
:param project_id: The project to create the table into. (templated)
|
||||
:type project_id: string
|
||||
:param dataset_id: The dataset to create the table into. (templated)
|
||||
:type dataset_id: string
|
||||
:param table_id: The Name of the table to be created. (templated)
|
||||
:type table_id: string
|
||||
:param schema_fields: If set, the schema field list as defined here:
|
||||
https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.load.schema
|
||||
**Example**: ::
|
||||
schema_fields=[{"name": "emp_name", "type": "STRING", "mode": "REQUIRED"},
|
||||
{"name": "salary", "type": "INTEGER", "mode": "NULLABLE"}]
|
||||
:type schema_fields: list
|
||||
:param gcs_schema_object: Full path to the JSON file containing
|
||||
schema (templated). For
|
||||
example: ``gs://test-bucket/dir1/dir2/employee_schema.json``
|
||||
:type gcs_schema_object: string
|
||||
:param time_partitioning: configure optional time partitioning fields i.e.
|
||||
partition by field, type and expiration as per API specifications.
|
||||
.. seealso::
|
||||
https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#timePartitioning
|
||||
:type time_partitioning: dict
|
||||
:param bigquery_conn_id: Reference to a specific BigQuery hook.
|
||||
:type bigquery_conn_id: string
|
||||
:param google_cloud_storage_conn_id: Reference to a specific Google
|
||||
cloud storage hook.
|
||||
:type google_cloud_storage_conn_id: string
|
||||
:param delegate_to: The account to impersonate, if any. For this to
|
||||
work, the service account making the request must have domain-wide
|
||||
delegation enabled.
|
||||
:type delegate_to: string
|
||||
:param labels: a dictionary containing labels for the table, passed to BigQuery
|
||||
**Example (with schema JSON in GCS)**: ::
|
||||
CreateTable = BigQueryCreateEmptyTableOperator(
|
||||
task_id='BigQueryCreateEmptyTableOperator_task',
|
||||
dataset_id='ODS',
|
||||
table_id='Employees',
|
||||
project_id='internal-gcp-project',
|
||||
gcs_schema_object='gs://schema-bucket/employee_schema.json',
|
||||
bigquery_conn_id='airflow-service-account',
|
||||
google_cloud_storage_conn_id='airflow-service-account'
|
||||
)
|
||||
**Corresponding Schema file** (``employee_schema.json``): ::
|
||||
[
|
||||
{
|
||||
"mode": "NULLABLE",
|
||||
"name": "emp_name",
|
||||
"type": "STRING"
|
||||
},
|
||||
{
|
||||
"mode": "REQUIRED",
|
||||
"name": "salary",
|
||||
"type": "INTEGER"
|
||||
}
|
||||
]
|
||||
**Example (with schema in the DAG)**: ::
|
||||
CreateTable = BigQueryCreateEmptyTableOperator(
|
||||
task_id='BigQueryCreateEmptyTableOperator_task',
|
||||
dataset_id='ODS',
|
||||
table_id='Employees',
|
||||
project_id='internal-gcp-project',
|
||||
schema_fields=[{"name": "emp_name", "type": "STRING", "mode": "REQUIRED"},
|
||||
{"name": "salary", "type": "INTEGER", "mode": "NULLABLE"}],
|
||||
bigquery_conn_id='airflow-service-account',
|
||||
google_cloud_storage_conn_id='airflow-service-account'
|
||||
)
|
||||
:type labels: dict
|
||||
"""
|
||||
template_fields = ('dataset_id', 'table_id', 'project_id',
|
||||
'gcs_schema_object', 'labels')
|
||||
ui_color = '#f0eee4'
|
||||
|
||||
@apply_defaults
|
||||
def __init__(self,
|
||||
dataset_id,
|
||||
table_id,
|
||||
project_id=None,
|
||||
schema_fields=None,
|
||||
gcs_schema_object=None,
|
||||
time_partitioning=None,
|
||||
bigquery_conn_id='bigquery_default',
|
||||
google_cloud_storage_conn_id='google_cloud_default',
|
||||
delegate_to=None,
|
||||
labels=None,
|
||||
*args, **kwargs):
|
||||
|
||||
super(BigQueryCreateEmptyTableOperator, self).__init__(*args, **kwargs)
|
||||
|
||||
self.project_id = project_id
|
||||
self.dataset_id = dataset_id
|
||||
self.table_id = table_id
|
||||
self.schema_fields = schema_fields
|
||||
self.gcs_schema_object = gcs_schema_object
|
||||
self.bigquery_conn_id = bigquery_conn_id
|
||||
self.google_cloud_storage_conn_id = google_cloud_storage_conn_id
|
||||
self.delegate_to = delegate_to
|
||||
self.time_partitioning = {} if time_partitioning is None else time_partitioning
|
||||
self.labels = labels
|
||||
|
||||
def execute(self, context):
|
||||
bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
|
||||
delegate_to=self.delegate_to)
|
||||
|
||||
if not self.schema_fields and self.gcs_schema_object:
|
||||
|
||||
gcs_bucket, gcs_object = _parse_gcs_url(self.gcs_schema_object)
|
||||
|
||||
gcs_hook = GoogleCloudStorageHook(
|
||||
google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
|
||||
delegate_to=self.delegate_to)
|
||||
schema_fields = json.loads(gcs_hook.download(
|
||||
gcs_bucket,
|
||||
gcs_object).decode("utf-8"))
|
||||
else:
|
||||
schema_fields = self.schema_fields
|
||||
|
||||
conn = bq_hook.get_conn()
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.create_empty_table(
|
||||
project_id=self.project_id,
|
||||
dataset_id=self.dataset_id,
|
||||
table_id=self.table_id,
|
||||
schema_fields=schema_fields,
|
||||
time_partitioning=self.time_partitioning,
|
||||
labels=self.labels
|
||||
)
|
||||
|
||||
|
||||
class BigQueryCreateExternalTableOperator(BaseOperator):
|
||||
"""
|
||||
Creates a new external table in the dataset with the data in Google Cloud
|
||||
Storage.
|
||||
The schema to be used for the BigQuery table may be specified in one of
|
||||
two ways. You may either directly pass the schema fields in, or you may
|
||||
point the operator to a Google cloud storage object name. The object in
|
||||
Google cloud storage must be a JSON file with the schema fields in it.
|
||||
:param bucket: The bucket to point the external table to. (templated)
|
||||
:type bucket: string
|
||||
:param source_objects: List of Google cloud storage URIs to point
|
||||
table to. (templated)
|
||||
If source_format is 'DATASTORE_BACKUP', the list must only contain a single URI.
|
||||
:type source_objects: list
|
||||
:param destination_project_dataset_table: The dotted (<project>.)<dataset>.<table>
|
||||
BigQuery table to load data into (templated). If <project> is not included,
|
||||
project will be the project defined in the connection json.
|
||||
:type destination_project_dataset_table: string
|
||||
:param schema_fields: If set, the schema field list as defined here:
|
||||
https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.load.schema
|
||||
**Example**: ::
|
||||
schema_fields=[{"name": "emp_name", "type": "STRING", "mode": "REQUIRED"},
|
||||
{"name": "salary", "type": "INTEGER", "mode": "NULLABLE"}]
|
||||
Should not be set when source_format is 'DATASTORE_BACKUP'.
|
||||
:type schema_fields: list
|
||||
:param schema_object: If set, a GCS object path pointing to a .json file that
|
||||
contains the schema for the table. (templated)
|
||||
:type schema_object: string
|
||||
:param source_format: File format of the data.
|
||||
:type source_format: string
|
||||
:param compression: [Optional] The compression type of the data source.
|
||||
Possible values include GZIP and NONE.
|
||||
The default value is NONE.
|
||||
This setting is ignored for Google Cloud Bigtable,
|
||||
Google Cloud Datastore backups and Avro formats.
|
||||
:type compression: string
|
||||
:param skip_leading_rows: Number of rows to skip when loading from a CSV.
|
||||
:type skip_leading_rows: int
|
||||
:param field_delimiter: The delimiter to use for the CSV.
|
||||
:type field_delimiter: string
|
||||
:param max_bad_records: The maximum number of bad records that BigQuery can
|
||||
ignore when running the job.
|
||||
:type max_bad_records: int
|
||||
:param quote_character: The value that is used to quote data sections in a CSV file.
|
||||
:type quote_character: string
|
||||
:param allow_quoted_newlines: Whether to allow quoted newlines (true) or not (false).
|
||||
:type allow_quoted_newlines: boolean
|
||||
:param allow_jagged_rows: Accept rows that are missing trailing optional columns.
|
||||
The missing values are treated as nulls. If false, records with missing trailing
|
||||
columns are treated as bad records, and if there are too many bad records, an
|
||||
invalid error is returned in the job result. Only applicable to CSV, ignored
|
||||
for other formats.
|
||||
:type allow_jagged_rows: bool
|
||||
:param bigquery_conn_id: Reference to a specific BigQuery hook.
|
||||
:type bigquery_conn_id: string
|
||||
:param google_cloud_storage_conn_id: Reference to a specific Google
|
||||
cloud storage hook.
|
||||
:type google_cloud_storage_conn_id: string
|
||||
:param delegate_to: The account to impersonate, if any. For this to
|
||||
work, the service account making the request must have domain-wide
|
||||
delegation enabled.
|
||||
:type delegate_to: string
|
||||
:param src_fmt_configs: configure optional fields specific to the source format
|
||||
:type src_fmt_configs: dict
|
||||
:param labels a dictionary containing labels for the table, passed to BigQuery
|
||||
:type labels: dict
|
||||
"""
|
||||
template_fields = ('bucket', 'source_objects',
|
||||
'schema_object', 'destination_project_dataset_table', 'labels')
|
||||
ui_color = '#f0eee4'
|
||||
|
||||
@apply_defaults
|
||||
def __init__(self,
|
||||
bucket,
|
||||
source_objects,
|
||||
destination_project_dataset_table,
|
||||
schema_fields=None,
|
||||
schema_object=None,
|
||||
source_format='CSV',
|
||||
compression='NONE',
|
||||
skip_leading_rows=0,
|
||||
field_delimiter=',',
|
||||
max_bad_records=0,
|
||||
quote_character=None,
|
||||
allow_quoted_newlines=False,
|
||||
allow_jagged_rows=False,
|
||||
bigquery_conn_id='bigquery_default',
|
||||
google_cloud_storage_conn_id='google_cloud_default',
|
||||
delegate_to=None,
|
||||
src_fmt_configs={},
|
||||
labels=None,
|
||||
*args, **kwargs):
|
||||
|
||||
super(BigQueryCreateExternalTableOperator, self).__init__(*args, **kwargs)
|
||||
|
||||
# GCS config
|
||||
self.bucket = bucket
|
||||
self.source_objects = source_objects
|
||||
self.schema_object = schema_object
|
||||
|
||||
# BQ config
|
||||
self.destination_project_dataset_table = destination_project_dataset_table
|
||||
self.schema_fields = schema_fields
|
||||
self.source_format = source_format
|
||||
self.compression = compression
|
||||
self.skip_leading_rows = skip_leading_rows
|
||||
self.field_delimiter = field_delimiter
|
||||
self.max_bad_records = max_bad_records
|
||||
self.quote_character = quote_character
|
||||
self.allow_quoted_newlines = allow_quoted_newlines
|
||||
self.allow_jagged_rows = allow_jagged_rows
|
||||
|
||||
self.bigquery_conn_id = bigquery_conn_id
|
||||
self.google_cloud_storage_conn_id = google_cloud_storage_conn_id
|
||||
self.delegate_to = delegate_to
|
||||
|
||||
self.src_fmt_configs = src_fmt_configs
|
||||
self.labels = labels
|
||||
|
||||
def execute(self, context):
|
||||
bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
|
||||
delegate_to=self.delegate_to)
|
||||
|
||||
if not self.schema_fields and self.schema_object \
|
||||
and self.source_format != 'DATASTORE_BACKUP':
|
||||
gcs_hook = GoogleCloudStorageHook(
|
||||
google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
|
||||
delegate_to=self.delegate_to)
|
||||
schema_fields = json.loads(gcs_hook.download(
|
||||
self.bucket,
|
||||
self.schema_object).decode("utf-8"))
|
||||
else:
|
||||
schema_fields = self.schema_fields
|
||||
|
||||
source_uris = ['gs://{}/{}'.format(self.bucket, source_object)
|
||||
for source_object in self.source_objects]
|
||||
conn = bq_hook.get_conn()
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.create_external_table(
|
||||
external_project_dataset_table=self.destination_project_dataset_table,
|
||||
schema_fields=schema_fields,
|
||||
source_uris=source_uris,
|
||||
source_format=self.source_format,
|
||||
compression=self.compression,
|
||||
skip_leading_rows=self.skip_leading_rows,
|
||||
field_delimiter=self.field_delimiter,
|
||||
max_bad_records=self.max_bad_records,
|
||||
quote_character=self.quote_character,
|
||||
allow_quoted_newlines=self.allow_quoted_newlines,
|
||||
allow_jagged_rows=self.allow_jagged_rows,
|
||||
src_fmt_configs=self.src_fmt_configs,
|
||||
labels=self.labels
|
||||
)
|
||||
|
||||
|
||||
class BigQueryDeleteDatasetOperator(BaseOperator):
|
||||
""""
|
||||
This operator deletes an existing dataset from your Project in Big query.
|
||||
https://cloud.google.com/bigquery/docs/reference/rest/v2/datasets/delete
|
||||
:param project_id: The project id of the dataset.
|
||||
:type project_id: string
|
||||
:param dataset_id: The dataset to be deleted.
|
||||
:type dataset_id: string
|
||||
**Example**: ::
|
||||
delete_temp_data = BigQueryDeleteDatasetOperator(dataset_id = 'temp-dataset',
|
||||
project_id = 'temp-project',
|
||||
bigquery_conn_id='_my_gcp_conn_',
|
||||
task_id='Deletetemp',
|
||||
dag=dag)
|
||||
"""
|
||||
|
||||
template_fields = ('dataset_id', 'project_id')
|
||||
ui_color = '#f00004'
|
||||
|
||||
@apply_defaults
|
||||
def __init__(self,
|
||||
dataset_id,
|
||||
project_id=None,
|
||||
bigquery_conn_id='bigquery_default',
|
||||
delegate_to=None,
|
||||
*args, **kwargs):
|
||||
self.dataset_id = dataset_id
|
||||
self.project_id = project_id
|
||||
self.bigquery_conn_id = bigquery_conn_id
|
||||
self.delegate_to = delegate_to
|
||||
|
||||
self.log.info('Dataset id: %s', self.dataset_id)
|
||||
self.log.info('Project id: %s', self.project_id)
|
||||
|
||||
super(BigQueryDeleteDatasetOperator, self).__init__(*args, **kwargs)
|
||||
|
||||
def execute(self, context):
|
||||
bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
|
||||
delegate_to=self.delegate_to)
|
||||
|
||||
conn = bq_hook.get_conn()
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.delete_dataset(
|
||||
project_id=self.project_id,
|
||||
dataset_id=self.dataset_id
|
||||
)
|
||||
|
||||
|
||||
class BigQueryCreateEmptyDatasetOperator(BaseOperator):
|
||||
""""
|
||||
This operator is used to create new dataset for your Project in Big query.
|
||||
https://cloud.google.com/bigquery/docs/reference/rest/v2/datasets#resource
|
||||
:param project_id: The name of the project where we want to create the dataset.
|
||||
Don't need to provide, if projectId in dataset_reference.
|
||||
:type project_id: str
|
||||
:param dataset_id: The id of dataset. Don't need to provide,
|
||||
if datasetId in dataset_reference.
|
||||
:type dataset_id: str
|
||||
:param dataset_reference: Dataset reference that could be provided with request body.
|
||||
More info:
|
||||
https://cloud.google.com/bigquery/docs/reference/rest/v2/datasets#resource
|
||||
:type dataset_reference: dict
|
||||
**Example**: ::
|
||||
create_new_dataset = BigQueryCreateEmptyDatasetOperator(
|
||||
dataset_id = 'new-dataset',
|
||||
project_id = 'my-project',
|
||||
dataset_reference = {"friendlyName": "New Dataset"}
|
||||
bigquery_conn_id='_my_gcp_conn_',
|
||||
task_id='newDatasetCreator',
|
||||
dag=dag)
|
||||
"""
|
||||
|
||||
template_fields = ('dataset_id', 'project_id')
|
||||
ui_color = '#f0eee4'
|
||||
|
||||
@apply_defaults
|
||||
def __init__(self,
|
||||
dataset_id,
|
||||
project_id=None,
|
||||
dataset_reference=None,
|
||||
bigquery_conn_id='bigquery_default',
|
||||
delegate_to=None,
|
||||
*args, **kwargs):
|
||||
self.dataset_id = dataset_id
|
||||
self.project_id = project_id
|
||||
self.bigquery_conn_id = bigquery_conn_id
|
||||
self.dataset_reference = dataset_reference if dataset_reference else {}
|
||||
self.delegate_to = delegate_to
|
||||
|
||||
self.log.info('Dataset id: %s', self.dataset_id)
|
||||
self.log.info('Project id: %s', self.project_id)
|
||||
|
||||
super(BigQueryCreateEmptyDatasetOperator, self).__init__(*args, **kwargs)
|
||||
|
||||
def execute(self, context):
|
||||
bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
|
||||
delegate_to=self.delegate_to)
|
||||
|
||||
conn = bq_hook.get_conn()
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.create_empty_dataset(
|
||||
project_id=self.project_id,
|
||||
dataset_id=self.dataset_id,
|
||||
dataset_reference=self.dataset_reference)
|
|
@ -18,9 +18,8 @@
|
|||
# under the License.
|
||||
|
||||
from airflow.sensors.base_sensor_operator import BaseSensorOperator
|
||||
from airflow.contrib.hooks.bigquery_hook import BigQueryHook
|
||||
from airflow.utils.decorators import apply_defaults
|
||||
|
||||
from airflow.providers.google.cloud.hooks.bigquery import BigQueryHook
|
||||
|
||||
class BigQuerySQLSensorOperator(BaseSensorOperator):
|
||||
"""
|
||||
|
@ -30,9 +29,9 @@ class BigQuerySQLSensorOperator(BaseSensorOperator):
|
|||
single value. If that value is coerced to false in some way,
|
||||
the sensor continues to wait.
|
||||
:type sql: str
|
||||
:param bigquery_conn_id: The connection ID to use when connecting to
|
||||
:param gcp_conn_id: The connection ID to use when connecting to
|
||||
Google BigQuery.
|
||||
:type bigquery_conn_id: str
|
||||
:type gcp_conn_id: str
|
||||
:param use_legacy_sql: Whether to use BQ legacy SQL
|
||||
:type use_legacy_sql: bool
|
||||
:param timeout: Time in seconds to wait for the sensor,
|
||||
|
@ -40,14 +39,13 @@ class BigQuerySQLSensorOperator(BaseSensorOperator):
|
|||
:type timeout: int
|
||||
"""
|
||||
|
||||
template_fields = BaseSensorOperator.template_fields + [
|
||||
template_fields = BaseSensorOperator.template_fields + (
|
||||
'sql',
|
||||
]
|
||||
)
|
||||
|
||||
@apply_defaults
|
||||
def __init__(self,
|
||||
sql,
|
||||
bigquery_conn_id='bigquery_default_conn',
|
||||
gcp_conn_id='bigquery_default_conn',
|
||||
use_legacy_sql=False,
|
||||
timeout=60*60*24,
|
||||
*args,
|
||||
|
@ -58,7 +56,7 @@ class BigQuerySQLSensorOperator(BaseSensorOperator):
|
|||
*args,
|
||||
**kwargs)
|
||||
self.sql = sql
|
||||
self.bigquery_conn_id = bigquery_conn_id
|
||||
self.gcp_conn_id = gcp_conn_id
|
||||
self.use_legacy_sql = use_legacy_sql
|
||||
self.poke_interval = 120
|
||||
self.mode = 'reschedule'
|
||||
|
@ -78,5 +76,5 @@ class BigQuerySQLSensorOperator(BaseSensorOperator):
|
|||
return True
|
||||
|
||||
def get_db_hook(self):
|
||||
return BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
|
||||
return BigQueryHook(gcp_conn_id=self.gcp_conn_id,
|
||||
use_legacy_sql=self.use_legacy_sql)
|
||||
|
|
|
@ -1,38 +1,17 @@
|
|||
import os
|
||||
import subprocess
|
||||
import tempfile
|
||||
from airflow.providers.google.cloud.operators.kubernetes_engine import GKEStartPodOperator as UpstreamGKEPodOperator
|
||||
|
||||
from google.auth.environment_vars import CREDENTIALS
|
||||
|
||||
from airflow import AirflowException
|
||||
|
||||
from airflow.contrib.hooks.gcp_container_hook import GKEClusterHook
|
||||
|
||||
from airflow.contrib.operators.gcp_container_operator import GKEPodOperator as UpstreamGKEPodOperator
|
||||
|
||||
KUBE_CONFIG_ENV_VAR = "KUBECONFIG"
|
||||
GCLOUD_APP_CRED = "CLOUDSDK_AUTH_CREDENTIAL_FILE_OVERRIDE"
|
||||
|
||||
# Note: In the next version of airflow this will change.
|
||||
# This module is deprecated. Please use `airflow.providers.google.cloud.operators.kubernetes_engine`.
|
||||
|
||||
class GKEPodOperator(UpstreamGKEPodOperator):
|
||||
"""
|
||||
We override execute and _set_env_from_extras methods to support:
|
||||
|
||||
- `CLOUDSDK_AUTH_CREDENTIAL_FILE_OVERRIDE` environment variable that is
|
||||
set to the path of the Service Account JSON key file. This is neccesary
|
||||
for gcloud to operate.
|
||||
|
||||
- Adjust when NamedTemporaryFile file descriptor is closed.
|
||||
|
||||
- Preserve XCOM result when do_xcom_push is True.
|
||||
|
||||
- Override init to default image_pull_policy=Always, in_cluster=False, do_xcom_push=False and GKE params
|
||||
|
||||
- In 1.10.x this inherited from upstream GKEPodOperator, rather than GKEStartPodOperator(v2)
|
||||
- In 1.10.x we needed to override the execute and helper methods to set an environment
|
||||
variable for authentication to work (CLOUDSDK_AUTH_CREDENTIAL_FILE_OVERRIDE). Fixed in v2
|
||||
- We will keep this class and call the upstream GkeStartPodOperator now, because
|
||||
numerous places in our code references it still
|
||||
- Overrides init to default image_pull_policy=Always, in_cluster=False,
|
||||
do_xcom_push=False and GKE params
|
||||
- Defaults reattach_on_restart=False to address a 1.10.12 regression where GkePodOperators
|
||||
reruns will simply attach to an existing pod and not perform any new work.
|
||||
|
||||
- Hard sets reattach_on_restart=False when do_xcom_push=True to address an error
|
||||
Retrying a failed task with do_xcom_push=True causes airflow to reattach to the pod
|
||||
eventually causing a 'Handshake status 500 Internal Server Error'. Logs will indicate
|
||||
|
@ -75,75 +54,3 @@ class GKEPodOperator(UpstreamGKEPodOperator):
|
|||
namespace=namespace,
|
||||
*args,
|
||||
**kwargs)
|
||||
|
||||
def execute(self, context):
|
||||
# We can remove this override once upgraded to 2.0. https://issues.apache.org/jira/browse/AIRFLOW-4072
|
||||
|
||||
# Moz specific - Commented out key_file references (Jason fixed auth behaviour with 1.10.2)
|
||||
# key_file = None
|
||||
|
||||
# If gcp_conn_id is not specified gcloud will use the default
|
||||
# service account credentials.
|
||||
if self.gcp_conn_id:
|
||||
from airflow.hooks.base_hook import BaseHook
|
||||
# extras is a deserialized json object
|
||||
extras = BaseHook.get_connection(self.gcp_conn_id).extra_dejson
|
||||
self._set_env_from_extras(extras=extras) # Moz specific since func no longer returns value
|
||||
|
||||
# Write config to a temp file and set the environment variable to point to it.
|
||||
# This is to avoid race conditions of reading/writing a single file
|
||||
with tempfile.NamedTemporaryFile() as conf_file:
|
||||
os.environ[KUBE_CONFIG_ENV_VAR] = conf_file.name
|
||||
# Attempt to get/update credentials
|
||||
# We call gcloud directly instead of using google-cloud-python api
|
||||
# because there is no way to write kubernetes config to a file, which is
|
||||
# required by KubernetesPodOperator.
|
||||
# The gcloud command looks at the env variable `KUBECONFIG` for where to save
|
||||
# the kubernetes config file.
|
||||
subprocess.check_call(
|
||||
["gcloud", "container", "clusters", "get-credentials",
|
||||
self.cluster_name,
|
||||
"--zone", self.location,
|
||||
"--project", self.project_id])
|
||||
|
||||
# if key_file: # Moz specific commented out
|
||||
# key_file.close() # Moz specific commented out
|
||||
|
||||
# Tell `KubernetesPodOperator` where the config file is located
|
||||
self.config_file = os.environ[KUBE_CONFIG_ENV_VAR]
|
||||
result = super(UpstreamGKEPodOperator, self).execute(context) # Moz specific
|
||||
if self.do_xcom_push: # Moz specific
|
||||
return result # Moz specific
|
||||
|
||||
|
||||
def _set_env_from_extras(self, extras):
|
||||
"""
|
||||
Sets the environment variable `GOOGLE_APPLICATION_CREDENTIALS` and
|
||||
`CLOUDSDK_AUTH_CREDENTIAL_FILE_OVERRIDE`with either:
|
||||
|
||||
- The path to the keyfile from the specified connection id
|
||||
- A generated file's path if the user specified JSON in the connection id. The
|
||||
file is assumed to be deleted after the process dies due to how mkstemp()
|
||||
works.
|
||||
|
||||
The environment variable is used inside the gcloud command to determine correct
|
||||
service account to use.
|
||||
"""
|
||||
key_path = self._get_field(extras, 'key_path', False)
|
||||
keyfile_json_str = self._get_field(extras, 'keyfile_dict', False)
|
||||
|
||||
if not key_path and not keyfile_json_str:
|
||||
self.log.info('Using gcloud with application default credentials.')
|
||||
elif key_path:
|
||||
os.environ[CREDENTIALS] = key_path
|
||||
os.environ[GCLOUD_APP_CRED] = key_path
|
||||
return None
|
||||
else:
|
||||
# Write service account JSON to secure file for gcloud to reference
|
||||
service_key = tempfile.NamedTemporaryFile(delete=False)
|
||||
service_key.write(keyfile_json_str.encode('utf-8'))
|
||||
os.environ[CREDENTIALS] = service_key.name
|
||||
os.environ[GCLOUD_APP_CRED] = service_key.name
|
||||
# Return file object to have a pointer to close after use,
|
||||
# thus deleting from file system.
|
||||
service_key.close() # Moz specific instead of return service_key
|
||||
|
|
|
@ -1,459 +0,0 @@
|
|||
import os
|
||||
import re
|
||||
import time
|
||||
import uuid
|
||||
from datetime import timedelta
|
||||
|
||||
from airflow.contrib.hooks.gcp_dataproc_hook import DataProcHook
|
||||
from airflow.contrib.operators.dataproc_operator import DataprocOperationBaseOperator
|
||||
from airflow.exceptions import AirflowException
|
||||
from airflow.models import BaseOperator
|
||||
from airflow.utils.decorators import apply_defaults
|
||||
from airflow.utils import timezone
|
||||
from airflow.version import version
|
||||
|
||||
"""
|
||||
We overwrite DataprocClusterCreateOperator here to create clusters with an option to
|
||||
install component gateway, which we install by default. We also add labels to the gce
|
||||
cluster config.
|
||||
|
||||
Previously on 1.10.2, we had to include DataprocOperationBaseOperator from master
|
||||
which used the v1beta2 rest api for creating clusters allowing us to install optional
|
||||
components and component gateway, but this class has been updated since 1.10.4.
|
||||
|
||||
"""
|
||||
|
||||
# pylint: disable=too-many-instance-attributes
|
||||
class DataprocClusterCreateOperator(DataprocOperationBaseOperator):
|
||||
"""
|
||||
--
|
||||
Pulled from 1.10.7
|
||||
|
||||
We modify the _build_gce_cluster_config method to install component gateway.
|
||||
--
|
||||
Create a new cluster on Google Cloud Dataproc. The operator will wait until the
|
||||
creation is successful or an error occurs in the creation process.
|
||||
|
||||
The parameters allow to configure the cluster. Please refer to
|
||||
|
||||
https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.clusters
|
||||
|
||||
for a detailed explanation on the different parameters. Most of the configuration
|
||||
parameters detailed in the link are available as a parameter to this operator.
|
||||
|
||||
:param cluster_name: The name of the DataProc cluster to create. (templated)
|
||||
:type cluster_name: str
|
||||
:param project_id: The ID of the google cloud project in which
|
||||
to create the cluster. (templated)
|
||||
:type project_id: str
|
||||
:param num_workers: The # of workers to spin up. If set to zero will
|
||||
spin up cluster in a single node mode
|
||||
:type num_workers: int
|
||||
:param storage_bucket: The storage bucket to use, setting to None lets dataproc
|
||||
generate a custom one for you
|
||||
:type storage_bucket: str
|
||||
:param init_actions_uris: List of GCS uri's containing
|
||||
dataproc initialization scripts
|
||||
:type init_actions_uris: list[str]
|
||||
:param init_action_timeout: Amount of time executable scripts in
|
||||
init_actions_uris has to complete
|
||||
:type init_action_timeout: str
|
||||
:param metadata: dict of key-value google compute engine metadata entries
|
||||
to add to all instances
|
||||
:type metadata: dict
|
||||
:param image_version: the version of software inside the Dataproc cluster
|
||||
:type image_version: str
|
||||
:param custom_image: custom Dataproc image for more info see
|
||||
https://cloud.google.com/dataproc/docs/guides/dataproc-images
|
||||
:type custom_image: str
|
||||
:param custom_image_project_id: project id for the custom Dataproc image, for more info see
|
||||
https://cloud.google.com/dataproc/docs/guides/dataproc-images
|
||||
:type custom_image_project_id: str
|
||||
:param autoscaling_policy: The autoscaling policy used by the cluster. Only resource names
|
||||
including projectid and location (region) are valid. Example:
|
||||
``projects/[projectId]/locations/[dataproc_region]/autoscalingPolicies/[policy_id]``
|
||||
:type autoscaling_policy: str
|
||||
:param properties: dict of properties to set on
|
||||
config files (e.g. spark-defaults.conf), see
|
||||
https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.clusters#SoftwareConfig
|
||||
:type properties: dict
|
||||
:param optional_components: List of optional cluster components, for more info see
|
||||
https://cloud.google.com/dataproc/docs/reference/rest/v1/ClusterConfig#Component
|
||||
:type optional_components: list[str]
|
||||
:param num_masters: The # of master nodes to spin up
|
||||
:type num_masters: int
|
||||
:param master_machine_type: Compute engine machine type to use for the master node
|
||||
:type master_machine_type: str
|
||||
:param master_disk_type: Type of the boot disk for the master node
|
||||
(default is ``pd-standard``).
|
||||
Valid values: ``pd-ssd`` (Persistent Disk Solid State Drive) or
|
||||
``pd-standard`` (Persistent Disk Hard Disk Drive).
|
||||
:type master_disk_type: str
|
||||
:param master_disk_size: Disk size for the master node
|
||||
:type master_disk_size: int
|
||||
:param master_num_local_ssds : Number of local SSDs to mount. Local SSDs are used for writing and reading Apache Hadoop and Apache Spark scratch files, such as shuffle outputs. Adding SSDs will improve Spark runtime performance.
|
||||
(default is 0)
|
||||
:type master_num_local_ssds : int
|
||||
:param worker_machine_type: Compute engine machine type to use for the worker nodes
|
||||
:type worker_machine_type: str
|
||||
:param worker_disk_type: Type of the boot disk for the worker node
|
||||
(default is ``pd-standard``).
|
||||
Valid values: ``pd-ssd`` (Persistent Disk Solid State Drive) or
|
||||
``pd-standard`` (Persistent Disk Hard Disk Drive).
|
||||
:type worker_disk_type: str
|
||||
:param worker_disk_size: Disk size for the worker nodes
|
||||
:type worker_disk_size: int
|
||||
:param worker_num_local_ssds : Number of local SSDs to mount. Local SSDs are used for writing and reading Apache Hadoop and Apache Spark scratch files, such as shuffle outputs. Adding SSDs will improve Spark runtime performance.
|
||||
(default is 0)
|
||||
:type worker_num_local_ssds : int
|
||||
:param num_preemptible_workers: The # of preemptible worker nodes to spin up
|
||||
:type num_preemptible_workers: int
|
||||
:param labels: dict of labels to add to the cluster
|
||||
:type labels: dict
|
||||
:param zone: The zone where the cluster will be located. Set to None to auto-zone. (templated)
|
||||
:type zone: str
|
||||
:param network_uri: The network uri to be used for machine communication, cannot be
|
||||
specified with subnetwork_uri
|
||||
:type network_uri: str
|
||||
:param subnetwork_uri: The subnetwork uri to be used for machine communication,
|
||||
cannot be specified with network_uri
|
||||
:type subnetwork_uri: str
|
||||
:param internal_ip_only: If true, all instances in the cluster will only
|
||||
have internal IP addresses. This can only be enabled for subnetwork
|
||||
enabled networks
|
||||
:type internal_ip_only: bool
|
||||
:param tags: The GCE tags to add to all instances
|
||||
:type tags: list[str]
|
||||
:param region: leave as 'global', might become relevant in the future. (templated)
|
||||
:type region: str
|
||||
:param gcp_conn_id: The connection ID to use connecting to Google Cloud Platform.
|
||||
:type gcp_conn_id: str
|
||||
:param delegate_to: The account to impersonate, if any.
|
||||
For this to work, the service account making the request must have domain-wide
|
||||
delegation enabled.
|
||||
:type delegate_to: str
|
||||
:param service_account: The service account of the dataproc instances.
|
||||
:type service_account: str
|
||||
:param service_account_scopes: The URIs of service account scopes to be included.
|
||||
:type service_account_scopes: list[str]
|
||||
:param idle_delete_ttl: The longest duration that cluster would keep alive while
|
||||
staying idle. Passing this threshold will cause cluster to be auto-deleted.
|
||||
A duration in seconds.
|
||||
:type idle_delete_ttl: int
|
||||
:param auto_delete_time: The time when cluster will be auto-deleted.
|
||||
:type auto_delete_time: datetime.datetime
|
||||
:param auto_delete_ttl: The life duration of cluster, the cluster will be
|
||||
auto-deleted at the end of this duration.
|
||||
A duration in seconds. (If auto_delete_time is set this parameter will be ignored)
|
||||
:type auto_delete_ttl: int
|
||||
:param customer_managed_key: The customer-managed key used for disk encryption
|
||||
``projects/[PROJECT_STORING_KEYS]/locations/[LOCATION]/keyRings/[KEY_RING_NAME]/cryptoKeys/[KEY_NAME]`` # noqa # pylint: disable=line-too-long
|
||||
:type customer_managed_key: str
|
||||
|
||||
Moz specific
|
||||
:param install_component_gateway: Install alpha feature component gateway.
|
||||
:type install_component_gateway: boolean
|
||||
|
||||
"""
|
||||
|
||||
template_fields = ['cluster_name', 'project_id', 'zone', 'region']
|
||||
|
||||
# pylint: disable=too-many-arguments,too-many-locals
|
||||
@apply_defaults
|
||||
def __init__(self,
|
||||
project_id,
|
||||
cluster_name,
|
||||
num_workers,
|
||||
job_name=None, # Moz specific
|
||||
zone=None,
|
||||
network_uri=None,
|
||||
subnetwork_uri=None,
|
||||
internal_ip_only=None,
|
||||
tags=None,
|
||||
storage_bucket=None,
|
||||
init_actions_uris=None,
|
||||
init_action_timeout="10m",
|
||||
metadata=None,
|
||||
custom_image=None,
|
||||
custom_image_project_id=None,
|
||||
image_version=None,
|
||||
autoscaling_policy=None,
|
||||
properties=None,
|
||||
optional_components=['ANACONDA'], # Moz specific
|
||||
num_masters=1,
|
||||
master_machine_type='n1-standard-4',
|
||||
master_disk_type='pd-standard',
|
||||
master_disk_size=500,
|
||||
master_num_local_ssds=0,
|
||||
worker_machine_type='n1-standard-4',
|
||||
worker_disk_type='pd-standard',
|
||||
worker_disk_size=500,
|
||||
worker_num_local_ssds=0,
|
||||
num_preemptible_workers=0,
|
||||
labels=None,
|
||||
region='global',
|
||||
service_account=None,
|
||||
service_account_scopes=None,
|
||||
idle_delete_ttl=None,
|
||||
auto_delete_time=None,
|
||||
auto_delete_ttl=None,
|
||||
customer_managed_key=None,
|
||||
install_component_gateway=True, # Moz specific
|
||||
*args,
|
||||
**kwargs):
|
||||
|
||||
super(DataprocClusterCreateOperator, self).__init__(
|
||||
project_id=project_id, region=region, *args, **kwargs)
|
||||
self.cluster_name = cluster_name
|
||||
self.job_name = job_name
|
||||
self.num_masters = num_masters
|
||||
self.num_workers = num_workers
|
||||
self.num_preemptible_workers = num_preemptible_workers
|
||||
self.storage_bucket = storage_bucket
|
||||
self.init_actions_uris = init_actions_uris
|
||||
self.init_action_timeout = init_action_timeout
|
||||
self.metadata = metadata
|
||||
self.custom_image = custom_image
|
||||
self.custom_image_project_id = custom_image_project_id
|
||||
self.image_version = image_version
|
||||
self.properties = properties or dict()
|
||||
self.optional_components = optional_components
|
||||
self.master_machine_type = master_machine_type
|
||||
self.master_disk_type = master_disk_type
|
||||
self.master_disk_size = master_disk_size
|
||||
self.master_num_local_ssds = master_num_local_ssds
|
||||
self.autoscaling_policy = autoscaling_policy
|
||||
self.worker_machine_type = worker_machine_type
|
||||
self.worker_disk_type = worker_disk_type
|
||||
self.worker_disk_size = worker_disk_size
|
||||
self.worker_num_local_ssds = worker_num_local_ssds
|
||||
self.labels = labels
|
||||
self.zone = zone
|
||||
self.network_uri = network_uri
|
||||
self.subnetwork_uri = subnetwork_uri
|
||||
self.internal_ip_only = internal_ip_only
|
||||
self.tags = tags
|
||||
self.service_account = service_account
|
||||
self.service_account_scopes = service_account_scopes
|
||||
self.idle_delete_ttl = idle_delete_ttl
|
||||
self.auto_delete_time = auto_delete_time
|
||||
self.auto_delete_ttl = auto_delete_ttl
|
||||
self.customer_managed_key = customer_managed_key
|
||||
self.single_node = num_workers == 0
|
||||
self.install_component_gateway = install_component_gateway # Moz specific
|
||||
|
||||
assert not (self.custom_image and self.image_version), \
|
||||
"custom_image and image_version can't be both set"
|
||||
|
||||
assert (
|
||||
not self.single_node or (
|
||||
self.single_node and self.num_preemptible_workers == 0
|
||||
)
|
||||
), "num_workers == 0 means single node mode - no preemptibles allowed"
|
||||
|
||||
def _get_init_action_timeout(self):
|
||||
match = re.match(r"^(\d+)(s|m)$", self.init_action_timeout)
|
||||
if match:
|
||||
if match.group(2) == "s":
|
||||
return self.init_action_timeout
|
||||
elif match.group(2) == "m":
|
||||
val = float(match.group(1))
|
||||
return "{}s".format(timedelta(minutes=val).seconds)
|
||||
|
||||
raise AirflowException(
|
||||
"DataprocClusterCreateOperator init_action_timeout"
|
||||
" should be expressed in minutes or seconds. i.e. 10m, 30s")
|
||||
|
||||
def _build_gce_cluster_config(self, cluster_data):
|
||||
"""
|
||||
We optionally add alpha feature 'enable component gateway'
|
||||
|
||||
"""
|
||||
|
||||
if self.install_component_gateway: # Moz specific start
|
||||
# Fetch current nested dict and add nested keys
|
||||
cluster_config_new = cluster_data['config']
|
||||
cluster_config_new.update({'endpointConfig' : {'enableHttpPortAccess' : True}})
|
||||
|
||||
# Overwrite the config key with newly created
|
||||
cluster_data.update({'config' : cluster_config_new}) # Moz specific end
|
||||
|
||||
|
||||
if self.zone:
|
||||
zone_uri = \
|
||||
'https://www.googleapis.com/compute/v1/projects/{}/zones/{}'.format(
|
||||
self.project_id, self.zone
|
||||
)
|
||||
cluster_data['config']['gceClusterConfig']['zoneUri'] = zone_uri
|
||||
|
||||
if self.metadata:
|
||||
cluster_data['config']['gceClusterConfig']['metadata'] = self.metadata
|
||||
|
||||
if self.network_uri:
|
||||
cluster_data['config']['gceClusterConfig']['networkUri'] = self.network_uri
|
||||
|
||||
if self.subnetwork_uri:
|
||||
cluster_data['config']['gceClusterConfig']['subnetworkUri'] = \
|
||||
self.subnetwork_uri
|
||||
|
||||
if self.internal_ip_only:
|
||||
if not self.subnetwork_uri:
|
||||
raise AirflowException("Set internal_ip_only to true only when"
|
||||
" you pass a subnetwork_uri.")
|
||||
cluster_data['config']['gceClusterConfig']['internalIpOnly'] = True
|
||||
|
||||
if self.tags:
|
||||
cluster_data['config']['gceClusterConfig']['tags'] = self.tags
|
||||
|
||||
if self.service_account:
|
||||
cluster_data['config']['gceClusterConfig']['serviceAccount'] = \
|
||||
self.service_account
|
||||
|
||||
if self.service_account_scopes:
|
||||
cluster_data['config']['gceClusterConfig']['serviceAccountScopes'] = \
|
||||
self.service_account_scopes
|
||||
|
||||
return cluster_data
|
||||
|
||||
def _build_lifecycle_config(self, cluster_data):
|
||||
if self.idle_delete_ttl:
|
||||
cluster_data['config']['lifecycleConfig']['idleDeleteTtl'] = \
|
||||
"{}s".format(self.idle_delete_ttl)
|
||||
|
||||
if self.auto_delete_time:
|
||||
utc_auto_delete_time = timezone.convert_to_utc(self.auto_delete_time)
|
||||
cluster_data['config']['lifecycleConfig']['autoDeleteTime'] = \
|
||||
utc_auto_delete_time.format('%Y-%m-%dT%H:%M:%S.%fZ', formatter='classic')
|
||||
elif self.auto_delete_ttl:
|
||||
cluster_data['config']['lifecycleConfig']['autoDeleteTtl'] = \
|
||||
"{}s".format(self.auto_delete_ttl)
|
||||
|
||||
return cluster_data
|
||||
|
||||
def _build_cluster_data(self):
|
||||
if self.zone:
|
||||
master_type_uri = \
|
||||
"https://www.googleapis.com/compute/v1/projects/{}/zones/{}/machineTypes/{}"\
|
||||
.format(self.project_id, self.zone, self.master_machine_type)
|
||||
worker_type_uri = \
|
||||
"https://www.googleapis.com/compute/v1/projects/{}/zones/{}/machineTypes/{}"\
|
||||
.format(self.project_id, self.zone, self.worker_machine_type)
|
||||
else:
|
||||
master_type_uri = self.master_machine_type
|
||||
worker_type_uri = self.worker_machine_type
|
||||
|
||||
cluster_data = {
|
||||
'projectId': self.project_id,
|
||||
'clusterName': self.cluster_name,
|
||||
'labels': {},
|
||||
'config': {
|
||||
'gceClusterConfig': {
|
||||
},
|
||||
'masterConfig': {
|
||||
'numInstances': self.num_masters,
|
||||
'machineTypeUri': master_type_uri,
|
||||
'diskConfig': {
|
||||
'bootDiskType': self.master_disk_type,
|
||||
'bootDiskSizeGb': self.master_disk_size,
|
||||
'numLocalSsds': self.master_num_local_ssds,
|
||||
}
|
||||
},
|
||||
'workerConfig': {
|
||||
'numInstances': self.num_workers,
|
||||
'machineTypeUri': worker_type_uri,
|
||||
'diskConfig': {
|
||||
'bootDiskType': self.worker_disk_type,
|
||||
'bootDiskSizeGb': self.worker_disk_size,
|
||||
'numLocalSsds': self.worker_num_local_ssds,
|
||||
}
|
||||
},
|
||||
'secondaryWorkerConfig': {},
|
||||
'softwareConfig': {},
|
||||
'lifecycleConfig': {},
|
||||
'encryptionConfig': {},
|
||||
'autoscalingConfig': {},
|
||||
}
|
||||
}
|
||||
if self.num_preemptible_workers > 0:
|
||||
cluster_data['config']['secondaryWorkerConfig'] = {
|
||||
'numInstances': self.num_preemptible_workers,
|
||||
'machineTypeUri': worker_type_uri,
|
||||
'diskConfig': {
|
||||
'bootDiskType': self.worker_disk_type,
|
||||
'bootDiskSizeGb': self.worker_disk_size
|
||||
},
|
||||
'isPreemptible': True
|
||||
}
|
||||
|
||||
cluster_data['labels'] = self.labels or {}
|
||||
|
||||
# Dataproc labels must conform to the following regex:
|
||||
# [a-z]([-a-z0-9]*[a-z0-9])? (current airflow version string follows
|
||||
# semantic versioning spec: x.y.z).
|
||||
cluster_data['labels'].update({'airflow-version':
|
||||
'v' + version.replace('.', '-').replace('+', '-')})
|
||||
# Moz specific
|
||||
cluster_data['labels'].update({'owner': self.owner.lower().replace('@mozilla.com', '').replace('.', '-'),
|
||||
'env': os.getenv('DEPLOY_ENVIRONMENT', 'env_not_set'),
|
||||
'jobname': self.job_name.lower().replace('_', '-')})
|
||||
|
||||
if self.storage_bucket:
|
||||
cluster_data['config']['configBucket'] = self.storage_bucket
|
||||
|
||||
if self.image_version:
|
||||
cluster_data['config']['softwareConfig']['imageVersion'] = self.image_version
|
||||
|
||||
elif self.custom_image:
|
||||
project_id = self.custom_image_project_id if (self.custom_image_project_id) else self.project_id
|
||||
custom_image_url = 'https://www.googleapis.com/compute/beta/projects/' \
|
||||
'{}/global/images/{}'.format(project_id,
|
||||
self.custom_image)
|
||||
cluster_data['config']['masterConfig']['imageUri'] = custom_image_url
|
||||
if not self.single_node:
|
||||
cluster_data['config']['workerConfig']['imageUri'] = custom_image_url
|
||||
|
||||
cluster_data = self._build_gce_cluster_config(cluster_data)
|
||||
|
||||
if self.single_node:
|
||||
self.properties["dataproc:dataproc.allow.zero.workers"] = "true"
|
||||
|
||||
if self.properties:
|
||||
cluster_data['config']['softwareConfig']['properties'] = self.properties
|
||||
|
||||
if self.optional_components:
|
||||
cluster_data['config']['softwareConfig']['optionalComponents'] = self.optional_components
|
||||
|
||||
cluster_data = self._build_lifecycle_config(cluster_data)
|
||||
|
||||
if self.init_actions_uris:
|
||||
init_actions_dict = [
|
||||
{
|
||||
'executableFile': uri,
|
||||
'executionTimeout': self._get_init_action_timeout()
|
||||
} for uri in self.init_actions_uris
|
||||
]
|
||||
cluster_data['config']['initializationActions'] = init_actions_dict
|
||||
|
||||
if self.customer_managed_key:
|
||||
cluster_data['config']['encryptionConfig'] =\
|
||||
{'gcePdKmsKeyName': self.customer_managed_key}
|
||||
if self.autoscaling_policy:
|
||||
cluster_data['config']['autoscalingConfig'] = {'policyUri': self.autoscaling_policy}
|
||||
|
||||
return cluster_data
|
||||
|
||||
def start(self):
|
||||
"""
|
||||
Create a new cluster on Google Cloud Dataproc.
|
||||
"""
|
||||
self.log.info('Creating cluster: %s', self.cluster_name)
|
||||
cluster_data = self._build_cluster_data()
|
||||
|
||||
return (
|
||||
self.hook.get_conn().projects().regions().clusters().create( # pylint: disable=no-member
|
||||
projectId=self.project_id,
|
||||
region=self.region,
|
||||
body=cluster_data,
|
||||
requestId=str(uuid.uuid4()),
|
||||
).execute())
|
||||
|
|
@ -1,10 +1,8 @@
|
|||
from airflow.models import BaseOperator
|
||||
from airflow.utils.decorators import apply_defaults
|
||||
import logging
|
||||
import time
|
||||
|
||||
class SleepOperator(BaseOperator):
|
||||
@apply_defaults
|
||||
def __init__(self, sleep_time=30, *args, **kwargs):
|
||||
super(SleepOperator, self).__init__(*args, **kwargs)
|
||||
self.sleep_time=sleep_time
|
||||
|
|
|
@ -5,9 +5,8 @@ from sqlalchemy import func
|
|||
|
||||
from airflow.exceptions import AirflowException
|
||||
from airflow.models import DagBag, DagModel, DagRun, TaskInstance
|
||||
from airflow.operators.sensors import ExternalTaskSensor
|
||||
from airflow.sensors.external_task import ExternalTaskSensor
|
||||
from airflow.utils.db import provide_session
|
||||
from airflow.utils.decorators import apply_defaults
|
||||
from airflow.utils.state import State
|
||||
|
||||
|
||||
|
@ -28,7 +27,6 @@ class ExternalTaskCompletedSensor(ExternalTaskSensor):
|
|||
|
||||
"""
|
||||
|
||||
@apply_defaults
|
||||
def __init__(self, failed_states = None, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.failed_states = failed_states or [State.FAILED, State.UPSTREAM_FAILED, State.SKIPPED]
|
||||
|
@ -41,7 +39,7 @@ class ExternalTaskCompletedSensor(ExternalTaskSensor):
|
|||
dttm = context['execution_date'] - self.execution_delta
|
||||
elif self.execution_date_fn:
|
||||
# Moz specific - _handle_execution_date_fn may not be defined in this context
|
||||
raise AirflowException("execution_date_fn is not supported by this sensor.")
|
||||
raise AirflowException("execution_date_fn is not supported by this custom mozilla sensor.")
|
||||
else:
|
||||
dttm = context['execution_date']
|
||||
|
||||
|
|
|
@ -1,14 +1,8 @@
|
|||
from airflow import DAG
|
||||
from datetime import datetime, timedelta
|
||||
from airflow.contrib.hooks.aws_hook import AwsHook
|
||||
from airflow.executors import get_default_executor
|
||||
from airflow.operators.subdag_operator import SubDagOperator
|
||||
from operators.task_sensor import ExternalTaskCompletedSensor
|
||||
from utils.dataproc import (
|
||||
moz_dataproc_pyspark_runner,
|
||||
moz_dataproc_jar_runner,
|
||||
get_dataproc_parameters,
|
||||
)
|
||||
|
||||
from utils.gcp import (
|
||||
bigquery_etl_query,
|
||||
bigquery_etl_copy_deduplicate,
|
||||
|
@ -91,7 +85,6 @@ main_summary_export = SubDagOperator(
|
|||
default_args=default_args,
|
||||
num_workers=40),
|
||||
task_id="main_summary_export",
|
||||
executor=get_default_executor(),
|
||||
dag=dag)
|
||||
|
||||
clients_daily_export = SubDagOperator(
|
||||
|
@ -141,7 +134,6 @@ clients_daily_export = SubDagOperator(
|
|||
default_args=default_args,
|
||||
num_preemptible_workers=10),
|
||||
task_id="clients_daily_export",
|
||||
executor=get_default_executor(),
|
||||
dag=dag)
|
||||
|
||||
wait_for_clients_daily = ExternalTaskCompletedSensor(
|
||||
|
|
|
@ -1,17 +1,16 @@
|
|||
from airflow import DAG
|
||||
from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook
|
||||
from airflow.contrib.operators.dataproc_operator import (
|
||||
DataprocClusterCreateOperator,
|
||||
DataprocClusterDeleteOperator,
|
||||
DataProcPySparkOperator,
|
||||
from airflow.providers.google.cloud.operators.dataproc import (
|
||||
DataprocCreateClusterOperator,
|
||||
DataprocDeleteClusterOperator,
|
||||
DataprocSubmitPySparkJobOperator,
|
||||
)
|
||||
|
||||
|
||||
def spark_subdag(
|
||||
parent_dag_name,
|
||||
child_dag_name,
|
||||
default_args,
|
||||
gcp_conn_id,
|
||||
project_id,
|
||||
service_account,
|
||||
main,
|
||||
pyfiles,
|
||||
|
@ -27,6 +26,7 @@ def spark_subdag(
|
|||
:param str child_dag_name: Name of the child DAG.
|
||||
:param Dict[str, Any] default_args: Default arguments for the child DAG.
|
||||
:param str gcp_conn_id: Name of the connection string.
|
||||
:param str project_id: GCP project id corresponding to the gcp_conn_id.
|
||||
:param str service_account: The address of the service account.
|
||||
:param str dataproc_region: The region of the DataProc cluster.
|
||||
:param str main:
|
||||
|
@ -36,12 +36,10 @@ def spark_subdag(
|
|||
:return: DAG
|
||||
"""
|
||||
|
||||
connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id)
|
||||
|
||||
shared_config = {
|
||||
"cluster_name": "prio-staging-{{ds_nodash}}",
|
||||
"gcp_conn_id": gcp_conn_id,
|
||||
"project_id": connection.project_id,
|
||||
"project_id": project_id,
|
||||
# From an error when not specifying the region:
|
||||
# - Dataproc images 2.0 and higher do not support the to-be
|
||||
# deprecated global region. Please use any non-global Dataproc
|
||||
|
@ -54,7 +52,7 @@ def spark_subdag(
|
|||
}
|
||||
|
||||
with DAG(f"{parent_dag_name}.{child_dag_name}", default_args=default_args) as dag:
|
||||
create_dataproc_cluster = DataprocClusterCreateOperator(
|
||||
create_dataproc_cluster = DataprocCreateClusterOperator(
|
||||
task_id="create_dataproc_cluster",
|
||||
image_version="preview-ubuntu18",
|
||||
service_account=service_account,
|
||||
|
@ -68,10 +66,10 @@ def spark_subdag(
|
|||
**shared_config,
|
||||
)
|
||||
|
||||
run_dataproc_spark = DataProcPySparkOperator(
|
||||
run_dataproc_spark = DataprocSubmitPySparkJobOperator(
|
||||
task_id="run_dataproc_spark",
|
||||
main=main,
|
||||
dataproc_pyspark_jars=[
|
||||
dataproc_jars=[
|
||||
"gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar"
|
||||
],
|
||||
pyfiles=pyfiles,
|
||||
|
@ -80,7 +78,7 @@ def spark_subdag(
|
|||
**shared_config,
|
||||
)
|
||||
|
||||
delete_dataproc_cluster = DataprocClusterDeleteOperator(
|
||||
delete_dataproc_cluster = DataprocDeleteClusterOperator(
|
||||
task_id="delete_dataproc_cluster",
|
||||
trigger_rule="all_done",
|
||||
dag=dag,
|
||||
|
|
|
@ -2,10 +2,9 @@ from datetime import timedelta
|
|||
from os import environ
|
||||
|
||||
from airflow import DAG
|
||||
from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook
|
||||
from airflow.contrib.operators.gcp_container_operator import (
|
||||
GKEClusterCreateOperator,
|
||||
GKEClusterDeleteOperator,
|
||||
from airflow.providers.google.cloud.operators.kubernetes_engine import (
|
||||
GKECreateClusterOperator,
|
||||
GKEDeleteClusterOperator,
|
||||
)
|
||||
from airflow.operators.bash_operator import BashOperator
|
||||
from operators.gcp_container_operator import GKEPodOperator
|
||||
|
@ -17,6 +16,7 @@ def container_subdag(
|
|||
child_dag_name,
|
||||
default_args,
|
||||
gcp_conn_id,
|
||||
project_id,
|
||||
service_account,
|
||||
server_id,
|
||||
env_vars={},
|
||||
|
@ -35,6 +35,7 @@ def container_subdag(
|
|||
:param str child_dag_name: Name of the child DAG.
|
||||
:param Dict[str, Any] default_args: Default arguments for the child DAG.
|
||||
:param str gcp_conn_id: Name of the connection string.
|
||||
:param str project_id: GCP project id associated with the gcp_conn_id.
|
||||
:param str service_account: The address of the service account.
|
||||
:param str server_id: The identifier for the Prio processor
|
||||
:param Dict[str, str] env_vars: Environment variables for configuring
|
||||
|
@ -50,12 +51,10 @@ def container_subdag(
|
|||
"""
|
||||
assert server_id in ["a", "b", "admin"]
|
||||
|
||||
connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id)
|
||||
|
||||
cluster_name = f"gke-prio-{server_id}"
|
||||
|
||||
shared_config = {
|
||||
"project_id": connection.project_id,
|
||||
"project_id": project_id,
|
||||
"gcp_conn_id": gcp_conn_id,
|
||||
"location": location,
|
||||
}
|
||||
|
@ -67,7 +66,7 @@ def container_subdag(
|
|||
# https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/
|
||||
# https://cloud.google.com/composer/docs/how-to/using/using-kubernetes-pod-operator
|
||||
# https://airflow.apache.org/docs/stable/_api/airflow/contrib/operators/kubernetes_pod_operator/index.html
|
||||
create_gke_cluster = GKEClusterCreateOperator(
|
||||
create_gke_cluster = GKECreateClusterOperator(
|
||||
task_id="create_gke_cluster",
|
||||
body=create_gke_config(
|
||||
name=cluster_name,
|
||||
|
@ -141,7 +140,7 @@ def container_subdag(
|
|||
**kwargs,
|
||||
)
|
||||
|
||||
delete_gke_cluster = GKEClusterDeleteOperator(
|
||||
delete_gke_cluster = GKEDeleteClusterOperator(
|
||||
task_id="delete_gke_cluster",
|
||||
name=cluster_name,
|
||||
trigger_rule="all_done",
|
||||
|
|
|
@ -37,10 +37,9 @@ the environment.
|
|||
from functools import partial
|
||||
|
||||
from airflow import DAG
|
||||
from airflow.contrib.operators.gcs_to_gcs import (
|
||||
GoogleCloudStorageToGoogleCloudStorageOperator,
|
||||
)
|
||||
from airflow.operators import DummyOperator, PythonOperator
|
||||
from airflow.providers.google.cloud.transfers.gcs_to_gcs import GCSToGCSOperator
|
||||
from airflow.operators.python import PythonOperator
|
||||
from airflow.operators.dummy import DummyOperator
|
||||
from airflow.operators.subdag_operator import SubDagOperator
|
||||
from prio import dataproc, kubernetes
|
||||
|
||||
|
@ -56,7 +55,7 @@ def transfer_data_subdag(
|
|||
submission_date,
|
||||
server_id,
|
||||
public_key_hex_external,
|
||||
google_cloud_storage_conn_id,
|
||||
gcp_conn_id,
|
||||
):
|
||||
"""Copy the partitioned data from the staging bucket into the corresponding
|
||||
receiving buckets in each processor. The job then submits a `_SUCCESS` file
|
||||
|
@ -78,22 +77,22 @@ def transfer_data_subdag(
|
|||
"raw/shares",
|
||||
]
|
||||
)
|
||||
transfer_dataset = GoogleCloudStorageToGoogleCloudStorageOperator(
|
||||
transfer_dataset = GCSToGCSOperator(
|
||||
task_id="transfer_dataset",
|
||||
source_bucket=source_bucket,
|
||||
source_object=f"staging/submission_date={submission_date}/server_id={server_id}/*",
|
||||
destination_bucket=destination_bucket,
|
||||
destination_object=f"{prefix}/",
|
||||
google_cloud_storage_conn_id=google_cloud_storage_conn_id,
|
||||
gcp_conn_id=gcp_conn_id,
|
||||
dag=dag,
|
||||
)
|
||||
mark_dataset_success = GoogleCloudStorageToGoogleCloudStorageOperator(
|
||||
mark_dataset_success = GCSToGCSOperator(
|
||||
task_id="mark_dataset_success",
|
||||
source_bucket=source_bucket,
|
||||
source_object="staging/_SUCCESS",
|
||||
destination_bucket=destination_bucket,
|
||||
destination_object=f"{prefix}/_SUCCESS",
|
||||
google_cloud_storage_conn_id=google_cloud_storage_conn_id,
|
||||
gcp_conn_id=gcp_conn_id,
|
||||
dag=dag,
|
||||
)
|
||||
transfer_dataset >> mark_dataset_success
|
||||
|
@ -104,6 +103,7 @@ def ingestion_subdag(
|
|||
dag,
|
||||
default_args,
|
||||
gcp_conn_id,
|
||||
project_id,
|
||||
service_account,
|
||||
bucket_bootstrap_admin,
|
||||
bucket_data_admin,
|
||||
|
@ -125,13 +125,14 @@ def ingestion_subdag(
|
|||
default_args=default_args,
|
||||
server_id="admin",
|
||||
gcp_conn_id=gcp_conn_id,
|
||||
project_id=project_id,
|
||||
service_account=service_account,
|
||||
arguments=[
|
||||
"bash",
|
||||
"-xc",
|
||||
f"source bin/dataproc; bootstrap gs://{bucket_bootstrap_admin}",
|
||||
],
|
||||
env_var=dict(SUBMODULE="origin"),
|
||||
env_vars=dict(SUBMODULE="origin"),
|
||||
),
|
||||
task_id="bootstrap",
|
||||
dag=dag,
|
||||
|
@ -144,6 +145,7 @@ def ingestion_subdag(
|
|||
child_dag_name="staging",
|
||||
default_args=default_args,
|
||||
gcp_conn_id=gcp_conn_id,
|
||||
project_id=project_id,
|
||||
service_account=service_account,
|
||||
main=f"gs://{bucket_bootstrap_admin}/processor-origin.py",
|
||||
pyfiles=[f"gs://{bucket_bootstrap_admin}/prio_processor.egg"],
|
||||
|
@ -175,7 +177,7 @@ def ingestion_subdag(
|
|||
destination_bucket_prefix=bucket_prefix,
|
||||
app_name=app_name,
|
||||
submission_date="{{ ds }}",
|
||||
google_cloud_storage_conn_id=gcp_conn_id,
|
||||
gcp_conn_id=gcp_conn_id,
|
||||
)
|
||||
|
||||
transfer_a = SubDagOperator(
|
||||
|
@ -213,7 +215,7 @@ def ingestion_subdag(
|
|||
|
||||
|
||||
def prio_processor_subdag(
|
||||
dag, default_args, gcp_conn_id, service_account, server_id, env_vars
|
||||
dag, default_args, gcp_conn_id, project_id, service_account, server_id, env_vars
|
||||
):
|
||||
return SubDagOperator(
|
||||
subdag=kubernetes.container_subdag(
|
||||
|
@ -221,6 +223,7 @@ def prio_processor_subdag(
|
|||
child_dag_name=f"processor_{server_id}",
|
||||
default_args=default_args,
|
||||
gcp_conn_id=gcp_conn_id,
|
||||
project_id=project_id,
|
||||
service_account=service_account,
|
||||
server_id=server_id,
|
||||
arguments=["bin/process"],
|
||||
|
@ -231,7 +234,7 @@ def prio_processor_subdag(
|
|||
)
|
||||
|
||||
|
||||
def load_bigquery_subdag(dag, default_args, gcp_conn_id, service_account, env_vars):
|
||||
def load_bigquery_subdag(dag, default_args, gcp_conn_id, project_id, service_account, env_vars):
|
||||
# Take the resulting aggregates and insert them into a BigQuery table. This
|
||||
# table is effectively append-only, so rerunning the dag will cause duplicate
|
||||
# results. In practice, rerunning the DAG is problematic when operation is
|
||||
|
@ -243,6 +246,7 @@ def load_bigquery_subdag(dag, default_args, gcp_conn_id, service_account, env_va
|
|||
default_args=default_args,
|
||||
server_id="admin",
|
||||
gcp_conn_id=gcp_conn_id,
|
||||
project_id=project_id,
|
||||
service_account=service_account,
|
||||
arguments=["bash", "-c", "bin/insert"],
|
||||
env_vars=env_vars,
|
||||
|
|
|
@ -2,7 +2,6 @@ from datetime import datetime, timedelta
|
|||
from os import environ
|
||||
|
||||
from airflow import DAG
|
||||
from airflow.contrib.hooks.gcs_hook import GoogleCloudStorageHook
|
||||
from prio.processor import ingestion_subdag, load_bigquery_subdag, prio_processor_subdag
|
||||
|
||||
DEFAULT_ARGS = {
|
||||
|
@ -26,13 +25,12 @@ IS_DEV = environ.get("DEPLOY_ENVIRONMENT") != "prod"
|
|||
ENVIRONMENT = "dev" if IS_DEV else "prod"
|
||||
|
||||
PRIO_ADMIN_CONN = "google_cloud_prio_admin"
|
||||
PRIO_ADMIN_PROJECT_ID = "moz-fx-prio-admin-prod-098j"
|
||||
PRIO_A_CONN = "google_cloud_prio_a"
|
||||
PRIO_A_PROJECT_ID = "moz-fx-prio-a-prod-kju7"
|
||||
|
||||
PROJECT_ADMIN = GoogleCloudStorageHook(PRIO_ADMIN_CONN).project_id
|
||||
PROJECT_A = GoogleCloudStorageHook(PRIO_A_CONN).project_id
|
||||
|
||||
SERVICE_ACCOUNT_ADMIN = f"prio-admin-runner@{PROJECT_ADMIN}.iam.gserviceaccount.com"
|
||||
SERVICE_ACCOUNT_A = f"prio-runner-{ENVIRONMENT}-a@{PROJECT_A}.iam.gserviceaccount.com"
|
||||
SERVICE_ACCOUNT_ADMIN = f"prio-admin-runner@{PRIO_ADMIN_PROJECT_ID}.iam.gserviceaccount.com"
|
||||
SERVICE_ACCOUNT_A = f"prio-runner-{ENVIRONMENT}-a@{PRIO_A_PROJECT_ID}.iam.gserviceaccount.com"
|
||||
|
||||
# Private bucket of server B is necessary for transfer
|
||||
BUCKET_PRIVATE_A = f"moz-fx-prio-{ENVIRONMENT}-a-private"
|
||||
|
@ -58,6 +56,7 @@ ingest = ingestion_subdag(
|
|||
dag,
|
||||
DEFAULT_ARGS,
|
||||
PRIO_ADMIN_CONN,
|
||||
PRIO_ADMIN_PROJECT_ID,
|
||||
SERVICE_ACCOUNT_ADMIN,
|
||||
BUCKET_BOOTSTRAP_ADMIN,
|
||||
BUCKET_DATA_ADMIN,
|
||||
|
@ -73,6 +72,7 @@ processor_a = prio_processor_subdag(
|
|||
dag,
|
||||
DEFAULT_ARGS,
|
||||
PRIO_A_CONN,
|
||||
PRIO_A_PROJECT_ID,
|
||||
SERVICE_ACCOUNT_A,
|
||||
"a",
|
||||
{
|
||||
|
@ -99,6 +99,7 @@ load_bigquery = load_bigquery_subdag(
|
|||
dag,
|
||||
DEFAULT_ARGS,
|
||||
PRIO_ADMIN_CONN,
|
||||
PRIO_ADMIN_PROJECT_ID,
|
||||
SERVICE_ACCOUNT_ADMIN,
|
||||
env_vars={
|
||||
"APP_NAME": APP_NAME,
|
||||
|
|
|
@ -2,7 +2,6 @@ from datetime import datetime, timedelta
|
|||
from os import environ
|
||||
|
||||
from airflow import DAG
|
||||
from airflow.contrib.hooks.gcs_hook import GoogleCloudStorageHook
|
||||
from prio.processor import prio_processor_subdag
|
||||
|
||||
DEFAULT_ARGS = {
|
||||
|
@ -25,7 +24,7 @@ DEFAULT_ARGS = {
|
|||
IS_DEV = environ.get("DEPLOY_ENVIRONMENT") != "prod"
|
||||
ENVIRONMENT = "dev" if IS_DEV else "prod"
|
||||
PRIO_B_CONN = "google_cloud_prio_b"
|
||||
PROJECT_B = GoogleCloudStorageHook(PRIO_B_CONN).project_id
|
||||
PROJECT_B = "moz-fx-prio-b-prod-a67n"
|
||||
SERVICE_ACCOUNT_B = f"prio-runner-{ENVIRONMENT}-b@{PROJECT_B}.iam.gserviceaccount.com"
|
||||
BUCKET_PRIVATE_B = f"moz-fx-prio-{ENVIRONMENT}-b-private"
|
||||
BUCKET_SHARED_A = f"moz-fx-prio-{ENVIRONMENT}-a-shared"
|
||||
|
@ -48,6 +47,7 @@ processor_b = prio_processor_subdag(
|
|||
dag,
|
||||
DEFAULT_ARGS,
|
||||
PRIO_B_CONN,
|
||||
PROJECT_B,
|
||||
SERVICE_ACCOUNT_B,
|
||||
"b",
|
||||
{
|
||||
|
|
|
@ -2,13 +2,13 @@ import time
|
|||
from datetime import datetime, timedelta
|
||||
|
||||
from airflow import DAG
|
||||
from airflow.contrib.hooks.aws_hook import AwsHook
|
||||
from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook
|
||||
from airflow.providers.amazon.aws.hooks.base_aws import AwsBaseHook
|
||||
from airflow.models import Variable
|
||||
from airflow.operators.http_operator import SimpleHttpOperator
|
||||
from airflow.operators.python_operator import PythonOperator
|
||||
from operators.gcp_container_operator import GKEPodOperator
|
||||
|
||||
|
||||
DOCS = """\
|
||||
# Probe Scraper
|
||||
|
||||
|
@ -36,6 +36,9 @@ resource.labels.pod_name="POD_NAME_FROM_AIRFLOW_LOGS" severity>=DEFAULT
|
|||
Adjust the time window as needed and you should be able to see logs associated with the failure.
|
||||
"""
|
||||
|
||||
DEFAULT_LOOKML_GENERATOR_IMAGE_VERSION = "v1.17.0"
|
||||
|
||||
|
||||
default_args = {
|
||||
'owner': 'dthorn@mozilla.com',
|
||||
'depends_on_past': False,
|
||||
|
@ -52,7 +55,7 @@ with DAG('probe_scraper',
|
|||
schedule_interval='0 0 * * 1-5') as dag:
|
||||
|
||||
aws_conn_id='aws_prod_probe_scraper'
|
||||
aws_access_key, aws_secret_key, session = AwsHook(aws_conn_id).get_credentials()
|
||||
aws_access_key, aws_secret_key, session = AwsBaseHook(aws_conn_id=aws_conn_id, client_type='s3').get_credentials()
|
||||
|
||||
# Built from repo https://github.com/mozilla/probe-scraper
|
||||
probe_scraper_image='gcr.io/moz-fx-data-airflow-prod-88e0/probe-scraper:latest'
|
||||
|
@ -132,15 +135,20 @@ with DAG('probe_scraper',
|
|||
probe_scraper >> delay_python_task
|
||||
|
||||
gcp_gke_conn_id = "google_cloud_airflow_gke"
|
||||
project_id = "moz-fx-data-airflow-gke-prod"
|
||||
image_tag = Variable.get("lookml_generator_release_str")
|
||||
if image_tag is None:
|
||||
image_tag = DEFAULT_LOOKML_GENERATOR_IMAGE_VERSION
|
||||
|
||||
lookml_generator_prod = GKEPodOperator(
|
||||
owner="ascholtz@mozilla.com",
|
||||
email=["ascholtz@mozilla.com", "dataops+alerts@mozilla.com"],
|
||||
task_id="lookml_generator",
|
||||
name="lookml-generator-1",
|
||||
image="gcr.io/moz-fx-data-airflow-prod-88e0/lookml-generator:" + Variable.get("lookml_generator_release_str"),
|
||||
image="gcr.io/moz-fx-data-airflow-prod-88e0/lookml-generator:" + image_tag,
|
||||
startup_timeout_seconds=500,
|
||||
gcp_conn_id=gcp_gke_conn_id,
|
||||
project_id=GoogleCloudBaseHook(gcp_conn_id=gcp_gke_conn_id).project_id,
|
||||
project_id=project_id,
|
||||
cluster_name="workloads-prod-v1",
|
||||
location="us-west1",
|
||||
dag=dag,
|
||||
|
@ -168,7 +176,7 @@ with DAG('probe_scraper',
|
|||
name="lookml-generator-staging-1",
|
||||
image="gcr.io/moz-fx-data-airflow-prod-88e0/lookml-generator:latest",
|
||||
gcp_conn_id=gcp_gke_conn_id,
|
||||
project_id=GoogleCloudBaseHook(gcp_conn_id=gcp_gke_conn_id).project_id,
|
||||
project_id=project_id,
|
||||
cluster_name="workloads-prod-v1",
|
||||
location="us-west1",
|
||||
dag=dag,
|
||||
|
|
|
@ -1,12 +1,11 @@
|
|||
from airflow import DAG
|
||||
from airflow.operators.subdag_operator import SubDagOperator
|
||||
from airflow.contrib.hooks.aws_hook import AwsHook
|
||||
from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook
|
||||
from airflow.contrib.operators.bigquery_table_delete_operator import (
|
||||
BigQueryTableDeleteOperator,
|
||||
from airflow.providers.google.cloud.operators.bigquery import (
|
||||
BigQueryDeleteTableOperator,
|
||||
)
|
||||
from airflow.contrib.operators.gcp_transfer_operator import (
|
||||
S3ToGoogleCloudStorageTransferOperator,
|
||||
|
||||
from airflow.providers.google.cloud.operators.cloud_storage_transfer_service import (
|
||||
CloudDataTransferServiceS3ToGCSOperator
|
||||
)
|
||||
|
||||
from datetime import datetime, timedelta
|
||||
|
@ -53,7 +52,7 @@ cluster_name = "socorro-import-dataproc-cluster"
|
|||
|
||||
# Defined in Airflow's UI -> Admin -> Connections
|
||||
gcp_conn_id = "google_cloud_airflow_dataproc"
|
||||
connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id)
|
||||
project_id = "airflow-dataproc"
|
||||
|
||||
# Required to copy socorro json data from aws prod s3 to gcs
|
||||
read_aws_conn_id = "aws_socorro_readonly_s3"
|
||||
|
@ -73,14 +72,14 @@ objects_prefix = "{}/{}/{}={}".format(
|
|||
)
|
||||
|
||||
# copy json crashstats from s3 to gcs
|
||||
s3_to_gcs = S3ToGoogleCloudStorageTransferOperator(
|
||||
s3_to_gcs = CloudDataTransferServiceS3ToGCSOperator(
|
||||
task_id="s3_to_gcs",
|
||||
s3_bucket="crashstats-telemetry-crashes-prod-us-west-2",
|
||||
project_id=project_id,
|
||||
gcs_bucket=gcs_data_bucket,
|
||||
description="socorro crash report copy from s3 to gcs",
|
||||
aws_conn_id=read_aws_conn_id,
|
||||
gcp_conn_id=gcp_conn_id,
|
||||
project_id=connection.project_id,
|
||||
object_conditions={"includePrefixes": "v1/crash_report/{{ ds_nodash }}"},
|
||||
transfer_options={"deleteObjectsUniqueInSink": True},
|
||||
timeout=3600,
|
||||
|
@ -116,7 +115,7 @@ crash_report_parquet = SubDagOperator(
|
|||
|
||||
|
||||
bq_gcp_conn_id = "google_cloud_derived_datasets"
|
||||
bq_connection = GoogleCloudBaseHook(gcp_conn_id=bq_gcp_conn_id)
|
||||
bq_project_id = "moz-fx-data-derived-datasets"
|
||||
|
||||
dest_s3_key = "s3://telemetry-parquet"
|
||||
|
||||
|
@ -142,9 +141,9 @@ gke_args = [
|
|||
]
|
||||
|
||||
# We remove the current date partition for idempotency.
|
||||
remove_bq_table_partition = BigQueryTableDeleteOperator(
|
||||
remove_bq_table_partition = BigQueryDeleteTableOperator(
|
||||
task_id="remove_bq_table_partition",
|
||||
bigquery_conn_id=bq_gcp_conn_id,
|
||||
gcp_conn_id=bq_gcp_conn_id,
|
||||
deletion_dataset_table="{}.{}${{{{ds_nodash}}}}".format(bq_dataset, bq_table_name),
|
||||
ignore_if_missing=True,
|
||||
dag=dag,
|
||||
|
@ -153,7 +152,7 @@ remove_bq_table_partition = BigQueryTableDeleteOperator(
|
|||
bq_load = GKEPodOperator(
|
||||
task_id="bigquery_load",
|
||||
gcp_conn_id=bq_gcp_conn_id,
|
||||
project_id=bq_connection.project_id,
|
||||
project_id=bq_project_id,
|
||||
name="load-socorro-crash-parquet-to-bq",
|
||||
image=docker_image,
|
||||
arguments=gke_args,
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
from datetime import datetime, timedelta
|
||||
|
||||
from airflow import DAG
|
||||
from airflow.contrib.hooks.aws_hook import AwsHook
|
||||
from airflow.operators.sensors import ExternalTaskSensor
|
||||
from airflow.providers.amazon.aws.hooks.base_aws import AwsBaseHook
|
||||
from airflow.sensors.external_task import ExternalTaskSensor
|
||||
from airflow.operators.subdag_operator import SubDagOperator
|
||||
from airflow.models import Variable
|
||||
from itertools import chain
|
||||
|
@ -21,14 +21,16 @@ TAAR_ETL_CONTAINER_IMAGE = "gcr.io/moz-fx-data-airflow-prod-88e0/taar_gcp_etl:0.
|
|||
|
||||
|
||||
# Dataproc connection to GCP
|
||||
gcpdataproc_conn_id = "google_cloud_airflow_dataproc"
|
||||
taar_gcpdataproc_conn_id = "google_cloud_airflow_dataproc"
|
||||
taar_gcpdataproc_project_id = "airflow-dataproc"
|
||||
|
||||
taar_aws_conn_id = "airflow_taar_rw_s3"
|
||||
taar_aws_access_key, taar_aws_secret_key, session = AwsHook(taar_aws_conn_id).get_credentials()
|
||||
taar_aws_access_key, taar_aws_secret_key, session = AwsBaseHook(
|
||||
aws_conn_id=taar_aws_conn_id, client_type='s3').get_credentials()
|
||||
taarlite_cluster_name = "dataproc-taarlite-guidguid"
|
||||
taar_locale_cluster_name = "dataproc-taar-locale"
|
||||
taar_similarity_cluster_name = "dataproc-taar-similarity"
|
||||
taar_gcpdataproc_conn_id = "google_cloud_airflow_dataproc"
|
||||
|
||||
|
||||
default_args = {
|
||||
"owner": "epavlov@mozilla.com",
|
||||
|
@ -107,7 +109,8 @@ taar_locale = SubDagOperator(
|
|||
"--prefix",
|
||||
"taar/locale",
|
||||
],
|
||||
gcp_conn_id=taar_gcpdataproc_conn_id
|
||||
gcp_conn_id=taar_gcpdataproc_conn_id,
|
||||
project_id=taar_gcpdataproc_project_id
|
||||
),
|
||||
dag=dag
|
||||
)
|
||||
|
@ -136,6 +139,7 @@ taar_similarity = SubDagOperator(
|
|||
"--prefix", "taar/similarity"
|
||||
],
|
||||
gcp_conn_id=taar_gcpdataproc_conn_id,
|
||||
project_id=taar_gcpdataproc_project_id,
|
||||
master_disk_type="pd-ssd",
|
||||
worker_disk_type="pd-ssd",
|
||||
master_disk_size=1024,
|
||||
|
@ -176,6 +180,7 @@ taar_collaborative_recommender = SubDagOperator(
|
|||
init_actions_uris=[],
|
||||
aws_conn_id=taar_aws_conn_id,
|
||||
gcp_conn_id=taar_gcpdataproc_conn_id,
|
||||
project_id=taar_gcpdataproc_project_id,
|
||||
default_args=default_args
|
||||
),
|
||||
dag=dag,
|
||||
|
@ -204,11 +209,13 @@ taar_lite = SubDagOperator(
|
|||
"--prefix", "taar/lite"
|
||||
],
|
||||
gcp_conn_id=taar_gcpdataproc_conn_id,
|
||||
project_id=taar_gcpdataproc_project_id,
|
||||
),
|
||||
dag=dag,
|
||||
)
|
||||
|
||||
|
||||
|
||||
taar_lite_guidranking = GKEPodOperator(
|
||||
task_id="taar_lite_guidranking",
|
||||
name="taar_lite_guidranking",
|
||||
|
@ -230,3 +237,4 @@ wait_for_clients_daily_export >> taar_locale
|
|||
wait_for_clients_daily_export >> taar_collaborative_recommender
|
||||
wait_for_clients_daily_export >> taar_lite
|
||||
wait_for_clients_daily_export >> taar_lite_guidranking
|
||||
|
||||
|
|
|
@ -12,6 +12,7 @@ from utils.dataproc import moz_dataproc_pyspark_runner
|
|||
|
||||
taar_ensemble_cluster_name = "dataproc-taar-ensemble"
|
||||
taar_gcpdataproc_conn_id = "google_cloud_airflow_dataproc"
|
||||
taar_gcpdataproc_project_id = "airflow-dataproc"
|
||||
|
||||
TAAR_BIGTABLE_INSTANCE_ID = Variable.get("taar_bigtable_instance_id")
|
||||
TAAR_ETL_STORAGE_BUCKET = Variable.get("taar_etl_storage_bucket")
|
||||
|
@ -181,6 +182,7 @@ taar_ensemble = SubDagOperator(
|
|||
"0.005",
|
||||
],
|
||||
gcp_conn_id=taar_gcpdataproc_conn_id,
|
||||
project_id=taar_gcpdataproc_project_id,
|
||||
master_disk_type="pd-ssd",
|
||||
worker_disk_type="pd-ssd",
|
||||
master_disk_size=1024,
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
from airflow import DAG
|
||||
from airflow.contrib.hooks.aws_hook import AwsHook
|
||||
from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook
|
||||
from airflow.providers.amazon.aws.hooks.base_aws import AwsBaseHook
|
||||
from airflow.operators.subdag_operator import SubDagOperator
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
|
@ -33,11 +32,11 @@ cluster_name = 'app-update-out-of-date-dataproc-cluster'
|
|||
|
||||
# Defined in Airflow's UI -> Admin -> Connections
|
||||
gcp_conn_id = 'google_cloud_airflow_dataproc'
|
||||
connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id)
|
||||
|
||||
# Required to write json output back to s3://telemetry-public-analysis-2/app-update/data/out-of-date/
|
||||
write_aws_conn_id='aws_dev_telemetry_public_analysis_2_rw'
|
||||
aws_access_key, aws_secret_key, session = AwsHook(write_aws_conn_id).get_credentials()
|
||||
aws_access_key, aws_secret_key, session = AwsBaseHook(
|
||||
aws_conn_id=write_aws_conn_id, client_type='s3').get_credentials()
|
||||
|
||||
crash_report_parquet = SubDagOperator(
|
||||
task_id="update_orphaning_dashboard_etl",
|
||||
|
|
|
@ -3,21 +3,21 @@ import os
|
|||
from collections import namedtuple
|
||||
|
||||
from airflow import models
|
||||
from airflow.contrib.hooks.aws_hook import AwsHook
|
||||
from airflow.operators.bash_operator import BashOperator
|
||||
from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook
|
||||
from airflow.contrib.operators.dataproc_operator import DataprocClusterDeleteOperator, DataProcSparkOperator, DataProcPySparkOperator
|
||||
from airflow.exceptions import AirflowException
|
||||
from airflow.utils.trigger_rule import TriggerRule
|
||||
|
||||
# Our own dataproc operator used to install component gateway
|
||||
from operators.moz_dataproc_operator import DataprocClusterCreateOperator
|
||||
|
||||
"""
|
||||
Note: We are currently on 1.10.7 and when we upgrade, the spark operators will move.
|
||||
This module is deprecated. Please use `airflow.providers.google.cloud.operators.dataproc
|
||||
"""
|
||||
from airflow.operators.bash_operator import BashOperator
|
||||
from airflow.providers.amazon.aws.hooks.base_aws import AwsBaseHook
|
||||
|
||||
# When google deprecates dataproc_v1beta2 in DataprocHook/Operator classes
|
||||
# We can import these from our patched code, rather than upgrading/deploying
|
||||
# apache-airflow-providers-google > 6.0.0, and google-cloud-dataproc > 2.5.0
|
||||
# from utils.patched.dataproc_operator import (
|
||||
from airflow.providers.google.cloud.operators.dataproc import (
|
||||
ClusterGenerator,
|
||||
DataprocCreateClusterOperator,
|
||||
DataprocDeleteClusterOperator,
|
||||
DataprocSubmitPySparkJobOperator,
|
||||
DataprocSubmitSparkJobOperator,
|
||||
)
|
||||
|
||||
class DataProcHelper:
|
||||
"""
|
||||
|
@ -32,8 +32,8 @@ class DataProcHelper:
|
|||
region='us-west1',
|
||||
subnetwork_uri=None,
|
||||
internal_ip_only=None,
|
||||
idle_delete_ttl='14400',
|
||||
auto_delete_ttl='28800',
|
||||
idle_delete_ttl=14400,
|
||||
auto_delete_ttl=28800,
|
||||
master_machine_type='n1-standard-8',
|
||||
worker_machine_type='n1-standard-4',
|
||||
num_preemptible_workers=0,
|
||||
|
@ -45,6 +45,7 @@ class DataProcHelper:
|
|||
install_component_gateway=True,
|
||||
aws_conn_id=None,
|
||||
gcp_conn_id='google_cloud_airflow_dataproc',
|
||||
project_id='airflow-dataproc',
|
||||
artifact_bucket='moz-fx-data-prod-airflow-dataproc-artifacts',
|
||||
storage_bucket='moz-fx-data-prod-dataproc-scratch',
|
||||
master_disk_type='pd-standard',
|
||||
|
@ -99,12 +100,11 @@ class DataProcHelper:
|
|||
self.install_component_gateway = install_component_gateway
|
||||
self.aws_conn_id = aws_conn_id
|
||||
self.gcp_conn_id = gcp_conn_id
|
||||
|
||||
self.connection = GoogleCloudBaseHook(gcp_conn_id=self.gcp_conn_id)
|
||||
self.project_id = project_id
|
||||
|
||||
def create_cluster(self):
|
||||
"""
|
||||
Returns a DataprocClusterCreateOperator
|
||||
Returns a DataprocCreateClusterOperator
|
||||
"""
|
||||
properties = {}
|
||||
|
||||
|
@ -115,7 +115,7 @@ class DataProcHelper:
|
|||
if self.aws_conn_id:
|
||||
for key, value in zip(
|
||||
("access.key", "secret.key", "session.token"),
|
||||
AwsHook(self.aws_conn_id).get_credentials(),
|
||||
AwsBaseHook(aws_conn_id=self.aws_conn_id, client_type='s3').get_credentials(),
|
||||
):
|
||||
if value is not None:
|
||||
properties["core:fs.s3a." + key] = value
|
||||
|
@ -133,48 +133,71 @@ class DataProcHelper:
|
|||
}
|
||||
metadata.update(self.additional_metadata)
|
||||
|
||||
return DataprocClusterCreateOperator(
|
||||
task_id='create_dataproc_cluster',
|
||||
cluster_name=self.cluster_name,
|
||||
job_name=self.job_name,
|
||||
gcp_conn_id=self.gcp_conn_id,
|
||||
service_account=self.service_account,
|
||||
project_id=self.connection.project_id,
|
||||
cluster_generator = ClusterGenerator(
|
||||
project_id = self.project_id,
|
||||
num_workers = self.num_workers,
|
||||
subnetwork_uri = self.subnetwork_uri,
|
||||
internal_ip_only = self.internal_ip_only,
|
||||
storage_bucket=self.storage_bucket,
|
||||
num_workers=self.num_workers,
|
||||
image_version=self.image_version,
|
||||
properties=properties,
|
||||
region=self.region,
|
||||
subnetwork_uri=self.subnetwork_uri,
|
||||
internal_ip_only=self.internal_ip_only,
|
||||
idle_delete_ttl=self.idle_delete_ttl,
|
||||
auto_delete_ttl=self.auto_delete_ttl,
|
||||
master_machine_type=self.master_machine_type,
|
||||
worker_machine_type=self.worker_machine_type,
|
||||
num_preemptible_workers=self.num_preemptible_workers,
|
||||
optional_components = self.optional_components,
|
||||
install_component_gateway = self.install_component_gateway,
|
||||
init_actions_uris=self.init_actions_uris,
|
||||
metadata = metadata,
|
||||
image_version=self.image_version,
|
||||
properties = properties,
|
||||
optional_components = self.optional_components,
|
||||
master_machine_type=self.master_machine_type,
|
||||
master_disk_type=self.master_disk_type,
|
||||
master_disk_size=self.master_disk_size,
|
||||
worker_machine_type=self.worker_machine_type,
|
||||
worker_disk_type=self.worker_disk_type,
|
||||
worker_disk_size=self.worker_disk_size,
|
||||
master_num_local_ssds=self.master_num_local_ssds,
|
||||
worker_num_local_ssds=self.worker_num_local_ssds,
|
||||
metadata=metadata,
|
||||
num_preemptible_workers=self.num_preemptible_workers,
|
||||
service_account=self.service_account,
|
||||
idle_delete_ttl=self.idle_delete_ttl,
|
||||
auto_delete_ttl=self.auto_delete_ttl
|
||||
)
|
||||
|
||||
cluster_config = cluster_generator.make()
|
||||
|
||||
# The DataprocCreateClusterOperator and ClusterGenerator dont support component gateway or local ssds
|
||||
# ClusterConfig format is
|
||||
# https://cloud.google.com/dataproc/docs/reference/rpc/google.cloud.dataproc.v1#google.cloud.dataproc.v1.ClusterConfig
|
||||
if self.install_component_gateway:
|
||||
cluster_config.update({'endpoint_config' : {'enable_http_port_access' : True}})
|
||||
|
||||
if self.master_num_local_ssds > 0:
|
||||
master_instance_group_config = cluster_config['master_config']
|
||||
master_instance_group_config['disk_config']['num_local_ssds'] = self.master_num_local_ssds
|
||||
cluster_config.update({'master_config' : master_instance_group_config})
|
||||
|
||||
if self.worker_num_local_ssds > 0:
|
||||
worker_instance_group_config = cluster_config['worker_config']
|
||||
worker_instance_group_config['disk_config']['num_local_ssds'] =self.worker_num_local_ssds
|
||||
cluster_config.update({'worker_config' : worker_instance_group_config})
|
||||
|
||||
return DataprocCreateClusterOperator(
|
||||
task_id='create_dataproc_cluster',
|
||||
cluster_name=self.cluster_name,
|
||||
project_id = self.project_id,
|
||||
use_if_exists=True,
|
||||
delete_on_error=True,
|
||||
labels={ 'env': os.getenv('DEPLOY_ENVIRONMENT', 'env_not_set'),
|
||||
'owner': os.getenv('AIRFLOW_CTX_DAG_OWNER', 'owner_not_set'),
|
||||
'jobname': self.job_name.lower().replace('_', '-') },
|
||||
gcp_conn_id=self.gcp_conn_id,
|
||||
region=self.region,
|
||||
cluster_config = cluster_config
|
||||
)
|
||||
|
||||
def delete_cluster(self):
|
||||
"""
|
||||
Returns a DataprocClusterDeleteOperator
|
||||
Returns a DataprocDeleteClusterOperator
|
||||
"""
|
||||
return DataprocClusterDeleteOperator(
|
||||
return DataprocDeleteClusterOperator(
|
||||
task_id='delete_dataproc_cluster',
|
||||
trigger_rule=TriggerRule.ALL_DONE,
|
||||
cluster_name=self.cluster_name,
|
||||
region=self.region,
|
||||
gcp_conn_id=self.gcp_conn_id,
|
||||
project_id=self.connection.project_id)
|
||||
project_id=self.project_id)
|
||||
# End DataProcHelper
|
||||
|
||||
|
||||
|
@ -187,8 +210,8 @@ def moz_dataproc_pyspark_runner(parent_dag_name=None,
|
|||
region='us-west1',
|
||||
subnetwork_uri=None,
|
||||
internal_ip_only=None,
|
||||
idle_delete_ttl='10800',
|
||||
auto_delete_ttl='21600',
|
||||
idle_delete_ttl=10800,
|
||||
auto_delete_ttl=21600,
|
||||
master_machine_type='n1-standard-8',
|
||||
worker_machine_type='n1-standard-4',
|
||||
num_preemptible_workers=0,
|
||||
|
@ -203,6 +226,7 @@ def moz_dataproc_pyspark_runner(parent_dag_name=None,
|
|||
job_name=None,
|
||||
aws_conn_id=None,
|
||||
gcp_conn_id='google_cloud_airflow_dataproc',
|
||||
project_id='airflow-dataproc',
|
||||
artifact_bucket='moz-fx-data-prod-airflow-dataproc-artifacts',
|
||||
storage_bucket='moz-fx-data-prod-dataproc-scratch',
|
||||
master_disk_type='pd-standard',
|
||||
|
@ -215,7 +239,7 @@ def moz_dataproc_pyspark_runner(parent_dag_name=None,
|
|||
|
||||
"""
|
||||
This will initially create a GCP Dataproc cluster with Anaconda/Jupyter/Component gateway.
|
||||
Then we call DataProcPySparkOperator to execute the pyspark script defined by the argument
|
||||
Then we call DataprocSubmitPySparkJobOperator to execute the pyspark script defined by the argument
|
||||
python_driver_code. Once that succeeds, we teardown the cluster.
|
||||
|
||||
**Example**: ::
|
||||
|
@ -281,6 +305,9 @@ def moz_dataproc_pyspark_runner(parent_dag_name=None,
|
|||
|
||||
:param str aws_conn_id: Airflow connection id for S3 access (if needed).
|
||||
:param str gcp_conn_id: The connection ID to use connecting to GCP.
|
||||
:param str project_id: The project ID corresponding to the gcp_conn_id. We
|
||||
add this because the dev environment doesn't parse it out
|
||||
correctly from the dummy connections.
|
||||
:param str artifact_bucket: Path to resources for bootstrapping the dataproc cluster
|
||||
:param str storage_bucket: Path to scratch bucket for intermediate cluster results
|
||||
:param list optional_components: List of optional components to install on cluster
|
||||
|
@ -338,6 +365,7 @@ def moz_dataproc_pyspark_runner(parent_dag_name=None,
|
|||
install_component_gateway=install_component_gateway,
|
||||
aws_conn_id=aws_conn_id,
|
||||
gcp_conn_id=gcp_conn_id,
|
||||
project_id=project_id,
|
||||
artifact_bucket=artifact_bucket,
|
||||
storage_bucket=storage_bucket,
|
||||
master_disk_type=master_disk_type,
|
||||
|
@ -353,7 +381,7 @@ def moz_dataproc_pyspark_runner(parent_dag_name=None,
|
|||
with models.DAG(_dag_name, default_args=default_args) as dag:
|
||||
create_dataproc_cluster = dataproc_helper.create_cluster()
|
||||
|
||||
run_pyspark_on_dataproc = DataProcPySparkOperator(
|
||||
run_pyspark_on_dataproc = DataprocSubmitPySparkJobOperator(
|
||||
task_id='run_dataproc_pyspark',
|
||||
job_name=job_name,
|
||||
cluster_name=cluster_name,
|
||||
|
@ -361,6 +389,7 @@ def moz_dataproc_pyspark_runner(parent_dag_name=None,
|
|||
main=python_driver_code,
|
||||
arguments=py_args,
|
||||
gcp_conn_id=gcp_conn_id,
|
||||
project_id=project_id
|
||||
)
|
||||
|
||||
delete_dataproc_cluster = dataproc_helper.delete_cluster()
|
||||
|
@ -379,8 +408,8 @@ def moz_dataproc_jar_runner(parent_dag_name=None,
|
|||
region='us-west1',
|
||||
subnetwork_uri=None,
|
||||
internal_ip_only=None,
|
||||
idle_delete_ttl='14400',
|
||||
auto_delete_ttl='28800',
|
||||
idle_delete_ttl=14400,
|
||||
auto_delete_ttl=28800,
|
||||
master_machine_type='n1-standard-8',
|
||||
worker_machine_type='n1-standard-4',
|
||||
num_preemptible_workers=0,
|
||||
|
@ -394,6 +423,7 @@ def moz_dataproc_jar_runner(parent_dag_name=None,
|
|||
job_name=None,
|
||||
aws_conn_id=None,
|
||||
gcp_conn_id='google_cloud_airflow_dataproc',
|
||||
project_id='airflow-dataproc',
|
||||
master_disk_type='pd-standard',
|
||||
worker_disk_type='pd-standard',
|
||||
master_disk_size=1024,
|
||||
|
@ -404,7 +434,7 @@ def moz_dataproc_jar_runner(parent_dag_name=None,
|
|||
|
||||
"""
|
||||
This will initially create a GCP Dataproc cluster with Anaconda/Jupyter/Component gateway.
|
||||
Then we call DataProcSparkOperator to execute the jar defined by the arguments
|
||||
Then we call DataprocSubmitSparkJobOperator to execute the jar defined by the arguments
|
||||
jar_urls and main_class. Once that succeeds, we teardown the cluster.
|
||||
|
||||
**Example**: ::
|
||||
|
@ -468,6 +498,7 @@ def moz_dataproc_jar_runner(parent_dag_name=None,
|
|||
install_component_gateway=install_component_gateway,
|
||||
aws_conn_id=aws_conn_id,
|
||||
gcp_conn_id=gcp_conn_id,
|
||||
project_id=project_id,
|
||||
master_disk_type=master_disk_type,
|
||||
master_disk_size=master_disk_size,
|
||||
worker_disk_type=worker_disk_type,
|
||||
|
@ -481,15 +512,17 @@ def moz_dataproc_jar_runner(parent_dag_name=None,
|
|||
with models.DAG(_dag_name, default_args=default_args) as dag:
|
||||
create_dataproc_cluster = dataproc_helper.create_cluster()
|
||||
|
||||
run_jar_on_dataproc = DataProcSparkOperator(
|
||||
run_jar_on_dataproc = DataprocSubmitSparkJobOperator(
|
||||
cluster_name=cluster_name,
|
||||
region=region,
|
||||
task_id='run_jar_on_dataproc',
|
||||
job_name=job_name,
|
||||
dataproc_spark_jars=jar_urls,
|
||||
dataproc_jars=jar_urls,
|
||||
main_class=main_class,
|
||||
arguments=jar_args,
|
||||
gcp_conn_id=gcp_conn_id)
|
||||
gcp_conn_id=gcp_conn_id,
|
||||
project_id=project_id
|
||||
)
|
||||
|
||||
delete_dataproc_cluster = dataproc_helper.delete_cluster()
|
||||
|
||||
|
@ -512,8 +545,8 @@ def moz_dataproc_scriptrunner(parent_dag_name=None,
|
|||
region='us-west1',
|
||||
subnetwork_uri=None,
|
||||
internal_ip_only=None,
|
||||
idle_delete_ttl='14400',
|
||||
auto_delete_ttl='28800',
|
||||
idle_delete_ttl=14400,
|
||||
auto_delete_ttl=28800,
|
||||
master_machine_type='n1-standard-8',
|
||||
worker_machine_type='n1-standard-4',
|
||||
num_preemptible_workers=0,
|
||||
|
@ -527,6 +560,7 @@ def moz_dataproc_scriptrunner(parent_dag_name=None,
|
|||
job_name=None,
|
||||
aws_conn_id=None,
|
||||
gcp_conn_id='google_cloud_airflow_dataproc',
|
||||
project_id='airflow-dataproc',
|
||||
master_disk_type='pd-standard',
|
||||
worker_disk_type='pd-standard',
|
||||
master_disk_size=1024,
|
||||
|
@ -538,7 +572,7 @@ def moz_dataproc_scriptrunner(parent_dag_name=None,
|
|||
"""
|
||||
This will initially create a GCP Dataproc cluster with Anaconda/Jupyter/Component gateway.
|
||||
Then we execute a script uri (either https or gcs) similar to how we use our custom AWS
|
||||
EmrSparkOperator. This will call DataProcSparkOperator using EMR's script-runner.jar, which
|
||||
EmrSparkOperator. This will call DataprocSubmitSparkJobOperator using EMR's script-runner.jar, which
|
||||
then executes the airflow_gcp.sh entrypoint script. The entrypoint script expects another
|
||||
script uri, along with it's arguments, as parameters. Once that succeeds, we teardown the
|
||||
cluster.
|
||||
|
@ -609,6 +643,7 @@ def moz_dataproc_scriptrunner(parent_dag_name=None,
|
|||
install_component_gateway=install_component_gateway,
|
||||
aws_conn_id=aws_conn_id,
|
||||
gcp_conn_id=gcp_conn_id,
|
||||
project_id=project_id,
|
||||
master_disk_type=master_disk_type,
|
||||
master_disk_size=master_disk_size,
|
||||
worker_disk_type=worker_disk_type,
|
||||
|
@ -636,17 +671,19 @@ def moz_dataproc_scriptrunner(parent_dag_name=None,
|
|||
with models.DAG(_dag_name, default_args=default_args) as dag:
|
||||
create_dataproc_cluster = dataproc_helper.create_cluster()
|
||||
|
||||
# Run DataprocSparkOperator with script-runner.jar pointing to airflow_gcp.sh.
|
||||
# Run DataprocSubmitSparkJobOperator with script-runner.jar pointing to airflow_gcp.sh.
|
||||
|
||||
run_script_on_dataproc = DataProcSparkOperator(
|
||||
run_script_on_dataproc = DataprocSubmitSparkJobOperator(
|
||||
cluster_name=cluster_name,
|
||||
region=region,
|
||||
task_id='run_script_on_dataproc',
|
||||
job_name=job_name,
|
||||
dataproc_spark_jars=[jar_url],
|
||||
dataproc_jars=[jar_url],
|
||||
main_class='com.amazon.elasticmapreduce.scriptrunner.ScriptRunner',
|
||||
arguments=args,
|
||||
gcp_conn_id=gcp_conn_id)
|
||||
gcp_conn_id=gcp_conn_id,
|
||||
project_id=project_id
|
||||
)
|
||||
|
||||
delete_dataproc_cluster = dataproc_helper.delete_cluster()
|
||||
|
||||
|
@ -715,13 +752,13 @@ def get_dataproc_parameters(conn_id="google_cloud_airflow_dataproc"):
|
|||
and should either be the production default ("dataproc-runner-prod"), or a
|
||||
service key associated with a sandbox account.
|
||||
"""
|
||||
gcp_conn = GoogleCloudBaseHook(conn_id)
|
||||
keyfile = json.loads(gcp_conn.extras["extra__google_cloud_platform__keyfile_dict"])
|
||||
dev_project_id = "replace_me"
|
||||
dev_client_email = "replace_me"
|
||||
|
||||
project_id = keyfile["project_id"]
|
||||
is_dev = os.environ.get("DEPLOY_ENVIRONMENT") == "dev"
|
||||
project_id = "airflow-dataproc" if is_dev else dev_project_id
|
||||
client_email = (
|
||||
keyfile["client_email"]
|
||||
dev_client_email
|
||||
if is_dev
|
||||
else "dataproc-runner-prod@airflow-dataproc.iam.gserviceaccount.com"
|
||||
)
|
||||
|
|
|
@ -1,14 +1,12 @@
|
|||
from operators.gcp_container_operator import GKEPodOperator
|
||||
|
||||
from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook
|
||||
|
||||
def simpleprophet_forecast(
|
||||
task_id,
|
||||
datasource,
|
||||
project_id,
|
||||
dataset_id,
|
||||
table_id,
|
||||
gcp_conn_id="google_cloud_derived_datasets",
|
||||
project_id='moz-fx-data-derived-datasets',
|
||||
gke_location="us-central1-a",
|
||||
gke_cluster_name="bq-load-gke-1",
|
||||
gke_namespace="default",
|
||||
|
@ -25,6 +23,7 @@ def simpleprophet_forecast(
|
|||
:param str table_id: [Required] ID of target table
|
||||
|
||||
:param str gcp_conn_id: Airflow connection id for GCP access
|
||||
:param str project_id: GCP project id associated with gcp_conn_id
|
||||
:param str gke_location: GKE cluster location
|
||||
:param str gke_cluster_name: GKE cluster name
|
||||
:param str gke_namespace: GKE cluster namespace
|
||||
|
@ -40,7 +39,7 @@ def simpleprophet_forecast(
|
|||
return GKEPodOperator(
|
||||
task_id=task_id,
|
||||
gcp_conn_id=gcp_conn_id,
|
||||
project_id=GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id,
|
||||
project_id=project_id,
|
||||
location=gke_location,
|
||||
cluster_name=gke_cluster_name,
|
||||
namespace=gke_namespace,
|
||||
|
|
|
@ -1,20 +1,25 @@
|
|||
from airflow import models
|
||||
from airflow.utils import trigger_rule
|
||||
from airflow.operators.dummy_operator import DummyOperator
|
||||
from airflow.operators.subdag_operator import SubDagOperator
|
||||
|
||||
from airflow.contrib.hooks.aws_hook import AwsHook
|
||||
from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook
|
||||
from airflow.contrib.operators.dataproc_operator import DataprocClusterCreateOperator, DataprocClusterDeleteOperator, DataProcSparkOperator, DataProcPySparkOperator # noqa
|
||||
from operators.gcp_container_operator import GKEPodOperator
|
||||
from airflow.contrib.operators.bigquery_table_delete_operator import BigQueryTableDeleteOperator # noqa:E501
|
||||
from airflow.contrib.operators.bigquery_to_gcs import BigQueryToCloudStorageOperator
|
||||
from airflow.contrib.operators.gcp_transfer_operator import S3ToGoogleCloudStorageTransferOperator # noqa:E501
|
||||
from airflow.contrib.operators.gcs_delete_operator import GoogleCloudStorageDeleteOperator
|
||||
|
||||
from airflow.providers.amazon.aws.hooks.base_aws import AwsBaseHook
|
||||
|
||||
from airflow.providers.google.cloud.operators.dataproc import (
|
||||
DataprocCreateClusterOperator,
|
||||
DataprocDeleteClusterOperator,
|
||||
DataprocSubmitPySparkJobOperator,
|
||||
)
|
||||
|
||||
from airflow.providers.google.cloud.transfers.bigquery_to_gcs import BigQueryToGCSOperator
|
||||
|
||||
from airflow.providers.google.cloud.operators.gcs import GCSDeleteObjectsOperator
|
||||
|
||||
import json
|
||||
import re
|
||||
|
||||
GCP_PROJECT_ID = "moz-fx-data-derived-datasets"
|
||||
|
||||
def export_to_parquet(
|
||||
table,
|
||||
|
@ -67,7 +72,7 @@ def export_to_parquet(
|
|||
cluster_name += "-export-{{ ds_nodash }}"
|
||||
|
||||
dag_prefix = parent_dag_name + "." if parent_dag_name else ""
|
||||
connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id)
|
||||
project_id = GCP_PROJECT_ID
|
||||
|
||||
if destination_table is None:
|
||||
destination_table = unqualified_table
|
||||
|
@ -82,11 +87,12 @@ def export_to_parquet(
|
|||
|
||||
with models.DAG(dag_id=dag_prefix + dag_name, default_args=default_args) as dag:
|
||||
|
||||
create_dataproc_cluster = DataprocClusterCreateOperator(
|
||||
create_dataproc_cluster = DataprocCreateClusterOperator(
|
||||
task_id="create_dataproc_cluster",
|
||||
cluster_name=cluster_name,
|
||||
gcp_conn_id=gcp_conn_id,
|
||||
project_id=connection.project_id,
|
||||
region="us-west1",
|
||||
project_id=project_id,
|
||||
num_workers=num_workers,
|
||||
image_version="1.4",
|
||||
storage_bucket=dataproc_storage_bucket,
|
||||
|
@ -100,13 +106,13 @@ def export_to_parquet(
|
|||
metadata={"PIP_PACKAGES": "google-cloud-bigquery==1.20.0"},
|
||||
)
|
||||
|
||||
run_dataproc_pyspark = DataProcPySparkOperator(
|
||||
run_dataproc_pyspark = DataprocSubmitPySparkJobOperator(
|
||||
task_id="run_dataproc_pyspark",
|
||||
cluster_name=cluster_name,
|
||||
dataproc_pyspark_jars=[
|
||||
dataproc_jars=[
|
||||
"gs://spark-lib/bigquery/spark-bigquery-latest.jar"
|
||||
],
|
||||
dataproc_pyspark_properties={
|
||||
dataproc_properties={
|
||||
"spark.jars.packages": "org.apache.spark:spark-avro_2.11:2.4.4",
|
||||
},
|
||||
main="https://raw.githubusercontent.com/mozilla/bigquery-etl/main"
|
||||
|
@ -125,31 +131,33 @@ def export_to_parquet(
|
|||
+ [static_partitions]
|
||||
+ arguments,
|
||||
gcp_conn_id=gcp_conn_id,
|
||||
project_id=project_id,
|
||||
)
|
||||
|
||||
delete_dataproc_cluster = DataprocClusterDeleteOperator(
|
||||
delete_dataproc_cluster = DataprocDeleteClusterOperator(
|
||||
task_id="delete_dataproc_cluster",
|
||||
cluster_name=cluster_name,
|
||||
gcp_conn_id=gcp_conn_id,
|
||||
project_id=connection.project_id,
|
||||
trigger_rule=trigger_rule.TriggerRule.ALL_DONE,
|
||||
project_id=project_id,
|
||||
trigger_rule="all_done",
|
||||
region="us-west1",
|
||||
)
|
||||
|
||||
if not use_storage_api:
|
||||
avro_export = BigQueryToCloudStorageOperator(
|
||||
avro_export = BigQueryToGCSOperator(
|
||||
task_id="avro_export",
|
||||
source_project_dataset_table=table,
|
||||
destination_cloud_storage_uris=avro_path,
|
||||
compression=None,
|
||||
export_format="AVRO",
|
||||
bigquery_conn_id=gcp_conn_id,
|
||||
gcp_conn_id=gcp_conn_id,
|
||||
)
|
||||
avro_delete = GoogleCloudStorageDeleteOperator(
|
||||
avro_delete = GCSDeleteObjectsOperator(
|
||||
task_id="avro_delete",
|
||||
bucket_name=gcs_output_bucket,
|
||||
prefix=avro_prefix,
|
||||
google_cloud_storage_conn_id=gcp_conn_id,
|
||||
trigger_rule=trigger_rule.TriggerRule.ALL_DONE,
|
||||
gcp_conn_id=gcp_conn_id,
|
||||
trigger_rule="all_done",
|
||||
)
|
||||
avro_export >> run_dataproc_pyspark >> avro_delete
|
||||
|
||||
|
@ -210,7 +218,7 @@ def bigquery_etl_query(
|
|||
parameters += (date_partition_parameter + ":DATE:{{ds}}",)
|
||||
return GKEPodOperator(
|
||||
gcp_conn_id=gcp_conn_id,
|
||||
project_id=GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id,
|
||||
project_id=project_id,
|
||||
location=gke_location,
|
||||
cluster_name=gke_cluster_name,
|
||||
namespace=gke_namespace,
|
||||
|
@ -278,7 +286,7 @@ def bigquery_etl_copy_deduplicate(
|
|||
return GKEPodOperator(
|
||||
task_id=task_id,
|
||||
gcp_conn_id=gcp_conn_id,
|
||||
project_id=GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id,
|
||||
project_id=GCP_PROJECT_ID,
|
||||
location=gke_location,
|
||||
cluster_name=gke_cluster_name,
|
||||
namespace=gke_namespace,
|
||||
|
@ -341,7 +349,7 @@ def bigquery_xcom_query(
|
|||
query = "{{ " + "task_instance.xcom_pull({!r})".format(xcom_task_id) + " }}"
|
||||
return GKEPodOperator(
|
||||
gcp_conn_id=gcp_conn_id,
|
||||
project_id=GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id,
|
||||
project_id=GCP_PROJECT_ID,
|
||||
location=gke_location,
|
||||
cluster_name=gke_cluster_name,
|
||||
namespace=gke_namespace,
|
||||
|
@ -407,7 +415,7 @@ def gke_command(
|
|||
key: value
|
||||
for key, value in zip(
|
||||
("AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", "AWS_SESSION_TOKEN"),
|
||||
AwsHook(aws_conn_id).get_credentials() if aws_conn_id else (),
|
||||
AwsBaseHook(aws_conn_id=aws_conn_id, client_type='s3').get_credentials() if aws_conn_id else (),
|
||||
)
|
||||
if value is not None}
|
||||
context_env_vars["XCOM_PUSH"] = json.dumps(xcom_push)
|
||||
|
@ -416,7 +424,7 @@ def gke_command(
|
|||
return GKEPodOperator(
|
||||
task_id=task_id,
|
||||
gcp_conn_id=gcp_conn_id,
|
||||
project_id=GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id,
|
||||
project_id=GCP_PROJECT_ID,
|
||||
location=gke_location,
|
||||
cluster_name=gke_cluster_name,
|
||||
namespace=gke_namespace,
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -126,5 +126,3 @@ AUTH_ROLE_PUBLIC = 'Admin'
|
|||
# APP_THEME = "superhero.css"
|
||||
# APP_THEME = "united.css"
|
||||
# APP_THEME = "yeti.css"
|
||||
|
||||
|
||||
|
|
|
@ -5,6 +5,7 @@ services:
|
|||
image: mysql:5.7
|
||||
ports:
|
||||
- '3306:3306'
|
||||
# command: ['--explicit_defaults_for_timestamp=1', '--character-set-server=utf8mb4']
|
||||
command: ['--explicit_defaults_for_timestamp=1']
|
||||
environment:
|
||||
MYSQL_ROOT_PASSWORD: secret
|
||||
|
@ -51,10 +52,11 @@ services:
|
|||
- AIRFLOW_EMAIL_BACKEND=airflow.macros.log_email_backend.log_email_backend
|
||||
- AIRFLOW__KUBERNETES__IN_CLUSTER=False
|
||||
- URL=http://localhost:8000
|
||||
- WEBSERVER_USE_RBAC=False
|
||||
# URL-encoded dummy connections; note that we define some other connections
|
||||
# in the bin/run script
|
||||
- AIRFLOW_CONN_ADM_SFTP=ftp://myname:mypassword@myhost.com:8000?known_hosts=myhost.com+AAAABBBBB
|
||||
# TODO(hwoo) - improve developer workflow by not loading all dags
|
||||
# - AIRFLOW__CORE__DAGS_FOLDER=$AIRFLOW_HOME/devdags
|
||||
|
||||
web:
|
||||
extends:
|
||||
|
|
|
@ -8,14 +8,8 @@ from airflow import configuration
|
|||
# Backfill Plugin Imports
|
||||
from backfill.main import Backfill
|
||||
|
||||
# Get RBAC config.
|
||||
rbac_authentication_enabled = configuration.getboolean("webserver", "RBAC")
|
||||
|
||||
# Init the plugin in Webserver's "Admin" Menu with Menu Item as "Backfill"
|
||||
if rbac_authentication_enabled == True:
|
||||
backfill_admin_view = {"category" : "Admin", "name" : "Backfill (Alpha)", "view": Backfill()}
|
||||
else:
|
||||
backfill_admin_view = Backfill(category="Admin", name="Backfill (Alpha)")
|
||||
backfill_admin_view = {"category" : "Admin", "name" : "Backfill (Alpha)", "view": Backfill()}
|
||||
|
||||
# Creating a flask blueprint to integrate the templates folder
|
||||
backfill_blueprint = Blueprint(
|
||||
|
|
|
@ -33,8 +33,6 @@ else:
|
|||
# Local file where history will be stored
|
||||
FILE = airflow_home_path + '/logs/backfill_history.txt'
|
||||
|
||||
rbac_authentication_enabled = configuration.getboolean("webserver", "RBAC")
|
||||
|
||||
# RE for remove ansi escape characters
|
||||
ansi_escape = re.compile(r'\x1B[@-_][0-?]*[ -/]*[@-~]')
|
||||
|
||||
|
@ -58,26 +56,15 @@ def file_ops(mode, data=None):
|
|||
return 1
|
||||
|
||||
def get_baseview():
|
||||
if rbac_authentication_enabled == True:
|
||||
return AppBuilderBaseView
|
||||
else:
|
||||
return BaseView
|
||||
return AppBuilderBaseView
|
||||
|
||||
class Backfill(get_baseview()):
|
||||
|
||||
route_base = "/admin/backfill/"
|
||||
|
||||
if rbac_authentication_enabled == True:
|
||||
@app_builder_expose('/')
|
||||
def list(self):
|
||||
""" Render the backfill page to client with RBAC"""
|
||||
return self.render_template("backfill_page.html",
|
||||
rbac_authentication_enabled=rbac_authentication_enabled)
|
||||
else:
|
||||
@expose('/')
|
||||
def base(self):
|
||||
""" Render the backfill page to client """
|
||||
return self.render("backfill_page.html")
|
||||
@app_builder_expose('/')
|
||||
def list(self):
|
||||
return self.render_template("backfill_page.html")
|
||||
|
||||
@expose('/stream')
|
||||
@app_builder_expose('/stream')
|
||||
|
@ -106,9 +93,10 @@ class Backfill(get_baseview()):
|
|||
if use_task_regex == 'true':
|
||||
cmd.extend(['-t', str(task_regex)])
|
||||
elif clear == 'false':
|
||||
cmd.append('dags')
|
||||
cmd.append('backfill')
|
||||
if dry_run == 'true':
|
||||
cmd.append('--dry_run')
|
||||
cmd.append('--dry-run')
|
||||
|
||||
if use_task_regex == 'true':
|
||||
cmd.extend(['-t', str(task_regex)])
|
||||
|
|
|
@ -4,21 +4,20 @@ Plugin that adds a "Mozilla" entry to the top bar with some useful links.
|
|||
Based on an example at
|
||||
https://github.com/airflow-plugins/Getting-Started/blob/master/Tutorial/creating-ui-modification.md
|
||||
"""
|
||||
|
||||
|
||||
from airflow.plugins_manager import AirflowPlugin
|
||||
from flask_admin.base import MenuLink
|
||||
|
||||
|
||||
telemetry_airflow = MenuLink(
|
||||
category="Mozilla",
|
||||
name="telemetry-airflow on GitHub",
|
||||
url="https://github.com/mozilla/telemetry-airflow")
|
||||
telemetry_airflow = {
|
||||
"name": "telemetry-airflow on GitHub",
|
||||
"category": "Mozilla",
|
||||
"href": "https://github.com/mozilla/telemetry-airflow"
|
||||
}
|
||||
|
||||
wtmo_dev = MenuLink(
|
||||
category="Mozilla",
|
||||
name="WTMO Developer Guide",
|
||||
url="https://mana.mozilla.org/wiki/display/DOPS/WTMO+Developer+Guide")
|
||||
wtmo_dev = {
|
||||
"name": "WTMO Developer Guide",
|
||||
"category": "Mozilla",
|
||||
"href": "https://mana.mozilla.org/wiki/display/DOPS/WTMO+Developer+Guide"
|
||||
}
|
||||
|
||||
class MozMenuPlugin(AirflowPlugin):
|
||||
name = "Mozilla"
|
||||
|
@ -26,5 +25,5 @@ class MozMenuPlugin(AirflowPlugin):
|
|||
flask_blueprints = []
|
||||
hooks = []
|
||||
executors = []
|
||||
admin_views = []
|
||||
menu_links = [telemetry_airflow, wtmo_dev]
|
||||
appbuilder_views = []
|
||||
appbuilder_menu_items = [telemetry_airflow, wtmo_dev]
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
{% extends 'airflow/master.html' %}
|
||||
{% extends base_template %}
|
||||
|
||||
{% block title %}Airflow - Backfill Plugin{% endblock %}
|
||||
|
||||
|
@ -320,13 +320,12 @@
|
|||
</style>
|
||||
{% endblock %}
|
||||
{% block body %}
|
||||
{% if rbac_authentication_enabled %}
|
||||
|
||||
{% block navbar %}
|
||||
<header class="top" role="header">
|
||||
{% include 'appbuilder/navbar.html' %}
|
||||
</header>
|
||||
{% endblock %}
|
||||
{%endif%}
|
||||
|
||||
<div class="container">
|
||||
<h1>Backfill (Alpha)</h1>
|
||||
|
|
|
@ -1,16 +1,11 @@
|
|||
boto3==1.15.18
|
||||
botocore<1.19.0,>=1.18.0
|
||||
kombu==4.6.10 # CeleryExecutor issues with 1.10.2 supposedly fixed in 1.10.5 airflow, but still observed issues on 1.10.7
|
||||
importlib-metadata==2.1.0
|
||||
importlib-metadata>=1.7
|
||||
argcomplete==1.12.2
|
||||
pandas-gbq==0.14.1
|
||||
# removed hdfs
|
||||
apache-airflow[celery,postgres,hive,jdbc,async,password,crypto,github_enterprise,datadog,statsd,s3,mysql,google_auth,gcp_api,kubernetes]==1.10.15
|
||||
apache-airflow-upgrade-check
|
||||
# Airflow 2.0 backported providers
|
||||
apache-airflow-backport-providers-google
|
||||
apache-airflow-backport-providers-amazon
|
||||
apache-airflow-backport-providers-http
|
||||
apache-airflow[amazon,celery,postgres,apache.hive,jdbc,async,password,crypto,github_enterprise,datadog,statsd,mysql,google_auth,cncf.kubernetes]==2.1.1
|
||||
cryptography>=3.2
|
||||
mozlogging
|
||||
retrying
|
||||
|
@ -19,11 +14,14 @@ redis
|
|||
hiredis
|
||||
requests
|
||||
jsonschema
|
||||
flask-admin
|
||||
Flask-OAuthlib
|
||||
Authlib~=0.15.3
|
||||
Flask-AppBuilder>=3.3.0
|
||||
pytz
|
||||
werkzeug==0.16.0
|
||||
werkzeug>=1.0.1,~=1.0
|
||||
# The next requirements are for kubernetes-client/python
|
||||
urllib3>=1.24.2 # MIT
|
||||
urllib3>=1.24.2 # MIT
|
||||
ipaddress>=1.0.17;python_version=="2.7" # PSF
|
||||
websocket-client>=0.32.0,!=0.40.0,!=0.41.*,!=0.42.* # LGPLv2+
|
||||
# Pin to older version, newer version has issues
|
||||
|
@ -31,4 +29,13 @@ JPype1==0.7.1
|
|||
shelljob==0.5.6
|
||||
# Fix no inspection available issue
|
||||
# https://github.com/apache/airflow/issues/8211
|
||||
SQLAlchemy==1.3.15
|
||||
SQLAlchemy>=1.3.18
|
||||
# Airflow 2 no longer installs http provider by default, until chardet becomes an optional dependency of requests
|
||||
apache-airflow-providers-http
|
||||
airflow-provider-fivetran
|
||||
# Upgrade google dataproc provider to fix beta client clusterConfig and mismatch issues
|
||||
apache-airflow-providers-google==5.0.0
|
||||
# 2.4.0 is broken for dataproc cluster create/delete
|
||||
# 2.6.0 and 3.0.0 are newer but not compatible with apache-airflow-providers-google
|
||||
# yet until maybe v7.0.0 bc 'google.cloud.dataproc_v1beta2' is deprecated
|
||||
google-cloud-dataproc==2.5.0
|
||||
|
|
234
requirements.txt
234
requirements.txt
|
@ -4,156 +4,184 @@
|
|||
#
|
||||
# pip-compile
|
||||
#
|
||||
airflow-provider-fivetran==1.0.1 # via -r requirements.in
|
||||
alembic==1.6.5 # via apache-airflow
|
||||
amqp==2.6.1 # via kombu
|
||||
apache-airflow-backport-providers-amazon==2021.3.3 # via -r requirements.in
|
||||
apache-airflow-backport-providers-google==2021.3.3 # via -r requirements.in
|
||||
apache-airflow-backport-providers-http==2021.4.10 # via -r requirements.in
|
||||
apache-airflow-upgrade-check==1.4.0 # via -r requirements.in
|
||||
apache-airflow[async,celery,crypto,datadog,gcp_api,github_enterprise,google_auth,hive,jdbc,kubernetes,mysql,password,postgres,s3,statsd]==1.10.15 # via -r requirements.in, apache-airflow-backport-providers-amazon, apache-airflow-backport-providers-google, apache-airflow-upgrade-check
|
||||
apispec[yaml]==1.3.3 # via flask-appbuilder
|
||||
argcomplete==1.12.2 # via -r requirements.in, apache-airflow
|
||||
anyio==3.3.0 # via httpcore
|
||||
apache-airflow-providers-amazon==2.1.0 # via apache-airflow
|
||||
apache-airflow-providers-apache-hive==2.0.1 # via apache-airflow
|
||||
apache-airflow-providers-celery==2.0.0 # via apache-airflow
|
||||
apache-airflow-providers-cncf-kubernetes==2.0.2 # via apache-airflow
|
||||
apache-airflow-providers-datadog==2.0.0 # via apache-airflow
|
||||
apache-airflow-providers-ftp==2.0.0 # via apache-airflow
|
||||
apache-airflow-providers-google==5.0.0 # via -r requirements.in
|
||||
apache-airflow-providers-http==2.0.0 # via -r requirements.in
|
||||
apache-airflow-providers-imap==2.0.0 # via apache-airflow
|
||||
apache-airflow-providers-jdbc==2.0.0 # via apache-airflow
|
||||
apache-airflow-providers-mysql==2.1.0 # via apache-airflow
|
||||
apache-airflow-providers-postgres==2.0.0 # via apache-airflow
|
||||
apache-airflow-providers-sqlite==2.0.0 # via apache-airflow
|
||||
apache-airflow[amazon,apache.hive,async,celery,cncf.kubernetes,crypto,datadog,github_enterprise,google_auth,jdbc,mysql,password,postgres,statsd]==2.1.1 # via -r requirements.in, airflow-provider-fivetran, apache-airflow-providers-amazon, apache-airflow-providers-apache-hive, apache-airflow-providers-celery, apache-airflow-providers-cncf-kubernetes, apache-airflow-providers-datadog, apache-airflow-providers-google, apache-airflow-providers-http, apache-airflow-providers-jdbc, apache-airflow-providers-mysql, apache-airflow-providers-postgres
|
||||
apispec[yaml]==3.3.2 # via flask-appbuilder
|
||||
argcomplete==1.12.2 # via -r requirements.in, apache-airflow, nox
|
||||
attrs==20.3.0 # via apache-airflow, cattrs, jsonschema
|
||||
authlib==0.15.4 # via -r requirements.in
|
||||
babel==2.9.1 # via flask-babel
|
||||
backports.entry-points-selectable==1.1.0 # via virtualenv
|
||||
bcrypt==3.2.0 # via apache-airflow, flask-bcrypt
|
||||
billiard==3.6.4.0 # via celery
|
||||
boto3==1.15.18 # via -r requirements.in, apache-airflow, apache-airflow-backport-providers-amazon, watchtower
|
||||
botocore==1.18.18 # via -r requirements.in, apache-airflow-backport-providers-amazon, boto3, s3transfer
|
||||
blinker==1.4 # via apache-airflow
|
||||
boto3==1.15.18 # via -r requirements.in, apache-airflow-providers-amazon, watchtower
|
||||
botocore==1.18.18 # via -r requirements.in, boto3, s3transfer
|
||||
cached-property==1.5.2 # via apache-airflow
|
||||
cachetools==4.2.2 # via google-auth
|
||||
cattrs==1.7.1 # via apache-airflow
|
||||
celery==4.4.7 # via apache-airflow, flower
|
||||
certifi==2021.5.30 # via kubernetes, requests
|
||||
cattrs==1.5.0 # via apache-airflow
|
||||
celery==4.4.7 # via apache-airflow-providers-celery, flower
|
||||
certifi==2021.5.30 # via httpx, kubernetes, requests
|
||||
cffi==1.14.6 # via bcrypt, cryptography, google-crc32c
|
||||
chardet==3.0.4 # via requests
|
||||
click==7.1.2 # via flask, flask-appbuilder, hmsclient
|
||||
colorama==0.4.4 # via flask-appbuilder
|
||||
colorlog==4.0.2 # via apache-airflow
|
||||
configparser==3.5.3 # via apache-airflow
|
||||
charset-normalizer==2.0.4 # via httpx
|
||||
click==7.1.2 # via clickclick, flask, flask-appbuilder, hmsclient
|
||||
clickclick==20.10.2 # via apache-airflow
|
||||
colorama==0.4.4 # via flask-appbuilder, rich
|
||||
colorlog==4.0.2 # via apache-airflow, nox
|
||||
commonmark==0.9.1 # via rich
|
||||
croniter==0.3.37 # via apache-airflow
|
||||
cryptography==3.4.7 # via -r requirements.in, apache-airflow, pyopenssl
|
||||
datadog==0.42.0 # via apache-airflow
|
||||
cryptography==3.4.7 # via -r requirements.in, apache-airflow, apache-airflow-providers-cncf-kubernetes, authlib, pyopenssl
|
||||
datadog==0.42.0 # via apache-airflow-providers-datadog
|
||||
defusedxml==0.7.1 # via python3-openid
|
||||
dill==0.3.4 # via apache-airflow
|
||||
distlib==0.3.2 # via virtualenv
|
||||
dnspython==1.16.0 # via email-validator, eventlet
|
||||
docutils==0.17.1 # via python-daemon
|
||||
email-validator==1.1.3 # via apache-airflow, flask-appbuilder
|
||||
docutils==0.16 # via apache-airflow, python-daemon
|
||||
email-validator==1.1.3 # via flask-appbuilder
|
||||
eventlet==0.31.1 # via apache-airflow
|
||||
flask-admin==1.5.4 # via apache-airflow
|
||||
flask-appbuilder==2.3.4 # via apache-airflow
|
||||
filelock==3.0.12 # via virtualenv
|
||||
flask-admin==1.5.8 # via -r requirements.in
|
||||
flask-appbuilder==3.3.2 # via -r requirements.in, apache-airflow
|
||||
flask-babel==1.0.0 # via flask-appbuilder
|
||||
flask-bcrypt==0.7.1 # via apache-airflow
|
||||
flask-caching==1.3.3 # via apache-airflow
|
||||
flask-caching==1.10.1 # via apache-airflow
|
||||
flask-jwt-extended==3.25.1 # via flask-appbuilder
|
||||
flask-login==0.4.1 # via apache-airflow, flask-appbuilder
|
||||
flask-oauthlib==0.9.5 # via -r requirements.in, apache-airflow
|
||||
flask-openid==1.2.5 # via flask-appbuilder
|
||||
flask-sqlalchemy==2.5.1 # via flask-appbuilder
|
||||
flask-swagger==0.2.14 # via apache-airflow
|
||||
flask-wtf==0.14.3 # via apache-airflow, flask-appbuilder
|
||||
flask==1.1.4 # via apache-airflow, flask-admin, flask-appbuilder, flask-babel, flask-bcrypt, flask-caching, flask-jwt-extended, flask-login, flask-oauthlib, flask-openid, flask-sqlalchemy, flask-swagger, flask-wtf
|
||||
flower==0.9.7 # via apache-airflow
|
||||
funcsigs==1.0.2 # via apache-airflow
|
||||
future==0.18.2 # via apache-airflow, pyhive
|
||||
flask==1.1.4 # via apache-airflow, flask-admin, flask-appbuilder, flask-babel, flask-bcrypt, flask-caching, flask-jwt-extended, flask-login, flask-oauthlib, flask-openid, flask-sqlalchemy, flask-wtf
|
||||
flower==0.9.7 # via apache-airflow-providers-celery
|
||||
future==0.18.2 # via pyhive
|
||||
gevent==21.1.2 # via apache-airflow
|
||||
google-ads==7.0.0 # via apache-airflow-backport-providers-google
|
||||
google-api-core[grpc,grpcgcp]==1.31.0 # via apache-airflow-backport-providers-google, google-ads, google-api-python-client, google-cloud-appengine-logging, google-cloud-automl, google-cloud-bigquery, google-cloud-bigquery-datatransfer, google-cloud-bigquery-storage, google-cloud-bigtable, google-cloud-container, google-cloud-core, google-cloud-datacatalog, google-cloud-dataproc, google-cloud-dlp, google-cloud-kms, google-cloud-language, google-cloud-logging, google-cloud-memcache, google-cloud-monitoring, google-cloud-os-login, google-cloud-pubsub, google-cloud-redis, google-cloud-secret-manager, google-cloud-spanner, google-cloud-speech, google-cloud-tasks, google-cloud-texttospeech, google-cloud-translate, google-cloud-videointelligence, google-cloud-vision, google-cloud-workflows
|
||||
google-api-python-client==1.12.8 # via apache-airflow, apache-airflow-backport-providers-google
|
||||
google-auth-httplib2==0.1.0 # via apache-airflow, apache-airflow-backport-providers-google, google-api-python-client
|
||||
google-ads==13.0.0 # via apache-airflow-providers-google
|
||||
google-api-core[grpc,grpcgcp]==1.31.0 # via apache-airflow-providers-google, google-ads, google-api-python-client, google-cloud-appengine-logging, google-cloud-automl, google-cloud-bigquery, google-cloud-bigquery-datatransfer, google-cloud-bigquery-storage, google-cloud-bigtable, google-cloud-container, google-cloud-core, google-cloud-datacatalog, google-cloud-dataproc, google-cloud-dlp, google-cloud-kms, google-cloud-language, google-cloud-logging, google-cloud-memcache, google-cloud-monitoring, google-cloud-os-login, google-cloud-pubsub, google-cloud-redis, google-cloud-secret-manager, google-cloud-spanner, google-cloud-speech, google-cloud-tasks, google-cloud-texttospeech, google-cloud-translate, google-cloud-videointelligence, google-cloud-vision, google-cloud-workflows
|
||||
google-api-python-client==1.12.8 # via apache-airflow-providers-google
|
||||
google-auth-httplib2==0.1.0 # via apache-airflow-providers-google, google-api-python-client
|
||||
google-auth-oauthlib==0.4.4 # via google-ads, pandas-gbq, pydata-google-auth
|
||||
google-auth==1.32.1 # via apache-airflow, apache-airflow-backport-providers-google, google-api-core, google-api-python-client, google-auth-httplib2, google-auth-oauthlib, google-cloud-core, google-cloud-storage, kubernetes, pandas-gbq, pydata-google-auth
|
||||
google-auth==1.32.1 # via apache-airflow-providers-google, google-api-core, google-api-python-client, google-auth-httplib2, google-auth-oauthlib, google-cloud-core, google-cloud-storage, kubernetes, pandas-gbq, pydata-google-auth
|
||||
google-cloud-appengine-logging==0.1.1 # via google-cloud-logging
|
||||
google-cloud-audit-log==0.1.0 # via google-cloud-logging
|
||||
google-cloud-automl==2.4.0 # via apache-airflow-backport-providers-google
|
||||
google-cloud-bigquery-datatransfer==3.3.0 # via apache-airflow-backport-providers-google
|
||||
google-cloud-automl==2.4.0 # via apache-airflow-providers-google
|
||||
google-cloud-bigquery-datatransfer==3.3.0 # via apache-airflow-providers-google
|
||||
google-cloud-bigquery-storage==2.6.0 # via google-cloud-bigquery
|
||||
google-cloud-bigquery[bqstorage,pandas]==2.20.0 # via pandas-gbq
|
||||
google-cloud-bigtable==1.7.0 # via apache-airflow, apache-airflow-backport-providers-google
|
||||
google-cloud-container==1.0.1 # via apache-airflow, apache-airflow-backport-providers-google
|
||||
google-cloud-bigtable==1.7.0 # via apache-airflow-providers-google
|
||||
google-cloud-container==1.0.1 # via apache-airflow-providers-google
|
||||
google-cloud-core==1.7.1 # via google-cloud-bigquery, google-cloud-bigtable, google-cloud-logging, google-cloud-spanner, google-cloud-storage, google-cloud-translate
|
||||
google-cloud-datacatalog==3.3.0 # via apache-airflow-backport-providers-google
|
||||
google-cloud-dataproc==2.4.0 # via apache-airflow-backport-providers-google
|
||||
google-cloud-dlp==1.0.0 # via apache-airflow, apache-airflow-backport-providers-google
|
||||
google-cloud-kms==2.4.0 # via apache-airflow-backport-providers-google
|
||||
google-cloud-language==1.3.0 # via apache-airflow, apache-airflow-backport-providers-google
|
||||
google-cloud-logging==2.5.0 # via apache-airflow-backport-providers-google
|
||||
google-cloud-memcache==1.1.0 # via apache-airflow-backport-providers-google
|
||||
google-cloud-monitoring==2.4.0 # via apache-airflow-backport-providers-google
|
||||
google-cloud-os-login==2.2.1 # via apache-airflow-backport-providers-google
|
||||
google-cloud-pubsub==2.6.1 # via apache-airflow-backport-providers-google
|
||||
google-cloud-redis==2.2.0 # via apache-airflow-backport-providers-google
|
||||
google-cloud-secret-manager==1.0.0 # via apache-airflow, apache-airflow-backport-providers-google
|
||||
google-cloud-spanner==1.19.1 # via apache-airflow, apache-airflow-backport-providers-google
|
||||
google-cloud-speech==1.3.2 # via apache-airflow, apache-airflow-backport-providers-google
|
||||
google-cloud-storage==1.40.0 # via apache-airflow, apache-airflow-backport-providers-google
|
||||
google-cloud-tasks==2.4.0 # via apache-airflow-backport-providers-google
|
||||
google-cloud-texttospeech==1.0.1 # via apache-airflow, apache-airflow-backport-providers-google
|
||||
google-cloud-translate==1.7.0 # via apache-airflow, apache-airflow-backport-providers-google
|
||||
google-cloud-videointelligence==1.16.1 # via apache-airflow, apache-airflow-backport-providers-google
|
||||
google-cloud-vision==1.0.0 # via apache-airflow, apache-airflow-backport-providers-google
|
||||
google-cloud-workflows==1.1.0 # via apache-airflow-backport-providers-google
|
||||
google-cloud-datacatalog==3.3.0 # via apache-airflow-providers-google
|
||||
google-cloud-dataproc==2.5.0 # via -r requirements.in, apache-airflow-providers-google
|
||||
google-cloud-dlp==1.0.0 # via apache-airflow-providers-google
|
||||
google-cloud-kms==2.4.0 # via apache-airflow-providers-google
|
||||
google-cloud-language==1.3.0 # via apache-airflow-providers-google
|
||||
google-cloud-logging==2.5.0 # via apache-airflow-providers-google
|
||||
google-cloud-memcache==1.0.0 # via apache-airflow-providers-google
|
||||
google-cloud-monitoring==2.4.0 # via apache-airflow-providers-google
|
||||
google-cloud-os-login==2.2.1 # via apache-airflow-providers-google
|
||||
google-cloud-pubsub==2.6.1 # via apache-airflow-providers-google
|
||||
google-cloud-redis==2.2.0 # via apache-airflow-providers-google
|
||||
google-cloud-secret-manager==1.0.0 # via apache-airflow-providers-google
|
||||
google-cloud-spanner==1.19.1 # via apache-airflow-providers-google
|
||||
google-cloud-speech==1.3.2 # via apache-airflow-providers-google
|
||||
google-cloud-storage==1.40.0 # via apache-airflow-providers-google
|
||||
google-cloud-tasks==2.4.0 # via apache-airflow-providers-google
|
||||
google-cloud-texttospeech==1.0.1 # via apache-airflow-providers-google
|
||||
google-cloud-translate==1.7.0 # via apache-airflow-providers-google
|
||||
google-cloud-videointelligence==1.16.1 # via apache-airflow-providers-google
|
||||
google-cloud-vision==1.0.0 # via apache-airflow-providers-google
|
||||
google-cloud-workflows==1.1.0 # via apache-airflow-providers-google
|
||||
google-crc32c==1.1.2 # via google-resumable-media
|
||||
google-resumable-media==1.3.1 # via google-cloud-bigquery, google-cloud-storage
|
||||
googleapis-common-protos[grpc]==1.53.0 # via google-ads, google-api-core, google-cloud-audit-log, grpc-google-iam-v1
|
||||
graphviz==0.16 # via apache-airflow
|
||||
greenlet==1.1.0 # via apache-airflow, eventlet, gevent
|
||||
grpc-google-iam-v1==0.12.3 # via google-cloud-bigtable, google-cloud-container, google-cloud-datacatalog, google-cloud-kms, google-cloud-pubsub, google-cloud-secret-manager, google-cloud-spanner, google-cloud-tasks
|
||||
grpcio-gcp==0.2.2 # via apache-airflow, apache-airflow-backport-providers-google, google-api-core
|
||||
grpcio-gcp==0.2.2 # via apache-airflow-providers-google, google-api-core
|
||||
grpcio==1.38.1 # via google-ads, google-api-core, google-cloud-bigquery, google-cloud-pubsub, googleapis-common-protos, grpc-google-iam-v1, grpcio-gcp
|
||||
gunicorn==20.1.0 # via apache-airflow
|
||||
h11==0.12.0 # via httpcore
|
||||
hiredis==2.0.0 # via -r requirements.in
|
||||
hmsclient==0.1.1 # via apache-airflow
|
||||
hmsclient==0.1.1 # via apache-airflow-providers-apache-hive
|
||||
httpcore==0.13.6 # via httpx
|
||||
httplib2==0.19.1 # via google-api-python-client, google-auth-httplib2
|
||||
httpx==0.19.0 # via apache-airflow, apache-airflow-providers-google
|
||||
humanize==3.10.0 # via flower
|
||||
idna==2.10 # via email-validator, requests
|
||||
importlib-metadata==2.1.0 # via -r requirements.in, apache-airflow, apache-airflow-upgrade-check, argcomplete, importlib-resources, jsonschema, kombu
|
||||
idna==2.10 # via anyio, email-validator, requests, rfc3986
|
||||
importlib-metadata==1.7.0 # via -r requirements.in, apache-airflow, argcomplete, importlib-resources, jsonschema, kombu, nox, virtualenv
|
||||
importlib-resources==1.5.0 # via apache-airflow
|
||||
inflection==0.5.1 # via apache-airflow
|
||||
iso8601==0.1.14 # via apache-airflow
|
||||
itsdangerous==1.1.0 # via flask, flask-wtf
|
||||
jaydebeapi==1.2.3 # via apache-airflow
|
||||
jinja2==2.11.3 # via apache-airflow, flask, flask-babel, python-nvd3
|
||||
isodate==0.6.0 # via openapi-schema-validator
|
||||
itsdangerous==1.1.0 # via apache-airflow, flask, flask-wtf
|
||||
jaydebeapi==1.2.3 # via apache-airflow-providers-jdbc
|
||||
jinja2==2.11.3 # via apache-airflow, flask, flask-babel, python-nvd3, swagger-ui-bundle
|
||||
jmespath==0.10.0 # via boto3, botocore
|
||||
jpype1==0.7.1 # via -r requirements.in, apache-airflow, jaydebeapi
|
||||
json-merge-patch==0.2 # via apache-airflow, apache-airflow-backport-providers-google
|
||||
jsonschema==3.2.0 # via -r requirements.in, apache-airflow, flask-appbuilder
|
||||
jpype1==0.7.1 # via -r requirements.in, jaydebeapi
|
||||
json-merge-patch==0.2 # via apache-airflow-providers-google
|
||||
jsonschema==3.2.0 # via -r requirements.in, apache-airflow, flask-appbuilder, openapi-schema-validator, openapi-spec-validator
|
||||
kombu==4.6.10 # via -r requirements.in, celery
|
||||
kubernetes==11.0.0 # via apache-airflow
|
||||
kubernetes==11.0.0 # via apache-airflow-providers-cncf-kubernetes
|
||||
lazy-object-proxy==1.4.3 # via apache-airflow
|
||||
libcst==0.3.19 # via google-cloud-bigquery-storage, google-cloud-datacatalog, google-cloud-os-login, google-cloud-pubsub, google-cloud-workflows
|
||||
lockfile==0.12.2 # via python-daemon
|
||||
lockfile==0.12.2 # via apache-airflow, python-daemon
|
||||
mako==1.1.4 # via alembic
|
||||
markdown==2.6.11 # via apache-airflow
|
||||
markupsafe==2.0.1 # via jinja2, mako, wtforms
|
||||
markupsafe==1.1.1 # via apache-airflow, jinja2, mako, wtforms
|
||||
marshmallow-enum==1.5.1 # via flask-appbuilder
|
||||
marshmallow-sqlalchemy==0.23.1 # via apache-airflow, flask-appbuilder
|
||||
marshmallow==2.21.0 # via flask-appbuilder, marshmallow-enum, marshmallow-sqlalchemy
|
||||
marshmallow-oneofschema==3.0.1 # via apache-airflow
|
||||
marshmallow-sqlalchemy==0.23.1 # via flask-appbuilder
|
||||
marshmallow==3.13.0 # via flask-appbuilder, marshmallow-enum, marshmallow-oneofschema, marshmallow-sqlalchemy
|
||||
mozlogging==0.1.0 # via -r requirements.in
|
||||
mypy-extensions==0.4.3 # via typing-inspect
|
||||
mysqlclient==1.3.14 # via apache-airflow
|
||||
mysql-connector-python==8.0.22 # via apache-airflow-providers-mysql
|
||||
mysqlclient==1.3.14 # via apache-airflow-providers-mysql
|
||||
natsort==7.1.1 # via croniter
|
||||
newrelic==6.4.4.161 # via -r requirements.in
|
||||
numpy==1.21.0 # via pandas, pyarrow
|
||||
nox==2020.12.31 # via google-ads
|
||||
numpy==1.21.0 # via apache-airflow, pandas, pyarrow
|
||||
oauthlib==2.1.0 # via apache-airflow, flask-oauthlib, requests-oauthlib
|
||||
packaging==21.0 # via apache-airflow, apache-airflow-upgrade-check, google-api-core, google-cloud-appengine-logging, google-cloud-automl, google-cloud-bigquery, google-cloud-bigquery-datatransfer, google-cloud-bigquery-storage, google-cloud-datacatalog, google-cloud-dataproc, google-cloud-kms, google-cloud-logging, google-cloud-memcache, google-cloud-monitoring, google-cloud-os-login, google-cloud-redis, google-cloud-tasks, google-cloud-workflows
|
||||
pandas-gbq==0.14.1 # via -r requirements.in, apache-airflow, apache-airflow-backport-providers-google
|
||||
openapi-schema-validator==0.1.5 # via openapi-spec-validator
|
||||
openapi-spec-validator==0.3.1 # via apache-airflow
|
||||
packaging==21.0 # via google-api-core, google-cloud-appengine-logging, google-cloud-automl, google-cloud-bigquery, google-cloud-bigquery-datatransfer, google-cloud-bigquery-storage, google-cloud-datacatalog, google-cloud-dataproc, google-cloud-kms, google-cloud-logging, google-cloud-monitoring, google-cloud-os-login, google-cloud-redis, google-cloud-tasks, google-cloud-workflows
|
||||
pandas-gbq==0.14.1 # via -r requirements.in, apache-airflow-providers-google
|
||||
pandas==1.3.0 # via apache-airflow, google-cloud-bigquery, pandas-gbq
|
||||
pendulum==1.4.4 # via apache-airflow
|
||||
pendulum==2.1.2 # via apache-airflow
|
||||
platformdirs==2.2.0 # via virtualenv
|
||||
prison==0.1.3 # via flask-appbuilder
|
||||
prometheus-client==0.8.0 # via flower
|
||||
proto-plus==1.19.0 # via google-cloud-appengine-logging, google-cloud-automl, google-cloud-bigquery, google-cloud-bigquery-datatransfer, google-cloud-bigquery-storage, google-cloud-datacatalog, google-cloud-dataproc, google-cloud-kms, google-cloud-logging, google-cloud-memcache, google-cloud-monitoring, google-cloud-os-login, google-cloud-pubsub, google-cloud-redis, google-cloud-tasks, google-cloud-workflows
|
||||
protobuf==3.17.3 # via google-ads, google-api-core, google-cloud-audit-log, google-cloud-bigquery, googleapis-common-protos, proto-plus
|
||||
proto-plus==1.19.0 # via google-ads, google-cloud-appengine-logging, google-cloud-automl, google-cloud-bigquery, google-cloud-bigquery-datatransfer, google-cloud-bigquery-storage, google-cloud-datacatalog, google-cloud-dataproc, google-cloud-kms, google-cloud-logging, google-cloud-memcache, google-cloud-monitoring, google-cloud-os-login, google-cloud-pubsub, google-cloud-redis, google-cloud-tasks, google-cloud-workflows
|
||||
protobuf==3.17.3 # via google-api-core, google-cloud-audit-log, google-cloud-bigquery, googleapis-common-protos, mysql-connector-python, proto-plus
|
||||
psutil==5.8.0 # via apache-airflow
|
||||
psycopg2-binary==2.9.1 # via apache-airflow
|
||||
psycopg2-binary==2.9.1 # via apache-airflow-providers-postgres
|
||||
pure-sasl==0.6.2 # via thrift-sasl
|
||||
py==1.10.0 # via nox
|
||||
pyarrow==4.0.1 # via google-cloud-bigquery
|
||||
pyasn1-modules==0.2.8 # via google-auth
|
||||
pyasn1==0.4.8 # via pyasn1-modules, rsa
|
||||
pycparser==2.20 # via cffi
|
||||
pydata-google-auth==1.2.0 # via pandas-gbq
|
||||
pygments==2.9.0 # via apache-airflow
|
||||
pyhive[hive]==0.6.4 # via apache-airflow
|
||||
pyjwt==1.7.1 # via flask-appbuilder, flask-jwt-extended
|
||||
pyopenssl==20.0.1 # via apache-airflow, apache-airflow-backport-providers-google
|
||||
pygments==2.9.0 # via apache-airflow, rich
|
||||
pyhive[hive]==0.6.4 # via apache-airflow-providers-apache-hive
|
||||
pyjwt==1.7.1 # via apache-airflow, flask-appbuilder, flask-jwt-extended
|
||||
pyopenssl==20.0.1 # via apache-airflow-providers-google
|
||||
pyparsing==2.4.7 # via httplib2, packaging
|
||||
pyrsistent==0.18.0 # via jsonschema
|
||||
python-daemon==2.3.0 # via apache-airflow
|
||||
|
@ -161,43 +189,47 @@ python-dateutil==2.8.1 # via alembic, apache-airflow, botocore, croniter, fla
|
|||
python-editor==1.0.4 # via alembic
|
||||
python-nvd3==0.15.0 # via apache-airflow
|
||||
python-slugify==4.0.1 # via apache-airflow, python-nvd3
|
||||
python3-openid==3.2.0 # via flask-openid
|
||||
pytz==2021.1 # via -r requirements.in, babel, celery, flask-babel, flower, google-api-core, pandas, tzlocal
|
||||
python3-openid==3.2.0 # via apache-airflow, flask-openid
|
||||
pytz==2021.1 # via -r requirements.in, babel, celery, flask-babel, flower, google-api-core, pandas
|
||||
pytzdata==2020.1 # via pendulum
|
||||
pyyaml==5.4.1 # via apispec, flask-swagger, google-ads, kubernetes, libcst
|
||||
pyyaml==5.4.1 # via apache-airflow, apispec, clickclick, google-ads, kubernetes, libcst, openapi-spec-validator
|
||||
redis==3.5.3 # via -r requirements.in
|
||||
requests-oauthlib==1.1.0 # via apache-airflow, flask-oauthlib, google-auth-oauthlib, kubernetes
|
||||
requests==2.23.0 # via -r requirements.in, apache-airflow, datadog, google-api-core, google-cloud-bigquery, google-cloud-storage, kubernetes, requests-oauthlib
|
||||
requests==2.23.0 # via -r requirements.in, airflow-provider-fivetran, apache-airflow-providers-http, datadog, google-api-core, google-cloud-bigquery, google-cloud-storage, kubernetes, requests-oauthlib
|
||||
retrying==1.3.3 # via -r requirements.in
|
||||
rfc3986[idna2008]==1.5.0 # via httpx
|
||||
rich==10.9.0 # via apache-airflow
|
||||
rsa==4.7.2 # via google-auth
|
||||
s3transfer==0.3.7 # via boto3
|
||||
sasl==0.3.1 # via pyhive
|
||||
setproctitle==1.2.2 # via apache-airflow
|
||||
shelljob==0.5.6 # via -r requirements.in
|
||||
six==1.16.0 # via bcrypt, eventlet, flask-jwt-extended, google-api-core, google-api-python-client, google-auth, google-auth-httplib2, google-cloud-core, google-resumable-media, grpcio, jsonschema, kubernetes, prison, protobuf, pyopenssl, python-dateutil, retrying, sasl, sqlalchemy-utils, tenacity, thrift, thrift-sasl
|
||||
sqlalchemy-jsonfield==0.9.0 # via apache-airflow
|
||||
six==1.16.0 # via bcrypt, eventlet, flask-jwt-extended, google-api-core, google-api-python-client, google-auth, google-auth-httplib2, google-cloud-core, google-resumable-media, grpcio, isodate, jsonschema, kubernetes, openapi-schema-validator, openapi-spec-validator, prison, protobuf, pyopenssl, python-dateutil, retrying, sasl, sqlalchemy-utils, tenacity, thrift, thrift-sasl, virtualenv
|
||||
sniffio==1.2.0 # via anyio, httpcore, httpx
|
||||
sqlalchemy-jsonfield==1.0.0 # via apache-airflow
|
||||
sqlalchemy-utils==0.37.8 # via flask-appbuilder
|
||||
sqlalchemy==1.3.15 # via -r requirements.in, alembic, apache-airflow, flask-sqlalchemy, marshmallow-sqlalchemy, sqlalchemy-jsonfield, sqlalchemy-utils
|
||||
sqlalchemy==1.3.24 # via -r requirements.in, alembic, apache-airflow, flask-appbuilder, flask-sqlalchemy, marshmallow-sqlalchemy, sqlalchemy-jsonfield, sqlalchemy-utils
|
||||
statsd==3.3.0 # via apache-airflow
|
||||
swagger-ui-bundle==0.0.8 # via apache-airflow
|
||||
tabulate==0.8.9 # via apache-airflow
|
||||
tenacity==4.12.0 # via apache-airflow
|
||||
tenacity==6.2.0 # via apache-airflow
|
||||
termcolor==1.1.0 # via apache-airflow
|
||||
text-unidecode==1.3 # via python-slugify
|
||||
thrift-sasl==0.4.3 # via pyhive
|
||||
thrift==0.13.0 # via apache-airflow, hmsclient, pyhive, thrift-sasl
|
||||
tornado==5.1.1 # via apache-airflow, flower
|
||||
typing-extensions==3.10.0.0 # via apache-airflow, libcst, typing-inspect
|
||||
thrift==0.13.0 # via apache-airflow-providers-apache-hive, hmsclient, pyhive, thrift-sasl
|
||||
tornado==5.1.1 # via flower
|
||||
typing-extensions==3.10.0.0 # via anyio, apache-airflow, libcst, rich, typing-inspect
|
||||
typing-inspect==0.7.1 # via libcst
|
||||
tzlocal==1.5.1 # via apache-airflow, pendulum
|
||||
unicodecsv==0.14.1 # via apache-airflow
|
||||
uritemplate==3.0.1 # via google-api-python-client
|
||||
urllib3==1.25.11 # via -r requirements.in, botocore, kubernetes, requests
|
||||
vine==1.3.0 # via amqp, apache-airflow, celery, flower
|
||||
watchtower==0.7.3 # via apache-airflow-backport-providers-amazon
|
||||
vine==1.3.0 # via amqp, apache-airflow-providers-celery, celery, flower
|
||||
virtualenv==20.7.2 # via nox
|
||||
watchtower==1.0.6 # via apache-airflow-providers-amazon
|
||||
websocket-client==1.1.0 # via -r requirements.in, kubernetes
|
||||
werkzeug==0.16.0 # via -r requirements.in, apache-airflow, flask, flask-caching, flask-jwt-extended
|
||||
werkzeug==1.0.1 # via -r requirements.in, apache-airflow, flask, flask-jwt-extended
|
||||
wtforms==2.3.3 # via flask-admin, flask-wtf
|
||||
zipp==3.5.0 # via importlib-metadata, importlib-resources
|
||||
zope.deprecation==4.4.0 # via apache-airflow
|
||||
zope.event==4.5.0 # via gevent
|
||||
zope.interface==5.4.0 # via gevent
|
||||
|
||||
|
|
|
@ -70,14 +70,14 @@ OAUTH_PROVIDERS = [{
|
|||
'token_key':'access_token',
|
||||
'icon':'fa-google',
|
||||
'remote_app': {
|
||||
'base_url':'https://www.googleapis.com/oauth2/v2/',
|
||||
'request_token_params':{
|
||||
'api_base_url':'https://www.googleapis.com/oauth2/v2/',
|
||||
'client_kwargs':{
|
||||
'scope': 'email profile'
|
||||
},
|
||||
'access_token_url':'https://accounts.google.com/o/oauth2/token',
|
||||
'authorize_url':'https://accounts.google.com/o/oauth2/auth',
|
||||
'request_token_url': None,
|
||||
'consumer_key': GOOGLE_KEY,
|
||||
'consumer_secret': GOOGLE_SECRET,
|
||||
'client_id': GOOGLE_KEY,
|
||||
'client_secret': GOOGLE_SECRET,
|
||||
}
|
||||
}]
|
||||
|
|
Загрузка…
Ссылка в новой задаче