From f4e3e352e14b67c1e8fa00036e64ccc4813fbf68 Mon Sep 17 00:00:00 2001 From: Kaxil Naik Date: Mon, 5 Feb 2018 10:48:00 +0100 Subject: [PATCH] [AIRFLOW-2063] Add missing docs for GCP - Add missing operator in `code.rst` and `integration.rst` - Fix documentation in DataProc operator - Minor doc fix in GCS operators - Fixed codeblocks & links in docstrings for BigQuery, DataProc, DataFlow, MLEngine, GCS hooks & operators Closes #3003 from kaxil/doc_update --- airflow/contrib/hooks/datastore_hook.py | 19 ++- airflow/contrib/hooks/gcp_mlengine_hook.py | 16 +-- airflow/contrib/hooks/gcs_hook.py | 11 +- .../operators/bigquery_check_operator.py | 6 +- .../contrib/operators/bigquery_get_data.py | 33 ++--- .../contrib/operators/bigquery_to_bigquery.py | 8 +- airflow/contrib/operators/bigquery_to_gcs.py | 8 +- .../contrib/operators/dataflow_operator.py | 115 ++++++++++-------- .../contrib/operators/dataproc_operator.py | 48 ++++---- .../contrib/operators/gcs_copy_operator.py | 23 ++-- .../contrib/operators/gcs_list_operator.py | 19 +-- .../contrib/operators/mlengine_operator.py | 16 ++- docs/code.rst | 22 ++++ docs/integration.rst | 111 ++++++++++++++++- 14 files changed, 303 insertions(+), 152 deletions(-) diff --git a/airflow/contrib/hooks/datastore_hook.py b/airflow/contrib/hooks/datastore_hook.py index cf98dc749f..ba690e0f2e 100644 --- a/airflow/contrib/hooks/datastore_hook.py +++ b/airflow/contrib/hooks/datastore_hook.py @@ -66,7 +66,9 @@ class DatastoreHook(GoogleCloudBaseHook): def commit(self, body): """ Commit a transaction, optionally creating, deleting or modifying some entities. - see https://cloud.google.com/datastore/docs/reference/rest/v1/projects/commit + + .. seealso:: + https://cloud.google.com/datastore/docs/reference/rest/v1/projects/commit :param body: the body of the commit request :return: the response body of the commit request @@ -77,7 +79,10 @@ class DatastoreHook(GoogleCloudBaseHook): def lookup(self, keys, read_consistency=None, transaction=None): """ Lookup some entities by key - see https://cloud.google.com/datastore/docs/reference/rest/v1/projects/lookup + + .. seealso:: + https://cloud.google.com/datastore/docs/reference/rest/v1/projects/lookup + :param keys: the keys to lookup :param read_consistency: the read consistency to use. default, strong or eventual. Cannot be used with a transaction. @@ -94,7 +99,10 @@ class DatastoreHook(GoogleCloudBaseHook): def rollback(self, transaction): """ Roll back a transaction - see https://cloud.google.com/datastore/docs/reference/rest/v1/projects/rollback + + .. seealso:: + https://cloud.google.com/datastore/docs/reference/rest/v1/projects/rollback + :param transaction: the transaction to roll back """ self.connection.projects().rollback(projectId=self.project_id, body={'transaction': transaction})\ @@ -103,7 +111,10 @@ class DatastoreHook(GoogleCloudBaseHook): def run_query(self, body): """ Run a query for entities. - see https://cloud.google.com/datastore/docs/reference/rest/v1/projects/runQuery + + .. seealso:: + https://cloud.google.com/datastore/docs/reference/rest/v1/projects/runQuery + :param body: the body of the query request :return: the batch of query results. """ diff --git a/airflow/contrib/hooks/gcp_mlengine_hook.py b/airflow/contrib/hooks/gcp_mlengine_hook.py index c17b614a4a..bf25f0662e 100644 --- a/airflow/contrib/hooks/gcp_mlengine_hook.py +++ b/airflow/contrib/hooks/gcp_mlengine_hook.py @@ -67,14 +67,14 @@ class MLEngineHook(GoogleCloudBaseHook): :type project_id: string :param job: MLEngine Job object that should be provided to the MLEngine - API, such as: - { - 'jobId': 'my_job_id', - 'trainingInput': { - 'scaleTier': 'STANDARD_1', - ... - } - } + API, such as: :: + { + 'jobId': 'my_job_id', + 'trainingInput': { + 'scaleTier': 'STANDARD_1', + ... + } + } :type job: dict :param use_existing_job_fn: In case that a MLEngine job with the same diff --git a/airflow/contrib/hooks/gcs_hook.py b/airflow/contrib/hooks/gcs_hook.py index ac8f2e0078..f959f950b0 100644 --- a/airflow/contrib/hooks/gcs_hook.py +++ b/airflow/contrib/hooks/gcs_hook.py @@ -231,7 +231,7 @@ class GoogleCloudStorageHook(GoogleCloudBaseHook): :param prefix: prefix string which filters objects whose name begin with this prefix :type prefix: string :param delimiter: filters objects based on the delimiter (for e.g '.csv') - :type delimiter:string + :type delimiter: string :return: a stream of object names matching the filtering criteria """ service = self.get_conn() @@ -273,11 +273,12 @@ class GoogleCloudStorageHook(GoogleCloudBaseHook): def get_size(self, bucket, object): """ Gets the size of a file in Google Cloud Storage. + :param bucket: The Google cloud storage bucket where the object is. :type bucket: string - :param object: The name of the object to check in the Google cloud - storage bucket. + :param object: The name of the object to check in the Google cloud storage bucket. :type object: string + """ self.log.info('Checking the file size of object: %s in bucket: %s', object, bucket) service = self.get_conn() @@ -290,7 +291,7 @@ class GoogleCloudStorageHook(GoogleCloudBaseHook): if 'name' in response and response['name'][-1] != '/': # Remove Directories & Just check size of files size = response['size'] - self.log.info('The file size of %s is %s', object, size) + self.log.info('The file size of %s is %s bytes.', object, size) return size else: raise ValueError('Object is not a file') @@ -301,6 +302,7 @@ class GoogleCloudStorageHook(GoogleCloudBaseHook): def get_crc32c(self, bucket, object): """ Gets the CRC32c checksum of an object in Google Cloud Storage. + :param bucket: The Google cloud storage bucket where the object is. :type bucket: string :param object: The name of the object to check in the Google cloud @@ -327,6 +329,7 @@ class GoogleCloudStorageHook(GoogleCloudBaseHook): def get_md5hash(self, bucket, object): """ Gets the MD5 hash of an object in Google Cloud Storage. + :param bucket: The Google cloud storage bucket where the object is. :type bucket: string :param object: The name of the object to check in the Google cloud diff --git a/airflow/contrib/operators/bigquery_check_operator.py b/airflow/contrib/operators/bigquery_check_operator.py index e5a8f0f608..8089840445 100644 --- a/airflow/contrib/operators/bigquery_check_operator.py +++ b/airflow/contrib/operators/bigquery_check_operator.py @@ -93,10 +93,10 @@ class BigQueryIntervalCheckOperator(IntervalCheckOperator): Checks that the values of metrics given as SQL expressions are within a certain tolerance of the ones from days_back before. - This method constructs a query like so: + This method constructs a query like so :: - SELECT {metrics_threshold_dict_key} FROM {table} - WHERE {date_filter_column}= + SELECT {metrics_threshold_dict_key} FROM {table} + WHERE {date_filter_column}= :param table: the table name :type table: str diff --git a/airflow/contrib/operators/bigquery_get_data.py b/airflow/contrib/operators/bigquery_get_data.py index b3b25f2502..4c12f01dc3 100644 --- a/airflow/contrib/operators/bigquery_get_data.py +++ b/airflow/contrib/operators/bigquery_get_data.py @@ -26,25 +26,26 @@ class BigQueryGetDataOperator(BaseOperator): be equal to the number of rows fetched. Each element in the list will again be a list where element would represent the columns values for that row. - Example Result: [['Tony', '10'], ['Mike', '20'], ['Steve', '15']] + **Example Result**: ``[['Tony', '10'], ['Mike', '20'], ['Steve', '15']]`` - Note: If you pass fields to `selected_fields` which are in different order than the - order of columns already in - BQ table, the data will still be in the order of BQ table. - For example if the BQ table has 3 columns as - [A,B,C] and you pass 'B,A' in the `selected_fields` - the data would still be of the form 'A,B'. + .. note:: + If you pass fields to ``selected_fields`` which are in different order than the + order of columns already in + BQ table, the data will still be in the order of BQ table. + For example if the BQ table has 3 columns as + ``[A,B,C]`` and you pass 'B,A' in the ``selected_fields`` + the data would still be of the form ``'A,B'``. - Example: + **Example**: :: - get_data = BigQueryGetDataOperator( - task_id='get_data_from_bq', - dataset_id='test_dataset', - table_id='Transaction_partitions', - max_results='100', - # selected_fields='DATE', - bigquery_conn_id='airflow-service-account' - ) + get_data = BigQueryGetDataOperator( + task_id='get_data_from_bq', + dataset_id='test_dataset', + table_id='Transaction_partitions', + max_results='100', + selected_fields='DATE', + bigquery_conn_id='airflow-service-account' + ) :param dataset_id: The dataset ID of the requested table. :type destination_dataset_table: string diff --git a/airflow/contrib/operators/bigquery_to_bigquery.py b/airflow/contrib/operators/bigquery_to_bigquery.py index 2bc4a8b8fe..2736d7c17c 100644 --- a/airflow/contrib/operators/bigquery_to_bigquery.py +++ b/airflow/contrib/operators/bigquery_to_bigquery.py @@ -19,11 +19,11 @@ from airflow.utils.decorators import apply_defaults class BigQueryToBigQueryOperator(BaseOperator): """ - Copies data from one BigQuery table to another. See here: + Copies data from one BigQuery table to another. - https://cloud.google.com/bigquery/docs/reference/v2/jobs#configuration.copy - - For more details about these parameters. + .. seealso:: + For more details about these parameters: + https://cloud.google.com/bigquery/docs/reference/v2/jobs#configuration.copy :param source_project_dataset_tables: One or more dotted (project:|project.). BigQuery tables to use as the diff --git a/airflow/contrib/operators/bigquery_to_gcs.py b/airflow/contrib/operators/bigquery_to_gcs.py index 800e7bdf32..ff20276011 100644 --- a/airflow/contrib/operators/bigquery_to_gcs.py +++ b/airflow/contrib/operators/bigquery_to_gcs.py @@ -21,11 +21,9 @@ class BigQueryToCloudStorageOperator(BaseOperator): """ Transfers a BigQuery table to a Google Cloud Storage bucket. - See here: - - https://cloud.google.com/bigquery/docs/reference/v2/jobs - - For more details about these parameters. + .. seealso:: + For more details about these parameters: + https://cloud.google.com/bigquery/docs/reference/v2/jobs :param source_project_dataset_table: The dotted (.|:).
BigQuery table to use as the source diff --git a/airflow/contrib/operators/dataflow_operator.py b/airflow/contrib/operators/dataflow_operator.py index b8c94318ef..c5bfc9791d 100644 --- a/airflow/contrib/operators/dataflow_operator.py +++ b/airflow/contrib/operators/dataflow_operator.py @@ -31,34 +31,33 @@ class DataFlowJavaOperator(BaseOperator): It's a good practice to define dataflow_* parameters in the default_args of the dag like the project, zone and staging location. - ``` - default_args = { - 'dataflow_default_options': { - 'project': 'my-gcp-project', - 'zone': 'europe-west1-d', - 'stagingLocation': 'gs://my-staging-bucket/staging/' + .. code-block:: python + + default_args = { + 'dataflow_default_options': { + 'project': 'my-gcp-project', + 'zone': 'europe-west1-d', + 'stagingLocation': 'gs://my-staging-bucket/staging/' + } } - } - ``` You need to pass the path to your dataflow as a file reference with the ``jar`` parameter, the jar needs to be a self executing jar. Use ``options`` to pass on options to your job. - ``` - t1 = DataFlowOperation( - task_id='datapflow_example', - jar='{{var.value.gcp_dataflow_base}}pipeline/build/libs/pipeline-example-1.0.jar', - options={ - 'autoscalingAlgorithm': 'BASIC', - 'maxNumWorkers': '50', - 'start': '{{ds}}', - 'partitionType': 'DAY', - 'labels': {'foo' : 'bar'} - }, - gcp_conn_id='gcp-airflow-service-account', - dag=my-dag) - ``` + .. code-block:: python + t1 = DataFlowOperation( + task_id='datapflow_example', + jar='{{var.value.gcp_dataflow_base}}pipeline/build/libs/pipeline-example-1.0.jar', + options={ + 'autoscalingAlgorithm': 'BASIC', + 'maxNumWorkers': '50', + 'start': '{{ds}}', + 'partitionType': 'DAY', + 'labels': {'foo' : 'bar'} + }, + gcp_conn_id='gcp-airflow-service-account', + dag=my-dag) Both ``jar`` and ``options`` are templated so you can use variables in them. """ @@ -84,9 +83,10 @@ class DataFlowJavaOperator(BaseOperator): high-level options, for instances, project and zone information, which apply to all dataflow operators in the DAG. - For more detail on job submission have a look at the reference: - https://cloud.google.com/dataflow/pipelines/specifying-exec-params + .. seealso:: + For more detail on job submission have a look at the reference: + https://cloud.google.com/dataflow/pipelines/specifying-exec-params :param jar: The reference to a self executing DataFlow jar. :type jar: string @@ -144,33 +144,37 @@ class DataflowTemplateOperator(BaseOperator): will be passed to the job. It's a good practice to define dataflow_* parameters in the default_args of the dag like the project, zone and staging location. - https://cloud.google.com/dataflow/docs/reference/rest/v1b3/LaunchTemplateParameters - https://cloud.google.com/dataflow/docs/reference/rest/v1b3/RuntimeEnvironment - ``` - default_args = { - 'dataflow_default_options': { - 'project': 'my-gcp-project' - 'zone': 'europe-west1-d', - 'tempLocation': 'gs://my-staging-bucket/staging/' + + .. seealso:: + https://cloud.google.com/dataflow/docs/reference/rest/v1b3/LaunchTemplateParameters + https://cloud.google.com/dataflow/docs/reference/rest/v1b3/RuntimeEnvironment + + .. code-block:: python + default_args = { + 'dataflow_default_options': { + 'project': 'my-gcp-project' + 'zone': 'europe-west1-d', + 'tempLocation': 'gs://my-staging-bucket/staging/' + } } } - } - ``` + You need to pass the path to your dataflow template as a file reference with the ``template`` parameter. Use ``parameters`` to pass on parameters to your job. Use ``environment`` to pass on runtime environment variables to your job. - ``` - t1 = DataflowTemplateOperator( - task_id='datapflow_example', - template='{{var.value.gcp_dataflow_base}}', - parameters={ - 'inputFile': "gs://bucket/input/my_input.txt", - 'outputFile': "gs://bucket/output/my_output.txt" - }, - gcp_conn_id='gcp-airflow-service-account', - dag=my-dag) - ``` - ``template`` ``dataflow_default_options`` and ``parameters`` are templated so you can + + .. code-block:: python + t1 = DataflowTemplateOperator( + task_id='datapflow_example', + template='{{var.value.gcp_dataflow_base}}', + parameters={ + 'inputFile': "gs://bucket/input/my_input.txt", + 'outputFile': "gs://bucket/output/my_output.txt" + }, + gcp_conn_id='gcp-airflow-service-account', + dag=my-dag) + + ``template``, ``dataflow_default_options`` and ``parameters`` are templated so you can use variables in them. """ template_fields = ['parameters', 'dataflow_default_options', 'template'] @@ -191,11 +195,14 @@ class DataflowTemplateOperator(BaseOperator): Create a new DataflowTemplateOperator. Note that dataflow_default_options is expected to save high-level options for project information, which apply to all dataflow operators in the DAG. - https://cloud.google.com/dataflow/docs/reference/rest/v1b3 - /LaunchTemplateParameters - https://cloud.google.com/dataflow/docs/reference/rest/v1b3/RuntimeEnvironment - For more detail on job template execution have a look at the reference: - https://cloud.google.com/dataflow/docs/templates/executing-templates + + .. seealso:: + https://cloud.google.com/dataflow/docs/reference/rest/v1b3 + /LaunchTemplateParameters + https://cloud.google.com/dataflow/docs/reference/rest/v1b3/RuntimeEnvironment + For more detail on job template execution have a look at the reference: + https://cloud.google.com/dataflow/docs/templates/executing-templates + :param template: The reference to the DataFlow template. :type template: string :param dataflow_default_options: Map of default job environment options. @@ -258,9 +265,9 @@ class DataFlowPythonOperator(BaseOperator): high-level options, for instances, project and zone information, which apply to all dataflow operators in the DAG. - For more detail on job submission have a look at the reference: - - https://cloud.google.com/dataflow/pipelines/specifying-exec-params + .. seealso:: + For more detail on job submission have a look at the reference: + https://cloud.google.com/dataflow/pipelines/specifying-exec-params :param py_file: Reference to the python dataflow pipleline file.py, e.g., /some/local/file/path/to/your/python/pipeline/file. diff --git a/airflow/contrib/operators/dataproc_operator.py b/airflow/contrib/operators/dataproc_operator.py index ece6d512e7..3444cc63a8 100644 --- a/airflow/contrib/operators/dataproc_operator.py +++ b/airflow/contrib/operators/dataproc_operator.py @@ -63,7 +63,7 @@ class DataprocClusterCreateOperator(BaseOperator): :param master_machine_type: Compute engine machine type to use for the master node :type master_machine_type: string :param master_disk_size: Disk size for the master node - :type int + :type master_disk_size: int :param worker_machine_type:Compute engine machine type to use for the worker nodes :type worker_machine_type: string :param worker_disk_size: Disk size for the worker nodes @@ -398,31 +398,31 @@ class DataProcPigOperator(BaseOperator): It's a good practice to define dataproc_* parameters in the default_args of the dag like the cluster name and UDFs. - ``` - default_args = { - 'cluster_name': 'cluster-1', - 'dataproc_pig_jars': [ - 'gs://example/udf/jar/datafu/1.2.0/datafu.jar', - 'gs://example/udf/jar/gpig/1.2/gpig.jar' - ] - } - ``` + .. code-block:: python + + default_args = { + 'cluster_name': 'cluster-1', + 'dataproc_pig_jars': [ + 'gs://example/udf/jar/datafu/1.2.0/datafu.jar', + 'gs://example/udf/jar/gpig/1.2/gpig.jar' + ] + } You can pass a pig script as string or file reference. Use variables to pass on variables for the pig script to be resolved on the cluster or use the parameters to be resolved in the script as template parameters. - ``` - t1 = DataProcPigOperator( - task_id='dataproc_pig', - query='a_pig_script.pig', - variables={'out': 'gs://example/output/{{ds}}'}, - dag=dag) - ``` + **Example**: :: - For more detail on about job submission have a look at the reference: + t1 = DataProcPigOperator( + task_id='dataproc_pig', + query='a_pig_script.pig', + variables={'out': 'gs://example/output/{{ds}}'}, + dag=dag) - https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs + .. seealso:: + For more detail on about job submission have a look at the reference: + https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs :param query: The query or reference to the query file (pg or pig extension). :type query: string @@ -967,8 +967,9 @@ class DataprocWorkflowTemplateInstantiateOperator(DataprocWorkflowTemplateBaseOp Instantiate a WorkflowTemplate on Google Cloud Dataproc. The operator will wait until the WorkflowTemplate is finished executing. - Please refer to: - https://cloud.google.com/dataproc/docs/reference/rest/v1beta2/projects.regions.workflowTemplates/instantiate + .. seealso:: + Please refer to: + https://cloud.google.com/dataproc/docs/reference/rest/v1beta2/projects.regions.workflowTemplates/instantiate :param template_id: The id of the template. :type template_id: string @@ -1008,8 +1009,9 @@ class DataprocWorkflowTemplateInstantiateInlineOperator( Instantiate a WorkflowTemplate Inline on Google Cloud Dataproc. The operator will wait until the WorkflowTemplate is finished executing. - Please refer to: - https://cloud.google.com/dataproc/docs/reference/rest/v1beta2/projects.regions.workflowTemplates/instantiateInline + .. seealso:: + Please refer to: + https://cloud.google.com/dataproc/docs/reference/rest/v1beta2/projects.regions.workflowTemplates/instantiateInline :param template: The template contents. :type template: map diff --git a/airflow/contrib/operators/gcs_copy_operator.py b/airflow/contrib/operators/gcs_copy_operator.py index 55d98a3a09..e679ccb90c 100644 --- a/airflow/contrib/operators/gcs_copy_operator.py +++ b/airflow/contrib/operators/gcs_copy_operator.py @@ -42,18 +42,19 @@ class GoogleCloudStorageCopyOperator(BaseOperator): For this to work, the service account making the request must have domain-wide delegation enabled. :type delegate_to: string - Example: The following Operator would move all the CSV files from `sales/sales-2017` folder in `data` bucket to - `sales` folder in `archive` bucket. + **Example**: + The following Operator would move all the CSV files from `sales/sales-2017` folder in + `data` bucket to `sales` folder in `archive` bucket. :: - move_file = GoogleCloudStorageCopyOperator( - task_id='move_file', - source_bucket='data', - source_object='sales/sales-2017/', - source_files_delimiter='.csv' - destination_bucket='archive', - destination_directory='sales', - google_cloud_storage_conn_id='airflow-service-account' - ) + move_file = GoogleCloudStorageCopyOperator( + task_id='move_file', + source_bucket='data', + source_object='sales/sales-2017/', + source_files_delimiter='.csv' + destination_bucket='archive', + destination_directory='sales', + google_cloud_storage_conn_id='airflow-service-account' + ) """ template_fields = ('source_bucket', 'source_object', 'source_files_delimiter', 'destination_bucket', 'destination_directory') diff --git a/airflow/contrib/operators/gcs_list_operator.py b/airflow/contrib/operators/gcs_list_operator.py index e991766b60..c374551e55 100644 --- a/airflow/contrib/operators/gcs_list_operator.py +++ b/airflow/contrib/operators/gcs_list_operator.py @@ -40,16 +40,17 @@ class GoogleCloudStorageListOperator(BaseOperator): domain-wide delegation enabled. :type delegate_to: string - Example: The following Operator would list all the Avro files from `sales/sales-2017` - folder in `data` bucket. + **Example**: + The following Operator would list all the Avro files from `sales/sales-2017` + folder in `data` bucket. :: - GCS_Files = GoogleCloudStorageListOperator( - task_id='GCS_Files', - bucket='data', - prefix='sales/sales-2017/', - delimiter='.avro', - google_cloud_storage_conn_id=google_cloud_conn_id - ) + GCS_Files = GoogleCloudStorageListOperator( + task_id='GCS_Files', + bucket='data', + prefix='sales/sales-2017/', + delimiter='.avro', + google_cloud_storage_conn_id=google_cloud_conn_id + ) """ template_fields = ('bucket', 'prefix', 'delimiter') ui_color = '#f0eee4' diff --git a/airflow/contrib/operators/mlengine_operator.py b/airflow/contrib/operators/mlengine_operator.py index 4d8943b184..e4451ab8e9 100644 --- a/airflow/contrib/operators/mlengine_operator.py +++ b/airflow/contrib/operators/mlengine_operator.py @@ -120,11 +120,15 @@ class MLEngineBatchPredictionOperator(BaseOperator): In options 2 and 3, both model and version name should contain the minimal identifier. For instance, call + + :: + MLEngineBatchPredictionOperator( ..., model_name='my_model', version_name='my_version', ...) + if the desired model version is "projects/my_project/models/my_model/versions/my_version". @@ -189,7 +193,7 @@ class MLEngineBatchPredictionOperator(BaseOperator): :type delegate_to: string Raises: - ValueError: if a unique model/version origin cannot be determined. + ``ValueError``: if a unique model/version origin cannot be determined. """ template_fields = [ @@ -346,23 +350,23 @@ class MLEngineVersionOperator(BaseOperator): :type version: dict :param operation: The operation to perform. Available operations are: - 'create': Creates a new version in the model specified by `model_name`, + * ``create``: Creates a new version in the model specified by `model_name`, in which case the `version` parameter should contain all the information to create that version (e.g. `name`, `deploymentUrl`). - 'get': Gets full information of a particular version in the model + * ``get``: Gets full information of a particular version in the model specified by `model_name`. The name of the version should be specified in the `version` parameter. - 'list': Lists all available versions of the model specified + * ``list``: Lists all available versions of the model specified by `model_name`. - 'delete': Deletes the version specified in `version` parameter from the + * ``delete``: Deletes the version specified in `version` parameter from the model specified by `model_name`). The name of the version should be specified in the `version` parameter. - :type operation: string + :type operation: string :param gcp_conn_id: The connection ID to use when fetching connection info. :type gcp_conn_id: string diff --git a/docs/code.rst b/docs/code.rst index 41a61d7d13..883f48cc54 100644 --- a/docs/code.rst +++ b/docs/code.rst @@ -94,14 +94,36 @@ Community-contributed Operators .. deprecated:: 1.8 Use :code:`from airflow.operators.bash_operator import BashOperator` instead. +.. autoclass:: airflow.contrib.operators.bigquery_check_operator.BigQueryCheckOperator +.. autoclass:: airflow.contrib.operators.bigquery_check_operator.BigQueryValueCheckOperator +.. autoclass:: airflow.contrib.operators.bigquery_check_operator.BigQueryIntervalCheckOperator .. autoclass:: airflow.contrib.operators.bigquery_get_data.BigQueryGetDataOperator .. autoclass:: airflow.contrib.operators.bigquery_operator.BigQueryOperator +.. autoclass:: airflow.contrib.operators.bigquery_table_delete_operator.BigQueryTableDeleteOperator +.. autoclass:: airflow.contrib.operators.bigquery_to_bigquery.BigQueryToBigQueryOperator .. autoclass:: airflow.contrib.operators.bigquery_to_gcs.BigQueryToCloudStorageOperator .. autoclass:: airflow.contrib.operators.databricks_operator.DatabricksSubmitRunOperator +.. autoclass:: airflow.contrib.operators.dataflow_operator.DataFlowJavaOperator +.. autoclass:: airflow.contrib.operators.dataflow_operator.DataflowTemplateOperator +.. autoclass:: airflow.contrib.operators.dataflow_operator.DataFlowPythonOperator +.. autoclass:: airflow.contrib.operators.dataproc_operator.DataprocClusterCreateOperator +.. autoclass:: airflow.contrib.operators.dataproc_operator.DataprocClusterDeleteOperator +.. autoclass:: airflow.contrib.operators.dataproc_operator.DataProcPigOperator +.. autoclass:: airflow.contrib.operators.dataproc_operator.DataProcHiveOperator +.. autoclass:: airflow.contrib.operators.dataproc_operator.DataProcSparkSqlOperator +.. autoclass:: airflow.contrib.operators.dataproc_operator.DataProcSparkOperator +.. autoclass:: airflow.contrib.operators.dataproc_operator.DataProcHadoopOperator +.. autoclass:: airflow.contrib.operators.dataproc_operator.DataProcPySparkOperator +.. autoclass:: airflow.contrib.operators.dataproc_operator.DataprocWorkflowTemplateInstantiateOperator +.. autoclass:: airflow.contrib.operators.dataproc_operator.DataprocWorkflowTemplateInstantiateInlineOperator +.. autoclass:: airflow.contrib.operators.datastore_export_operator.DatastoreExportOperator +.. autoclass:: airflow.contrib.operators.datastore_import_operator.DatastoreImportOperator .. autoclass:: airflow.contrib.operators.ecs_operator.ECSOperator +.. autoclass:: airflow.contrib.operators.file_to_gcs.FileToGoogleCloudStorageOperator .. autoclass:: airflow.contrib.operators.file_to_wasb.FileToWasbOperator .. autoclass:: airflow.contrib.operators.gcs_copy_operator.GoogleCloudStorageCopyOperator .. autoclass:: airflow.contrib.operators.gcs_download_operator.GoogleCloudStorageDownloadOperator +.. autoclass:: airflow.contrib.operators.gcs_list_operator.GoogleCloudStorageListOperator .. autoclass:: airflow.contrib.operators.gcs_to_gcs.GoogleCloudStorageToGoogleCloudStorageOperator .. autoclass:: airflow.contrib.operators.pubsub_operator.PubSubTopicCreateOperator .. autoclass:: airflow.contrib.operators.pubsub_operator.PubSubTopicDeleteOperator diff --git a/docs/integration.rst b/docs/integration.rst index 734ecad68e..5c26f9aebb 100644 --- a/docs/integration.rst +++ b/docs/integration.rst @@ -62,8 +62,8 @@ Your reverse proxy (ex: nginx) should be configured as follow: proxy_set_header Connection "upgrade"; } } - - + + .. _Azure: Azure: Microsoft Azure @@ -374,6 +374,13 @@ BigQueryIntervalCheckOperator .. autoclass:: airflow.contrib.operators.bigquery_check_operator.BigQueryIntervalCheckOperator +.. _BigQueryGetDataOperator: + +BigQueryGetDataOperator +^^^^^^^^^^^^^^^^^^^^^^^ + +.. autoclass:: airflow.contrib.operators.bigquery_get_data.BigQueryGetDataOperator + .. _BigQueryOperator: BigQueryOperator @@ -381,6 +388,13 @@ BigQueryOperator .. autoclass:: airflow.contrib.operators.bigquery_operator.BigQueryOperator +.. _BigQueryTableDeleteOperator: + +BigQueryTableDeleteOperator +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. autoclass:: airflow.contrib.operators.bigquery_table_delete_operator.BigQueryTableDeleteOperator + .. _BigQueryToBigQueryOperator: BigQueryToBigQueryOperator @@ -410,6 +424,7 @@ DataFlow Operators """""""""""""""""" - :ref:`DataFlowJavaOperator` : launching Cloud Dataflow jobs written in Java. +- :ref:`DataflowTemplateOperator` : launching a templated Cloud DataFlow batch job. - :ref:`DataFlowPythonOperator` : launching Cloud Dataflow jobs written in python. .. _DataFlowJavaOperator: @@ -453,6 +468,13 @@ DataFlowJavaOperator }, dag=dag) +.. _DataflowTemplateOperator: + +DataflowTemplateOperator +^^^^^^^^^^^^^^^^^^^^^^^^ + +.. autoclass:: airflow.contrib.operators.dataflow_operator.DataflowTemplateOperator + .. _DataFlowPythonOperator: DataFlowPythonOperator @@ -475,12 +497,30 @@ Cloud DataProc DataProc Operators """""""""""""""""" +- :ref:`DataprocClusterCreateOperator` : Create a new cluster on Google Cloud Dataproc. +- :ref:`DataprocClusterDeleteOperator` : Delete a cluster on Google Cloud Dataproc. - :ref:`DataProcPigOperator` : Start a Pig query Job on a Cloud DataProc cluster. - :ref:`DataProcHiveOperator` : Start a Hive query Job on a Cloud DataProc cluster. - :ref:`DataProcSparkSqlOperator` : Start a Spark SQL query Job on a Cloud DataProc cluster. - :ref:`DataProcSparkOperator` : Start a Spark Job on a Cloud DataProc cluster. - :ref:`DataProcHadoopOperator` : Start a Hadoop Job on a Cloud DataProc cluster. - :ref:`DataProcPySparkOperator` : Start a PySpark Job on a Cloud DataProc cluster. +- :ref:`DataprocWorkflowTemplateInstantiateOperator` : Instantiate a WorkflowTemplate on Google Cloud Dataproc. +- :ref:`DataprocWorkflowTemplateInstantiateInlineOperator` : Instantiate a WorkflowTemplate Inline on Google Cloud Dataproc. + +.. _DataprocClusterCreateOperator: + +DataprocClusterCreateOperator +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. autoclass:: airflow.contrib.operators.dataproc_operator.DataprocClusterCreateOperator + +.. _DataprocClusterDeleteOperator: + +DataprocClusterDeleteOperator +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. autoclass:: airflow.contrib.operators.dataproc_operator.DataprocClusterDeleteOperator .. _DataProcPigOperator: @@ -523,12 +563,44 @@ DataProcPySparkOperator ^^^^^^^^^^^^^^^^^^^^^^^ .. autoclass:: airflow.contrib.operators.dataproc_operator.DataProcPySparkOperator - :members: +.. _DataprocWorkflowTemplateInstantiateOperator: + +DataprocWorkflowTemplateInstantiateOperator +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. autoclass:: airflow.contrib.operators.dataproc_operator.DataprocWorkflowTemplateInstantiateOperator + +.. _DataprocWorkflowTemplateInstantiateInlineOperator: + +DataprocWorkflowTemplateInstantiateInlineOperator +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. autoclass:: airflow.contrib.operators.dataproc_operator.DataprocWorkflowTemplateInstantiateInlineOperator Cloud Datastore ''''''''''''''' +Datastore Operators +""""""""""""""""""" + +- :ref:`DatastoreExportOperator` : Export entities from Google Cloud Datastore to Cloud Storage. +- :ref:`DatastoreImportOperator` : Import entities from Cloud Storage to Google Cloud Datastore. + +.. _DatastoreExportOperator: + +DatastoreExportOperator +^^^^^^^^^^^^^^^^^^^^^^^ + +.. autoclass:: airflow.contrib.operators.datastore_export_operator.DatastoreExportOperator + +.. _DatastoreImportOperator: + +DatastoreImportOperator +^^^^^^^^^^^^^^^^^^^^^^^ + +.. autoclass:: airflow.contrib.operators.datastore_import_operator.DatastoreImportOperator + DatastoreHook """"""""""""" @@ -597,8 +669,26 @@ Cloud Storage Storage Operators """"""""""""""""" +- :ref:`FileToGoogleCloudStorageOperator` : Uploads a file to Google Cloud Storage. +- :ref:`GoogleCloudStorageCopyOperator` : Copies objects (optionally from a directory) filtered by 'delimiter' (file extension for e.g .json) from a bucket to another bucket in a different directory, if required. +- :ref:`GoogleCloudStorageListOperator` : List all objects from the bucket with the give string prefix and delimiter in name. - :ref:`GoogleCloudStorageDownloadOperator` : Downloads a file from Google Cloud Storage. - :ref:`GoogleCloudStorageToBigQueryOperator` : Loads files from Google cloud storage into BigQuery. +- :ref:`GoogleCloudStorageToGoogleCloudStorageOperator` : Copies a single object from a bucket to another, with renaming if requested. + +.. _FileToGoogleCloudStorageOperator: + +FileToGoogleCloudStorageOperator +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. autoclass:: airflow.contrib.operators.file_to_gcs.FileToGoogleCloudStorageOperator + +.. _GoogleCloudStorageCopyOperator: + +GoogleCloudStorageCopyOperator +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. autoclass:: airflow.contrib.operators.gcs_copy_operator.GoogleCloudStorageCopyOperator .. _GoogleCloudStorageDownloadOperator: @@ -606,7 +696,13 @@ GoogleCloudStorageDownloadOperator ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. autoclass:: airflow.contrib.operators.gcs_download_operator.GoogleCloudStorageDownloadOperator - :members: + +.. _GoogleCloudStorageListOperator: + +GoogleCloudStorageListOperator +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. autoclass:: airflow.contrib.operators.gcs_list_operator.GoogleCloudStorageListOperator .. _GoogleCloudStorageToBigQueryOperator: @@ -614,8 +710,13 @@ GoogleCloudStorageToBigQueryOperator ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. autoclass:: airflow.contrib.operators.gcs_to_bq.GoogleCloudStorageToBigQueryOperator - :members: +.. _GoogleCloudStorageToGoogleCloudStorageOperator: + +GoogleCloudStorageToGoogleCloudStorageOperator +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. autoclass:: airflow.contrib.operators.gcs_to_gcs.GoogleCloudStorageToGoogleCloudStorageOperator GoogleCloudStorageHook """"""""""""""""""""""