Add BigQueryInsertJobOperator (#8868)

* Add BigQueryInsertJobOperator * fixup! Add BigQueryInsertJobOperator * fixup! fixup! Add BigQueryInsertJobOperator * fixup! fixup! fixup! Add BigQueryInsertJobOperator
2020-06-01 11:54:38 +02:00 · 2020-06-01 11:54:38 +02:00 · 7898525468
--- a/airflow/providers/google/cloud/example_dags/example_bigquery_queries.py
+++ b/airflow/providers/google/cloud/example_dags/example_bigquery_queries.py
@ -27,7 +27,7 @@ from airflow.operators.bash import BashOperator
 from airflow.providers.google.cloud.operators.bigquery import (
    BigQueryCheckOperator, BigQueryCreateEmptyDatasetOperator, BigQueryCreateEmptyTableOperator,
    BigQueryDeleteDatasetOperator, BigQueryExecuteQueryOperator, BigQueryGetDataOperator,
-    BigQueryIntervalCheckOperator, BigQueryValueCheckOperator,
+    BigQueryInsertJobOperator, BigQueryIntervalCheckOperator, BigQueryValueCheckOperator,
 )
 from airflow.utils.dates import days_ago
@ -40,10 +40,10 @@ TABLE_2 = "table2"
 INSERT_DATE = datetime.now().strftime("%Y-%m-%d")
 # [START howto_operator_bigquery_query]
-INSERT_ROWS_QUERY = f"""
+INSERT_ROWS_QUERY = \
-INSERT INTO {DATASET_NAME}.{TABLE_1} VALUES (42, "monthy python", "{INSERT_DATE}");
+    f"INSERT {DATASET_NAME}.{TABLE_1} VALUES " \
-INSERT INTO {DATASET_NAME}.{TABLE_1} VALUES (42, "fishy fish", "{INSERT_DATE}");
+    f"(42, 'monthy python', '{INSERT_DATE}'), " \
-"""
+    f"(42, 'fishy fish', '{INSERT_DATE}');"
 # [END howto_operator_bigquery_query]
 SCHEMA = [
@ -84,13 +84,22 @@ with models.DAG(
        task_id="delete_dataset", dataset_id=DATASET_NAME, delete_contents=True
    )
-    # [START howto_operator_bigquery_execute_query]
+    # [START howto_operator_bigquery_insert_job]
    insert_query_job = BigQueryInsertJobOperator(
        task_id="insert_query_job",
        configuration={
            "query": {
                "query": INSERT_ROWS_QUERY,
                "useLegacySql": False,
            }
        },
    )
    # [END howto_operator_bigquery_insert_job]
    execute_insert_query = BigQueryExecuteQueryOperator(
        task_id="execute_insert_query", sql=INSERT_ROWS_QUERY, use_legacy_sql=False
    )
    # [END howto_operator_bigquery_execute_query]
    # [START howto_operator_bigquery_execute_query_list]
    bigquery_execute_multi_query = BigQueryExecuteQueryOperator(
        task_id="execute_multi_query",
        sql=[
@ -99,16 +108,13 @@ with models.DAG(
        ],
        use_legacy_sql=False,
    )
    # [END howto_operator_bigquery_execute_query_list]
    # [START howto_operator_bigquery_execute_query_save]
    execute_query_save = BigQueryExecuteQueryOperator(
        task_id="execute_query_save",
        sql=f"SELECT * FROM {DATASET_NAME}.{TABLE_1}",
        use_legacy_sql=False,
        destination_dataset_table=f"{DATASET_NAME}.{TABLE_2}",
    )
    # [END howto_operator_bigquery_execute_query_save]
    # [START howto_operator_bigquery_get_data]
    get_data = BigQueryGetDataOperator(
@ -137,7 +143,7 @@ with models.DAG(
    check_value = BigQueryValueCheckOperator(
        task_id="check_value",
        sql=f"SELECT COUNT(*) FROM {DATASET_NAME}.{TABLE_1}",
-        pass_value=2,
+        pass_value=4,
        use_legacy_sql=False,
    )
    # [END howto_operator_bigquery_value_check]
@ -152,8 +158,9 @@ with models.DAG(
    )
    # [END howto_operator_bigquery_interval_check]
-    [create_table_1, create_table_2] >> execute_insert_query
+    [create_table_1, create_table_2] >> insert_query_job
    insert_query_job >> execute_insert_query
    execute_insert_query >> get_data >> get_data_result >> delete_dataset
    execute_insert_query >> execute_query_save >> bigquery_execute_multi_query >> delete_dataset
    execute_insert_query >> [check_count, check_value, check_interval] >> delete_dataset
--- a/airflow/providers/google/cloud/hooks/bigquery.py
+++ b/airflow/providers/google/cloud/hooks/bigquery.py
@ -1397,13 +1397,42 @@ class BigQueryHook(GoogleBaseHook, DbApiHook):
                self.log.info('Waiting for canceled job with id %s to finish.', job_id)
                time.sleep(5)
    @GoogleBaseHook.fallback_to_default_project_id
    def get_job(
        self,
        job_id: Optional[str] = None,
        project_id: Optional[str] = None,
        location: Optional[str] = None,
    ) -> Union[CopyJob, QueryJob, LoadJob, ExtractJob]:
        """
        Retrives a BigQuery job. For more information see:
        https://cloud.google.com/bigquery/docs/reference/v2/jobs
        :param job_id: The ID of the job. The ID must contain only letters (a-z, A-Z),
            numbers (0-9), underscores (_), or dashes (-). The maximum length is 1,024
            characters. If not provided then uuid will be generated.
        :type job_id: str
        :param project_id: Google Cloud Project where the job is running
        :type project_id: str
        :param location: location the job is running
        :type location: str
        """
        client = self.get_client(project_id=project_id, location=location)
        job = client.get_job(
            job_id=job_id,
            project=project_id,
            location=location
        )
        return job
    @GoogleBaseHook.fallback_to_default_project_id
    def insert_job(
        self,
        configuration: Dict,
        job_id: Optional[str] = None,
        project_id: Optional[str] = None,
        location: Optional[str] = None,
-    ) -> str:
+    ) -> Union[CopyJob, QueryJob, LoadJob, ExtractJob]:
        """
        Executes a BigQuery job. Waits for the job to complete and returns job id.
        See here:
@ -1414,17 +1443,23 @@ class BigQueryHook(GoogleBaseHook, DbApiHook):
            BigQuery's configuration field in the job object. See
            https://cloud.google.com/bigquery/docs/reference/v2/jobs for
            details.
        :type configuration: Dict[str, Any]
        :param job_id: The ID of the job. The ID must contain only letters (a-z, A-Z),
            numbers (0-9), underscores (_), or dashes (-). The maximum length is 1,024
            characters. If not provided then uuid will be generated.
        :type job_id: str
        :param project_id: Google Cloud Project where the job is running
        :type project_id: str
        :param location: location the job is running
        :type location: str
        """
        job_id = job_id or str(uuid.uuid4())
        location = location or self.location
        client = self.get_client(project_id=project_id, location=location)
        job_data = {
            "configuration": configuration,
            "jobReference": {
-                "jobId": str(uuid.uuid4()),
+                "jobId": job_id,
                "projectId": project_id,
                "location": location
            }
@ -1446,9 +1481,7 @@ class BigQueryHook(GoogleBaseHook, DbApiHook):
        if not job:
            raise AirflowException(f"Unknown job type. Supported types: {supported_jobs.keys()}")
        job = job.from_api_repr(job_data, client)
-        # Start the job and wait for it to complete and get the result.
+        return job
        job.result()
        return job.job_id
    def run_with_configuration(self, configuration: Dict) -> str:
        """
@ -1467,8 +1500,11 @@ class BigQueryHook(GoogleBaseHook, DbApiHook):
            "This method is deprecated. Please use `BigQueryHook.insert_job`",
            DeprecationWarning
        )
-        self.running_job_id = self.insert_job(configuration=configuration, project_id=self.project_id)
+        job = self.insert_job(configuration=configuration, project_id=self.project_id)
-        return self.running_job_id
+        # Start the job and wait for it to complete and get the result.
        job.result()
        self.running_job_id = job.job_id
        return job.job_id
    def run_load(self,  # pylint: disable=too-many-locals,too-many-arguments,invalid-name
                 destination_project_dataset_table: str,
@ -1709,8 +1745,11 @@ class BigQueryHook(GoogleBaseHook, DbApiHook):
        if allow_jagged_rows:
            configuration['load']['allowJaggedRows'] = allow_jagged_rows
-        self.running_job_id = self.insert_job(configuration=configuration, project_id=self.project_id)
+        job = self.insert_job(configuration=configuration, project_id=self.project_id)
-        return self.running_job_id
+        # Start the job and wait for it to complete and get the result.
        job.result()
        self.running_job_id = job.job_id
        return job.job_id
    def run_copy(self,  # pylint: disable=invalid-name
                 source_project_dataset_tables: Union[List, str],
@ -1803,8 +1842,11 @@ class BigQueryHook(GoogleBaseHook, DbApiHook):
                "destinationEncryptionConfiguration"
            ] = encryption_configuration
-        self.running_job_id = self.insert_job(configuration=configuration, project_id=self.project_id)
+        job = self.insert_job(configuration=configuration, project_id=self.project_id)
-        return self.running_job_id
+        # Start the job and wait for it to complete and get the result.
        job.result()
        self.running_job_id = job.job_id
        return job.job_id
    def run_extract(
            self,
@ -1878,8 +1920,9 @@ class BigQueryHook(GoogleBaseHook, DbApiHook):
            configuration['extract']['fieldDelimiter'] = field_delimiter
            configuration['extract']['printHeader'] = print_header
-        self.running_job_id = self.insert_job(configuration=configuration, project_id=self.project_id)
+        job = self.insert_job(configuration=configuration, project_id=self.project_id)
-        return self.running_job_id
+        self.running_job_id = job.job_id
        return job.job_id
    # pylint: disable=too-many-locals,too-many-arguments, too-many-branches
    def run_query(self,
@ -2123,8 +2166,11 @@ class BigQueryHook(GoogleBaseHook, DbApiHook):
                "destinationEncryptionConfiguration"
            ] = encryption_configuration
-        self.running_job_id = self.insert_job(configuration=configuration, project_id=self.project_id)
+        job = self.insert_job(configuration=configuration, project_id=self.project_id)
-        return self.running_job_id
+        # Start the job and wait for it to complete and get the result.
        job.result()
        self.running_job_id = job.job_id
        return job.job_id
 class BigQueryPandasConnector(GbqConnector):
--- a/airflow/providers/google/cloud/operators/bigquery.py
+++ b/airflow/providers/google/cloud/operators/bigquery.py
@ -23,10 +23,12 @@ This module contains Google BigQuery operators.
 import enum
 import json
 import warnings
 from time import sleep
 from typing import Any, Dict, Iterable, List, Optional, SupportsAbs, Union
 import attr
 from google.api_core.exceptions import Conflict
 from google.api_core.retry import exponential_sleep_generator
 from google.cloud.bigquery import TableReference
 from airflow.exceptions import AirflowException
@ -546,6 +548,11 @@ class BigQueryExecuteQueryOperator(BaseOperator):
                "the gcp_conn_id parameter.", DeprecationWarning, stacklevel=3)
            gcp_conn_id = bigquery_conn_id
        warnings.warn(
            "This operator is deprecated. Please use `BigQueryInsertJobOperator`.",
            DeprecationWarning, stacklevel=3,
        )
        self.sql = sql
        self.destination_dataset_table = destination_dataset_table
        self.write_disposition = write_disposition
@ -1570,3 +1577,78 @@ class BigQueryUpsertTableOperator(BaseOperator):
            table_resource=self.table_resource,
            project_id=self.project_id,
        )
 class BigQueryInsertJobOperator(BaseOperator):
    """
    Executes a BigQuery job. Waits for the job to complete and returns job id.
    See here:
        https://cloud.google.com/bigquery/docs/reference/v2/jobs
    :param configuration: The configuration parameter maps directly to BigQuery's
        configuration field in the job  object. For more details see
        https://cloud.google.com/bigquery/docs/reference/v2/jobs
    :type configuration: Dict[str, Any]
    :param job_id: The ID of the job. The ID must contain only letters (a-z, A-Z),
        numbers (0-9), underscores (_), or dashes (-). The maximum length is 1,024
        characters. If not provided then uuid will be generated.
    :type job_id: str
    :param project_id: Google Cloud Project where the job is running
    :type project_id: str
    :param location: location the job is running
    :type location: str
    :param gcp_conn_id: (Optional) The connection ID used to connect to Google Cloud Platform.
    :type gcp_conn_id: str
    """
    template_fields = ("configuration", "job_id")
    ui_color = BigQueryUIColors.QUERY.value
    def __init__(
        self,
        configuration: Dict[str, Any],
        project_id: Optional[str] = None,
        location: Optional[str] = None,
        job_id: Optional[str] = None,
        gcp_conn_id: str = 'google_cloud_default',
        delegate_to: Optional[str] = None,
        *args,
        **kwargs,
    ) -> None:
        super().__init__(*args, **kwargs)
        self.configuration = configuration
        self.location = location
        self.job_id = job_id
        self.project_id = project_id
        self.gcp_conn_id = gcp_conn_id
        self.delegate_to = delegate_to
    def execute(self, context: Any):
        hook = BigQueryHook(
            gcp_conn_id=self.gcp_conn_id,
            delegate_to=self.delegate_to,
        )
        try:
            job = hook.insert_job(
                configuration=self.configuration,
                project_id=self.project_id,
                location=self.location,
                job_id=self.job_id,
            )
            # Start the job and wait for it to complete and get the result.
            job.result()
        except Conflict:
            job = hook.get_job(
                project_id=self.project_id,
                location=self.location,
                job_id=self.job_id,
            )
            # Get existing job and wait for it to be ready
            for time_to_wait in exponential_sleep_generator(initial=10, maximum=120):
                sleep(time_to_wait)
                job.reload()
                if job.done():
                    break
        return job.job_id
--- a/docs/howto/operator/gcp/bigquery.rst
+++ b/docs/howto/operator/gcp/bigquery.rst
@ -241,10 +241,10 @@ You can also use this operator to delete a view.
    :start-after: [START howto_operator_bigquery_delete_view]
    :end-before: [END howto_operator_bigquery_delete_view]
-.. _howto/operator:BigQueryExecuteQueryOperator:
+.. _howto/operator:BigQueryInsertJobOperator:
-Execute queries
+Execute BigQuery jobs
-^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^
 Let's say you would like to execute the following query.
@ -255,32 +255,23 @@ Let's say you would like to execute the following query.
    :end-before: [END howto_operator_bigquery_query]
 To execute the SQL query in a specific BigQuery database you can use
-:class:`~airflow.providers.google.cloud.operators.bigquery.BigQueryExecuteQueryOperator`.
+:class:`~airflow.providers.google.cloud.operators.bigquery.BigQueryInsertJobOperator` with
 proper query job configuration.
 .. exampleinclude:: ../../../../airflow/providers/google/cloud/example_dags/example_bigquery_queries.py
    :language: python
    :dedent: 4
-    :start-after: [START howto_operator_bigquery_execute_query]
+    :start-after: [START howto_operator_bigquery_insert_job]
-    :end-before: [END howto_operator_bigquery_execute_query]
+    :end-before: [END howto_operator_bigquery_insert_job]
-``sql`` argument can receive a str representing a sql statement, a list of str
+For more information on types of BigQuery job please check
-(sql statements), or reference to a template file. Template reference are recognized
+`documentation <https://cloud.google.com/bigquery/docs/reference/v2/jobs>`__.
 by str ending in '.sql'.
-.. exampleinclude:: ../../../../airflow/providers/google/cloud/example_dags/example_bigquery_queries.py
+Additionally you can use ``job_id`` parameter of
-    :language: python
+:class:`~airflow.providers.google.cloud.operators.bigquery.BigQueryInsertJobOperator` to improve
-    :dedent: 4
+idempotency. If this parameter is not passed then uuid will be used as ``job_id``. If provided then
-    :start-after: [START howto_operator_bigquery_execute_query_list]
+operator will try to submit a new job with this ``job_id```. If there's already a job with such ``job_id``
-    :end-before: [END howto_operator_bigquery_execute_query_list]
+then it will reattach to the existing job.
 You can store the results of the query in a table by specifying
 ``destination_dataset_table``.
 .. exampleinclude:: ../../../../airflow/providers/google/cloud/example_dags/example_bigquery_queries.py
    :language: python
    :dedent: 4
    :start-after: [START howto_operator_bigquery_execute_query_save]
    :end-before: [END howto_operator_bigquery_execute_query_save]
 Validate data
 ^^^^^^^^^^^^^
--- a/tests/providers/google/cloud/operators/test_bigquery.py
+++ b/tests/providers/google/cloud/operators/test_bigquery.py
@ -21,6 +21,7 @@ from datetime import datetime
 from unittest.mock import MagicMock
 import mock
 from google.cloud.exceptions import Conflict
 from parameterized import parameterized
 from airflow import models
@ -31,8 +32,8 @@ from airflow.providers.google.cloud.operators.bigquery import (
    BigQueryCreateEmptyDatasetOperator, BigQueryCreateEmptyTableOperator, BigQueryCreateExternalTableOperator,
    BigQueryDeleteDatasetOperator, BigQueryDeleteTableOperator, BigQueryExecuteQueryOperator,
    BigQueryGetDataOperator, BigQueryGetDatasetOperator, BigQueryGetDatasetTablesOperator,
-    BigQueryIntervalCheckOperator, BigQueryPatchDatasetOperator, BigQueryUpdateDatasetOperator,
+    BigQueryInsertJobOperator, BigQueryIntervalCheckOperator, BigQueryPatchDatasetOperator,
-    BigQueryUpsertTableOperator, BigQueryValueCheckOperator,
+    BigQueryUpdateDatasetOperator, BigQueryUpsertTableOperator, BigQueryValueCheckOperator,
 )
 from airflow.serialization.serialized_objects import SerializedDAG
 from airflow.settings import Session
@ -788,3 +789,89 @@ class TestBigQueryUpsertTableOperator(unittest.TestCase):
                project_id=TEST_GCP_PROJECT_ID,
                table_resource=TEST_TABLE_RESOURCES
            )
 class TestBigQueryInsertJobOperator:
    @mock.patch('airflow.providers.google.cloud.operators.bigquery.BigQueryHook')
    def test_execute(self, mock_hook):
        job_id = "123456"
        configuration = {
            "query": {
                "query": "SELECT * FROM any",
                "useLegacySql": False,
            }
        }
        mock_hook.return_value.insert_job.return_value = MagicMock(job_id=job_id)
        op = BigQueryInsertJobOperator(
            task_id="insert_query_job",
            configuration=configuration,
            location=TEST_DATASET_LOCATION,
            job_id=job_id,
            project_id=TEST_GCP_PROJECT_ID
        )
        result = op.execute({})
        mock_hook.return_value.insert_job.assert_called_once_with(
            configuration=configuration,
            location=TEST_DATASET_LOCATION,
            job_id=job_id,
            project_id=TEST_GCP_PROJECT_ID,
        )
        assert result == job_id
    @mock.patch('airflow.providers.google.cloud.operators.bigquery.exponential_sleep_generator')
    @mock.patch('airflow.providers.google.cloud.operators.bigquery.BigQueryHook')
    def test_execute_idempotency(self, mock_hook, mock_sleep_generator):
        job_id = "123456"
        configuration = {
            "query": {
                "query": "SELECT * FROM any",
                "useLegacySql": False,
            }
        }
        class MockJob:
            _call_no = 0
            _done = False
            def __init__(self):
                pass
            def reload(self):
                if MockJob._call_no == 3:
                    MockJob._done = True
                else:
                    MockJob._call_no += 1
            def done(self):
                return MockJob._done
            @property
            def job_id(self):
                return job_id
        mock_hook.return_value.insert_job.return_value.result.side_effect = Conflict("any")
        mock_sleep_generator.return_value = [0, 0, 0, 0, 0]
        mock_hook.return_value.get_job.return_value = MockJob()
        op = BigQueryInsertJobOperator(
            task_id="insert_query_job",
            configuration=configuration,
            location=TEST_DATASET_LOCATION,
            job_id=job_id,
            project_id=TEST_GCP_PROJECT_ID
        )
        result = op.execute({})
        assert MockJob._call_no == 3
        mock_hook.return_value.get_job.assert_called_once_with(
            location=TEST_DATASET_LOCATION,
            job_id=job_id,
            project_id=TEST_GCP_PROJECT_ID,
        )
        assert result == job_id