* Add BigQueryInsertJobOperator

* fixup! Add BigQueryInsertJobOperator

* fixup! fixup! Add BigQueryInsertJobOperator

* fixup! fixup! fixup! Add BigQueryInsertJobOperator
This commit is contained in:
Tomek Urbaszek 2020-06-01 11:54:38 +02:00 коммит произвёл GitHub
Родитель 7c0e6ede60
Коммит 7898525468
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
5 изменённых файлов: 266 добавлений и 53 удалений

Просмотреть файл

@ -27,7 +27,7 @@ from airflow.operators.bash import BashOperator
from airflow.providers.google.cloud.operators.bigquery import ( from airflow.providers.google.cloud.operators.bigquery import (
BigQueryCheckOperator, BigQueryCreateEmptyDatasetOperator, BigQueryCreateEmptyTableOperator, BigQueryCheckOperator, BigQueryCreateEmptyDatasetOperator, BigQueryCreateEmptyTableOperator,
BigQueryDeleteDatasetOperator, BigQueryExecuteQueryOperator, BigQueryGetDataOperator, BigQueryDeleteDatasetOperator, BigQueryExecuteQueryOperator, BigQueryGetDataOperator,
BigQueryIntervalCheckOperator, BigQueryValueCheckOperator, BigQueryInsertJobOperator, BigQueryIntervalCheckOperator, BigQueryValueCheckOperator,
) )
from airflow.utils.dates import days_ago from airflow.utils.dates import days_ago
@ -40,10 +40,10 @@ TABLE_2 = "table2"
INSERT_DATE = datetime.now().strftime("%Y-%m-%d") INSERT_DATE = datetime.now().strftime("%Y-%m-%d")
# [START howto_operator_bigquery_query] # [START howto_operator_bigquery_query]
INSERT_ROWS_QUERY = f""" INSERT_ROWS_QUERY = \
INSERT INTO {DATASET_NAME}.{TABLE_1} VALUES (42, "monthy python", "{INSERT_DATE}"); f"INSERT {DATASET_NAME}.{TABLE_1} VALUES " \
INSERT INTO {DATASET_NAME}.{TABLE_1} VALUES (42, "fishy fish", "{INSERT_DATE}"); f"(42, 'monthy python', '{INSERT_DATE}'), " \
""" f"(42, 'fishy fish', '{INSERT_DATE}');"
# [END howto_operator_bigquery_query] # [END howto_operator_bigquery_query]
SCHEMA = [ SCHEMA = [
@ -84,13 +84,22 @@ with models.DAG(
task_id="delete_dataset", dataset_id=DATASET_NAME, delete_contents=True task_id="delete_dataset", dataset_id=DATASET_NAME, delete_contents=True
) )
# [START howto_operator_bigquery_execute_query] # [START howto_operator_bigquery_insert_job]
insert_query_job = BigQueryInsertJobOperator(
task_id="insert_query_job",
configuration={
"query": {
"query": INSERT_ROWS_QUERY,
"useLegacySql": False,
}
},
)
# [END howto_operator_bigquery_insert_job]
execute_insert_query = BigQueryExecuteQueryOperator( execute_insert_query = BigQueryExecuteQueryOperator(
task_id="execute_insert_query", sql=INSERT_ROWS_QUERY, use_legacy_sql=False task_id="execute_insert_query", sql=INSERT_ROWS_QUERY, use_legacy_sql=False
) )
# [END howto_operator_bigquery_execute_query]
# [START howto_operator_bigquery_execute_query_list]
bigquery_execute_multi_query = BigQueryExecuteQueryOperator( bigquery_execute_multi_query = BigQueryExecuteQueryOperator(
task_id="execute_multi_query", task_id="execute_multi_query",
sql=[ sql=[
@ -99,16 +108,13 @@ with models.DAG(
], ],
use_legacy_sql=False, use_legacy_sql=False,
) )
# [END howto_operator_bigquery_execute_query_list]
# [START howto_operator_bigquery_execute_query_save]
execute_query_save = BigQueryExecuteQueryOperator( execute_query_save = BigQueryExecuteQueryOperator(
task_id="execute_query_save", task_id="execute_query_save",
sql=f"SELECT * FROM {DATASET_NAME}.{TABLE_1}", sql=f"SELECT * FROM {DATASET_NAME}.{TABLE_1}",
use_legacy_sql=False, use_legacy_sql=False,
destination_dataset_table=f"{DATASET_NAME}.{TABLE_2}", destination_dataset_table=f"{DATASET_NAME}.{TABLE_2}",
) )
# [END howto_operator_bigquery_execute_query_save]
# [START howto_operator_bigquery_get_data] # [START howto_operator_bigquery_get_data]
get_data = BigQueryGetDataOperator( get_data = BigQueryGetDataOperator(
@ -137,7 +143,7 @@ with models.DAG(
check_value = BigQueryValueCheckOperator( check_value = BigQueryValueCheckOperator(
task_id="check_value", task_id="check_value",
sql=f"SELECT COUNT(*) FROM {DATASET_NAME}.{TABLE_1}", sql=f"SELECT COUNT(*) FROM {DATASET_NAME}.{TABLE_1}",
pass_value=2, pass_value=4,
use_legacy_sql=False, use_legacy_sql=False,
) )
# [END howto_operator_bigquery_value_check] # [END howto_operator_bigquery_value_check]
@ -152,8 +158,9 @@ with models.DAG(
) )
# [END howto_operator_bigquery_interval_check] # [END howto_operator_bigquery_interval_check]
[create_table_1, create_table_2] >> execute_insert_query [create_table_1, create_table_2] >> insert_query_job
insert_query_job >> execute_insert_query
execute_insert_query >> get_data >> get_data_result >> delete_dataset execute_insert_query >> get_data >> get_data_result >> delete_dataset
execute_insert_query >> execute_query_save >> bigquery_execute_multi_query >> delete_dataset execute_insert_query >> execute_query_save >> bigquery_execute_multi_query >> delete_dataset
execute_insert_query >> [check_count, check_value, check_interval] >> delete_dataset execute_insert_query >> [check_count, check_value, check_interval] >> delete_dataset

Просмотреть файл

@ -1397,13 +1397,42 @@ class BigQueryHook(GoogleBaseHook, DbApiHook):
self.log.info('Waiting for canceled job with id %s to finish.', job_id) self.log.info('Waiting for canceled job with id %s to finish.', job_id)
time.sleep(5) time.sleep(5)
@GoogleBaseHook.fallback_to_default_project_id
def get_job(
self,
job_id: Optional[str] = None,
project_id: Optional[str] = None,
location: Optional[str] = None,
) -> Union[CopyJob, QueryJob, LoadJob, ExtractJob]:
"""
Retrives a BigQuery job. For more information see:
https://cloud.google.com/bigquery/docs/reference/v2/jobs
:param job_id: The ID of the job. The ID must contain only letters (a-z, A-Z),
numbers (0-9), underscores (_), or dashes (-). The maximum length is 1,024
characters. If not provided then uuid will be generated.
:type job_id: str
:param project_id: Google Cloud Project where the job is running
:type project_id: str
:param location: location the job is running
:type location: str
"""
client = self.get_client(project_id=project_id, location=location)
job = client.get_job(
job_id=job_id,
project=project_id,
location=location
)
return job
@GoogleBaseHook.fallback_to_default_project_id @GoogleBaseHook.fallback_to_default_project_id
def insert_job( def insert_job(
self, self,
configuration: Dict, configuration: Dict,
job_id: Optional[str] = None,
project_id: Optional[str] = None, project_id: Optional[str] = None,
location: Optional[str] = None, location: Optional[str] = None,
) -> str: ) -> Union[CopyJob, QueryJob, LoadJob, ExtractJob]:
""" """
Executes a BigQuery job. Waits for the job to complete and returns job id. Executes a BigQuery job. Waits for the job to complete and returns job id.
See here: See here:
@ -1414,17 +1443,23 @@ class BigQueryHook(GoogleBaseHook, DbApiHook):
BigQuery's configuration field in the job object. See BigQuery's configuration field in the job object. See
https://cloud.google.com/bigquery/docs/reference/v2/jobs for https://cloud.google.com/bigquery/docs/reference/v2/jobs for
details. details.
:type configuration: Dict[str, Any]
:param job_id: The ID of the job. The ID must contain only letters (a-z, A-Z),
numbers (0-9), underscores (_), or dashes (-). The maximum length is 1,024
characters. If not provided then uuid will be generated.
:type job_id: str
:param project_id: Google Cloud Project where the job is running :param project_id: Google Cloud Project where the job is running
:type project_id: str :type project_id: str
:param location: location the job is running :param location: location the job is running
:type location: str :type location: str
""" """
job_id = job_id or str(uuid.uuid4())
location = location or self.location location = location or self.location
client = self.get_client(project_id=project_id, location=location) client = self.get_client(project_id=project_id, location=location)
job_data = { job_data = {
"configuration": configuration, "configuration": configuration,
"jobReference": { "jobReference": {
"jobId": str(uuid.uuid4()), "jobId": job_id,
"projectId": project_id, "projectId": project_id,
"location": location "location": location
} }
@ -1446,9 +1481,7 @@ class BigQueryHook(GoogleBaseHook, DbApiHook):
if not job: if not job:
raise AirflowException(f"Unknown job type. Supported types: {supported_jobs.keys()}") raise AirflowException(f"Unknown job type. Supported types: {supported_jobs.keys()}")
job = job.from_api_repr(job_data, client) job = job.from_api_repr(job_data, client)
# Start the job and wait for it to complete and get the result. return job
job.result()
return job.job_id
def run_with_configuration(self, configuration: Dict) -> str: def run_with_configuration(self, configuration: Dict) -> str:
""" """
@ -1467,8 +1500,11 @@ class BigQueryHook(GoogleBaseHook, DbApiHook):
"This method is deprecated. Please use `BigQueryHook.insert_job`", "This method is deprecated. Please use `BigQueryHook.insert_job`",
DeprecationWarning DeprecationWarning
) )
self.running_job_id = self.insert_job(configuration=configuration, project_id=self.project_id) job = self.insert_job(configuration=configuration, project_id=self.project_id)
return self.running_job_id # Start the job and wait for it to complete and get the result.
job.result()
self.running_job_id = job.job_id
return job.job_id
def run_load(self, # pylint: disable=too-many-locals,too-many-arguments,invalid-name def run_load(self, # pylint: disable=too-many-locals,too-many-arguments,invalid-name
destination_project_dataset_table: str, destination_project_dataset_table: str,
@ -1709,8 +1745,11 @@ class BigQueryHook(GoogleBaseHook, DbApiHook):
if allow_jagged_rows: if allow_jagged_rows:
configuration['load']['allowJaggedRows'] = allow_jagged_rows configuration['load']['allowJaggedRows'] = allow_jagged_rows
self.running_job_id = self.insert_job(configuration=configuration, project_id=self.project_id) job = self.insert_job(configuration=configuration, project_id=self.project_id)
return self.running_job_id # Start the job and wait for it to complete and get the result.
job.result()
self.running_job_id = job.job_id
return job.job_id
def run_copy(self, # pylint: disable=invalid-name def run_copy(self, # pylint: disable=invalid-name
source_project_dataset_tables: Union[List, str], source_project_dataset_tables: Union[List, str],
@ -1803,8 +1842,11 @@ class BigQueryHook(GoogleBaseHook, DbApiHook):
"destinationEncryptionConfiguration" "destinationEncryptionConfiguration"
] = encryption_configuration ] = encryption_configuration
self.running_job_id = self.insert_job(configuration=configuration, project_id=self.project_id) job = self.insert_job(configuration=configuration, project_id=self.project_id)
return self.running_job_id # Start the job and wait for it to complete and get the result.
job.result()
self.running_job_id = job.job_id
return job.job_id
def run_extract( def run_extract(
self, self,
@ -1878,8 +1920,9 @@ class BigQueryHook(GoogleBaseHook, DbApiHook):
configuration['extract']['fieldDelimiter'] = field_delimiter configuration['extract']['fieldDelimiter'] = field_delimiter
configuration['extract']['printHeader'] = print_header configuration['extract']['printHeader'] = print_header
self.running_job_id = self.insert_job(configuration=configuration, project_id=self.project_id) job = self.insert_job(configuration=configuration, project_id=self.project_id)
return self.running_job_id self.running_job_id = job.job_id
return job.job_id
# pylint: disable=too-many-locals,too-many-arguments, too-many-branches # pylint: disable=too-many-locals,too-many-arguments, too-many-branches
def run_query(self, def run_query(self,
@ -2123,8 +2166,11 @@ class BigQueryHook(GoogleBaseHook, DbApiHook):
"destinationEncryptionConfiguration" "destinationEncryptionConfiguration"
] = encryption_configuration ] = encryption_configuration
self.running_job_id = self.insert_job(configuration=configuration, project_id=self.project_id) job = self.insert_job(configuration=configuration, project_id=self.project_id)
return self.running_job_id # Start the job and wait for it to complete and get the result.
job.result()
self.running_job_id = job.job_id
return job.job_id
class BigQueryPandasConnector(GbqConnector): class BigQueryPandasConnector(GbqConnector):

Просмотреть файл

@ -23,10 +23,12 @@ This module contains Google BigQuery operators.
import enum import enum
import json import json
import warnings import warnings
from time import sleep
from typing import Any, Dict, Iterable, List, Optional, SupportsAbs, Union from typing import Any, Dict, Iterable, List, Optional, SupportsAbs, Union
import attr import attr
from google.api_core.exceptions import Conflict from google.api_core.exceptions import Conflict
from google.api_core.retry import exponential_sleep_generator
from google.cloud.bigquery import TableReference from google.cloud.bigquery import TableReference
from airflow.exceptions import AirflowException from airflow.exceptions import AirflowException
@ -546,6 +548,11 @@ class BigQueryExecuteQueryOperator(BaseOperator):
"the gcp_conn_id parameter.", DeprecationWarning, stacklevel=3) "the gcp_conn_id parameter.", DeprecationWarning, stacklevel=3)
gcp_conn_id = bigquery_conn_id gcp_conn_id = bigquery_conn_id
warnings.warn(
"This operator is deprecated. Please use `BigQueryInsertJobOperator`.",
DeprecationWarning, stacklevel=3,
)
self.sql = sql self.sql = sql
self.destination_dataset_table = destination_dataset_table self.destination_dataset_table = destination_dataset_table
self.write_disposition = write_disposition self.write_disposition = write_disposition
@ -1570,3 +1577,78 @@ class BigQueryUpsertTableOperator(BaseOperator):
table_resource=self.table_resource, table_resource=self.table_resource,
project_id=self.project_id, project_id=self.project_id,
) )
class BigQueryInsertJobOperator(BaseOperator):
"""
Executes a BigQuery job. Waits for the job to complete and returns job id.
See here:
https://cloud.google.com/bigquery/docs/reference/v2/jobs
:param configuration: The configuration parameter maps directly to BigQuery's
configuration field in the job object. For more details see
https://cloud.google.com/bigquery/docs/reference/v2/jobs
:type configuration: Dict[str, Any]
:param job_id: The ID of the job. The ID must contain only letters (a-z, A-Z),
numbers (0-9), underscores (_), or dashes (-). The maximum length is 1,024
characters. If not provided then uuid will be generated.
:type job_id: str
:param project_id: Google Cloud Project where the job is running
:type project_id: str
:param location: location the job is running
:type location: str
:param gcp_conn_id: (Optional) The connection ID used to connect to Google Cloud Platform.
:type gcp_conn_id: str
"""
template_fields = ("configuration", "job_id")
ui_color = BigQueryUIColors.QUERY.value
def __init__(
self,
configuration: Dict[str, Any],
project_id: Optional[str] = None,
location: Optional[str] = None,
job_id: Optional[str] = None,
gcp_conn_id: str = 'google_cloud_default',
delegate_to: Optional[str] = None,
*args,
**kwargs,
) -> None:
super().__init__(*args, **kwargs)
self.configuration = configuration
self.location = location
self.job_id = job_id
self.project_id = project_id
self.gcp_conn_id = gcp_conn_id
self.delegate_to = delegate_to
def execute(self, context: Any):
hook = BigQueryHook(
gcp_conn_id=self.gcp_conn_id,
delegate_to=self.delegate_to,
)
try:
job = hook.insert_job(
configuration=self.configuration,
project_id=self.project_id,
location=self.location,
job_id=self.job_id,
)
# Start the job and wait for it to complete and get the result.
job.result()
except Conflict:
job = hook.get_job(
project_id=self.project_id,
location=self.location,
job_id=self.job_id,
)
# Get existing job and wait for it to be ready
for time_to_wait in exponential_sleep_generator(initial=10, maximum=120):
sleep(time_to_wait)
job.reload()
if job.done():
break
return job.job_id

Просмотреть файл

@ -241,10 +241,10 @@ You can also use this operator to delete a view.
:start-after: [START howto_operator_bigquery_delete_view] :start-after: [START howto_operator_bigquery_delete_view]
:end-before: [END howto_operator_bigquery_delete_view] :end-before: [END howto_operator_bigquery_delete_view]
.. _howto/operator:BigQueryExecuteQueryOperator: .. _howto/operator:BigQueryInsertJobOperator:
Execute queries Execute BigQuery jobs
^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^
Let's say you would like to execute the following query. Let's say you would like to execute the following query.
@ -255,32 +255,23 @@ Let's say you would like to execute the following query.
:end-before: [END howto_operator_bigquery_query] :end-before: [END howto_operator_bigquery_query]
To execute the SQL query in a specific BigQuery database you can use To execute the SQL query in a specific BigQuery database you can use
:class:`~airflow.providers.google.cloud.operators.bigquery.BigQueryExecuteQueryOperator`. :class:`~airflow.providers.google.cloud.operators.bigquery.BigQueryInsertJobOperator` with
proper query job configuration.
.. exampleinclude:: ../../../../airflow/providers/google/cloud/example_dags/example_bigquery_queries.py .. exampleinclude:: ../../../../airflow/providers/google/cloud/example_dags/example_bigquery_queries.py
:language: python :language: python
:dedent: 4 :dedent: 4
:start-after: [START howto_operator_bigquery_execute_query] :start-after: [START howto_operator_bigquery_insert_job]
:end-before: [END howto_operator_bigquery_execute_query] :end-before: [END howto_operator_bigquery_insert_job]
``sql`` argument can receive a str representing a sql statement, a list of str For more information on types of BigQuery job please check
(sql statements), or reference to a template file. Template reference are recognized `documentation <https://cloud.google.com/bigquery/docs/reference/v2/jobs>`__.
by str ending in '.sql'.
.. exampleinclude:: ../../../../airflow/providers/google/cloud/example_dags/example_bigquery_queries.py Additionally you can use ``job_id`` parameter of
:language: python :class:`~airflow.providers.google.cloud.operators.bigquery.BigQueryInsertJobOperator` to improve
:dedent: 4 idempotency. If this parameter is not passed then uuid will be used as ``job_id``. If provided then
:start-after: [START howto_operator_bigquery_execute_query_list] operator will try to submit a new job with this ``job_id```. If there's already a job with such ``job_id``
:end-before: [END howto_operator_bigquery_execute_query_list] then it will reattach to the existing job.
You can store the results of the query in a table by specifying
``destination_dataset_table``.
.. exampleinclude:: ../../../../airflow/providers/google/cloud/example_dags/example_bigquery_queries.py
:language: python
:dedent: 4
:start-after: [START howto_operator_bigquery_execute_query_save]
:end-before: [END howto_operator_bigquery_execute_query_save]
Validate data Validate data
^^^^^^^^^^^^^ ^^^^^^^^^^^^^

Просмотреть файл

@ -21,6 +21,7 @@ from datetime import datetime
from unittest.mock import MagicMock from unittest.mock import MagicMock
import mock import mock
from google.cloud.exceptions import Conflict
from parameterized import parameterized from parameterized import parameterized
from airflow import models from airflow import models
@ -31,8 +32,8 @@ from airflow.providers.google.cloud.operators.bigquery import (
BigQueryCreateEmptyDatasetOperator, BigQueryCreateEmptyTableOperator, BigQueryCreateExternalTableOperator, BigQueryCreateEmptyDatasetOperator, BigQueryCreateEmptyTableOperator, BigQueryCreateExternalTableOperator,
BigQueryDeleteDatasetOperator, BigQueryDeleteTableOperator, BigQueryExecuteQueryOperator, BigQueryDeleteDatasetOperator, BigQueryDeleteTableOperator, BigQueryExecuteQueryOperator,
BigQueryGetDataOperator, BigQueryGetDatasetOperator, BigQueryGetDatasetTablesOperator, BigQueryGetDataOperator, BigQueryGetDatasetOperator, BigQueryGetDatasetTablesOperator,
BigQueryIntervalCheckOperator, BigQueryPatchDatasetOperator, BigQueryUpdateDatasetOperator, BigQueryInsertJobOperator, BigQueryIntervalCheckOperator, BigQueryPatchDatasetOperator,
BigQueryUpsertTableOperator, BigQueryValueCheckOperator, BigQueryUpdateDatasetOperator, BigQueryUpsertTableOperator, BigQueryValueCheckOperator,
) )
from airflow.serialization.serialized_objects import SerializedDAG from airflow.serialization.serialized_objects import SerializedDAG
from airflow.settings import Session from airflow.settings import Session
@ -788,3 +789,89 @@ class TestBigQueryUpsertTableOperator(unittest.TestCase):
project_id=TEST_GCP_PROJECT_ID, project_id=TEST_GCP_PROJECT_ID,
table_resource=TEST_TABLE_RESOURCES table_resource=TEST_TABLE_RESOURCES
) )
class TestBigQueryInsertJobOperator:
@mock.patch('airflow.providers.google.cloud.operators.bigquery.BigQueryHook')
def test_execute(self, mock_hook):
job_id = "123456"
configuration = {
"query": {
"query": "SELECT * FROM any",
"useLegacySql": False,
}
}
mock_hook.return_value.insert_job.return_value = MagicMock(job_id=job_id)
op = BigQueryInsertJobOperator(
task_id="insert_query_job",
configuration=configuration,
location=TEST_DATASET_LOCATION,
job_id=job_id,
project_id=TEST_GCP_PROJECT_ID
)
result = op.execute({})
mock_hook.return_value.insert_job.assert_called_once_with(
configuration=configuration,
location=TEST_DATASET_LOCATION,
job_id=job_id,
project_id=TEST_GCP_PROJECT_ID,
)
assert result == job_id
@mock.patch('airflow.providers.google.cloud.operators.bigquery.exponential_sleep_generator')
@mock.patch('airflow.providers.google.cloud.operators.bigquery.BigQueryHook')
def test_execute_idempotency(self, mock_hook, mock_sleep_generator):
job_id = "123456"
configuration = {
"query": {
"query": "SELECT * FROM any",
"useLegacySql": False,
}
}
class MockJob:
_call_no = 0
_done = False
def __init__(self):
pass
def reload(self):
if MockJob._call_no == 3:
MockJob._done = True
else:
MockJob._call_no += 1
def done(self):
return MockJob._done
@property
def job_id(self):
return job_id
mock_hook.return_value.insert_job.return_value.result.side_effect = Conflict("any")
mock_sleep_generator.return_value = [0, 0, 0, 0, 0]
mock_hook.return_value.get_job.return_value = MockJob()
op = BigQueryInsertJobOperator(
task_id="insert_query_job",
configuration=configuration,
location=TEST_DATASET_LOCATION,
job_id=job_id,
project_id=TEST_GCP_PROJECT_ID
)
result = op.execute({})
assert MockJob._call_no == 3
mock_hook.return_value.get_job.assert_called_once_with(
location=TEST_DATASET_LOCATION,
job_id=job_id,
project_id=TEST_GCP_PROJECT_ID,
)
assert result == job_id