Add BigQueryInsertJobOperator (#8868)
* Add BigQueryInsertJobOperator * fixup! Add BigQueryInsertJobOperator * fixup! fixup! Add BigQueryInsertJobOperator * fixup! fixup! fixup! Add BigQueryInsertJobOperator
This commit is contained in:
Родитель
7c0e6ede60
Коммит
7898525468
|
@ -27,7 +27,7 @@ from airflow.operators.bash import BashOperator
|
||||||
from airflow.providers.google.cloud.operators.bigquery import (
|
from airflow.providers.google.cloud.operators.bigquery import (
|
||||||
BigQueryCheckOperator, BigQueryCreateEmptyDatasetOperator, BigQueryCreateEmptyTableOperator,
|
BigQueryCheckOperator, BigQueryCreateEmptyDatasetOperator, BigQueryCreateEmptyTableOperator,
|
||||||
BigQueryDeleteDatasetOperator, BigQueryExecuteQueryOperator, BigQueryGetDataOperator,
|
BigQueryDeleteDatasetOperator, BigQueryExecuteQueryOperator, BigQueryGetDataOperator,
|
||||||
BigQueryIntervalCheckOperator, BigQueryValueCheckOperator,
|
BigQueryInsertJobOperator, BigQueryIntervalCheckOperator, BigQueryValueCheckOperator,
|
||||||
)
|
)
|
||||||
from airflow.utils.dates import days_ago
|
from airflow.utils.dates import days_ago
|
||||||
|
|
||||||
|
@ -40,10 +40,10 @@ TABLE_2 = "table2"
|
||||||
INSERT_DATE = datetime.now().strftime("%Y-%m-%d")
|
INSERT_DATE = datetime.now().strftime("%Y-%m-%d")
|
||||||
|
|
||||||
# [START howto_operator_bigquery_query]
|
# [START howto_operator_bigquery_query]
|
||||||
INSERT_ROWS_QUERY = f"""
|
INSERT_ROWS_QUERY = \
|
||||||
INSERT INTO {DATASET_NAME}.{TABLE_1} VALUES (42, "monthy python", "{INSERT_DATE}");
|
f"INSERT {DATASET_NAME}.{TABLE_1} VALUES " \
|
||||||
INSERT INTO {DATASET_NAME}.{TABLE_1} VALUES (42, "fishy fish", "{INSERT_DATE}");
|
f"(42, 'monthy python', '{INSERT_DATE}'), " \
|
||||||
"""
|
f"(42, 'fishy fish', '{INSERT_DATE}');"
|
||||||
# [END howto_operator_bigquery_query]
|
# [END howto_operator_bigquery_query]
|
||||||
|
|
||||||
SCHEMA = [
|
SCHEMA = [
|
||||||
|
@ -84,13 +84,22 @@ with models.DAG(
|
||||||
task_id="delete_dataset", dataset_id=DATASET_NAME, delete_contents=True
|
task_id="delete_dataset", dataset_id=DATASET_NAME, delete_contents=True
|
||||||
)
|
)
|
||||||
|
|
||||||
# [START howto_operator_bigquery_execute_query]
|
# [START howto_operator_bigquery_insert_job]
|
||||||
|
insert_query_job = BigQueryInsertJobOperator(
|
||||||
|
task_id="insert_query_job",
|
||||||
|
configuration={
|
||||||
|
"query": {
|
||||||
|
"query": INSERT_ROWS_QUERY,
|
||||||
|
"useLegacySql": False,
|
||||||
|
}
|
||||||
|
},
|
||||||
|
)
|
||||||
|
# [END howto_operator_bigquery_insert_job]
|
||||||
|
|
||||||
execute_insert_query = BigQueryExecuteQueryOperator(
|
execute_insert_query = BigQueryExecuteQueryOperator(
|
||||||
task_id="execute_insert_query", sql=INSERT_ROWS_QUERY, use_legacy_sql=False
|
task_id="execute_insert_query", sql=INSERT_ROWS_QUERY, use_legacy_sql=False
|
||||||
)
|
)
|
||||||
# [END howto_operator_bigquery_execute_query]
|
|
||||||
|
|
||||||
# [START howto_operator_bigquery_execute_query_list]
|
|
||||||
bigquery_execute_multi_query = BigQueryExecuteQueryOperator(
|
bigquery_execute_multi_query = BigQueryExecuteQueryOperator(
|
||||||
task_id="execute_multi_query",
|
task_id="execute_multi_query",
|
||||||
sql=[
|
sql=[
|
||||||
|
@ -99,16 +108,13 @@ with models.DAG(
|
||||||
],
|
],
|
||||||
use_legacy_sql=False,
|
use_legacy_sql=False,
|
||||||
)
|
)
|
||||||
# [END howto_operator_bigquery_execute_query_list]
|
|
||||||
|
|
||||||
# [START howto_operator_bigquery_execute_query_save]
|
|
||||||
execute_query_save = BigQueryExecuteQueryOperator(
|
execute_query_save = BigQueryExecuteQueryOperator(
|
||||||
task_id="execute_query_save",
|
task_id="execute_query_save",
|
||||||
sql=f"SELECT * FROM {DATASET_NAME}.{TABLE_1}",
|
sql=f"SELECT * FROM {DATASET_NAME}.{TABLE_1}",
|
||||||
use_legacy_sql=False,
|
use_legacy_sql=False,
|
||||||
destination_dataset_table=f"{DATASET_NAME}.{TABLE_2}",
|
destination_dataset_table=f"{DATASET_NAME}.{TABLE_2}",
|
||||||
)
|
)
|
||||||
# [END howto_operator_bigquery_execute_query_save]
|
|
||||||
|
|
||||||
# [START howto_operator_bigquery_get_data]
|
# [START howto_operator_bigquery_get_data]
|
||||||
get_data = BigQueryGetDataOperator(
|
get_data = BigQueryGetDataOperator(
|
||||||
|
@ -137,7 +143,7 @@ with models.DAG(
|
||||||
check_value = BigQueryValueCheckOperator(
|
check_value = BigQueryValueCheckOperator(
|
||||||
task_id="check_value",
|
task_id="check_value",
|
||||||
sql=f"SELECT COUNT(*) FROM {DATASET_NAME}.{TABLE_1}",
|
sql=f"SELECT COUNT(*) FROM {DATASET_NAME}.{TABLE_1}",
|
||||||
pass_value=2,
|
pass_value=4,
|
||||||
use_legacy_sql=False,
|
use_legacy_sql=False,
|
||||||
)
|
)
|
||||||
# [END howto_operator_bigquery_value_check]
|
# [END howto_operator_bigquery_value_check]
|
||||||
|
@ -152,8 +158,9 @@ with models.DAG(
|
||||||
)
|
)
|
||||||
# [END howto_operator_bigquery_interval_check]
|
# [END howto_operator_bigquery_interval_check]
|
||||||
|
|
||||||
[create_table_1, create_table_2] >> execute_insert_query
|
[create_table_1, create_table_2] >> insert_query_job
|
||||||
|
|
||||||
|
insert_query_job >> execute_insert_query
|
||||||
execute_insert_query >> get_data >> get_data_result >> delete_dataset
|
execute_insert_query >> get_data >> get_data_result >> delete_dataset
|
||||||
execute_insert_query >> execute_query_save >> bigquery_execute_multi_query >> delete_dataset
|
execute_insert_query >> execute_query_save >> bigquery_execute_multi_query >> delete_dataset
|
||||||
execute_insert_query >> [check_count, check_value, check_interval] >> delete_dataset
|
execute_insert_query >> [check_count, check_value, check_interval] >> delete_dataset
|
||||||
|
|
|
@ -1397,13 +1397,42 @@ class BigQueryHook(GoogleBaseHook, DbApiHook):
|
||||||
self.log.info('Waiting for canceled job with id %s to finish.', job_id)
|
self.log.info('Waiting for canceled job with id %s to finish.', job_id)
|
||||||
time.sleep(5)
|
time.sleep(5)
|
||||||
|
|
||||||
|
@GoogleBaseHook.fallback_to_default_project_id
|
||||||
|
def get_job(
|
||||||
|
self,
|
||||||
|
job_id: Optional[str] = None,
|
||||||
|
project_id: Optional[str] = None,
|
||||||
|
location: Optional[str] = None,
|
||||||
|
) -> Union[CopyJob, QueryJob, LoadJob, ExtractJob]:
|
||||||
|
"""
|
||||||
|
Retrives a BigQuery job. For more information see:
|
||||||
|
https://cloud.google.com/bigquery/docs/reference/v2/jobs
|
||||||
|
|
||||||
|
:param job_id: The ID of the job. The ID must contain only letters (a-z, A-Z),
|
||||||
|
numbers (0-9), underscores (_), or dashes (-). The maximum length is 1,024
|
||||||
|
characters. If not provided then uuid will be generated.
|
||||||
|
:type job_id: str
|
||||||
|
:param project_id: Google Cloud Project where the job is running
|
||||||
|
:type project_id: str
|
||||||
|
:param location: location the job is running
|
||||||
|
:type location: str
|
||||||
|
"""
|
||||||
|
client = self.get_client(project_id=project_id, location=location)
|
||||||
|
job = client.get_job(
|
||||||
|
job_id=job_id,
|
||||||
|
project=project_id,
|
||||||
|
location=location
|
||||||
|
)
|
||||||
|
return job
|
||||||
|
|
||||||
@GoogleBaseHook.fallback_to_default_project_id
|
@GoogleBaseHook.fallback_to_default_project_id
|
||||||
def insert_job(
|
def insert_job(
|
||||||
self,
|
self,
|
||||||
configuration: Dict,
|
configuration: Dict,
|
||||||
|
job_id: Optional[str] = None,
|
||||||
project_id: Optional[str] = None,
|
project_id: Optional[str] = None,
|
||||||
location: Optional[str] = None,
|
location: Optional[str] = None,
|
||||||
) -> str:
|
) -> Union[CopyJob, QueryJob, LoadJob, ExtractJob]:
|
||||||
"""
|
"""
|
||||||
Executes a BigQuery job. Waits for the job to complete and returns job id.
|
Executes a BigQuery job. Waits for the job to complete and returns job id.
|
||||||
See here:
|
See here:
|
||||||
|
@ -1414,17 +1443,23 @@ class BigQueryHook(GoogleBaseHook, DbApiHook):
|
||||||
BigQuery's configuration field in the job object. See
|
BigQuery's configuration field in the job object. See
|
||||||
https://cloud.google.com/bigquery/docs/reference/v2/jobs for
|
https://cloud.google.com/bigquery/docs/reference/v2/jobs for
|
||||||
details.
|
details.
|
||||||
|
:type configuration: Dict[str, Any]
|
||||||
|
:param job_id: The ID of the job. The ID must contain only letters (a-z, A-Z),
|
||||||
|
numbers (0-9), underscores (_), or dashes (-). The maximum length is 1,024
|
||||||
|
characters. If not provided then uuid will be generated.
|
||||||
|
:type job_id: str
|
||||||
:param project_id: Google Cloud Project where the job is running
|
:param project_id: Google Cloud Project where the job is running
|
||||||
:type project_id: str
|
:type project_id: str
|
||||||
:param location: location the job is running
|
:param location: location the job is running
|
||||||
:type location: str
|
:type location: str
|
||||||
"""
|
"""
|
||||||
|
job_id = job_id or str(uuid.uuid4())
|
||||||
location = location or self.location
|
location = location or self.location
|
||||||
client = self.get_client(project_id=project_id, location=location)
|
client = self.get_client(project_id=project_id, location=location)
|
||||||
job_data = {
|
job_data = {
|
||||||
"configuration": configuration,
|
"configuration": configuration,
|
||||||
"jobReference": {
|
"jobReference": {
|
||||||
"jobId": str(uuid.uuid4()),
|
"jobId": job_id,
|
||||||
"projectId": project_id,
|
"projectId": project_id,
|
||||||
"location": location
|
"location": location
|
||||||
}
|
}
|
||||||
|
@ -1446,9 +1481,7 @@ class BigQueryHook(GoogleBaseHook, DbApiHook):
|
||||||
if not job:
|
if not job:
|
||||||
raise AirflowException(f"Unknown job type. Supported types: {supported_jobs.keys()}")
|
raise AirflowException(f"Unknown job type. Supported types: {supported_jobs.keys()}")
|
||||||
job = job.from_api_repr(job_data, client)
|
job = job.from_api_repr(job_data, client)
|
||||||
# Start the job and wait for it to complete and get the result.
|
return job
|
||||||
job.result()
|
|
||||||
return job.job_id
|
|
||||||
|
|
||||||
def run_with_configuration(self, configuration: Dict) -> str:
|
def run_with_configuration(self, configuration: Dict) -> str:
|
||||||
"""
|
"""
|
||||||
|
@ -1467,8 +1500,11 @@ class BigQueryHook(GoogleBaseHook, DbApiHook):
|
||||||
"This method is deprecated. Please use `BigQueryHook.insert_job`",
|
"This method is deprecated. Please use `BigQueryHook.insert_job`",
|
||||||
DeprecationWarning
|
DeprecationWarning
|
||||||
)
|
)
|
||||||
self.running_job_id = self.insert_job(configuration=configuration, project_id=self.project_id)
|
job = self.insert_job(configuration=configuration, project_id=self.project_id)
|
||||||
return self.running_job_id
|
# Start the job and wait for it to complete and get the result.
|
||||||
|
job.result()
|
||||||
|
self.running_job_id = job.job_id
|
||||||
|
return job.job_id
|
||||||
|
|
||||||
def run_load(self, # pylint: disable=too-many-locals,too-many-arguments,invalid-name
|
def run_load(self, # pylint: disable=too-many-locals,too-many-arguments,invalid-name
|
||||||
destination_project_dataset_table: str,
|
destination_project_dataset_table: str,
|
||||||
|
@ -1709,8 +1745,11 @@ class BigQueryHook(GoogleBaseHook, DbApiHook):
|
||||||
if allow_jagged_rows:
|
if allow_jagged_rows:
|
||||||
configuration['load']['allowJaggedRows'] = allow_jagged_rows
|
configuration['load']['allowJaggedRows'] = allow_jagged_rows
|
||||||
|
|
||||||
self.running_job_id = self.insert_job(configuration=configuration, project_id=self.project_id)
|
job = self.insert_job(configuration=configuration, project_id=self.project_id)
|
||||||
return self.running_job_id
|
# Start the job and wait for it to complete and get the result.
|
||||||
|
job.result()
|
||||||
|
self.running_job_id = job.job_id
|
||||||
|
return job.job_id
|
||||||
|
|
||||||
def run_copy(self, # pylint: disable=invalid-name
|
def run_copy(self, # pylint: disable=invalid-name
|
||||||
source_project_dataset_tables: Union[List, str],
|
source_project_dataset_tables: Union[List, str],
|
||||||
|
@ -1803,8 +1842,11 @@ class BigQueryHook(GoogleBaseHook, DbApiHook):
|
||||||
"destinationEncryptionConfiguration"
|
"destinationEncryptionConfiguration"
|
||||||
] = encryption_configuration
|
] = encryption_configuration
|
||||||
|
|
||||||
self.running_job_id = self.insert_job(configuration=configuration, project_id=self.project_id)
|
job = self.insert_job(configuration=configuration, project_id=self.project_id)
|
||||||
return self.running_job_id
|
# Start the job and wait for it to complete and get the result.
|
||||||
|
job.result()
|
||||||
|
self.running_job_id = job.job_id
|
||||||
|
return job.job_id
|
||||||
|
|
||||||
def run_extract(
|
def run_extract(
|
||||||
self,
|
self,
|
||||||
|
@ -1878,8 +1920,9 @@ class BigQueryHook(GoogleBaseHook, DbApiHook):
|
||||||
configuration['extract']['fieldDelimiter'] = field_delimiter
|
configuration['extract']['fieldDelimiter'] = field_delimiter
|
||||||
configuration['extract']['printHeader'] = print_header
|
configuration['extract']['printHeader'] = print_header
|
||||||
|
|
||||||
self.running_job_id = self.insert_job(configuration=configuration, project_id=self.project_id)
|
job = self.insert_job(configuration=configuration, project_id=self.project_id)
|
||||||
return self.running_job_id
|
self.running_job_id = job.job_id
|
||||||
|
return job.job_id
|
||||||
|
|
||||||
# pylint: disable=too-many-locals,too-many-arguments, too-many-branches
|
# pylint: disable=too-many-locals,too-many-arguments, too-many-branches
|
||||||
def run_query(self,
|
def run_query(self,
|
||||||
|
@ -2123,8 +2166,11 @@ class BigQueryHook(GoogleBaseHook, DbApiHook):
|
||||||
"destinationEncryptionConfiguration"
|
"destinationEncryptionConfiguration"
|
||||||
] = encryption_configuration
|
] = encryption_configuration
|
||||||
|
|
||||||
self.running_job_id = self.insert_job(configuration=configuration, project_id=self.project_id)
|
job = self.insert_job(configuration=configuration, project_id=self.project_id)
|
||||||
return self.running_job_id
|
# Start the job and wait for it to complete and get the result.
|
||||||
|
job.result()
|
||||||
|
self.running_job_id = job.job_id
|
||||||
|
return job.job_id
|
||||||
|
|
||||||
|
|
||||||
class BigQueryPandasConnector(GbqConnector):
|
class BigQueryPandasConnector(GbqConnector):
|
||||||
|
|
|
@ -23,10 +23,12 @@ This module contains Google BigQuery operators.
|
||||||
import enum
|
import enum
|
||||||
import json
|
import json
|
||||||
import warnings
|
import warnings
|
||||||
|
from time import sleep
|
||||||
from typing import Any, Dict, Iterable, List, Optional, SupportsAbs, Union
|
from typing import Any, Dict, Iterable, List, Optional, SupportsAbs, Union
|
||||||
|
|
||||||
import attr
|
import attr
|
||||||
from google.api_core.exceptions import Conflict
|
from google.api_core.exceptions import Conflict
|
||||||
|
from google.api_core.retry import exponential_sleep_generator
|
||||||
from google.cloud.bigquery import TableReference
|
from google.cloud.bigquery import TableReference
|
||||||
|
|
||||||
from airflow.exceptions import AirflowException
|
from airflow.exceptions import AirflowException
|
||||||
|
@ -546,6 +548,11 @@ class BigQueryExecuteQueryOperator(BaseOperator):
|
||||||
"the gcp_conn_id parameter.", DeprecationWarning, stacklevel=3)
|
"the gcp_conn_id parameter.", DeprecationWarning, stacklevel=3)
|
||||||
gcp_conn_id = bigquery_conn_id
|
gcp_conn_id = bigquery_conn_id
|
||||||
|
|
||||||
|
warnings.warn(
|
||||||
|
"This operator is deprecated. Please use `BigQueryInsertJobOperator`.",
|
||||||
|
DeprecationWarning, stacklevel=3,
|
||||||
|
)
|
||||||
|
|
||||||
self.sql = sql
|
self.sql = sql
|
||||||
self.destination_dataset_table = destination_dataset_table
|
self.destination_dataset_table = destination_dataset_table
|
||||||
self.write_disposition = write_disposition
|
self.write_disposition = write_disposition
|
||||||
|
@ -1570,3 +1577,78 @@ class BigQueryUpsertTableOperator(BaseOperator):
|
||||||
table_resource=self.table_resource,
|
table_resource=self.table_resource,
|
||||||
project_id=self.project_id,
|
project_id=self.project_id,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class BigQueryInsertJobOperator(BaseOperator):
|
||||||
|
"""
|
||||||
|
Executes a BigQuery job. Waits for the job to complete and returns job id.
|
||||||
|
See here:
|
||||||
|
|
||||||
|
https://cloud.google.com/bigquery/docs/reference/v2/jobs
|
||||||
|
|
||||||
|
:param configuration: The configuration parameter maps directly to BigQuery's
|
||||||
|
configuration field in the job object. For more details see
|
||||||
|
https://cloud.google.com/bigquery/docs/reference/v2/jobs
|
||||||
|
:type configuration: Dict[str, Any]
|
||||||
|
:param job_id: The ID of the job. The ID must contain only letters (a-z, A-Z),
|
||||||
|
numbers (0-9), underscores (_), or dashes (-). The maximum length is 1,024
|
||||||
|
characters. If not provided then uuid will be generated.
|
||||||
|
:type job_id: str
|
||||||
|
:param project_id: Google Cloud Project where the job is running
|
||||||
|
:type project_id: str
|
||||||
|
:param location: location the job is running
|
||||||
|
:type location: str
|
||||||
|
:param gcp_conn_id: (Optional) The connection ID used to connect to Google Cloud Platform.
|
||||||
|
:type gcp_conn_id: str
|
||||||
|
"""
|
||||||
|
|
||||||
|
template_fields = ("configuration", "job_id")
|
||||||
|
ui_color = BigQueryUIColors.QUERY.value
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
configuration: Dict[str, Any],
|
||||||
|
project_id: Optional[str] = None,
|
||||||
|
location: Optional[str] = None,
|
||||||
|
job_id: Optional[str] = None,
|
||||||
|
gcp_conn_id: str = 'google_cloud_default',
|
||||||
|
delegate_to: Optional[str] = None,
|
||||||
|
*args,
|
||||||
|
**kwargs,
|
||||||
|
) -> None:
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
self.configuration = configuration
|
||||||
|
self.location = location
|
||||||
|
self.job_id = job_id
|
||||||
|
self.project_id = project_id
|
||||||
|
self.gcp_conn_id = gcp_conn_id
|
||||||
|
self.delegate_to = delegate_to
|
||||||
|
|
||||||
|
def execute(self, context: Any):
|
||||||
|
hook = BigQueryHook(
|
||||||
|
gcp_conn_id=self.gcp_conn_id,
|
||||||
|
delegate_to=self.delegate_to,
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
job = hook.insert_job(
|
||||||
|
configuration=self.configuration,
|
||||||
|
project_id=self.project_id,
|
||||||
|
location=self.location,
|
||||||
|
job_id=self.job_id,
|
||||||
|
)
|
||||||
|
# Start the job and wait for it to complete and get the result.
|
||||||
|
job.result()
|
||||||
|
except Conflict:
|
||||||
|
job = hook.get_job(
|
||||||
|
project_id=self.project_id,
|
||||||
|
location=self.location,
|
||||||
|
job_id=self.job_id,
|
||||||
|
)
|
||||||
|
# Get existing job and wait for it to be ready
|
||||||
|
for time_to_wait in exponential_sleep_generator(initial=10, maximum=120):
|
||||||
|
sleep(time_to_wait)
|
||||||
|
job.reload()
|
||||||
|
if job.done():
|
||||||
|
break
|
||||||
|
return job.job_id
|
||||||
|
|
|
@ -241,10 +241,10 @@ You can also use this operator to delete a view.
|
||||||
:start-after: [START howto_operator_bigquery_delete_view]
|
:start-after: [START howto_operator_bigquery_delete_view]
|
||||||
:end-before: [END howto_operator_bigquery_delete_view]
|
:end-before: [END howto_operator_bigquery_delete_view]
|
||||||
|
|
||||||
.. _howto/operator:BigQueryExecuteQueryOperator:
|
.. _howto/operator:BigQueryInsertJobOperator:
|
||||||
|
|
||||||
Execute queries
|
Execute BigQuery jobs
|
||||||
^^^^^^^^^^^^^^^
|
^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
Let's say you would like to execute the following query.
|
Let's say you would like to execute the following query.
|
||||||
|
|
||||||
|
@ -255,32 +255,23 @@ Let's say you would like to execute the following query.
|
||||||
:end-before: [END howto_operator_bigquery_query]
|
:end-before: [END howto_operator_bigquery_query]
|
||||||
|
|
||||||
To execute the SQL query in a specific BigQuery database you can use
|
To execute the SQL query in a specific BigQuery database you can use
|
||||||
:class:`~airflow.providers.google.cloud.operators.bigquery.BigQueryExecuteQueryOperator`.
|
:class:`~airflow.providers.google.cloud.operators.bigquery.BigQueryInsertJobOperator` with
|
||||||
|
proper query job configuration.
|
||||||
|
|
||||||
.. exampleinclude:: ../../../../airflow/providers/google/cloud/example_dags/example_bigquery_queries.py
|
.. exampleinclude:: ../../../../airflow/providers/google/cloud/example_dags/example_bigquery_queries.py
|
||||||
:language: python
|
:language: python
|
||||||
:dedent: 4
|
:dedent: 4
|
||||||
:start-after: [START howto_operator_bigquery_execute_query]
|
:start-after: [START howto_operator_bigquery_insert_job]
|
||||||
:end-before: [END howto_operator_bigquery_execute_query]
|
:end-before: [END howto_operator_bigquery_insert_job]
|
||||||
|
|
||||||
``sql`` argument can receive a str representing a sql statement, a list of str
|
For more information on types of BigQuery job please check
|
||||||
(sql statements), or reference to a template file. Template reference are recognized
|
`documentation <https://cloud.google.com/bigquery/docs/reference/v2/jobs>`__.
|
||||||
by str ending in '.sql'.
|
|
||||||
|
|
||||||
.. exampleinclude:: ../../../../airflow/providers/google/cloud/example_dags/example_bigquery_queries.py
|
Additionally you can use ``job_id`` parameter of
|
||||||
:language: python
|
:class:`~airflow.providers.google.cloud.operators.bigquery.BigQueryInsertJobOperator` to improve
|
||||||
:dedent: 4
|
idempotency. If this parameter is not passed then uuid will be used as ``job_id``. If provided then
|
||||||
:start-after: [START howto_operator_bigquery_execute_query_list]
|
operator will try to submit a new job with this ``job_id```. If there's already a job with such ``job_id``
|
||||||
:end-before: [END howto_operator_bigquery_execute_query_list]
|
then it will reattach to the existing job.
|
||||||
|
|
||||||
You can store the results of the query in a table by specifying
|
|
||||||
``destination_dataset_table``.
|
|
||||||
|
|
||||||
.. exampleinclude:: ../../../../airflow/providers/google/cloud/example_dags/example_bigquery_queries.py
|
|
||||||
:language: python
|
|
||||||
:dedent: 4
|
|
||||||
:start-after: [START howto_operator_bigquery_execute_query_save]
|
|
||||||
:end-before: [END howto_operator_bigquery_execute_query_save]
|
|
||||||
|
|
||||||
Validate data
|
Validate data
|
||||||
^^^^^^^^^^^^^
|
^^^^^^^^^^^^^
|
||||||
|
|
|
@ -21,6 +21,7 @@ from datetime import datetime
|
||||||
from unittest.mock import MagicMock
|
from unittest.mock import MagicMock
|
||||||
|
|
||||||
import mock
|
import mock
|
||||||
|
from google.cloud.exceptions import Conflict
|
||||||
from parameterized import parameterized
|
from parameterized import parameterized
|
||||||
|
|
||||||
from airflow import models
|
from airflow import models
|
||||||
|
@ -31,8 +32,8 @@ from airflow.providers.google.cloud.operators.bigquery import (
|
||||||
BigQueryCreateEmptyDatasetOperator, BigQueryCreateEmptyTableOperator, BigQueryCreateExternalTableOperator,
|
BigQueryCreateEmptyDatasetOperator, BigQueryCreateEmptyTableOperator, BigQueryCreateExternalTableOperator,
|
||||||
BigQueryDeleteDatasetOperator, BigQueryDeleteTableOperator, BigQueryExecuteQueryOperator,
|
BigQueryDeleteDatasetOperator, BigQueryDeleteTableOperator, BigQueryExecuteQueryOperator,
|
||||||
BigQueryGetDataOperator, BigQueryGetDatasetOperator, BigQueryGetDatasetTablesOperator,
|
BigQueryGetDataOperator, BigQueryGetDatasetOperator, BigQueryGetDatasetTablesOperator,
|
||||||
BigQueryIntervalCheckOperator, BigQueryPatchDatasetOperator, BigQueryUpdateDatasetOperator,
|
BigQueryInsertJobOperator, BigQueryIntervalCheckOperator, BigQueryPatchDatasetOperator,
|
||||||
BigQueryUpsertTableOperator, BigQueryValueCheckOperator,
|
BigQueryUpdateDatasetOperator, BigQueryUpsertTableOperator, BigQueryValueCheckOperator,
|
||||||
)
|
)
|
||||||
from airflow.serialization.serialized_objects import SerializedDAG
|
from airflow.serialization.serialized_objects import SerializedDAG
|
||||||
from airflow.settings import Session
|
from airflow.settings import Session
|
||||||
|
@ -788,3 +789,89 @@ class TestBigQueryUpsertTableOperator(unittest.TestCase):
|
||||||
project_id=TEST_GCP_PROJECT_ID,
|
project_id=TEST_GCP_PROJECT_ID,
|
||||||
table_resource=TEST_TABLE_RESOURCES
|
table_resource=TEST_TABLE_RESOURCES
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestBigQueryInsertJobOperator:
|
||||||
|
@mock.patch('airflow.providers.google.cloud.operators.bigquery.BigQueryHook')
|
||||||
|
def test_execute(self, mock_hook):
|
||||||
|
job_id = "123456"
|
||||||
|
configuration = {
|
||||||
|
"query": {
|
||||||
|
"query": "SELECT * FROM any",
|
||||||
|
"useLegacySql": False,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
mock_hook.return_value.insert_job.return_value = MagicMock(job_id=job_id)
|
||||||
|
|
||||||
|
op = BigQueryInsertJobOperator(
|
||||||
|
task_id="insert_query_job",
|
||||||
|
configuration=configuration,
|
||||||
|
location=TEST_DATASET_LOCATION,
|
||||||
|
job_id=job_id,
|
||||||
|
project_id=TEST_GCP_PROJECT_ID
|
||||||
|
)
|
||||||
|
result = op.execute({})
|
||||||
|
|
||||||
|
mock_hook.return_value.insert_job.assert_called_once_with(
|
||||||
|
configuration=configuration,
|
||||||
|
location=TEST_DATASET_LOCATION,
|
||||||
|
job_id=job_id,
|
||||||
|
project_id=TEST_GCP_PROJECT_ID,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result == job_id
|
||||||
|
|
||||||
|
@mock.patch('airflow.providers.google.cloud.operators.bigquery.exponential_sleep_generator')
|
||||||
|
@mock.patch('airflow.providers.google.cloud.operators.bigquery.BigQueryHook')
|
||||||
|
def test_execute_idempotency(self, mock_hook, mock_sleep_generator):
|
||||||
|
job_id = "123456"
|
||||||
|
|
||||||
|
configuration = {
|
||||||
|
"query": {
|
||||||
|
"query": "SELECT * FROM any",
|
||||||
|
"useLegacySql": False,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
class MockJob:
|
||||||
|
_call_no = 0
|
||||||
|
_done = False
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def reload(self):
|
||||||
|
if MockJob._call_no == 3:
|
||||||
|
MockJob._done = True
|
||||||
|
else:
|
||||||
|
MockJob._call_no += 1
|
||||||
|
|
||||||
|
def done(self):
|
||||||
|
return MockJob._done
|
||||||
|
|
||||||
|
@property
|
||||||
|
def job_id(self):
|
||||||
|
return job_id
|
||||||
|
|
||||||
|
mock_hook.return_value.insert_job.return_value.result.side_effect = Conflict("any")
|
||||||
|
mock_sleep_generator.return_value = [0, 0, 0, 0, 0]
|
||||||
|
mock_hook.return_value.get_job.return_value = MockJob()
|
||||||
|
|
||||||
|
op = BigQueryInsertJobOperator(
|
||||||
|
task_id="insert_query_job",
|
||||||
|
configuration=configuration,
|
||||||
|
location=TEST_DATASET_LOCATION,
|
||||||
|
job_id=job_id,
|
||||||
|
project_id=TEST_GCP_PROJECT_ID
|
||||||
|
)
|
||||||
|
result = op.execute({})
|
||||||
|
|
||||||
|
assert MockJob._call_no == 3
|
||||||
|
|
||||||
|
mock_hook.return_value.get_job.assert_called_once_with(
|
||||||
|
location=TEST_DATASET_LOCATION,
|
||||||
|
job_id=job_id,
|
||||||
|
project_id=TEST_GCP_PROJECT_ID,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result == job_id
|
||||||
|
|
Загрузка…
Ссылка в новой задаче