зеркало из
1
0
Форкнуть 0
* streaming from blob

* managed streaming
Q:  why dont we use aio library for storage?

* format

* fix test

* prepering client_base from_stream methods

* maybe better

* async impl
test fallback to queue

* format

* fix usage of ExecuteRequestParams in tests

* fix usage of ExecuteRequestParams in tests

* b

* fix aio

* add span headers for devbug

* add span headers for devbug

* add span headers for devbug

* t

* f

* revert

* print

* try copy

* remove print

---------

Co-authored-by: Ohad Bitton <ohbitton@microsoft.com>
This commit is contained in:
ohad bitton 2024-01-24 10:13:14 +02:00 коммит произвёл GitHub
Родитель 40197266f2
Коммит c6150c0056
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: B5690EEEBB952194
11 изменённых файлов: 397 добавлений и 152 удалений

Просмотреть файл

@ -5,6 +5,13 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## Unreleased ## Unreleased
### Added
- Streaming ingestion from blob
### Fixed
- Managed streaming fallback to queued
-
### Changed ### Changed
- Changed binary files data format compression to false - Changed binary files data format compression to false

Просмотреть файл

@ -51,42 +51,85 @@ class KustoClient(_KustoClientBase):
return await self.execute_mgmt(database, query, properties) return await self.execute_mgmt(database, query, properties)
return await self.execute_query(database, query, properties) return await self.execute_query(database, query, properties)
@distributed_trace_async(name_of_span="KustoClient.query_cmd", kind=SpanKind.CLIENT) @distributed_trace_async(name_of_span="AioKustoClient.query_cmd", kind=SpanKind.CLIENT)
@aio_documented_by(KustoClientSync.execute_query) @aio_documented_by(KustoClientSync.execute_query)
async def execute_query(self, database: str, query: str, properties: ClientRequestProperties = None) -> KustoResponseDataSet: async def execute_query(self, database: str, query: str, properties: ClientRequestProperties = None) -> KustoResponseDataSet:
database = self._get_database_or_default(database) database = self._get_database_or_default(database)
Span.set_query_attributes(self._kusto_cluster, database, properties) Span.set_query_attributes(self._kusto_cluster, database, properties)
request = ExecuteRequestParams._from_query(
query,
database,
properties,
self._request_headers,
self._query_default_timeout,
self._mgmt_default_timeout,
self._client_server_delta,
self.client_details,
)
return await self._execute(self._query_endpoint, request, properties)
return await self._execute(self._query_endpoint, database, query, None, KustoClient._query_default_timeout, properties) @distributed_trace_async(name_of_span="AioKustoClient.control_cmd", kind=SpanKind.CLIENT)
@distributed_trace_async(name_of_span="KustoClient.control_cmd", kind=SpanKind.CLIENT)
@aio_documented_by(KustoClientSync.execute_mgmt) @aio_documented_by(KustoClientSync.execute_mgmt)
async def execute_mgmt(self, database: str, query: str, properties: ClientRequestProperties = None) -> KustoResponseDataSet: async def execute_mgmt(self, database: str, query: str, properties: ClientRequestProperties = None) -> KustoResponseDataSet:
database = self._get_database_or_default(database) database = self._get_database_or_default(database)
Span.set_query_attributes(self._kusto_cluster, database, properties) Span.set_query_attributes(self._kusto_cluster, database, properties)
request = ExecuteRequestParams._from_query(
query,
database,
properties,
self._request_headers,
self._mgmt_default_timeout,
self._mgmt_default_timeout,
self._client_server_delta,
self.client_details,
)
return await self._execute(self._mgmt_endpoint, request, properties)
return await self._execute(self._mgmt_endpoint, database, query, None, KustoClient._mgmt_default_timeout, properties) @distributed_trace_async(name_of_span="AioKustoClient.streaming_ingest", kind=SpanKind.CLIENT)
@distributed_trace_async(name_of_span="KustoClient.streaming_ingest", kind=SpanKind.CLIENT)
@aio_documented_by(KustoClientSync.execute_streaming_ingest) @aio_documented_by(KustoClientSync.execute_streaming_ingest)
async def execute_streaming_ingest( async def execute_streaming_ingest(
self, self,
database: Optional[str], database: Optional[str],
table: str, table: str,
stream: io.IOBase, stream: Optional[io.IOBase],
blob_url: Optional[str],
stream_format: Union[DataFormat, str], stream_format: Union[DataFormat, str],
properties: ClientRequestProperties = None, properties: ClientRequestProperties = None,
mapping_name: str = None, mapping_name: str = None,
): ):
database = self._get_database_or_default(database) database = self._get_database_or_default(database)
Span.set_streaming_ingest_attributes(self._kusto_cluster, database, table, properties)
stream_format = stream_format.kusto_value if isinstance(stream_format, DataFormat) else DataFormat[stream_format.upper()].kusto_value stream_format = stream_format.kusto_value if isinstance(stream_format, DataFormat) else DataFormat[stream_format.upper()].kusto_value
endpoint = self._streaming_ingest_endpoint + database + "/" + table + "?streamFormat=" + stream_format endpoint = self._streaming_ingest_endpoint + database + "/" + table + "?streamFormat=" + stream_format
if mapping_name is not None: if mapping_name is not None:
endpoint = endpoint + "&mappingName=" + mapping_name endpoint = endpoint + "&mappingName=" + mapping_name
await self._execute(endpoint, database, None, stream, self._streaming_ingest_default_timeout, properties) if blob_url:
endpoint += "&sourceKind=uri"
request = ExecuteRequestParams._from_blob_url(
blob_url,
properties,
self._request_headers,
self._streaming_ingest_default_timeout,
self._mgmt_default_timeout,
self._client_server_delta,
self.client_details,
)
elif stream:
request = ExecuteRequestParams._from_stream(
stream,
properties,
self._request_headers,
self._streaming_ingest_default_timeout,
self._mgmt_default_timeout,
self._client_server_delta,
self.client_details,
)
else:
raise Exception("execute_streaming_ingest is expecting either a stream or blob url")
Span.set_streaming_ingest_attributes(self._kusto_cluster, database, table, properties)
await self._execute(endpoint, request, properties)
@aio_documented_by(KustoClientSync._execute_streaming_query_parsed) @aio_documented_by(KustoClientSync._execute_streaming_query_parsed)
async def _execute_streaming_query_parsed( async def _execute_streaming_query_parsed(
@ -96,10 +139,13 @@ class KustoClient(_KustoClientBase):
timeout: timedelta = _KustoClientBase._query_default_timeout, timeout: timedelta = _KustoClientBase._query_default_timeout,
properties: Optional[ClientRequestProperties] = None, properties: Optional[ClientRequestProperties] = None,
) -> StreamingDataSetEnumerator: ) -> StreamingDataSetEnumerator:
response = await self._execute(self._query_endpoint, database, query, None, timeout, properties, stream_response=True) request = ExecuteRequestParams._from_query(
query, database, properties, self._request_headers, timeout, self._mgmt_default_timeout, self._client_server_delta, self.client_details
)
response = await self._execute(self._query_endpoint, request, properties, stream_response=True)
return StreamingDataSetEnumerator(JsonTokenReader(response.content)) return StreamingDataSetEnumerator(JsonTokenReader(response.content))
@distributed_trace_async(name_of_span="KustoClient.streaming_query", kind=SpanKind.CLIENT) @distributed_trace_async(name_of_span="AioKustoClient.streaming_query", kind=SpanKind.CLIENT)
@aio_documented_by(KustoClientSync.execute_streaming_query) @aio_documented_by(KustoClientSync.execute_streaming_query)
async def execute_streaming_query( async def execute_streaming_query(
self, self,
@ -118,41 +164,33 @@ class KustoClient(_KustoClientBase):
async def _execute( async def _execute(
self, self,
endpoint: str, endpoint: str,
database: Optional[str], request: ExecuteRequestParams,
query: Optional[str], properties: Optional[ClientRequestProperties] = None,
payload: Optional[io.IOBase],
timeout: timedelta,
properties: ClientRequestProperties = None,
stream_response: bool = False, stream_response: bool = False,
) -> Union[KustoResponseDataSet, ClientResponse]: ) -> Union[KustoResponseDataSet, ClientResponse]:
"""Executes given query against this client""" """Executes given query against this client"""
if self._is_closed: if self._is_closed:
raise KustoClosedError() raise KustoClosedError()
self.validate_endpoint() self.validate_endpoint()
request_params = ExecuteRequestParams(
database, request_headers = request.request_headers
payload, timeout = request.timeout
properties,
query,
timeout,
self._request_headers,
self._mgmt_default_timeout,
self._client_server_delta,
self.client_details,
)
json_payload = request_params.json_payload
request_headers = request_params.request_headers
timeout = request_params.timeout
if self._aad_helper: if self._aad_helper:
request_headers["Authorization"] = await self._aad_helper.acquire_authorization_header_async() request_headers["Authorization"] = await self._aad_helper.acquire_authorization_header_async()
invoker = lambda: self._session.post( invoker = lambda: self._session.post(
endpoint, headers=request_headers, json=json_payload, data=payload, timeout=timeout.seconds, proxy=self._proxy_url, allow_redirects=False endpoint,
headers=request_headers,
json=request.json_payload,
data=request.payload,
timeout=timeout.seconds,
proxy=self._proxy_url,
allow_redirects=False,
) )
try: try:
response = await MonitoredActivity.invoke_async( response = await MonitoredActivity.invoke_async(
invoker, name_of_span="KustoClient.http_post", tracing_attributes=Span.create_http_attributes("POST", endpoint, request_headers) invoker, name_of_span="AioKustoClient.http_post", tracing_attributes=Span.create_http_attributes("POST", endpoint, request_headers)
) )
except Exception as e: except Exception as e:
raise KustoNetworkError(endpoint, None if properties is None else properties.client_request_id) from e raise KustoNetworkError(endpoint, None if properties is None else properties.client_request_id) from e
@ -172,7 +210,7 @@ class KustoClient(_KustoClientBase):
response_json = await response.json() response_json = await response.json()
except Exception: except Exception:
response_json = None response_json = None
raise self._handle_http_error(e, endpoint, payload, response, response.status, response_json, response_text) raise self._handle_http_error(e, endpoint, request.payload, response, response.status, response_json, response_text)
async with response: async with response:
response_json = None response_json = None
@ -186,5 +224,5 @@ class KustoClient(_KustoClientBase):
response_text = await response.text() response_text = await response.text()
except Exception: except Exception:
response_text = None response_text = None
raise self._handle_http_error(e, endpoint, payload, response, response.status, response_json, response_text) raise self._handle_http_error(e, endpoint, request.payload, response, response.status, response_json, response_text)
return MonitoredActivity.invoke(lambda: self._kusto_parse_by_endpoint(endpoint, response_json), name_of_span="KustoClient.processing_response") return MonitoredActivity.invoke(lambda: self._kusto_parse_by_endpoint(endpoint, response_json), name_of_span="AioKustoClient.processing_response")

Просмотреть файл

@ -175,8 +175,17 @@ class KustoClient(_KustoClientBase):
""" """
database = self._get_database_or_default(database) database = self._get_database_or_default(database)
Span.set_query_attributes(self._kusto_cluster, database, properties) Span.set_query_attributes(self._kusto_cluster, database, properties)
request = ExecuteRequestParams._from_query(
return self._execute(self._query_endpoint, database, query, None, self._query_default_timeout, properties) query,
database,
properties,
self._request_headers,
self._query_default_timeout,
self._mgmt_default_timeout,
self._client_server_delta,
self.client_details,
)
return self._execute(self._query_endpoint, request, properties)
@distributed_trace(name_of_span="KustoClient.control_cmd", kind=SpanKind.CLIENT) @distributed_trace(name_of_span="KustoClient.control_cmd", kind=SpanKind.CLIENT)
def execute_mgmt(self, database: Optional[str], query: str, properties: Optional[ClientRequestProperties] = None) -> KustoResponseDataSet: def execute_mgmt(self, database: Optional[str], query: str, properties: Optional[ClientRequestProperties] = None) -> KustoResponseDataSet:
@ -191,15 +200,25 @@ class KustoClient(_KustoClientBase):
""" """
database = self._get_database_or_default(database) database = self._get_database_or_default(database)
Span.set_query_attributes(self._kusto_cluster, database, properties) Span.set_query_attributes(self._kusto_cluster, database, properties)
request = ExecuteRequestParams._from_query(
return self._execute(self._mgmt_endpoint, database, query, None, self._mgmt_default_timeout, properties) query,
database,
properties,
self._request_headers,
self._mgmt_default_timeout,
self._mgmt_default_timeout,
self._client_server_delta,
self.client_details,
)
return self._execute(self._mgmt_endpoint, request, properties)
@distributed_trace(name_of_span="KustoClient.streaming_ingest", kind=SpanKind.CLIENT) @distributed_trace(name_of_span="KustoClient.streaming_ingest", kind=SpanKind.CLIENT)
def execute_streaming_ingest( def execute_streaming_ingest(
self, self,
database: Optional[str], database: Optional[str],
table: str, table: str,
stream: IO[AnyStr], stream: Optional[IO[AnyStr]],
blob_url: Optional[str],
stream_format: Union[DataFormat, str], stream_format: Union[DataFormat, str],
properties: Optional[ClientRequestProperties] = None, properties: Optional[ClientRequestProperties] = None,
mapping_name: str = None, mapping_name: str = None,
@ -211,20 +230,44 @@ class KustoClient(_KustoClientBase):
https://docs.microsoft.com/en-us/azure/data-explorer/ingest-data-streaming https://docs.microsoft.com/en-us/azure/data-explorer/ingest-data-streaming
:param Optional[str] database: Target database. If not provided, will default to the "Initial Catalog" value in the connection string :param Optional[str] database: Target database. If not provided, will default to the "Initial Catalog" value in the connection string
:param str table: Target table. :param str table: Target table.
:param io.BaseIO stream: stream object which contains the data to ingest. :param Optional[IO[AnyStr]] stream: a stream object or which contains the data to ingest.
:param Optional[str] blob_url: An url to a blob which contains the data to ingest. Provide either this or stream.
:param DataFormat stream_format: Format of the data in the stream. :param DataFormat stream_format: Format of the data in the stream.
:param ClientRequestProperties properties: additional request properties. :param ClientRequestProperties properties: additional request properties.
:param str mapping_name: Pre-defined mapping of the table. Required when stream_format is json/avro. :param str mapping_name: Pre-defined mapping of the table. Required when stream_format is json/avro.
""" """
database = self._get_database_or_default(database) database = self._get_database_or_default(database)
Span.set_streaming_ingest_attributes(self._kusto_cluster, database, table, properties)
stream_format = stream_format.kusto_value if isinstance(stream_format, DataFormat) else DataFormat[stream_format.upper()].kusto_value stream_format = stream_format.kusto_value if isinstance(stream_format, DataFormat) else DataFormat[stream_format.upper()].kusto_value
endpoint = self._streaming_ingest_endpoint + database + "/" + table + "?streamFormat=" + stream_format endpoint = self._streaming_ingest_endpoint + database + "/" + table + "?streamFormat=" + stream_format
if mapping_name is not None: if mapping_name is not None:
endpoint = endpoint + "&mappingName=" + mapping_name endpoint = endpoint + "&mappingName=" + mapping_name
if blob_url:
endpoint += "&sourceKind=uri"
request = ExecuteRequestParams._from_blob_url(
blob_url,
properties,
self._request_headers,
self._streaming_ingest_default_timeout,
self._mgmt_default_timeout,
self._client_server_delta,
self.client_details,
)
elif stream:
request = ExecuteRequestParams._from_stream(
stream,
properties,
self._request_headers,
self._streaming_ingest_default_timeout,
self._mgmt_default_timeout,
self._client_server_delta,
self.client_details,
)
else:
raise Exception("execute_streaming_ingest is expecting either a stream or blob url")
self._execute(endpoint, database, None, stream, self._streaming_ingest_default_timeout, properties) Span.set_streaming_ingest_attributes(self._kusto_cluster, database, table, properties)
self._execute(endpoint, request, properties)
def _execute_streaming_query_parsed( def _execute_streaming_query_parsed(
self, self,
@ -233,7 +276,10 @@ class KustoClient(_KustoClientBase):
timeout: timedelta = _KustoClientBase._query_default_timeout, timeout: timedelta = _KustoClientBase._query_default_timeout,
properties: Optional[ClientRequestProperties] = None, properties: Optional[ClientRequestProperties] = None,
) -> StreamingDataSetEnumerator: ) -> StreamingDataSetEnumerator:
response = self._execute(self._query_endpoint, database, query, None, timeout, properties, stream_response=True) request = ExecuteRequestParams._from_query(
query, database, properties, self._request_headers, timeout, self._mgmt_default_timeout, self._client_server_delta, self.client_details
)
response = self._execute(self._query_endpoint, request, properties, stream_response=True)
response.raw.decode_content = True response.raw.decode_content = True
return StreamingDataSetEnumerator(JsonTokenReader(response.raw)) return StreamingDataSetEnumerator(JsonTokenReader(response.raw))
@ -262,10 +308,7 @@ class KustoClient(_KustoClientBase):
def _execute( def _execute(
self, self,
endpoint: str, endpoint: str,
database: Optional[str], request: ExecuteRequestParams,
query: Optional[str],
payload: Optional[IO[AnyStr]],
timeout: timedelta,
properties: Optional[ClientRequestProperties] = None, properties: Optional[ClientRequestProperties] = None,
stream_response: bool = False, stream_response: bool = False,
) -> Union[KustoResponseDataSet, Response]: ) -> Union[KustoResponseDataSet, Response]:
@ -273,20 +316,8 @@ class KustoClient(_KustoClientBase):
if self._is_closed: if self._is_closed:
raise KustoClosedError() raise KustoClosedError()
self.validate_endpoint() self.validate_endpoint()
request_params = ExecuteRequestParams(
database, request_headers = request.request_headers
payload,
properties,
query,
timeout,
self._request_headers,
self._mgmt_default_timeout,
self._client_server_delta,
self.client_details,
)
json_payload = request_params.json_payload
request_headers = request_params.request_headers
timeout = request_params.timeout
if self._aad_helper: if self._aad_helper:
request_headers["Authorization"] = self._aad_helper.acquire_authorization_header() request_headers["Authorization"] = self._aad_helper.acquire_authorization_header()
@ -294,9 +325,9 @@ class KustoClient(_KustoClientBase):
invoker = lambda: self._session.post( invoker = lambda: self._session.post(
endpoint, endpoint,
headers=request_headers, headers=request_headers,
json=json_payload, json=request.json_payload,
data=payload, data=request.payload,
timeout=timeout.seconds, timeout=request.timeout.seconds,
stream=stream_response, stream=stream_response,
allow_redirects=False, allow_redirects=False,
) )
@ -324,6 +355,6 @@ class KustoClient(_KustoClientBase):
response_json = response.json() response_json = response.json()
response.raise_for_status() response.raise_for_status()
except Exception as e: except Exception as e:
raise self._handle_http_error(e, endpoint, payload, response, response.status_code, response_json, response.text) raise self._handle_http_error(e, endpoint, request.payload, response, response.status_code, response_json, response.text)
# trace response processing # trace response processing
return MonitoredActivity.invoke(lambda: self._kusto_parse_by_endpoint(endpoint, response_json), name_of_span="KustoClient.processing_response") return MonitoredActivity.invoke(lambda: self._kusto_parse_by_endpoint(endpoint, response_json), name_of_span="KustoClient.processing_response")

Просмотреть файл

@ -125,36 +125,82 @@ class _KustoClientBase(abc.ABC):
class ExecuteRequestParams: class ExecuteRequestParams:
def __init__( @staticmethod
self, def _from_stream(
database: str, stream: io.IOBase,
payload: Optional[io.IOBase],
properties: ClientRequestProperties, properties: ClientRequestProperties,
query: str, request_headers: Any,
timeout: timedelta, timeout: timedelta,
request_headers: dict,
mgmt_default_timeout: timedelta, mgmt_default_timeout: timedelta,
client_server_delta: timedelta, client_server_delta: timedelta,
client_details: ClientDetails, client_details: ClientDetails,
): ):
request_headers = copy(request_headers) # Before 3.0 it was KPC.execute_streaming_ingest, but was changed to align with the other SDKs
request_headers["Connection"] = "Keep-Alive" client_request_id_prefix = "KPC.executeStreamingIngest;"
json_payload = None request_headers = request_headers.copy()
if not payload: request_headers["Content-Encoding"] = "gzip"
json_payload = {"db": database, "csl": query} if properties:
if properties: request_headers.update(json.loads(properties.to_json())["Options"])
json_payload["properties"] = properties.to_json()
client_request_id_prefix = "KPC.execute;" return ExecuteRequestParams(
request_headers["Content-Type"] = "application/json; charset=utf-8" stream, None, request_headers, client_request_id_prefix, properties, timeout, mgmt_default_timeout, client_server_delta, client_details
else: )
if properties:
request_headers.update(json.loads(properties.to_json())["Options"])
# Before 3.0 it was KPC.execute_streaming_ingest, but was changed to align with the other SDKs @staticmethod
client_request_id_prefix = "KPC.executeStreamingIngest;" def _from_query(
request_headers["Content-Encoding"] = "gzip" query: str,
database: str,
properties: ClientRequestProperties,
request_headers: Any,
timeout: timedelta,
mgmt_default_timeout: timedelta,
client_server_delta: timedelta,
client_details: ClientDetails,
):
json_payload = {"db": database, "csl": query}
if properties:
json_payload["properties"] = properties.to_json()
client_request_id_prefix = "KPC.execute;"
request_headers = request_headers.copy()
request_headers["Content-Type"] = "application/json; charset=utf-8"
return ExecuteRequestParams(
None, json_payload, request_headers, client_request_id_prefix, properties, timeout, mgmt_default_timeout, client_server_delta, client_details
)
@staticmethod
def _from_blob_url(
blob: str,
properties: ClientRequestProperties,
request_headers: Any,
timeout: timedelta,
mgmt_default_timeout: timedelta,
client_server_delta: timedelta,
client_details: ClientDetails,
):
json_payload = {"sourceUri": blob}
client_request_id_prefix = "KPC.executeStreamingIngestFromBlob;"
request_headers = request_headers.copy()
request_headers["Content-Type"] = "application/json; charset=utf-8"
if properties:
request_headers.update(json.loads(properties.to_json())["Options"])
return ExecuteRequestParams(
None, json_payload, request_headers, client_request_id_prefix, properties, timeout, mgmt_default_timeout, client_server_delta, client_details
)
def __init__(
self,
payload,
json_payload,
request_headers,
client_request_id_prefix,
properties: ClientRequestProperties,
timeout: timedelta,
mgmt_default_timeout: timedelta,
client_server_delta: timedelta,
client_details: ClientDetails,
):
special_headers = [ special_headers = [
{ {
"name": "x-ms-client-request-id", "name": "x-ms-client-request-id",
@ -201,3 +247,4 @@ class ExecuteRequestParams:
self.json_payload = json_payload self.json_payload = json_payload
self.request_headers = request_headers self.request_headers = request_headers
self.timeout = timeout self.timeout = timeout
self.payload = payload

Просмотреть файл

@ -37,15 +37,14 @@ def test_properties():
def test_default_tracing_properties(): def test_default_tracing_properties():
kcsb = KustoConnectionStringBuilder("test") kcsb = KustoConnectionStringBuilder("test")
params = ExecuteRequestParams( params = ExecuteRequestParams._from_query(
"somedatabase",
None,
ClientRequestProperties(),
"somequery", "somequery",
timedelta(seconds=10), "somedatabase",
ClientRequestProperties(),
{}, {},
timedelta(seconds=10), timedelta(seconds=10),
timedelta(seconds=10), timedelta(seconds=10),
timedelta(seconds=10),
kcsb.client_details, kcsb.client_details,
) )
@ -60,15 +59,14 @@ def test_custom_kcsb_tracing_properties():
kcsb.application_for_tracing = "myApp" kcsb.application_for_tracing = "myApp"
kcsb.user_name_for_tracing = "myUser" kcsb.user_name_for_tracing = "myUser"
params = ExecuteRequestParams( params = ExecuteRequestParams._from_query(
"somedatabase",
None,
ClientRequestProperties(),
"somequery", "somequery",
timedelta(seconds=10), "somedatabase",
ClientRequestProperties(),
{}, {},
timedelta(seconds=10), timedelta(seconds=10),
timedelta(seconds=10), timedelta(seconds=10),
timedelta(seconds=10),
kcsb.client_details, kcsb.client_details,
) )
@ -84,15 +82,14 @@ def test_custom_crp_tracing_properties():
crp.application = "myApp2" crp.application = "myApp2"
crp.user = "myUser2" crp.user = "myUser2"
params = ExecuteRequestParams( params = ExecuteRequestParams._from_query(
"somedatabase",
None,
crp,
"somequery", "somequery",
timedelta(seconds=10), "somedatabase",
crp,
{}, {},
timedelta(seconds=10), timedelta(seconds=10),
timedelta(seconds=10), timedelta(seconds=10),
timedelta(seconds=10),
kcsb.client_details, kcsb.client_details,
) )
@ -110,15 +107,14 @@ def test_custom_crp_tracing_properties_override_kcsb():
crp.application = "myApp2" crp.application = "myApp2"
crp.user = "myUser2" crp.user = "myUser2"
params = ExecuteRequestParams( params = ExecuteRequestParams._from_query(
"somedatabase",
None,
crp,
"somequery", "somequery",
timedelta(seconds=10), "somedatabase",
crp,
{}, {},
timedelta(seconds=10), timedelta(seconds=10),
timedelta(seconds=10), timedelta(seconds=10),
timedelta(seconds=10),
kcsb.client_details, kcsb.client_details,
) )
@ -133,15 +129,14 @@ def test_set_connector_name_and_version():
kcsb._set_connector_details("myConnector", "myVersion", send_user=False) kcsb._set_connector_details("myConnector", "myVersion", send_user=False)
crp = ClientRequestProperties() crp = ClientRequestProperties()
params = ExecuteRequestParams( params = ExecuteRequestParams._from_query(
"somedatabase",
None,
crp,
"somequery", "somequery",
timedelta(seconds=10), "somedatabase",
ClientRequestProperties(),
{}, {},
timedelta(seconds=10), timedelta(seconds=10),
timedelta(seconds=10), timedelta(seconds=10),
timedelta(seconds=10),
kcsb.client_details, kcsb.client_details,
) )
@ -157,15 +152,14 @@ def test_set_connector_no_app_version():
kcsb._set_connector_details("myConnector", "myVersion", app_name="myApp", send_user=True) kcsb._set_connector_details("myConnector", "myVersion", app_name="myApp", send_user=True)
crp = ClientRequestProperties() crp = ClientRequestProperties()
params = ExecuteRequestParams( params = ExecuteRequestParams._from_query(
"somedatabase",
None,
crp,
"somequery", "somequery",
timedelta(seconds=10), "somedatabase",
ClientRequestProperties(),
{}, {},
timedelta(seconds=10), timedelta(seconds=10),
timedelta(seconds=10), timedelta(seconds=10),
timedelta(seconds=10),
kcsb.client_details, kcsb.client_details,
) )
@ -189,15 +183,14 @@ def test_set_connector_full():
) )
crp = ClientRequestProperties() crp = ClientRequestProperties()
params = ExecuteRequestParams( params = ExecuteRequestParams._from_query(
"somedatabase",
None,
crp,
"somequery", "somequery",
timedelta(seconds=10), "somedatabase",
crp,
{}, {},
timedelta(seconds=10), timedelta(seconds=10),
timedelta(seconds=10), timedelta(seconds=10),
timedelta(seconds=10),
kcsb.client_details, kcsb.client_details,
) )

Просмотреть файл

@ -12,6 +12,8 @@ from io import BytesIO, SEEK_END
from typing import Union, Optional, AnyStr, IO, List, Dict from typing import Union, Optional, AnyStr, IO, List, Dict
from zipfile import ZipFile from zipfile import ZipFile
from azure.storage.blob import BlobClient
OptionalUUID = Optional[Union[str, uuid.UUID]] OptionalUUID = Optional[Union[str, uuid.UUID]]
@ -150,6 +152,10 @@ class BlobDescriptor(DescriptorBase):
obfuscated_path = self.path.split("?")[0].split(";")[0] obfuscated_path = self.path.split("?")[0].split(";")[0]
return {self._BLOB_URI: obfuscated_path, self._SOURCE_ID: str(self.source_id)} return {self._BLOB_URI: obfuscated_path, self._SOURCE_ID: str(self.source_id)}
def fill_size(self):
if not self.size:
self.size = BlobClient.from_blob_url(self.path).get_blob_properties().size
class StreamDescriptor(DescriptorBase): class StreamDescriptor(DescriptorBase):
"""StreamDescriptor is used to describe a stream that will be used as ingestion source""" """StreamDescriptor is used to describe a stream that will be used as ingestion source"""

Просмотреть файл

@ -1,7 +1,8 @@
import uuid import uuid
from io import SEEK_SET from io import SEEK_SET
from typing import AnyStr, IO, TYPE_CHECKING, Union from typing import AnyStr, IO, TYPE_CHECKING, Union, Optional
from azure.kusto.ingest.descriptors import DescriptorBase
from tenacity import Retrying, _utils, stop_after_attempt, wait_random_exponential from tenacity import Retrying, _utils, stop_after_attempt, wait_random_exponential
from azure.core.tracing.decorator import distributed_trace from azure.core.tracing.decorator import distributed_trace
@ -11,7 +12,6 @@ from azure.kusto.data import KustoConnectionStringBuilder
from azure.kusto.data.exceptions import KustoApiError, KustoClosedError from azure.kusto.data.exceptions import KustoApiError, KustoClosedError
from azure.kusto.data._telemetry import MonitoredActivity from azure.kusto.data._telemetry import MonitoredActivity
from . import BlobDescriptor, FileDescriptor, IngestionProperties, StreamDescriptor from . import BlobDescriptor, FileDescriptor, IngestionProperties, StreamDescriptor
from ._ingest_telemetry import IngestTracingAttributes from ._ingest_telemetry import IngestTracingAttributes
from ._stream_extensions import chain_streams, read_until_size_or_end from ._stream_extensions import chain_streams, read_until_size_or_end
@ -108,30 +108,20 @@ class ManagedStreamingIngestClient(BaseIngestClient):
stream = stream_descriptor.stream stream = stream_descriptor.stream
buffered_stream = read_until_size_or_end(stream, self.MAX_STREAMING_SIZE_IN_BYTES + 1) buffered_stream = read_until_size_or_end(stream, self.MAX_STREAMING_SIZE_IN_BYTES + 1)
length = len(buffered_stream.getbuffer())
if len(buffered_stream.getbuffer()) > self.MAX_STREAMING_SIZE_IN_BYTES:
stream_descriptor.stream = chain_streams([buffered_stream, stream])
return self.queued_client.ingest_from_stream(stream_descriptor, ingestion_properties)
stream_descriptor.stream = buffered_stream stream_descriptor.stream = buffered_stream
try: try:
for attempt in Retrying( res = self._stream_with_retries(length, stream_descriptor, ingestion_properties)
stop=stop_after_attempt(self._num_of_attempts), wait=wait_random_exponential(max=self._max_seconds_per_retry), reraise=True if res:
): return res
with attempt: stream_descriptor.stream = chain_streams([buffered_stream, stream])
stream.seek(0, SEEK_SET)
client_request_id = ManagedStreamingIngestClient._get_request_id(stream_descriptor.source_id, attempt.retry_state.attempt_number - 1)
# trace attempt to ingest from stream
invoker = lambda: self.streaming_client._ingest_from_stream_with_client_request_id(
stream_descriptor, ingestion_properties, client_request_id
)
return MonitoredActivity.invoke(invoker, name_of_span="ManagedStreamingIngestClient.ingest_from_stream_attempt")
except KustoApiError as ex: except KustoApiError as ex:
error = ex.get_api_error() error = ex.get_api_error()
if error.permanent: if error.permanent:
raise raise
buffered_stream.seek(0, SEEK_SET)
return self.queued_client.ingest_from_stream(stream_descriptor, ingestion_properties) return self.queued_client.ingest_from_stream(stream_descriptor, ingestion_properties)
@ -151,9 +141,42 @@ class ManagedStreamingIngestClient(BaseIngestClient):
if self._is_closed: if self._is_closed:
raise KustoClosedError() raise KustoClosedError()
blob_descriptor.fill_size()
try:
res = self._stream_with_retries(blob_descriptor.size, blob_descriptor, ingestion_properties)
if res:
return res
except KustoApiError as ex:
error = ex.get_api_error()
if error.permanent:
raise
return self.queued_client.ingest_from_blob(blob_descriptor, ingestion_properties) return self.queued_client.ingest_from_blob(blob_descriptor, ingestion_properties)
def _stream_with_retries(
self,
length: int,
descriptor: DescriptorBase,
props: IngestionProperties,
) -> Optional[IngestionResult]:
from_stream = isinstance(descriptor, StreamDescriptor)
if length > self.MAX_STREAMING_SIZE_IN_BYTES:
return None
for attempt in Retrying(stop=stop_after_attempt(self._num_of_attempts), wait=wait_random_exponential(max=self._max_seconds_per_retry), reraise=True):
with attempt:
client_request_id = ManagedStreamingIngestClient._get_request_id(descriptor.source_id, attempt.retry_state.attempt_number - 1)
# trace attempt to ingest from stream
if from_stream:
descriptor.stream.seek(0, SEEK_SET)
invoker = lambda: self.streaming_client._ingest_from_stream_with_client_request_id(descriptor, props, client_request_id)
else:
invoker = lambda: self.streaming_client.ingest_from_blob(descriptor, props, client_request_id)
return MonitoredActivity.invoke(
invoker,
name_of_span="ManagedStreamingIngestClient.ingest_from_stream_attempt",
tracing_attributes={"attemptNumber": attempt, "sourceIsStream": from_stream},
)
@staticmethod @staticmethod
def _get_request_id(source_id: uuid.UUID, attempt: int): def _get_request_id(source_id: uuid.UUID, attempt: int):
return f"KPC.executeManagedStreamingIngest;{source_id};{attempt}" return f"KPC.executeManagedStreamingIngest;{source_id};{attempt}"

Просмотреть файл

@ -10,7 +10,7 @@ from azure.kusto.data import KustoClient, KustoConnectionStringBuilder, ClientRe
from ._ingest_telemetry import IngestTracingAttributes from ._ingest_telemetry import IngestTracingAttributes
from .base_ingest_client import BaseIngestClient, IngestionResult, IngestionStatus from .base_ingest_client import BaseIngestClient, IngestionResult, IngestionStatus
from .descriptors import FileDescriptor, StreamDescriptor from .descriptors import FileDescriptor, StreamDescriptor, BlobDescriptor
from .ingestion_properties import IngestionProperties from .ingestion_properties import IngestionProperties
@ -79,9 +79,30 @@ class KustoStreamingIngestClient(BaseIngestClient):
ingestion_properties.database, ingestion_properties.database,
ingestion_properties.table, ingestion_properties.table,
stream_descriptor.stream, stream_descriptor.stream,
None,
ingestion_properties.format.name, ingestion_properties.format.name,
additional_properties, additional_properties,
mapping_name=ingestion_properties.ingestion_mapping_reference, mapping_name=ingestion_properties.ingestion_mapping_reference,
) )
return IngestionResult(IngestionStatus.SUCCESS, ingestion_properties.database, ingestion_properties.table, stream_descriptor.source_id) return IngestionResult(IngestionStatus.SUCCESS, ingestion_properties.database, ingestion_properties.table, stream_descriptor.source_id)
def ingest_from_blob(
self, blob_descriptor: BlobDescriptor, ingestion_properties: IngestionProperties, client_request_id: Optional[str] = None
) -> IngestionResult:
IngestTracingAttributes.set_ingest_descriptor_attributes(blob_descriptor, ingestion_properties)
additional_properties = None
if client_request_id:
additional_properties = ClientRequestProperties()
additional_properties.client_request_id = client_request_id
self._kusto_client.execute_streaming_ingest(
ingestion_properties.database,
ingestion_properties.table,
None,
blob_descriptor.path,
ingestion_properties.format.name,
additional_properties,
mapping_name=ingestion_properties.ingestion_mapping_reference,
)
return IngestionResult(IngestionStatus.SUCCESS, ingestion_properties.database, ingestion_properties.table, blob_descriptor.source_id)

Просмотреть файл

@ -538,3 +538,32 @@ class TestE2E:
self.ingest_client.ingest_from_dataframe(df, ingestion_properties) self.ingest_client.ingest_from_dataframe(df, ingestion_properties)
await self.assert_rows_added(1, timeout=120) await self.assert_rows_added(1, timeout=120)
@pytest.mark.asyncio
async def test_streaming_ingest_from_blob(self, is_managed_streaming):
ingestion_properties = IngestionProperties(
database=self.test_db,
table=self.test_table,
data_format=DataFormat.JSON,
ingestion_mapping_reference="JsonMapping",
ingestion_mapping_kind=IngestionMappingKind.JSON,
)
containers = self.ingest_client._resource_manager.get_containers()
with FileDescriptor(self.json_file_path).open(False) as stream:
blob_descriptor = self.ingest_client.upload_blob(
containers,
FileDescriptor(self.json_file_path),
ingestion_properties.database,
ingestion_properties.table,
stream,
None,
10 * 60,
3,
)
if is_managed_streaming:
self.managed_streaming_ingest_client.ingest_from_blob(blob_descriptor, ingestion_properties)
else:
self.streaming_ingest_client.ingest_from_blob(blob_descriptor, ingestion_properties)
await self.assert_rows_added(2, timeout=120)

Просмотреть файл

@ -44,6 +44,36 @@ STORAGE_QUEUE4_URL = "https://storageaccount2.queue.core.windows.net/readyforagg
STORAGE_QUEUE5_URL = "https://storageaccount2.queue.core.windows.net/readyforaggregation-secured?5" STORAGE_QUEUE5_URL = "https://storageaccount2.queue.core.windows.net/readyforaggregation-secured?5"
def request_callback_throw_transient(request):
response_status = 400
response_body = {
"error": {
"code": "BadRequest",
"message": "Request is invalid and cannot be executed.",
"@type": "Kusto.Common.Svc.Exceptions.AdminCommandWrongEndpointException",
"@message": "Cannot get ingestion resources from this service endpoint. The appropriate endpoint is most likely "
"'https://ingest-somecluster.kusto.windows.net/'.",
"@context": {
"timestamp": "2021-10-12T06:05:35.6602087Z",
"serviceAlias": "SomeCluster",
"machineName": "KEngine000000",
"processName": "Kusto.WinSvc.Svc",
"processId": 2648,
"threadId": 472,
"appDomainName": "Kusto.WinSvc.Svc.exe",
"clientRequestId": "KPC.execute;a3dfb878-9d2b-49d6-89a5-e9b3a9f1f674",
"activityId": "87eb8fc9-78b3-4580-bcc8-6c90482f9118",
"subActivityId": "bbfb038b-4467-4f96-afd4-945904fc6278",
"activityType": "DN.AdminCommand.IngestionResourcesGetCommand",
"parentActivityId": "00e678e9-4204-4143-8c94-6afd94c27430",
"activityStack": "(Activity stack: CRID=KPC.execute;a3dfb878-9d2b-49d6-89a5-e9b3a9f1f674 ARID=87eb8fc9-78b3-4580-bcc8-6c90482f9118 > DN.Admin.Client.ExecuteControlCommand/833dfb85-5d67-44b7-882d-eb2283e65780 > P.WCF.Service.ExecuteControlCommand..IInterNodeCommunicationAdminContract/3784e74f-1d89-4c15-adef-0a360c4c431e > DN.FE.ExecuteControlCommand/00e678e9-4204-4143-8c94-6afd94c27430 > DN.AdminCommand.IngestionResourcesGetCommand/bbfb038b-4467-4f96-afd4-945904fc6278)",
},
"@permanent": False,
}
}
return response_status, {}, json.dumps(response_body)
def request_callback(request): def request_callback(request):
body = json.loads(request.body.decode()) if type(request.body) == bytes else json.loads(request.body) body = json.loads(request.body.decode()) if type(request.body) == bytes else json.loads(request.body)
response_status = 400 response_status = 400

Просмотреть файл

@ -11,7 +11,7 @@ import responses
from azure.kusto.data.data_format import DataFormat from azure.kusto.data.data_format import DataFormat
from azure.kusto.data.exceptions import KustoApiError from azure.kusto.data.exceptions import KustoApiError
from azure.kusto.ingest import ManagedStreamingIngestClient, IngestionProperties, IngestionStatus, BlobDescriptor from azure.kusto.ingest import ManagedStreamingIngestClient, IngestionProperties, IngestionStatus, BlobDescriptor
from test_kusto_ingest_client import request_callback as queued_request_callback, assert_queued_upload from test_kusto_ingest_client import request_callback as queued_request_callback, assert_queued_upload, request_callback_throw_transient
from test_kusto_streaming_ingest_client import request_callback as streaming_request_callback, assert_managed_streaming_request_id from test_kusto_streaming_ingest_client import request_callback as streaming_request_callback, assert_managed_streaming_request_id
@ -21,6 +21,11 @@ class TransientResponseHelper:
self.total_calls = 0 self.total_calls = 0
@pytest.fixture(params=["Blob", "File"])
def is_blob(request):
return request.param == "Blob"
def transient_error_callback(helper: TransientResponseHelper, request, custom_request_id=None): def transient_error_callback(helper: TransientResponseHelper, request, custom_request_id=None):
if custom_request_id: if custom_request_id:
assert request.headers["x-ms-client-request-id"] == custom_request_id assert request.headers["x-ms-client-request-id"] == custom_request_id
@ -84,7 +89,7 @@ class TestManagedStreamingIngestClient:
@patch("azure.storage.blob.BlobClient.upload_blob") @patch("azure.storage.blob.BlobClient.upload_blob")
@patch("azure.storage.queue.QueueClient.send_message") @patch("azure.storage.queue.QueueClient.send_message")
@patch("uuid.uuid4", return_value=MOCKED_UUID_4) @patch("uuid.uuid4", return_value=MOCKED_UUID_4)
def test_fallback_big_file(self, mock_uuid, mock_put_message_in_queue, mock_upload_blob_from_stream, mock_aad): def test_fallback_big_file(self, mock_uuid, mock_put_message_in_queue, mock_upload_blob_from_stream, mock_aad, is_blob):
responses.add_callback( responses.add_callback(
responses.POST, "https://ingest-somecluster.kusto.windows.net/v1/rest/mgmt", callback=queued_request_callback, content_type="application/json" responses.POST, "https://ingest-somecluster.kusto.windows.net/v1/rest/mgmt", callback=queued_request_callback, content_type="application/json"
) )
@ -108,10 +113,20 @@ class TestManagedStreamingIngestClient:
mock_upload_blob_from_stream.side_effect = check_bytes mock_upload_blob_from_stream.side_effect = check_bytes
f = NamedTemporaryFile(dir=".", mode="wb", delete=False) f = NamedTemporaryFile(dir=".", mode="wb", delete=False)
blob_url = "https://storageaccount.blob.core.windows.net/tempstorage/database__table__11111111-1111-1111-1111-111111111111__{}?".format(
os.path.basename(f.name)
)
try: try:
f.write(initial_bytes) if is_blob:
f.close() result = ingest_client.ingest_from_blob(BlobDescriptor(blob_url + "sas", 5 * 1024 * 1024), ingestion_properties=ingestion_properties)
result = ingest_client.ingest_from_file(f.name, ingestion_properties=ingestion_properties) f.close()
else:
f.write(initial_bytes)
f.close()
result = ingest_client.ingest_from_file(f.name, ingestion_properties=ingestion_properties)
except Exception as e:
print(e)
finally: finally:
os.unlink(f.name) os.unlink(f.name)
@ -119,14 +134,13 @@ class TestManagedStreamingIngestClient:
assert_queued_upload( assert_queued_upload(
mock_put_message_in_queue, mock_put_message_in_queue,
mock_upload_blob_from_stream, mock_upload_blob_from_stream if not is_blob else None,
"https://storageaccount.blob.core.windows.net/tempstorage/database__table__11111111-1111-1111-1111-111111111111__{}?".format( blob_url,
os.path.basename(f.name)
),
format=data_format.kusto_value, format=data_format.kusto_value,
) )
mock_upload_blob_from_stream.assert_called() if not is_blob:
mock_upload_blob_from_stream.assert_called()
@responses.activate @responses.activate
@patch("azure.kusto.data.security._AadHelper.acquire_authorization_header", return_value=None) @patch("azure.kusto.data.security._AadHelper.acquire_authorization_header", return_value=None)
@ -314,6 +328,12 @@ class TestManagedStreamingIngestClient:
responses.add_callback( responses.add_callback(
responses.POST, "https://ingest-somecluster.kusto.windows.net/v1/rest/mgmt", callback=queued_request_callback, content_type="application/json" responses.POST, "https://ingest-somecluster.kusto.windows.net/v1/rest/mgmt", callback=queued_request_callback, content_type="application/json"
) )
responses.add_callback(
responses.POST,
"https://somecluster.kusto.windows.net/v1/rest/ingest/database/table?streamFormat=csv&sourceKind=uri",
callback=request_callback_throw_transient,
content_type="application/json",
)
ingest_client = ManagedStreamingIngestClient.from_dm_kcsb("https://ingest-somecluster.kusto.windows.net") ingest_client = ManagedStreamingIngestClient.from_dm_kcsb("https://ingest-somecluster.kusto.windows.net")
ingestion_properties = IngestionProperties(database="database", table="table") ingestion_properties = IngestionProperties(database="database", table="table")