From 5c1909339d30228e736b2241a823d9fde9c4d69a Mon Sep 17 00:00:00 2001 From: mayamarom10 Date: Tue, 18 Jun 2024 17:22:22 +0300 Subject: [PATCH] Fix df from result table error message + old types (#543) * Added dictionary options as parameters for type matching for dataframe_from_result_table func * reformatted the file * reformatted the file * reformatted the file * Fixes after PR * Fixes after PR * Fixes after PR * Fixes after PR * Fixes after PR * For python 3.7 3.8 * Nullable bools fix * Nullable bools fix * Nullable bools fix * Nullable bools fix2 * PR comment fixes * LRU cache default paramaters for python 3.7 * LRU cache maxsize=1 * modified changes requested * black * fix for numpy 2.0 * fix numpy 2.0 nan * fix numpy 2.0 ninf, inf * fix numpy 2.0 -inf * fix tenacity>=8.3 * fix tenacity>=8.3 * black * fixed error message in case of none existing type + added old type names in kusto * Delete azure-kusto-ingest/azure/kusto/ingest/V2/__init__.py * Delete azure-kusto-ingest/azure/kusto/ingest/V2/blob_source.py * Delete azure-kusto-ingest/azure/kusto/ingest/V2/compression_type.py * Delete azure-kusto-ingest/azure/kusto/ingest/V2/ingestion_source.py * Update setup.py * Delete azure-kusto-ingest/azure/kusto/ingest/V2/kusto_storage_uploader.py * Delete azure-kusto-ingest/azure/kusto/ingest/V2/local_source.py * fixed error message in case of none existing type + added old type names in kusto * added test for missing and old types * black * using pytest.raises --- azure-kusto-data/azure/kusto/data/helpers.py | 16 +++++++-- azure-kusto-data/tests/input/dataframe.json | 36 ++++++++++++-------- azure-kusto-data/tests/test_helpers.py | 19 +++++++++-- 3 files changed, 52 insertions(+), 19 deletions(-) diff --git a/azure-kusto-data/azure/kusto/data/helpers.py b/azure-kusto-data/azure/kusto/data/helpers.py index 0a5638a..653a66e 100644 --- a/azure-kusto-data/azure/kusto/data/helpers.py +++ b/azure-kusto-data/azure/kusto/data/helpers.py @@ -18,14 +18,22 @@ def default_dict() -> Converter: return { "string": lambda col, df: df[col].astype(pd.StringDtype()) if hasattr(pd, "StringDType") else df[col], "guid": lambda col, df: df[col], + "uuid": lambda col, df: df[col], + "uniqueid": lambda col, df: df[col], "dynamic": lambda col, df: df[col], "bool": lambda col, df: df[col].astype(bool), + "boolean": lambda col, df: df[col].astype(bool), "int": lambda col, df: df[col].astype(pd.Int32Dtype()), + "int32": lambda col, df: df[col].astype(pd.Int32Dtype()), + "int64": lambda col, df: df[col].astype(pd.Int64Dtype()), "long": lambda col, df: df[col].astype(pd.Int64Dtype()), "real": lambda col, df: parse_float(df, col), + "double": lambda col, df: parse_float(df, col), "decimal": lambda col, df: parse_float(df, col), "datetime": lambda col, df: parse_datetime(df, col), + "date": lambda col, df: parse_datetime(df, col), "timespan": lambda col, df: df[col].apply(parse_timedelta), + "time": lambda col, df: df[col].apply(parse_timedelta), } @@ -67,13 +75,15 @@ def dataframe_from_result_table( column_name = col.column_name column_type = col.column_type if converters_by_column_name and column_name in converters_by_column_name: - converter = converters_by_column_name[column_name] + converter = converters_by_column_name.get(column_name) elif converters_by_type and column_type in converters_by_type: - converter = converters_by_type[column_type] + converter = converters_by_type.get(column_type) elif nullable_bools and column_type == "bool": converter = lambda col, df: df[col].astype(pd.BooleanDtype()) else: - converter = default[column_type] + converter = default.get(column_type) + if converter is None: + raise Exception("Unexpected type " + column_type) if isinstance(converter, str): frame[column_name] = frame[column_name].astype(converter) else: diff --git a/azure-kusto-data/tests/input/dataframe.json b/azure-kusto-data/tests/input/dataframe.json index 18d56c9..0c82aea 100644 --- a/azure-kusto-data/tests/input/dataframe.json +++ b/azure-kusto-data/tests/input/dataframe.json @@ -73,6 +73,10 @@ "ColumnName": "RecordReal", "ColumnType": "real" }, + { + "ColumnName": "RecordDouble", + "ColumnType": "double" + }, { "ColumnName": "RecordDecimal", "ColumnType": "decimal" @@ -80,6 +84,10 @@ { "ColumnName": "RecordDynamic", "ColumnType": "dynamic" + }, + { + "ColumnName": "MissingType", + "ColumnType": "missing" } ], "Rows": [ @@ -92,8 +100,8 @@ 222, 92233720368, "6f3c1072-2739-461c-8aa7-3cfc8ff528a8", - 3.14159, 1.2, - "{\"Visualization\":null,\"Title\":null,\"XColumn\":null,\"Series\":null,\"YColumns\":null,\"XTitle\":null,\"YTitle\":null,\"XAxis\":null,\"YAxis\":null,\"Legend\":null,\"YSplit\":null,\"Accumulate\":false,\"IsQuerySorted\":false,\"Kind\":null}" + 3.14159, 7.89, 1.2, + "{\"Visualization\":null,\"Title\":null,\"XColumn\":null,\"Series\":null,\"YColumns\":null,\"XTitle\":null,\"YTitle\":null,\"XAxis\":null,\"YAxis\":null,\"Legend\":null,\"YSplit\":null,\"Accumulate\":false,\"IsQuerySorted\":false,\"Kind\":null}", "miss" ], [ @@ -105,8 +113,8 @@ 222, 92233720368, "6f3c1072-2739-461c-8aa7-3cfc8ff528a8", - "NaN", "NaN", - "{\"Visualization\":null,\"Title\":null,\"XColumn\":null,\"Series\":null,\"YColumns\":null,\"XTitle\":null,\"YTitle\":null,\"XAxis\":null,\"YAxis\":null,\"Legend\":null,\"YSplit\":null,\"Accumulate\":false,\"IsQuerySorted\":false,\"Kind\":null}" + "NaN", "NaN", "NaN", + "{\"Visualization\":null,\"Title\":null,\"XColumn\":null,\"Series\":null,\"YColumns\":null,\"XTitle\":null,\"YTitle\":null,\"XAxis\":null,\"YAxis\":null,\"Legend\":null,\"YSplit\":null,\"Accumulate\":false,\"IsQuerySorted\":false,\"Kind\":null}", "miss" ], [ @@ -118,8 +126,8 @@ 222, 92233720368, "6f3c1072-2739-461c-8aa7-3cfc8ff528a8", - "Infinity", "Infinity", - "{\"Visualization\":null,\"Title\":null,\"XColumn\":null,\"Series\":null,\"YColumns\":null,\"XTitle\":null,\"YTitle\":null,\"XAxis\":null,\"YAxis\":null,\"Legend\":null,\"YSplit\":null,\"Accumulate\":false,\"IsQuerySorted\":false,\"Kind\":null}" + "Infinity", "Infinity", "Infinity", + "{\"Visualization\":null,\"Title\":null,\"XColumn\":null,\"Series\":null,\"YColumns\":null,\"XTitle\":null,\"YTitle\":null,\"XAxis\":null,\"YAxis\":null,\"Legend\":null,\"YSplit\":null,\"Accumulate\":false,\"IsQuerySorted\":false,\"Kind\":null}", "miss" ], [ @@ -131,8 +139,8 @@ 222, 92233720368, "6f3c1072-2739-461c-8aa7-3cfc8ff528a8", - "-Infinity", "-Infinity", - "{\"Visualization\":null,\"Title\":null,\"XColumn\":null,\"Series\":null,\"YColumns\":null,\"XTitle\":null,\"YTitle\":null,\"XAxis\":null,\"YAxis\":null,\"Legend\":null,\"YSplit\":null,\"Accumulate\":false,\"IsQuerySorted\":false,\"Kind\":null}" + "-Infinity", "-Infinity", "-Infinity", + "{\"Visualization\":null,\"Title\":null,\"XColumn\":null,\"Series\":null,\"YColumns\":null,\"XTitle\":null,\"YTitle\":null,\"XAxis\":null,\"YAxis\":null,\"Legend\":null,\"YSplit\":null,\"Accumulate\":false,\"IsQuerySorted\":false,\"Kind\":null}", "miss" ], [ @@ -144,8 +152,8 @@ 222, 92233720368, "6f3c1072-2739-461c-8aa7-3cfc8ff528a8", - 3.14159, 1.2, - "{\"Visualization\":null,\"Title\":null,\"XColumn\":null,\"Series\":null,\"YColumns\":null,\"XTitle\":null,\"YTitle\":null,\"XAxis\":null,\"YAxis\":null,\"Legend\":null,\"YSplit\":null,\"Accumulate\":false,\"IsQuerySorted\":false,\"Kind\":null}" + 3.14159, 7.89, 1.2, + "{\"Visualization\":null,\"Title\":null,\"XColumn\":null,\"Series\":null,\"YColumns\":null,\"XTitle\":null,\"YTitle\":null,\"XAxis\":null,\"YAxis\":null,\"Legend\":null,\"YSplit\":null,\"Accumulate\":false,\"IsQuerySorted\":false,\"Kind\":null}", "miss" ], [ @@ -157,8 +165,8 @@ 222, 92233720368, "6f3c1072-2739-461c-8aa7-3cfc8ff528a8", - 3.14159, 1.2, - "{\"Visualization\":null,\"Title\":null,\"XColumn\":null,\"Series\":null,\"YColumns\":null,\"XTitle\":null,\"YTitle\":null,\"XAxis\":null,\"YAxis\":null,\"Legend\":null,\"YSplit\":null,\"Accumulate\":false,\"IsQuerySorted\":false,\"Kind\":null}" + 3.14159, 7.89, 1.2, + "{\"Visualization\":null,\"Title\":null,\"XColumn\":null,\"Series\":null,\"YColumns\":null,\"XTitle\":null,\"YTitle\":null,\"XAxis\":null,\"YAxis\":null,\"Legend\":null,\"YSplit\":null,\"Accumulate\":false,\"IsQuerySorted\":false,\"Kind\":null}", "miss" ], [ @@ -170,8 +178,8 @@ 222, 92233720368, "6f3c1072-2739-461c-8aa7-3cfc8ff528a8", - 3.14159, 1.2, - "{\"Visualization\":null,\"Title\":null,\"XColumn\":null,\"Series\":null,\"YColumns\":null,\"XTitle\":null,\"YTitle\":null,\"XAxis\":null,\"YAxis\":null,\"Legend\":null,\"YSplit\":null,\"Accumulate\":false,\"IsQuerySorted\":false,\"Kind\":null}" + 3.14159, 7.89, 1.2, + "{\"Visualization\":null,\"Title\":null,\"XColumn\":null,\"Series\":null,\"YColumns\":null,\"XTitle\":null,\"YTitle\":null,\"XAxis\":null,\"YAxis\":null,\"Legend\":null,\"YSplit\":null,\"Accumulate\":false,\"IsQuerySorted\":false,\"Kind\":null}", "miss" ], [ diff --git a/azure-kusto-data/tests/test_helpers.py b/azure-kusto-data/tests/test_helpers.py index 989a4ea..3d0f0d8 100644 --- a/azure-kusto-data/tests/test_helpers.py +++ b/azure-kusto-data/tests/test_helpers.py @@ -4,6 +4,8 @@ import datetime import json import os +import pytest + from azure.kusto.data._models import KustoResultTable from azure.kusto.data.helpers import dataframe_from_result_table from azure.kusto.data.response import KustoResponseDataSetV2 @@ -20,7 +22,11 @@ with open(os.path.join(os.path.dirname(__file__), "input", "dataframe.json"), "r response = KustoResponseDataSetV2(json.loads(data)) # Test when given both types of dictionary parameters that type conversion doesn't override column name conversion -test_dict_by_name = {"RecordName": lambda col, frame: frame[col].astype("str"), "RecordInt64": lambda col, frame: frame[col].astype("int64")} +test_dict_by_name = { + "RecordName": lambda col, frame: frame[col].astype("str"), + "RecordInt64": lambda col, frame: frame[col].astype("int64"), + "MissingType": lambda col, frame: frame[col].astype("str"), +} test_dict_by_type = {"int": lambda col, frame: frame[col].astype("int32")} df = dataframe_from_result_table(response.primary_results[0], converters_by_type=test_dict_by_type, converters_by_column_name=test_dict_by_name) @@ -52,6 +58,8 @@ assert type(df.iloc[0].RecordLong) is numpy.int64 assert df.iloc[0].RecordLong == 92233720368 assert type(df.iloc[0].RecordReal) is numpy.float64 assert df.iloc[0].RecordReal == 3.14159 +assert type(df.iloc[0].RecordDouble) is numpy.float64 +assert df.iloc[0].RecordDouble == 7.89 assert type(df.iloc[0].RecordDecimal) is numpy.float64 assert df.iloc[0].RecordDecimal == 1.2 @@ -90,10 +98,17 @@ assert df.iloc[6].RecordOffset == pandas.to_timedelta("1 days 01:01:01") # Testing int to float conversion test_int_to_float = {"int": "float64"} -df_int_to_float = dataframe_from_result_table(response.primary_results[0], converters_by_type=test_int_to_float) +ignore_missing_type = { + "MissingType": lambda col, frame: frame[col].astype("str"), +} +df_int_to_float = dataframe_from_result_table(response.primary_results[0], converters_by_type=test_int_to_float, converters_by_column_name=ignore_missing_type) assert type(df_int_to_float.iloc[0].RecordInt) is numpy.float64 assert df.iloc[0].RecordInt == 5678 +# Testing missing type conversion +with pytest.raises(Exception): + df_missing_type = dataframe_from_result_table(response.primary_results[0]) + def test_pandas_mixed_date(): df = dataframe_from_result_table(