From 69da685d1e56490ba607dee6874e473040a626b9 Mon Sep 17 00:00:00 2001 From: Li Jiang Date: Tue, 29 Oct 2024 11:58:40 +0800 Subject: [PATCH] Fix data transform issue, spark log_loss metric compute error and json dumps TypeError (Sync Fabric till 3c545e67) (#1371) * Merged PR 1444697: Fix json dumps TypeError Fix json dumps TypeError ---- Bug fix to address a `TypeError` in `json.dumps`. This pull request fixes a `TypeError` encountered when using `json.dumps` on `automl._automl_user_configurations` by introducing a safe JSON serialization function. - Added `safe_json_dumps` function in `flaml/fabric/mlflow.py` to handle non-serializable objects. - Updated `MLflowIntegration` class in `flaml/fabric/mlflow.py` to use `safe_json_dumps` for JSON serialization. - Modified `test/automl/test_multiclass.py` to test the new `safe_json_dumps` function. Related work items: #3439408 * Fix data transform issue and spark log_loss metric compute error --- flaml/automl/data.py | 2 +- flaml/automl/spark/metrics.py | 18 +++++++++++++++++- flaml/automl/time_series/ts_data.py | 2 +- flaml/fabric/mlflow.py | 9 ++++++++- test/automl/test_multiclass.py | 4 ++-- 5 files changed, 29 insertions(+), 6 deletions(-) diff --git a/flaml/automl/data.py b/flaml/automl/data.py index f18d9e829..747236dad 100644 --- a/flaml/automl/data.py +++ b/flaml/automl/data.py @@ -293,7 +293,7 @@ class DataTransformer: y = y.rename(TS_VALUE_COL) for column in X.columns: # sklearn\utils\validation.py needs int/float values - if X[column].dtype.name in ("object", "category"): + if X[column].dtype.name in ("object", "category", "string"): if X[column].nunique() == 1 or X[column].nunique(dropna=True) == n - X[column].isnull().sum(): X.drop(columns=column, inplace=True) drop = True diff --git a/flaml/automl/spark/metrics.py b/flaml/automl/spark/metrics.py index 674c5e5e4..7cb85c09d 100644 --- a/flaml/automl/spark/metrics.py +++ b/flaml/automl/spark/metrics.py @@ -1,3 +1,4 @@ +import json from typing import Union import numpy as np @@ -9,7 +10,7 @@ from pyspark.ml.evaluation import ( RegressionEvaluator, ) -from flaml.automl.spark import F, psSeries +from flaml.automl.spark import F, T, psDataFrame, psSeries, sparkDataFrame def ps_group_counts(groups: Union[psSeries, np.ndarray]) -> np.ndarray: @@ -36,6 +37,16 @@ def _compute_label_from_probability(df, probability_col, prediction_col): return df +def string_to_array(s): + try: + return json.loads(s) + except json.JSONDecodeError: + return [] + + +string_to_array_udf = F.udf(string_to_array, T.ArrayType(T.DoubleType())) + + def spark_metric_loss_score( metric_name: str, y_predict: psSeries, @@ -135,6 +146,11 @@ def spark_metric_loss_score( ) elif metric_name == "log_loss": # For log_loss, prediction_col should be probability, and we need to convert it to label + # handle data like "{'type': '1', 'values': '[1, 2, 3]'}" + # Fix cannot resolve "array_max(prediction)" due to data type mismatch: Parameter 1 requires the "ARRAY" type, + # however "prediction" has the type "STRUCT, values: ARRAY>" + df = df.withColumn(prediction_col, df[prediction_col].cast(T.StringType())) + df = df.withColumn(prediction_col, string_to_array_udf(df[prediction_col])) df = _compute_label_from_probability(df, prediction_col, prediction_col + "_label") evaluator = MulticlassClassificationEvaluator( metricName="logLoss", diff --git a/flaml/automl/time_series/ts_data.py b/flaml/automl/time_series/ts_data.py index 95cb91f83..2587a70e7 100644 --- a/flaml/automl/time_series/ts_data.py +++ b/flaml/automl/time_series/ts_data.py @@ -393,7 +393,7 @@ class DataTransformerTS: for column in X.columns: # sklearn/utils/validation.py needs int/float values - if X[column].dtype.name in ("object", "category"): + if X[column].dtype.name in ("object", "category", "string"): if ( # drop columns where all values are the same X[column].nunique() == 1 diff --git a/flaml/fabric/mlflow.py b/flaml/fabric/mlflow.py index 6786ec583..4cfb0e2cd 100644 --- a/flaml/fabric/mlflow.py +++ b/flaml/fabric/mlflow.py @@ -127,6 +127,13 @@ def _get_notebook_name(): return None +def safe_json_dumps(obj): + def default(o): + return str(o) + + return json.dumps(obj, default=default) + + class MLflowIntegration: def __init__(self, experiment_type="automl", mlflow_exp_name=None, extra_tag=None): try: @@ -438,7 +445,7 @@ class MLflowIntegration: "flaml.meric": automl_metric_name, "flaml.run_source": "flaml-automl", "flaml.log_type": self.log_type, - "flaml.automl_user_configurations": json.dumps(automl._automl_user_configurations), + "flaml.automl_user_configurations": safe_json_dumps(automl._automl_user_configurations), }, "params": { "sample_size": search_state.sample_size, diff --git a/test/automl/test_multiclass.py b/test/automl/test_multiclass.py index 1641b5f84..9be63cff6 100644 --- a/test/automl/test_multiclass.py +++ b/test/automl/test_multiclass.py @@ -187,7 +187,6 @@ class TestMultiClass(unittest.TestCase): def test_custom_metric(self): df, y = load_iris(return_X_y=True, as_frame=True) df["label"] = y - automl = AutoML() settings = { "dataframe": df, "label": "label", @@ -204,7 +203,8 @@ class TestMultiClass(unittest.TestCase): "pred_time_limit": 1e-5, "ensemble": True, } - automl.fit(**settings) + automl = AutoML(**settings) # test safe_json_dumps + automl.fit(dataframe=df, label="label") print(automl.classes_) print(automl.model) print(automl.config_history)