Fix data transform issue, spark log_loss metric compute error and json dumps TypeError (Sync Fabric till 3c545e67) (#1371)

* Merged PR 1444697: Fix json dumps TypeError

Fix json dumps TypeError

----
Bug fix to address a `TypeError` in `json.dumps`.

This pull request fixes a `TypeError` encountered when using `json.dumps` on `automl._automl_user_configurations` by introducing a safe JSON serialization function.
- Added `safe_json_dumps` function in `flaml/fabric/mlflow.py` to handle non-serializable objects.
- Updated `MLflowIntegration` class in `flaml/fabric/mlflow.py` to use `safe_json_dumps` for JSON serialization.
- Modified `test/automl/test_multiclass.py` to test the new `safe_json_dumps` function.

Related work items: #3439408

* Fix data transform issue and spark log_loss metric compute error
This commit is contained in:
Li Jiang 2024-10-29 11:58:40 +08:00 коммит произвёл GitHub
Родитель c01c3910eb
Коммит 69da685d1e
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: B5690EEEBB952194
5 изменённых файлов: 29 добавлений и 6 удалений

Просмотреть файл

@ -293,7 +293,7 @@ class DataTransformer:
y = y.rename(TS_VALUE_COL)
for column in X.columns:
# sklearn\utils\validation.py needs int/float values
if X[column].dtype.name in ("object", "category"):
if X[column].dtype.name in ("object", "category", "string"):
if X[column].nunique() == 1 or X[column].nunique(dropna=True) == n - X[column].isnull().sum():
X.drop(columns=column, inplace=True)
drop = True

Просмотреть файл

@ -1,3 +1,4 @@
import json
from typing import Union
import numpy as np
@ -9,7 +10,7 @@ from pyspark.ml.evaluation import (
RegressionEvaluator,
)
from flaml.automl.spark import F, psSeries
from flaml.automl.spark import F, T, psDataFrame, psSeries, sparkDataFrame
def ps_group_counts(groups: Union[psSeries, np.ndarray]) -> np.ndarray:
@ -36,6 +37,16 @@ def _compute_label_from_probability(df, probability_col, prediction_col):
return df
def string_to_array(s):
try:
return json.loads(s)
except json.JSONDecodeError:
return []
string_to_array_udf = F.udf(string_to_array, T.ArrayType(T.DoubleType()))
def spark_metric_loss_score(
metric_name: str,
y_predict: psSeries,
@ -135,6 +146,11 @@ def spark_metric_loss_score(
)
elif metric_name == "log_loss":
# For log_loss, prediction_col should be probability, and we need to convert it to label
# handle data like "{'type': '1', 'values': '[1, 2, 3]'}"
# Fix cannot resolve "array_max(prediction)" due to data type mismatch: Parameter 1 requires the "ARRAY" type,
# however "prediction" has the type "STRUCT<type: TINYINT, size: INT, indices: ARRAY<INT>, values: ARRAY<DOUBLE>>"
df = df.withColumn(prediction_col, df[prediction_col].cast(T.StringType()))
df = df.withColumn(prediction_col, string_to_array_udf(df[prediction_col]))
df = _compute_label_from_probability(df, prediction_col, prediction_col + "_label")
evaluator = MulticlassClassificationEvaluator(
metricName="logLoss",

Просмотреть файл

@ -393,7 +393,7 @@ class DataTransformerTS:
for column in X.columns:
# sklearn/utils/validation.py needs int/float values
if X[column].dtype.name in ("object", "category"):
if X[column].dtype.name in ("object", "category", "string"):
if (
# drop columns where all values are the same
X[column].nunique() == 1

Просмотреть файл

@ -127,6 +127,13 @@ def _get_notebook_name():
return None
def safe_json_dumps(obj):
def default(o):
return str(o)
return json.dumps(obj, default=default)
class MLflowIntegration:
def __init__(self, experiment_type="automl", mlflow_exp_name=None, extra_tag=None):
try:
@ -438,7 +445,7 @@ class MLflowIntegration:
"flaml.meric": automl_metric_name,
"flaml.run_source": "flaml-automl",
"flaml.log_type": self.log_type,
"flaml.automl_user_configurations": json.dumps(automl._automl_user_configurations),
"flaml.automl_user_configurations": safe_json_dumps(automl._automl_user_configurations),
},
"params": {
"sample_size": search_state.sample_size,

Просмотреть файл

@ -187,7 +187,6 @@ class TestMultiClass(unittest.TestCase):
def test_custom_metric(self):
df, y = load_iris(return_X_y=True, as_frame=True)
df["label"] = y
automl = AutoML()
settings = {
"dataframe": df,
"label": "label",
@ -204,7 +203,8 @@ class TestMultiClass(unittest.TestCase):
"pred_time_limit": 1e-5,
"ensemble": True,
}
automl.fit(**settings)
automl = AutoML(**settings) # test safe_json_dumps
automl.fit(dataframe=df, label="label")
print(automl.classes_)
print(automl.model)
print(automl.config_history)