зеркало из https://github.com/microsoft/FLAML.git
Fix data transform issue, spark log_loss metric compute error and json dumps TypeError (Sync Fabric till 3c545e67) (#1371)
* Merged PR 1444697: Fix json dumps TypeError Fix json dumps TypeError ---- Bug fix to address a `TypeError` in `json.dumps`. This pull request fixes a `TypeError` encountered when using `json.dumps` on `automl._automl_user_configurations` by introducing a safe JSON serialization function. - Added `safe_json_dumps` function in `flaml/fabric/mlflow.py` to handle non-serializable objects. - Updated `MLflowIntegration` class in `flaml/fabric/mlflow.py` to use `safe_json_dumps` for JSON serialization. - Modified `test/automl/test_multiclass.py` to test the new `safe_json_dumps` function. Related work items: #3439408 * Fix data transform issue and spark log_loss metric compute error
This commit is contained in:
Родитель
c01c3910eb
Коммит
69da685d1e
|
@ -293,7 +293,7 @@ class DataTransformer:
|
|||
y = y.rename(TS_VALUE_COL)
|
||||
for column in X.columns:
|
||||
# sklearn\utils\validation.py needs int/float values
|
||||
if X[column].dtype.name in ("object", "category"):
|
||||
if X[column].dtype.name in ("object", "category", "string"):
|
||||
if X[column].nunique() == 1 or X[column].nunique(dropna=True) == n - X[column].isnull().sum():
|
||||
X.drop(columns=column, inplace=True)
|
||||
drop = True
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
import json
|
||||
from typing import Union
|
||||
|
||||
import numpy as np
|
||||
|
@ -9,7 +10,7 @@ from pyspark.ml.evaluation import (
|
|||
RegressionEvaluator,
|
||||
)
|
||||
|
||||
from flaml.automl.spark import F, psSeries
|
||||
from flaml.automl.spark import F, T, psDataFrame, psSeries, sparkDataFrame
|
||||
|
||||
|
||||
def ps_group_counts(groups: Union[psSeries, np.ndarray]) -> np.ndarray:
|
||||
|
@ -36,6 +37,16 @@ def _compute_label_from_probability(df, probability_col, prediction_col):
|
|||
return df
|
||||
|
||||
|
||||
def string_to_array(s):
|
||||
try:
|
||||
return json.loads(s)
|
||||
except json.JSONDecodeError:
|
||||
return []
|
||||
|
||||
|
||||
string_to_array_udf = F.udf(string_to_array, T.ArrayType(T.DoubleType()))
|
||||
|
||||
|
||||
def spark_metric_loss_score(
|
||||
metric_name: str,
|
||||
y_predict: psSeries,
|
||||
|
@ -135,6 +146,11 @@ def spark_metric_loss_score(
|
|||
)
|
||||
elif metric_name == "log_loss":
|
||||
# For log_loss, prediction_col should be probability, and we need to convert it to label
|
||||
# handle data like "{'type': '1', 'values': '[1, 2, 3]'}"
|
||||
# Fix cannot resolve "array_max(prediction)" due to data type mismatch: Parameter 1 requires the "ARRAY" type,
|
||||
# however "prediction" has the type "STRUCT<type: TINYINT, size: INT, indices: ARRAY<INT>, values: ARRAY<DOUBLE>>"
|
||||
df = df.withColumn(prediction_col, df[prediction_col].cast(T.StringType()))
|
||||
df = df.withColumn(prediction_col, string_to_array_udf(df[prediction_col]))
|
||||
df = _compute_label_from_probability(df, prediction_col, prediction_col + "_label")
|
||||
evaluator = MulticlassClassificationEvaluator(
|
||||
metricName="logLoss",
|
||||
|
|
|
@ -393,7 +393,7 @@ class DataTransformerTS:
|
|||
|
||||
for column in X.columns:
|
||||
# sklearn/utils/validation.py needs int/float values
|
||||
if X[column].dtype.name in ("object", "category"):
|
||||
if X[column].dtype.name in ("object", "category", "string"):
|
||||
if (
|
||||
# drop columns where all values are the same
|
||||
X[column].nunique() == 1
|
||||
|
|
|
@ -127,6 +127,13 @@ def _get_notebook_name():
|
|||
return None
|
||||
|
||||
|
||||
def safe_json_dumps(obj):
|
||||
def default(o):
|
||||
return str(o)
|
||||
|
||||
return json.dumps(obj, default=default)
|
||||
|
||||
|
||||
class MLflowIntegration:
|
||||
def __init__(self, experiment_type="automl", mlflow_exp_name=None, extra_tag=None):
|
||||
try:
|
||||
|
@ -438,7 +445,7 @@ class MLflowIntegration:
|
|||
"flaml.meric": automl_metric_name,
|
||||
"flaml.run_source": "flaml-automl",
|
||||
"flaml.log_type": self.log_type,
|
||||
"flaml.automl_user_configurations": json.dumps(automl._automl_user_configurations),
|
||||
"flaml.automl_user_configurations": safe_json_dumps(automl._automl_user_configurations),
|
||||
},
|
||||
"params": {
|
||||
"sample_size": search_state.sample_size,
|
||||
|
|
|
@ -187,7 +187,6 @@ class TestMultiClass(unittest.TestCase):
|
|||
def test_custom_metric(self):
|
||||
df, y = load_iris(return_X_y=True, as_frame=True)
|
||||
df["label"] = y
|
||||
automl = AutoML()
|
||||
settings = {
|
||||
"dataframe": df,
|
||||
"label": "label",
|
||||
|
@ -204,7 +203,8 @@ class TestMultiClass(unittest.TestCase):
|
|||
"pred_time_limit": 1e-5,
|
||||
"ensemble": True,
|
||||
}
|
||||
automl.fit(**settings)
|
||||
automl = AutoML(**settings) # test safe_json_dumps
|
||||
automl.fit(dataframe=df, label="label")
|
||||
print(automl.classes_)
|
||||
print(automl.model)
|
||||
print(automl.config_history)
|
||||
|
|
Загрузка…
Ссылка в новой задаче