Bug Fix: Data Diagnosis - Fix bug of failure test and warning of pandas in data diagnosis (#638)
**Description** Fix bug of failure test and warning of pandas in data diagnosis. **Major Revision** - fix warning of pandas in replace and fillna due to type downcast - fix bug of failure check function only check one matched metric rather than all matched metrics - fix bug when converting regex into str of metrics when there're more than one match group
This commit is contained in:
Родитель
46a5792915
Коммит
7af75df392
|
@ -243,7 +243,7 @@ def aggregate(raw_data_df, pattern=None):
|
|||
match = re.search(pattern, metric)
|
||||
if match:
|
||||
metric_in_list = list(metric)
|
||||
for i in range(1, len(match.groups()) + 1):
|
||||
for i in range(len(match.groups()), 0, -1):
|
||||
metric_in_list[match.start(i):match.end(i)] = '*'
|
||||
short = ''.join(metric_in_list)
|
||||
if short not in metric_store:
|
||||
|
|
|
@ -262,9 +262,8 @@ class DataDiagnosis(RuleBase):
|
|||
all_data_df = data_not_accept_df[[
|
||||
append_columns[index]
|
||||
]].merge(all_data_df, left_index=True, right_index=True, how='right')
|
||||
all_data_df['Accept'] = all_data_df['Accept'].replace(np.nan, True)
|
||||
all_data_df['Number Of Issues'] = all_data_df['Number Of Issues'].replace(np.nan, 0)
|
||||
all_data_df['Number Of Issues'] = all_data_df['Number Of Issues'].astype(int)
|
||||
all_data_df['Accept'] = all_data_df['Accept'].replace(np.nan, 1).astype('bool')
|
||||
all_data_df['Number Of Issues'] = all_data_df['Number Of Issues'].replace(np.nan, 0).astype('int')
|
||||
|
||||
return all_data_df
|
||||
|
||||
|
@ -296,7 +295,7 @@ class DataDiagnosis(RuleBase):
|
|||
data_not_accept_df (DataFrame): the DataFrame to output
|
||||
output_path (str): the path of output jsonl file
|
||||
"""
|
||||
data_not_accept_df = data_not_accept_df.convert_dtypes().astype('object').fillna(self.na)
|
||||
data_not_accept_df = data_not_accept_df.convert_dtypes().astype('object').infer_objects().fillna(self.na)
|
||||
p = Path(output_path)
|
||||
try:
|
||||
data_not_accept_json = data_not_accept_df.to_json(orient='index')
|
||||
|
@ -327,7 +326,7 @@ class DataDiagnosis(RuleBase):
|
|||
data_not_accept_df (DataFrame): the DataFrame to output
|
||||
output_path (str): the path of output jsonl file
|
||||
"""
|
||||
data_not_accept_df = data_not_accept_df.convert_dtypes().astype('object').fillna(self.na)
|
||||
data_not_accept_df = data_not_accept_df.convert_dtypes().astype('object').infer_objects().fillna(self.na)
|
||||
data_not_accept_df = data_not_accept_df.reset_index()
|
||||
data_not_accept_df = data_not_accept_df.rename(
|
||||
columns={
|
||||
|
@ -378,7 +377,7 @@ class DataDiagnosis(RuleBase):
|
|||
data_not_accept_df = data_analysis.round_significant_decimal_places(
|
||||
data_not_accept_df, round, [metric]
|
||||
)
|
||||
data_not_accept_df = data_not_accept_df.convert_dtypes().astype('object').fillna(self.na)
|
||||
data_not_accept_df = data_not_accept_df.convert_dtypes().astype('object').infer_objects().fillna(self.na)
|
||||
lines = file_handler.generate_md_table(data_not_accept_df, header)
|
||||
return lines
|
||||
|
||||
|
|
|
@ -239,19 +239,22 @@ class RuleOp:
|
|||
violated_metric_num = 0
|
||||
for metric_regex in raw_rule['metrics']:
|
||||
match = False
|
||||
violate = False
|
||||
for metric in rule['metrics']:
|
||||
if re.search(metric_regex, metric):
|
||||
match = True
|
||||
# metric not in raw_data or the value is none, miss test
|
||||
if metric not in data_row or pd.isna(data_row[metric]):
|
||||
violated_metric_num += 1
|
||||
break
|
||||
if RuleOp.miss_test(metric, rule, data_row, details, categories):
|
||||
violate = True
|
||||
# metric_regex written in rules is not matched by any metric, miss test
|
||||
if not match:
|
||||
violated_metric_num += 1
|
||||
violate = True
|
||||
RuleOp.add_categories_and_details(metric_regex + '_miss', rule['categories'], details, categories)
|
||||
if violate:
|
||||
violated_metric_num += 1
|
||||
# return code != 0, failed test
|
||||
violated_metric_num += RuleOp.value(data_row, rule, summary_data_row, details, categories)
|
||||
details[:] = list(dict.fromkeys(details)) # remove duplicate details
|
||||
return violated_metric_num
|
||||
|
||||
|
||||
|
|
Загрузка…
Ссылка в новой задаче