July'20 release (#216)
* pin pmdarima, matplotlib versions (#212) * pin pmdarima to 1.1.1 * try 1.2.0 * fix envsetup, revert to ==1.1.1 * also update test yml * fix this script as well * pin matplotlib Co-authored-by: Chenhui Hu <chenhhu@microsoft.com> * Testing and refactoring fclib 1 (#214) * start utils testing * testing ojdata * clean * rm unneeded file * use conftest.py * loss fn tests * tweak R datagen * feature testing * feature testing 2 * test dcnn * fine tune * modelling tests * rm unneeded imports * more pred tests * set to run code coverage * add coveragerc * install pytest-cov * fixing, removing unneeded cruft * rm unused energy lag stuff * more fixes * yet more fixes * more tests * split feature tests up * use codecov task * install .net core for cobertura * don't output html * no reportDirectory * tidying up * linting Co-authored-by: Hong Ooi <hongooi@microsoft.com> Co-authored-by: Chenhui Hu <chenhhu@microsoft.com>
This commit is contained in:
Родитель
bc47741cb1
Коммит
12e2044946
|
@ -0,0 +1,8 @@
|
|||
[run]
|
||||
include =
|
||||
fclib/fclib/*
|
||||
|
||||
omit =
|
||||
fclib/fclib/azureml/*
|
||||
fclib/tests/*
|
||||
fclib/setup.py
|
|
@ -61,7 +61,7 @@ def day_type(datetime_col, holiday_col=None, semi_holiday_offset=timedelta(days=
|
|||
datetype = pd.DataFrame({"DayType": datetime_col.dt.dayofweek})
|
||||
datetype.replace({"DayType": WEEK_DAY_TYPE_MAP}, inplace=True)
|
||||
|
||||
if holiday_col:
|
||||
if holiday_col is not None:
|
||||
holiday_mask = holiday_col > 0
|
||||
datetype.loc[holiday_mask, "DayType"] = HOLIDAY_CODE
|
||||
|
||||
|
@ -165,58 +165,58 @@ def day_of_year(date_time_col):
|
|||
return date_time_col.dt.dayofyear
|
||||
|
||||
|
||||
def encoded_month_of_year(month_of_year):
|
||||
"""
|
||||
Create one hot encoding of month of year.
|
||||
"""
|
||||
month_of_year = pd.get_dummies(month_of_year, prefix="MonthOfYear")
|
||||
# def encoded_month_of_year(month_of_year):
|
||||
# """
|
||||
# Create one hot encoding of month of year.
|
||||
# """
|
||||
# month_of_year = pd.get_dummies(month_of_year, prefix="MonthOfYear")
|
||||
|
||||
return month_of_year
|
||||
# return month_of_year
|
||||
|
||||
|
||||
def encoded_day_of_week(day_of_week):
|
||||
"""
|
||||
Create one hot encoding of day_of_week.
|
||||
"""
|
||||
day_of_week = pd.get_dummies(day_of_week, prefix="DayOfWeek")
|
||||
# def encoded_day_of_week(day_of_week):
|
||||
# """
|
||||
# Create one hot encoding of day_of_week.
|
||||
# """
|
||||
# day_of_week = pd.get_dummies(day_of_week, prefix="DayOfWeek")
|
||||
|
||||
return day_of_week
|
||||
# return day_of_week
|
||||
|
||||
|
||||
def encoded_day_of_month(day_of_month):
|
||||
"""
|
||||
Create one hot encoding of day_of_month.
|
||||
"""
|
||||
day_of_month = pd.get_dummies(day_of_month, prefix="DayOfMonth")
|
||||
# def encoded_day_of_month(day_of_month):
|
||||
# """
|
||||
# Create one hot encoding of day_of_month.
|
||||
# """
|
||||
# day_of_month = pd.get_dummies(day_of_month, prefix="DayOfMonth")
|
||||
|
||||
return day_of_month
|
||||
# return day_of_month
|
||||
|
||||
|
||||
def encoded_day_of_year(day_of_year):
|
||||
"""
|
||||
Create one hot encoding of day_of_year.
|
||||
"""
|
||||
day_of_year = pd.get_dummies(day_of_year)
|
||||
# def encoded_day_of_year(day_of_year):
|
||||
# """
|
||||
# Create one hot encoding of day_of_year.
|
||||
# """
|
||||
# day_of_year = pd.get_dummies(day_of_year)
|
||||
|
||||
return day_of_year
|
||||
# return day_of_year
|
||||
|
||||
|
||||
def encoded_hour_of_day(hour_of_day):
|
||||
"""
|
||||
Create one hot encoding of hour_of_day.
|
||||
"""
|
||||
hour_of_day = pd.get_dummies(hour_of_day, prefix="HourOfDay")
|
||||
# def encoded_hour_of_day(hour_of_day):
|
||||
# """
|
||||
# Create one hot encoding of hour_of_day.
|
||||
# """
|
||||
# hour_of_day = pd.get_dummies(hour_of_day, prefix="HourOfDay")
|
||||
|
||||
return hour_of_day
|
||||
# return hour_of_day
|
||||
|
||||
|
||||
def encoded_week_of_year(week_of_year):
|
||||
"""
|
||||
Create one hot encoding of week_of_year.
|
||||
"""
|
||||
week_of_year = pd.get_dummies(week_of_year, prefix="WeekOfYear")
|
||||
# def encoded_week_of_year(week_of_year):
|
||||
# """
|
||||
# Create one hot encoding of week_of_year.
|
||||
# """
|
||||
# week_of_year = pd.get_dummies(week_of_year, prefix="WeekOfYear")
|
||||
|
||||
return week_of_year
|
||||
# return week_of_year
|
||||
|
||||
|
||||
def normalized_current_year(datetime_col, min_year, max_year):
|
||||
|
@ -236,8 +236,8 @@ def normalized_current_year(datetime_col, min_year, max_year):
|
|||
|
||||
if max_year != min_year:
|
||||
current_year = (year - min_year) / (max_year - min_year)
|
||||
elif max_year == min_year:
|
||||
current_year = 0
|
||||
else:
|
||||
current_year = pd.Series([0 for x in range(len(datetime_col))])
|
||||
|
||||
return current_year
|
||||
|
||||
|
@ -255,13 +255,13 @@ def normalized_current_date(datetime_col, min_date, max_date):
|
|||
Returns:
|
||||
float: the position of the current date in the min_date:max_date range
|
||||
"""
|
||||
date = datetime_col.dt.date
|
||||
current_date = (date - min_date).apply(lambda x: x.days)
|
||||
date = datetime_col # .dt.date
|
||||
current_date = (date - min_date) # .apply(lambda x: x.days)
|
||||
|
||||
if max_date != min_date:
|
||||
current_date = current_date / (max_date - min_date).days
|
||||
elif max_date == min_date:
|
||||
current_date = 0
|
||||
current_date = current_date / (max_date - min_date) # .days
|
||||
else:
|
||||
current_date = pd.Series([0 for x in range(len(datetime_col))])
|
||||
|
||||
return current_date
|
||||
|
||||
|
@ -285,8 +285,8 @@ def normalized_current_datehour(datetime_col, min_datehour, max_datehour):
|
|||
|
||||
if max_min_diff != 0:
|
||||
current_datehour = current_datehour / (max_min_diff.days * 24 + max_min_diff.seconds / 3600)
|
||||
elif max_min_diff == 0:
|
||||
current_datehour = 0
|
||||
else:
|
||||
current_datehour = pd.Series([0 for x in range(len(datetime_col))])
|
||||
|
||||
return current_datehour
|
||||
|
||||
|
@ -426,425 +426,6 @@ def daily_fourier(datetime_col, n_harmonics):
|
|||
return output_dict
|
||||
|
||||
|
||||
def same_week_day_hour_lag(
|
||||
datetime_col, value_col, n_years=3, week_window=1, agg_func="mean", q=None, output_colname="SameWeekHourLag"
|
||||
):
|
||||
"""
|
||||
Creates a lag feature by calculating quantiles, mean and std of values of and
|
||||
around the same week, same day of week, and same hour of day, of previous years.
|
||||
|
||||
Args:
|
||||
datetime_col: Datetime column.
|
||||
value_col: Feature value column to create lag feature from.
|
||||
n_years: Number of previous years data to use. Default value 3.
|
||||
week_window: Number of weeks before and after the same week to use,
|
||||
which should help reduce noise in the data. Default value 1.
|
||||
agg_func: Aggregation function to apply on multiple previous values,
|
||||
accepted values are 'mean', 'quantile', 'std'. Default value 'mean'.
|
||||
q: If agg_func is 'quantile', taking value between 0 and 1.
|
||||
output_colname: name of the output lag feature column.
|
||||
Default value 'SameWeekHourLag'.
|
||||
|
||||
Returns:
|
||||
pd.DataFrame: data frame containing the newly created lag
|
||||
feature as a column.
|
||||
"""
|
||||
|
||||
if not is_datetime_like(datetime_col):
|
||||
datetime_col = pd.to_datetime(datetime_col, format=DATETIME_FORMAT)
|
||||
min_time_stamp = min(datetime_col)
|
||||
max_time_stamp = max(datetime_col)
|
||||
|
||||
df = pd.DataFrame({"Datetime": datetime_col, "value": value_col})
|
||||
df.set_index("Datetime", inplace=True)
|
||||
|
||||
week_lag_base = 52
|
||||
week_lag_last_year = list(range(week_lag_base - week_window, week_lag_base + week_window + 1))
|
||||
week_lag_all = []
|
||||
for y in range(n_years):
|
||||
week_lag_all += [x + y * 52 for x in week_lag_last_year]
|
||||
|
||||
week_lag_cols = []
|
||||
for w in week_lag_all:
|
||||
if (max_time_stamp - timedelta(weeks=w)) >= min_time_stamp:
|
||||
col_name = "week_lag_" + str(w)
|
||||
week_lag_cols.append(col_name)
|
||||
|
||||
lag_datetime = df.index.get_level_values(0) - timedelta(weeks=w)
|
||||
valid_lag_mask = lag_datetime >= min_time_stamp
|
||||
|
||||
df[col_name] = np.nan
|
||||
|
||||
df.loc[valid_lag_mask, col_name] = df.loc[lag_datetime[valid_lag_mask], "value"].values
|
||||
|
||||
# Additional aggregation options will be added as needed
|
||||
if agg_func == "mean" and q is None:
|
||||
df[output_colname] = round(df[week_lag_cols].mean(axis=1))
|
||||
elif agg_func == "quantile" and q is not None:
|
||||
df[output_colname] = round(df[week_lag_cols].quantile(q, axis=1))
|
||||
elif agg_func == "std" and q is None:
|
||||
df[output_colname] = round(df[week_lag_cols].std(axis=1))
|
||||
|
||||
return df[[output_colname]]
|
||||
|
||||
|
||||
def same_day_hour_lag(
|
||||
datetime_col, value_col, n_years=3, day_window=1, agg_func="mean", q=None, output_colname="SameDayHourLag"
|
||||
):
|
||||
"""
|
||||
Creates a lag feature by calculating quantiles, mean, and std of values of
|
||||
and around the same day of year, and same hour of day, of previous years.
|
||||
|
||||
Args:
|
||||
datetime_col: Datetime column.
|
||||
value_col: Feature value column to create lag feature from.
|
||||
n_years: Number of previous years data to use. Default value 3.
|
||||
day_window: Number of days before and after the same day to use,
|
||||
which should help reduce noise in the data. Default value 1.
|
||||
agg_func: Aggregation function to apply on multiple previous values,
|
||||
accepted values are 'mean', 'quantile', 'std'. Default value 'mean'.
|
||||
q: If agg_func is 'quantile', taking value between 0 and 1.
|
||||
output_colname: name of the output lag feature column.
|
||||
Default value 'SameDayHourLag'.
|
||||
|
||||
Returns:
|
||||
pd.DataFrame: data frame containing the newly created lag
|
||||
feature as a column.
|
||||
"""
|
||||
|
||||
if not is_datetime_like(datetime_col):
|
||||
datetime_col = pd.to_datetime(datetime_col, format=DATETIME_FORMAT)
|
||||
min_time_stamp = min(datetime_col)
|
||||
max_time_stamp = max(datetime_col)
|
||||
|
||||
df = pd.DataFrame({"Datetime": datetime_col, "value": value_col})
|
||||
df.set_index("Datetime", inplace=True)
|
||||
|
||||
day_lag_base = 365
|
||||
day_lag_last_year = list(range(day_lag_base - day_window, day_lag_base + day_window + 1))
|
||||
day_lag_all = []
|
||||
for y in range(n_years):
|
||||
day_lag_all += [x + y * 365 for x in day_lag_last_year]
|
||||
|
||||
day_lag_cols = []
|
||||
for d in day_lag_all:
|
||||
if (max_time_stamp - timedelta(days=d)) >= min_time_stamp:
|
||||
col_name = "day_lag_" + str(d)
|
||||
day_lag_cols.append(col_name)
|
||||
|
||||
lag_datetime = df.index.get_level_values(0) - timedelta(days=d)
|
||||
valid_lag_mask = lag_datetime >= min_time_stamp
|
||||
|
||||
df[col_name] = np.nan
|
||||
|
||||
df.loc[valid_lag_mask, col_name] = df.loc[lag_datetime[valid_lag_mask], "value"].values
|
||||
|
||||
# Additional aggregation options will be added as needed
|
||||
if agg_func == "mean" and q is None:
|
||||
df[output_colname] = round(df[day_lag_cols].mean(axis=1))
|
||||
elif agg_func == "quantile" and q is not None:
|
||||
df[output_colname] = round(df[day_lag_cols].quantile(q, axis=1))
|
||||
elif agg_func == "std" and q is None:
|
||||
df[output_colname] = round(df[day_lag_cols].std(axis=1))
|
||||
|
||||
return df[[output_colname]]
|
||||
|
||||
|
||||
def same_day_hour_moving_average(
|
||||
datetime_col,
|
||||
value_col,
|
||||
window_size,
|
||||
start_week,
|
||||
average_count,
|
||||
forecast_creation_time,
|
||||
output_col_prefix="moving_average_lag_",
|
||||
):
|
||||
"""
|
||||
Creates moving average features by averaging values of the same day of
|
||||
week and same hour of day of previous weeks.
|
||||
|
||||
Args:
|
||||
datetime_col: Datetime column
|
||||
value_col: Feature value column to create moving average features from.
|
||||
window_size: Number of weeks used to compute the average.
|
||||
start_week: First week of the first moving average feature.
|
||||
average_count: Number of moving average features to create.
|
||||
forecast_creation_time: The time point when the feature is created.
|
||||
This value is used to prevent using data that are not available
|
||||
at forecast creation time to compute features.
|
||||
output_col_prefix: Prefix of the output columns. The start week of each
|
||||
moving average feature is added at the end. Default value 'moving_average_lag_'.
|
||||
|
||||
Returns:
|
||||
pd.DataFrame: data frame containing the newly created lag features as
|
||||
columns.
|
||||
|
||||
For example, start_week = 9, window_size=4, and average_count = 3 will
|
||||
create three moving average features.
|
||||
1) moving_average_lag_9: average the same day and hour values of the 9th,
|
||||
10th, 11th, and 12th weeks before the current week.
|
||||
2) moving_average_lag_10: average the same day and hour values of the
|
||||
10th, 11th, 12th, and 13th weeks before the current week.
|
||||
3) moving_average_lag_11: average the same day and hour values of the
|
||||
11th, 12th, 13th, and 14th weeks before the current week.
|
||||
"""
|
||||
|
||||
df = pd.DataFrame({"Datetime": datetime_col, "value": value_col})
|
||||
df.set_index("Datetime", inplace=True)
|
||||
|
||||
df = df.asfreq("H")
|
||||
|
||||
if not df.index.is_monotonic:
|
||||
df.sort_index(inplace=True)
|
||||
|
||||
df["fct_diff"] = df.index - forecast_creation_time
|
||||
df["fct_diff"] = df["fct_diff"].apply(lambda x: x.days * 24 + x.seconds / 3600)
|
||||
max_diff = max(df["fct_diff"])
|
||||
|
||||
for i in range(average_count):
|
||||
output_col = output_col_prefix + str(start_week + i)
|
||||
week_lag_start = start_week + i
|
||||
hour_lags = [(week_lag_start + w) * 24 * 7 for w in range(window_size)]
|
||||
hour_lags = [h for h in hour_lags if h > max_diff]
|
||||
if hour_lags:
|
||||
tmp_df = df[["value"]].copy()
|
||||
tmp_col_all = []
|
||||
for h in hour_lags:
|
||||
tmp_col = "tmp_lag_" + str(h)
|
||||
tmp_col_all.append(tmp_col)
|
||||
tmp_df[tmp_col] = tmp_df["value"].shift(h)
|
||||
|
||||
df[output_col] = round(tmp_df[tmp_col_all].mean(axis=1))
|
||||
df.drop(["fct_diff", "value"], inplace=True, axis=1)
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def same_day_hour_moving_quantile(
|
||||
datetime_col,
|
||||
value_col,
|
||||
window_size,
|
||||
start_week,
|
||||
quantile_count,
|
||||
q,
|
||||
forecast_creation_time,
|
||||
output_col_prefix="moving_quatile_lag_",
|
||||
):
|
||||
"""
|
||||
Creates a series of quantiles features by calculating quantiles of values of
|
||||
the same day of week and same hour of day of previous weeks.
|
||||
|
||||
Args:
|
||||
datetime_col: Datetime column
|
||||
value_col: Feature value column to create quantile features from.
|
||||
window_size: Number of weeks used to compute the quantile.
|
||||
start_week: First week of the first moving quantile feature.
|
||||
quantile_count: Number of quantile features to create.
|
||||
q: quantile to compute from history values, should be between 0 and 1.
|
||||
forecast_creation_time: The time point when the feature is created.
|
||||
This value is used to prevent using data that are not available
|
||||
at forecast creation time to compute features.
|
||||
output_col_prefix: Prefix of the output columns. The start week of each
|
||||
moving average feature is added at the end. Default value 'moving_quatile_lag_'.
|
||||
|
||||
Returns:
|
||||
pd.DataFrame: data frame containing the newly created lag features as
|
||||
columns.
|
||||
|
||||
For example, start_week = 9, window_size=4, and quantile_count = 3 will
|
||||
create three quantiles features.
|
||||
1) moving_quantile_lag_9: calculate quantile of the same day and hour values of the 9th,
|
||||
10th, 11th, and 12th weeks before the current week.
|
||||
2) moving_quantile_lag_10: calculate quantile of average the same day and hour values of the
|
||||
10th, 11th, 12th, and 13th weeks before the current week.
|
||||
3) moving_quantile_lag_11: calculate quantile of average the same day and hour values of the
|
||||
11th, 12th, 13th, and 14th weeks before the current week.
|
||||
"""
|
||||
|
||||
df = pd.DataFrame({"Datetime": datetime_col, "value": value_col})
|
||||
df.set_index("Datetime", inplace=True)
|
||||
|
||||
df = df.asfreq("H")
|
||||
|
||||
if not df.index.is_monotonic:
|
||||
df.sort_index(inplace=True)
|
||||
|
||||
df["fct_diff"] = df.index - forecast_creation_time
|
||||
df["fct_diff"] = df["fct_diff"].apply(lambda x: x.days * 24 + x.seconds / 3600)
|
||||
max_diff = max(df["fct_diff"])
|
||||
|
||||
for i in range(quantile_count):
|
||||
output_col = output_col_prefix + str(start_week + i)
|
||||
week_lag_start = start_week + i
|
||||
hour_lags = [(week_lag_start + w) * 24 * 7 for w in range(window_size)]
|
||||
hour_lags = [h for h in hour_lags if h > max_diff]
|
||||
if hour_lags:
|
||||
tmp_df = df[["value"]].copy()
|
||||
tmp_col_all = []
|
||||
for h in hour_lags:
|
||||
tmp_col = "tmp_lag_" + str(h)
|
||||
tmp_col_all.append(tmp_col)
|
||||
tmp_df[tmp_col] = tmp_df["value"].shift(h)
|
||||
|
||||
df[output_col] = round(tmp_df[tmp_col_all].quantile(q, axis=1))
|
||||
|
||||
df.drop(["fct_diff", "value"], inplace=True, axis=1)
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def same_day_hour_moving_std(
|
||||
datetime_col,
|
||||
value_col,
|
||||
window_size,
|
||||
start_week,
|
||||
std_count,
|
||||
forecast_creation_time,
|
||||
output_col_prefix="moving_std_lag_",
|
||||
):
|
||||
"""
|
||||
Creates standard deviation features by calculating std of values of the
|
||||
same day of week and same hour of day of previous weeks.
|
||||
|
||||
Args:
|
||||
datetime_col: Datetime column
|
||||
value_col: Feature value column to create moving std features from.
|
||||
window_size: Number of weeks used to compute the std.
|
||||
start_week: First week of the first moving std feature.
|
||||
std_count: Number of moving std features to create.
|
||||
forecast_creation_time: The time point when the feature is created.
|
||||
This value is used to prevent using data that are not available at
|
||||
forecast creation time to compute features.
|
||||
output_col_prefix: Prefix of the output columns. The start week of each
|
||||
moving average feature is added at the end. Default value 'moving_std_lag_'.
|
||||
|
||||
Returns:
|
||||
pd.DataFrame: data frame containing the newly created lag features as
|
||||
columns.
|
||||
|
||||
For example, start_week = 9, window_size=4, and std_count = 3 will
|
||||
create three moving std features.
|
||||
1) moving_std_lag_9: calculate std of the same day and hour values of the 9th,
|
||||
10th, 11th, and 12th weeks before the current week.
|
||||
2) moving_std_lag_10: calculate std of the same day and hour values of the
|
||||
10th, 11th, 12th, and 13th weeks before the current week.
|
||||
3) moving_std_lag_11: calculate std of the same day and hour values of the
|
||||
11th, 12th, 13th, and 14th weeks before the current week.
|
||||
"""
|
||||
|
||||
df = pd.DataFrame({"Datetime": datetime_col, "value": value_col})
|
||||
df.set_index("Datetime", inplace=True)
|
||||
|
||||
df = df.asfreq("H")
|
||||
|
||||
if not df.index.is_monotonic:
|
||||
df.sort_index(inplace=True)
|
||||
|
||||
df["fct_diff"] = df.index - forecast_creation_time
|
||||
df["fct_diff"] = df["fct_diff"].apply(lambda x: x.days * 24 + x.seconds / 3600)
|
||||
max_diff = max(df["fct_diff"])
|
||||
|
||||
for i in range(std_count):
|
||||
output_col = output_col_prefix + str(start_week + i)
|
||||
week_lag_start = start_week + i
|
||||
hour_lags = [(week_lag_start + w) * 24 * 7 for w in range(window_size)]
|
||||
hour_lags = [h for h in hour_lags if h > max_diff]
|
||||
if hour_lags:
|
||||
tmp_df = df[["value"]].copy()
|
||||
tmp_col_all = []
|
||||
for h in hour_lags:
|
||||
tmp_col = "tmp_lag_" + str(h)
|
||||
tmp_col_all.append(tmp_col)
|
||||
tmp_df[tmp_col] = tmp_df["value"].shift(h)
|
||||
|
||||
df[output_col] = round(tmp_df[tmp_col_all].std(axis=1))
|
||||
|
||||
df.drop(["value", "fct_diff"], inplace=True, axis=1)
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def same_day_hour_moving_agg(
|
||||
datetime_col,
|
||||
value_col,
|
||||
window_size,
|
||||
start_week,
|
||||
count,
|
||||
forecast_creation_time,
|
||||
agg_func="mean",
|
||||
q=None,
|
||||
output_col_prefix="moving_agg_lag_",
|
||||
):
|
||||
"""
|
||||
Creates a series of aggregation features by calculating mean, quantiles,
|
||||
or std of values of the same day of week and same hour of day of previous weeks.
|
||||
|
||||
Args:
|
||||
datetime_col: Datetime column
|
||||
value_col: Feature value column to create aggregation features from.
|
||||
window_size: Number of weeks used to compute the aggregation.
|
||||
start_week: First week of the first aggregation feature.
|
||||
count: Number of aggregation features to create.
|
||||
forecast_creation_time: The time point when the feature is created.
|
||||
This value is used to prevent using data that are not available
|
||||
at forecast creation time to compute features.
|
||||
agg_func: Aggregation function to apply on multiple previous values,
|
||||
accepted values are 'mean', 'quantile', 'std'.
|
||||
q: If agg_func is 'quantile', taking value between 0 and 1.
|
||||
output_col_prefix: Prefix of the output columns. The start week of each
|
||||
moving average feature is added at the end. Default value 'moving_agg_lag_'.
|
||||
|
||||
Returns:
|
||||
pd.DataFrame: data frame containing the newly created lag features as
|
||||
columns.
|
||||
|
||||
For example, start_week = 9, window_size=4, and count = 3 will
|
||||
create three aggregation of features.
|
||||
1) moving_agg_lag_9: aggregate the same day and hour values of the 9th,
|
||||
10th, 11th, and 12th weeks before the current week.
|
||||
2) moving_agg_lag_10: aggregate the same day and hour values of the
|
||||
10th, 11th, 12th, and 13th weeks before the current week.
|
||||
3) moving_agg_lag_11: aggregate the same day and hour values of the
|
||||
11th, 12th, 13th, and 14th weeks before the current week.
|
||||
"""
|
||||
|
||||
df = pd.DataFrame({"Datetime": datetime_col, "value": value_col})
|
||||
df.set_index("Datetime", inplace=True)
|
||||
|
||||
df = df.asfreq("H")
|
||||
|
||||
if not df.index.is_monotonic:
|
||||
df.sort_index(inplace=True)
|
||||
|
||||
df["fct_diff"] = df.index - forecast_creation_time
|
||||
df["fct_diff"] = df["fct_diff"].apply(lambda x: x.days * 24 + x.seconds / 3600)
|
||||
max_diff = max(df["fct_diff"])
|
||||
|
||||
for i in range(count):
|
||||
output_col = output_col_prefix + str(start_week + i)
|
||||
week_lag_start = start_week + i
|
||||
hour_lags = [(week_lag_start + w) * 24 * 7 for w in range(window_size)]
|
||||
hour_lags = [h for h in hour_lags if h > max_diff]
|
||||
if hour_lags:
|
||||
tmp_df = df[["value"]].copy()
|
||||
tmp_col_all = []
|
||||
for h in hour_lags:
|
||||
tmp_col = "tmp_lag_" + str(h)
|
||||
tmp_col_all.append(tmp_col)
|
||||
tmp_df[tmp_col] = tmp_df["value"].shift(h)
|
||||
|
||||
if agg_func == "mean" and q is None:
|
||||
df[output_col] = round(tmp_df[tmp_col_all].mean(axis=1))
|
||||
elif agg_func == "quantile" and q is not None:
|
||||
df[output_col] = round(tmp_df[tmp_col_all].quantile(q, axis=1))
|
||||
elif agg_func == "std" and q is None:
|
||||
df[output_col] = round(tmp_df[tmp_col_all].std(axis=1))
|
||||
|
||||
df.drop(["fct_diff", "value"], inplace=True, axis=1)
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def df_from_cartesian_product(dict_in):
|
||||
"""Generate a Pandas dataframe from Cartesian product of lists.
|
||||
|
||||
|
@ -1075,17 +656,17 @@ def add_datetime(input_datetime, unit, add_count):
|
|||
Exception: if invalid unit is provided. Valid units are:
|
||||
'year', 'month', 'week', 'day', 'hour', 'minute'.
|
||||
"""
|
||||
if unit == "Y":
|
||||
if unit == "year":
|
||||
new_datetime = input_datetime + relativedelta(years=add_count)
|
||||
elif unit == "M":
|
||||
elif unit == "month":
|
||||
new_datetime = input_datetime + relativedelta(months=add_count)
|
||||
elif unit == "W":
|
||||
elif unit == "week":
|
||||
new_datetime = input_datetime + relativedelta(weeks=add_count)
|
||||
elif unit == "D":
|
||||
elif unit == "day":
|
||||
new_datetime = input_datetime + relativedelta(days=add_count)
|
||||
elif unit == "h":
|
||||
elif unit == "hour":
|
||||
new_datetime = input_datetime + relativedelta(hours=add_count)
|
||||
elif unit == "m":
|
||||
elif unit == "minute":
|
||||
new_datetime = input_datetime + relativedelta(minutes=add_count)
|
||||
else:
|
||||
raise Exception(
|
||||
|
|
|
@ -19,7 +19,7 @@ def create_dcnn_model(
|
|||
kernel_size=2,
|
||||
n_filters=3,
|
||||
dropout_rate=0.1,
|
||||
max_cat_id=[1e3, 1e3],
|
||||
max_cat_id=[100, 100],
|
||||
):
|
||||
"""Create a Dilated CNN model.
|
||||
|
||||
|
|
|
@ -0,0 +1,58 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import os
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import pytest
|
||||
from itertools import product
|
||||
|
||||
|
||||
class GenerateData:
|
||||
@staticmethod
|
||||
def ojdata(start=50, stop=61):
|
||||
keyvars = {"store": [1, 2], "brand": [1, 2, 3], "week": list(range(start, stop))}
|
||||
df = pd.DataFrame([row for row in product(*keyvars.values())], columns=keyvars.keys())
|
||||
|
||||
n = len(df)
|
||||
np.random.seed(12345)
|
||||
df["constant"] = 1
|
||||
df["logmove"] = np.random.normal(9, 1, n)
|
||||
df["price1"] = np.random.normal(0.55, 0.003, n)
|
||||
df["price2"] = np.random.normal(0.55, 0.003, n)
|
||||
df["price3"] = np.random.normal(0.55, 0.003, n)
|
||||
df["price4"] = np.random.normal(0.55, 0.003, n)
|
||||
df["price5"] = np.random.normal(0.55, 0.003, n)
|
||||
df["price6"] = np.random.normal(0.55, 0.003, n)
|
||||
df["price7"] = np.random.normal(0.55, 0.003, n)
|
||||
df["price8"] = np.random.normal(0.55, 0.003, n)
|
||||
df["price9"] = np.random.normal(0.55, 0.003, n)
|
||||
df["price10"] = np.random.normal(0.55, 0.003, n)
|
||||
df["price11"] = np.random.normal(0.55, 0.003, n)
|
||||
df["deal"] = np.random.binomial(1, 0.5, n)
|
||||
df["feat"] = np.random.binomial(1, 0.25, n)
|
||||
df["profit"] = np.random.normal(30, 7.5, n)
|
||||
return df
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def generate_ojdata():
|
||||
|
||||
# data file that will be created and deleted each time test is run
|
||||
ojdata_csv = "fclib/tests/resources/ojdatagen.csv"
|
||||
df = GenerateData.ojdata()
|
||||
df.to_csv(ojdata_csv, index_label=False, index=False)
|
||||
|
||||
yield generate_ojdata
|
||||
|
||||
# teardown code
|
||||
try:
|
||||
os.remove(ojdata_csv)
|
||||
os.remove(os.path.dirname(ojdata_csv) + "/yx.csv")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def generate_data():
|
||||
return GenerateData
|
|
@ -1,34 +0,0 @@
|
|||
#!/usr/bin/Rscript
|
||||
#
|
||||
# Source entire R testing files
|
||||
#
|
||||
# Note that we first define a function to source entire folder including R files. Then,
|
||||
# we simply source all .R files within the specified folder.
|
||||
|
||||
## Define a function to source entire folder
|
||||
source_entire_folder <- function(folderName, verbose=FALSE, showWarnings=TRUE) {
|
||||
# Find all .R files within a folder and soruces them
|
||||
#
|
||||
# Args:
|
||||
# folderName: Name of the folder including R files to be sourced.
|
||||
# verbose: If TRUE, print message; if not, not. Default is FALSE.
|
||||
#
|
||||
# Returns:
|
||||
# NULL.
|
||||
files <- list.files(folderName, full.names=FALSE)
|
||||
# Grab only R files that start with the word 'test'
|
||||
files <- files[grepl("\\.[rR]$", files)]
|
||||
files <- files[grepl("^test", files)]
|
||||
if (!length(files) && showWarnings)
|
||||
warning("No R test files in ", folderName)
|
||||
for (f in files) {
|
||||
if (verbose)
|
||||
cat("sourcing: ", f, "\n")
|
||||
## TODO: add caught whether error or not and return that
|
||||
try(source(paste(folderName, f, sep='/'), local=FALSE, echo=FALSE), silent=!verbose)
|
||||
}
|
||||
return(invisible(NULL))
|
||||
}
|
||||
|
||||
## Source all .R files within the folder of tests/unit
|
||||
source_entire_folder('./tests/unit', verbose=TRUE)
|
|
@ -1,47 +0,0 @@
|
|||
#!/usr/bin/Rscript
|
||||
#
|
||||
# Test download retail data
|
||||
#
|
||||
# Note that we define a function to test download_data.r file in retail benchmarking,
|
||||
# based on output checking.
|
||||
|
||||
## Define a function to test download_data.r file in retail benchmarking.
|
||||
test_download_retail_data <- function() {
|
||||
# Test download_data.r file in retail benchmarking
|
||||
#
|
||||
# Args:
|
||||
# NULL.
|
||||
#
|
||||
# Returns:
|
||||
# NULL.
|
||||
BENCHMARK_DIR <- file.path('./retail_sales', 'OrangeJuice_Pt_3Weeks_Weekly')
|
||||
DATA_DIR <- file.path(BENCHMARK_DIR, 'data')
|
||||
SCRIPT_PATH <- file.path(BENCHMARK_DIR, 'common', 'download_data.r')
|
||||
# Call data download script
|
||||
source(SCRIPT_PATH)
|
||||
# Check downloaded data
|
||||
sales <- read.csv(file.path(DATA_DIR, 'yx.csv'))
|
||||
if(all(dim(sales) == c(106139, 19)) == FALSE) {
|
||||
stop("There is something wrong")
|
||||
}
|
||||
column_names <- c('store', 'brand', 'week', 'logmove', 'constant',
|
||||
'price1', 'price2', 'price3', 'price4', 'price5',
|
||||
'price6', 'price7', 'price8', 'price9', 'price10',
|
||||
'price11', 'deal', 'feat', 'profit')
|
||||
if(all(colnames(sales) == column_names) == FALSE) {
|
||||
stop("There is something wrong")
|
||||
}
|
||||
storedemo <- read.csv(file.path(DATA_DIR, 'storedemo.csv'))
|
||||
if(all(dim(storedemo) == c(83, 12)) == FALSE) {
|
||||
stop("There is something wrong")
|
||||
}
|
||||
column_names <- c('STORE', 'AGE60', 'EDUC', 'ETHNIC', 'INCOME',
|
||||
'HHLARGE', 'WORKWOM', 'HVAL150', 'SSTRDIST',
|
||||
'SSTRVOL', 'CPDIST5', 'CPWVOL5')
|
||||
if(all(colnames(storedemo) == column_names) == FALSE) {
|
||||
stop("There is something wrong")
|
||||
}
|
||||
}
|
||||
|
||||
## Test download_data.r file in retail benchmarking.
|
||||
test_download_retail_data()
|
|
@ -1,64 +0,0 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import os
|
||||
import pandas as pd
|
||||
from tempfile import TemporaryDirectory
|
||||
|
||||
from fclib.dataset.ojdata import download_ojdata
|
||||
|
||||
|
||||
def test_download_retail_data():
|
||||
|
||||
DATA_FILE_LIST = ["yx.csv", "storedemo.csv"]
|
||||
|
||||
with TemporaryDirectory() as tmpdirname:
|
||||
print("Created temporary directory", tmpdirname)
|
||||
|
||||
# Download the data to the temp directory
|
||||
download_ojdata(tmpdirname)
|
||||
# Check downloaded data
|
||||
DATA_DIM_LIST = [(106139, 19), (83, 12)]
|
||||
COLUMN_NAME_LIST = [
|
||||
[
|
||||
"store",
|
||||
"brand",
|
||||
"week",
|
||||
"logmove",
|
||||
"constant",
|
||||
"price1",
|
||||
"price2",
|
||||
"price3",
|
||||
"price4",
|
||||
"price5",
|
||||
"price6",
|
||||
"price7",
|
||||
"price8",
|
||||
"price9",
|
||||
"price10",
|
||||
"price11",
|
||||
"deal",
|
||||
"feat",
|
||||
"profit",
|
||||
],
|
||||
[
|
||||
"STORE",
|
||||
"AGE60",
|
||||
"EDUC",
|
||||
"ETHNIC",
|
||||
"INCOME",
|
||||
"HHLARGE",
|
||||
"WORKWOM",
|
||||
"HVAL150",
|
||||
"SSTRDIST",
|
||||
"SSTRVOL",
|
||||
"CPDIST5",
|
||||
"CPWVOL5",
|
||||
],
|
||||
]
|
||||
for idx, f in enumerate(DATA_FILE_LIST):
|
||||
file_path = os.path.join(tmpdirname, f)
|
||||
assert os.path.exists(file_path)
|
||||
df = pd.read_csv(file_path, index_col=None)
|
||||
assert df.shape == DATA_DIM_LIST[idx]
|
||||
assert list(df) == COLUMN_NAME_LIST[idx]
|
|
@ -0,0 +1,19 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
from fclib.models.dilated_cnn import create_dcnn_model
|
||||
|
||||
|
||||
def test_create_dcnn_model():
|
||||
mod0 = create_dcnn_model(seq_len=1) # default args
|
||||
assert mod0 is not None
|
||||
|
||||
mod1 = create_dcnn_model(
|
||||
seq_len=1, n_dyn_fea=1, n_outputs=2, n_dilated_layers=1, kernel_size=2, dropout_rate=0.05, max_cat_id=[30, 120]
|
||||
)
|
||||
assert mod1 is not None
|
||||
|
||||
mod2 = create_dcnn_model(
|
||||
seq_len=1, n_dyn_fea=1, n_outputs=2, n_dilated_layers=2, kernel_size=2, dropout_rate=0.05, max_cat_id=[30, 120]
|
||||
)
|
||||
assert mod2 is not None
|
|
@ -0,0 +1,24 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from fclib.evaluation.evaluation_utils import MAPE, sMAPE, pinball_loss
|
||||
|
||||
|
||||
y = np.array([1, 2, 3])
|
||||
yhat = np.array([1.1, 2.2, 3.3])
|
||||
TOLERANCE = 1e-5
|
||||
|
||||
|
||||
def test_MAPE():
|
||||
assert abs(MAPE(yhat, y) - 0.1) < TOLERANCE
|
||||
|
||||
|
||||
def test_sMAPE():
|
||||
assert abs(sMAPE(yhat, y) - 0.04761904) < TOLERANCE
|
||||
|
||||
|
||||
def test_pinball_loss():
|
||||
df = pd.DataFrame({"yhat": yhat, "y": y})
|
||||
assert all(abs(pinball_loss(df.yhat, df.y, 0.5) - pd.Series([0.05, 0.1, 0.15])) < TOLERANCE)
|
|
@ -0,0 +1,119 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import pandas as pd
|
||||
import datetime
|
||||
|
||||
|
||||
from fclib.feature_engineering.feature_utils import (
|
||||
is_datetime_like,
|
||||
day_type,
|
||||
hour_of_day,
|
||||
time_of_year,
|
||||
week_of_year,
|
||||
week_of_month,
|
||||
month_of_year,
|
||||
day_of_week,
|
||||
day_of_month,
|
||||
day_of_year,
|
||||
)
|
||||
|
||||
|
||||
def test_is_datetime_like():
|
||||
st = "2000-01-01"
|
||||
assert not is_datetime_like(st)
|
||||
|
||||
dt = datetime.datetime.now()
|
||||
assert is_datetime_like(dt)
|
||||
|
||||
pdt = pd.DatetimeIndex(["2000-01-01"])
|
||||
assert is_datetime_like(pdt)
|
||||
|
||||
pts = pd.Timestamp("2000-01-01T12:00:00")
|
||||
assert is_datetime_like(pts)
|
||||
|
||||
d = datetime.date(2000, 1, 1)
|
||||
assert is_datetime_like(d)
|
||||
|
||||
|
||||
def test_day_type():
|
||||
dates = pd.to_datetime(pd.Series(["2000-01-01", "2000-01-02", "2000-01-03"]))
|
||||
hols = pd.Series([True, False, False])
|
||||
|
||||
dty = day_type(dates)
|
||||
assert all(dty == [5, 6, 0])
|
||||
|
||||
dty2 = day_type(dates, hols)
|
||||
assert all(dty2 == [7, 8, 0])
|
||||
|
||||
|
||||
# date component extractors
|
||||
|
||||
sample_date = pd.to_datetime(pd.Series(["2000-01-01 12:30:59"]))
|
||||
|
||||
|
||||
def test_hour_of_day():
|
||||
dates = sample_date
|
||||
assert all(hour_of_day(dates) == 12)
|
||||
|
||||
|
||||
def test_time_of_year():
|
||||
dates = sample_date
|
||||
tyr = time_of_year(dates)
|
||||
assert all(tyr >= 0 and tyr <= 1)
|
||||
|
||||
|
||||
def test_week_of_year():
|
||||
dates = sample_date
|
||||
assert week_of_year(dates)[0] == 52 # first day of 2000 is in last week of 1999
|
||||
|
||||
|
||||
def test_week_of_month():
|
||||
dates = sample_date
|
||||
assert week_of_month(dates)[0] == 1 # first day of 2000 is in first month of 2000
|
||||
|
||||
|
||||
def test_month_of_year():
|
||||
dates = sample_date
|
||||
assert month_of_year(dates)[0] == 1
|
||||
|
||||
|
||||
def test_day_of_week():
|
||||
dates = sample_date
|
||||
assert day_of_week(dates)[0] == 5
|
||||
|
||||
|
||||
def test_day_of_month():
|
||||
dates = sample_date
|
||||
assert day_of_month(dates)[0] == 1
|
||||
|
||||
|
||||
def test_day_of_year():
|
||||
dates = sample_date
|
||||
assert day_of_year(dates)[0] == 1
|
||||
|
||||
|
||||
# def test_encoded_month_of_year():
|
||||
# dates = sample_date
|
||||
# enc = encoded_month_of_year(dates)
|
||||
# assert len(enc.columns) == 12
|
||||
|
||||
# def test_encoded_day_of_week():
|
||||
# dates = sample_date
|
||||
# enc = encoded_day_of_week(dates)
|
||||
# assert len(enc.columns) == 7
|
||||
|
||||
# def test_encoded_day_of_year():
|
||||
# dates = sample_date
|
||||
# enc = encoded_day_of_year(dates)
|
||||
# assert len(enc.columns) >= 365
|
||||
|
||||
# def test_encoded_hour_of_day():
|
||||
# dates = sample_date
|
||||
# enc = encoded_hour_of_day(dates)
|
||||
# assert len(enc.columns) == 24
|
||||
|
||||
# def test_encoded_week_of_year():
|
||||
# dates = sample_date
|
||||
# enc = encoded_week_of_year(dates)
|
||||
# assert len(enc.columns) == 53
|
|
@ -0,0 +1,100 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import pandas as pd
|
||||
import datetime
|
||||
import pytest
|
||||
|
||||
|
||||
from fclib.feature_engineering.feature_utils import (
|
||||
df_from_cartesian_product,
|
||||
lagged_features,
|
||||
moving_averages,
|
||||
combine_features,
|
||||
gen_sequence_array,
|
||||
static_feature_array,
|
||||
normalize_columns,
|
||||
get_datetime_col,
|
||||
get_month_day_range,
|
||||
add_datetime,
|
||||
)
|
||||
|
||||
# misc utilities
|
||||
|
||||
|
||||
def test_df_from_cartesian_product():
|
||||
d = {"x1": [1, 2, 3], "x2": [4, 5, 6], "x3": ["a", "b", "c"]}
|
||||
df = df_from_cartesian_product(d)
|
||||
assert len(df) == 27
|
||||
assert list(df.columns) == ["x1", "x2", "x3"]
|
||||
|
||||
|
||||
def test_lagged_features():
|
||||
df = pd.DataFrame({"x1": [1, 2, 3], "x2": [4, 5, 6], "x3": ["a", "b", "c"]})
|
||||
dflag = lagged_features(df, [1, 2])
|
||||
assert dflag.shape == (3, 6)
|
||||
assert all(pd.isna(dflag.iloc[0, :]))
|
||||
|
||||
|
||||
def test_moving_averages():
|
||||
df = pd.DataFrame({"x1": [1, 2, 3], "x2": [4, 5, 6]})
|
||||
dfma = moving_averages(df, 1, 2)
|
||||
assert dfma.shape == (3, 2)
|
||||
assert all(pd.isna(dfma.iloc[0, :]))
|
||||
|
||||
|
||||
def test_combine_features():
|
||||
df = pd.DataFrame({"x1": [1, 2, 3], "x2": [4, 5, 6]})
|
||||
dfcomb = combine_features(df, ["x1", "x2"], [1, 2], 2, ["x1", "x2"])
|
||||
assert dfcomb.shape == (3, 8)
|
||||
|
||||
|
||||
def test_gen_sequence_array():
|
||||
val = pd.Series(x for x in range(8))
|
||||
df0 = df_from_cartesian_product({"x1": [1, 2], "x2": [1, 2, 3, 4]})
|
||||
df = pd.concat([val.to_frame("y"), df0], axis=1)
|
||||
arr = gen_sequence_array(df, 2, ["y"], "x1", "x2")
|
||||
assert len(arr) == 8
|
||||
|
||||
|
||||
def test_static_feature_array():
|
||||
val = pd.Series(x for x in range(8))
|
||||
df0 = df_from_cartesian_product({"x1": [1, 2], "x2": [1, 2, 3, 4]})
|
||||
df = pd.concat([val.to_frame("y"), df0], axis=1)
|
||||
arr = static_feature_array(df, 8, ["x1", "x2"], "x1", "x2")
|
||||
assert len(arr) == 8
|
||||
|
||||
|
||||
def test_normalize_columns():
|
||||
df = pd.Series((x * 1.0) for x in range(20)).to_frame("x")
|
||||
(sc, _) = normalize_columns(df, ["x"])
|
||||
assert len(sc) == len(df)
|
||||
assert all(sc["x"] >= 0) and all(sc["x"] <= 1)
|
||||
|
||||
|
||||
def test_get_datetime_col():
|
||||
df = pd.DataFrame({"x1": ["2001-01-01", "2001-01-02", "2001-01-03"], "x2": [1, 2, 3], "x3": ["a", "b", "c"]})
|
||||
dt1 = get_datetime_col(df, "x1")
|
||||
assert len(dt1) == 3
|
||||
|
||||
with pytest.raises(Exception):
|
||||
get_datetime_col(df, "x3")
|
||||
|
||||
|
||||
def test_get_month_day_range():
|
||||
x = datetime.datetime(2000, 1, 15)
|
||||
(first, last) = get_month_day_range(x)
|
||||
assert first == datetime.datetime(2000, 1, 1, 0, 0)
|
||||
assert last == datetime.datetime(2000, 1, 31, 23, 0)
|
||||
|
||||
|
||||
def test_add_datetime():
|
||||
x = datetime.datetime(2000, 1, 1)
|
||||
xy = add_datetime(x, "year", 1)
|
||||
assert xy == datetime.datetime(2001, 1, 1)
|
||||
|
||||
xm = add_datetime(x, "month", 1)
|
||||
assert xm == datetime.datetime(2000, 2, 1)
|
||||
|
||||
xd = add_datetime(x, "day", 1)
|
||||
assert xd == datetime.datetime(2000, 1, 2)
|
|
@ -0,0 +1,63 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import pandas as pd
|
||||
import pytest
|
||||
|
||||
|
||||
from fclib.feature_engineering.feature_utils import (
|
||||
normalized_current_year,
|
||||
normalized_current_date,
|
||||
normalized_current_datehour,
|
||||
normalized_columns,
|
||||
)
|
||||
|
||||
# normalization functions
|
||||
|
||||
sample_date = pd.to_datetime(pd.Series(["2000-01-01 12:30:59"]))
|
||||
|
||||
|
||||
def test_normalized_current_year():
|
||||
dates = sample_date
|
||||
nyr = normalized_current_year(dates, 1980, 2020)
|
||||
assert all(nyr >= 0) and all(nyr <= 1)
|
||||
|
||||
bad = normalized_current_year(dates, 2000, 2000)
|
||||
assert len(bad) == len(dates)
|
||||
|
||||
|
||||
def test_normalized_current_date():
|
||||
dates = sample_date
|
||||
span = pd.to_datetime(pd.Series(["1980-01-01 00:00:00", "2020-01-01 23:59:59"]))
|
||||
ndt = normalized_current_date(dates, span[0], span[1])
|
||||
assert all(ndt >= 0) and all(ndt <= 1)
|
||||
|
||||
badspan = pd.to_datetime(pd.Series(["2000-01-01 00:00:00", "2000-01-01 00:00:00"]))
|
||||
bad = normalized_current_date(dates, badspan[0], badspan[1])
|
||||
assert len(bad) == len(dates)
|
||||
|
||||
|
||||
def test_normalized_current_datehour():
|
||||
dates = sample_date
|
||||
span = pd.to_datetime(pd.Series(["1980-01-01 00:00:00", "2020-01-01 23:59:59"]))
|
||||
ndt = normalized_current_datehour(dates, span[0], span[1])
|
||||
assert all(ndt >= 0) and all(ndt <= 1)
|
||||
|
||||
badspan = pd.to_datetime(pd.Series(["2000-01-01 00:00:00", "2000-01-01 00:00:00"]))
|
||||
bad = normalized_current_datehour(dates, badspan[0], badspan[1])
|
||||
assert len(bad) == len(dates)
|
||||
|
||||
|
||||
def test_normalized_columns():
|
||||
dates = pd.to_datetime(pd.Series(["2000-01-01", "2000-01-02", "2000-01-03"]))
|
||||
vals = pd.Series([1, 2, 3])
|
||||
|
||||
nc1 = normalized_columns(dates, vals, mode="log")
|
||||
assert type(nc1).__name__ == "DataFrame"
|
||||
assert nc1.columns[0] == "normalized_columns"
|
||||
|
||||
nc2 = normalized_columns(dates, vals, mode="minmax")
|
||||
assert all(nc2["normalized_columns"] >= 0) and all(nc2["normalized_columns"] <= 1)
|
||||
|
||||
with pytest.raises(Exception):
|
||||
normalized_columns(dates, vals, mode="foo")
|
|
@ -0,0 +1,36 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import pandas as pd
|
||||
import datetime
|
||||
|
||||
|
||||
from fclib.feature_engineering.feature_utils import annual_fourier, weekly_fourier, daily_fourier, fourier_approximation
|
||||
|
||||
# Fourier stuff
|
||||
|
||||
|
||||
def test_fourier_approximation():
|
||||
dates = pd.Series([x for x in range(1, 366)])
|
||||
(fsin, fcos) = fourier_approximation(dates, 1, 365.24)
|
||||
assert len(fsin) == len(dates)
|
||||
assert len(fcos) == len(dates)
|
||||
assert all(abs(fsin) <= 1) and all(abs(fcos) <= 1)
|
||||
|
||||
|
||||
def test_annual_fourier():
|
||||
dates = pd.to_datetime(pd.Series([datetime.date(2000, 1, 1) + datetime.timedelta(days=x) for x in range(365)]))
|
||||
fa = annual_fourier(dates, 5)
|
||||
assert len(fa) == 10
|
||||
|
||||
|
||||
def test_weekly_fourier():
|
||||
dates = pd.to_datetime(pd.Series([datetime.date(2000, 1, 1) + datetime.timedelta(days=x) for x in range(365)]))
|
||||
fw = weekly_fourier(dates, 5)
|
||||
assert len(fw) == 10
|
||||
|
||||
|
||||
def test_daily_fourier():
|
||||
dates = pd.to_datetime(pd.Series([datetime.date(2000, 1, 1) + datetime.timedelta(days=x) for x in range(365)]))
|
||||
fd = daily_fourier(dates, 5)
|
||||
assert len(fd) == 10
|
|
@ -0,0 +1,23 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import pandas as pd
|
||||
import lightgbm as lgb
|
||||
|
||||
from fclib.models.lightgbm import predict
|
||||
|
||||
|
||||
def test_predict(generate_ojdata, generate_data):
|
||||
data = pd.read_csv("fclib/tests/resources/ojdatagen.csv")
|
||||
newdata = generate_data.ojdata(61, 70)
|
||||
|
||||
params = {"objective": "mape"}
|
||||
target = "logmove"
|
||||
|
||||
lgb_data = lgb.Dataset(data.drop(columns=[target]), label=data[target])
|
||||
lgb_model = lgb.train(params, lgb_data, valid_sets=[lgb_data])
|
||||
|
||||
predint = predict(newdata, lgb_model, target, ["store", "brand"], True)
|
||||
assert predint.logmove.dtype.name == "int64"
|
||||
predfloat = predict(newdata.drop(columns=[target]), lgb_model, "logmove", ["store", "brand"], False)
|
||||
assert predfloat.logmove.dtype.name == "float64"
|
|
@ -0,0 +1,116 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import os
|
||||
import shutil
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from tempfile import TemporaryDirectory
|
||||
|
||||
from fclib.dataset.ojdata import download_ojdata, complete_and_fill_df, _gen_split_indices, split_train_test
|
||||
|
||||
|
||||
# data file that will be created and deleted each time test is run
|
||||
ojdata_csv = "fclib/tests/resources/ojdatagen.csv"
|
||||
|
||||
|
||||
def test_download_retail_data():
|
||||
|
||||
DATA_FILE_LIST = ["yx.csv", "storedemo.csv"]
|
||||
|
||||
with TemporaryDirectory() as tmpdirname:
|
||||
print("Created temporary directory", tmpdirname)
|
||||
|
||||
# Download the data to the temp directory
|
||||
download_ojdata(tmpdirname)
|
||||
# Check downloaded data
|
||||
DATA_DIM_LIST = [(106139, 19), (83, 12)]
|
||||
COLUMN_NAME_LIST = [
|
||||
[
|
||||
"store",
|
||||
"brand",
|
||||
"week",
|
||||
"logmove",
|
||||
"constant",
|
||||
"price1",
|
||||
"price2",
|
||||
"price3",
|
||||
"price4",
|
||||
"price5",
|
||||
"price6",
|
||||
"price7",
|
||||
"price8",
|
||||
"price9",
|
||||
"price10",
|
||||
"price11",
|
||||
"deal",
|
||||
"feat",
|
||||
"profit",
|
||||
],
|
||||
[
|
||||
"STORE",
|
||||
"AGE60",
|
||||
"EDUC",
|
||||
"ETHNIC",
|
||||
"INCOME",
|
||||
"HHLARGE",
|
||||
"WORKWOM",
|
||||
"HVAL150",
|
||||
"SSTRDIST",
|
||||
"SSTRVOL",
|
||||
"CPDIST5",
|
||||
"CPWVOL5",
|
||||
],
|
||||
]
|
||||
for idx, f in enumerate(DATA_FILE_LIST):
|
||||
file_path = os.path.join(tmpdirname, f)
|
||||
assert os.path.exists(file_path)
|
||||
df = pd.read_csv(file_path, index_col=None)
|
||||
assert df.shape == DATA_DIM_LIST[idx]
|
||||
assert list(df) == COLUMN_NAME_LIST[idx]
|
||||
|
||||
|
||||
def test_complete_and_fill_df(generate_ojdata):
|
||||
ojdata = pd.read_csv(ojdata_csv, index_col=False)
|
||||
|
||||
base_out = complete_and_fill_df(ojdata, stores=[1, 2], brands=[1, 2, 3], weeks=list(range(50, 61)))
|
||||
assert ojdata.equals(base_out)
|
||||
|
||||
# remove cells
|
||||
cell_na = ojdata
|
||||
cell_na.loc[3:5, "logmove"] = np.nan
|
||||
cell_out = complete_and_fill_df(cell_na, stores=[1, 2], brands=[1, 2, 3], weeks=list(range(50, 61)))
|
||||
assert not any(pd.isna(cell_out["logmove"]))
|
||||
|
||||
# remove rows
|
||||
row_na = ojdata
|
||||
row_na.drop(3, axis=0)
|
||||
row_out = complete_and_fill_df(row_na, stores=[1, 2], brands=[1, 2, 3], weeks=list(range(50, 61)))
|
||||
assert len(row_out) == len(ojdata)
|
||||
|
||||
|
||||
def test_gen_split_indices(generate_ojdata):
|
||||
base = _gen_split_indices()
|
||||
assert len(base) == 3
|
||||
assert all([len(i) == 12 for i in base])
|
||||
|
||||
small = _gen_split_indices(3, 2, 0, 50, 60)
|
||||
assert all([len(i) == 3 for i in small])
|
||||
|
||||
|
||||
def test_split_train_test(generate_ojdata):
|
||||
resdir = os.path.dirname(ojdata_csv)
|
||||
shutil.copyfile(ojdata_csv, resdir + "/yx.csv")
|
||||
|
||||
(traindf, testdf, auxdf) = split_train_test(resdir, 1, 2, 1, 50, 60)
|
||||
assert len(traindf) == 1
|
||||
assert len(testdf) == 1
|
||||
assert len(auxdf) == 1
|
||||
|
||||
(traindf, testdf, auxdf) = split_train_test(resdir, 3, 2, 1, 50, 60)
|
||||
assert len(traindf) == 3
|
||||
assert len(testdf) == 3
|
||||
assert len(auxdf) == 3
|
||||
|
||||
for i in list(range(3)):
|
||||
assert max(traindf[i].week) < min(testdf[i].week)
|
|
@ -0,0 +1,24 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from fclib.common.plot import plot_predictions_with_history
|
||||
|
||||
|
||||
def test_plot_predictions_with_history(generate_ojdata, generate_data):
|
||||
data = pd.read_csv("fclib/tests/resources/ojdatagen.csv")
|
||||
pred = generate_data.ojdata(61, 70)
|
||||
# implicit assert no-exceptions
|
||||
plot_predictions_with_history(
|
||||
data,
|
||||
pred,
|
||||
grain1_unique_vals=[1, 2],
|
||||
grain2_unique_vals=[1, 2, 3],
|
||||
time_col_name="week",
|
||||
target_col_name="logmove",
|
||||
grain1_name="store",
|
||||
grain2_name="brand",
|
||||
min_timestep=min(data.week),
|
||||
num_samples=4,
|
||||
)
|
|
@ -0,0 +1,21 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from fclib.models.multiple_linear_regression import fit, predict
|
||||
|
||||
|
||||
def test_fit_and_predict(generate_ojdata, generate_data):
|
||||
data = pd.read_csv("fclib/tests/resources/ojdatagen.csv")
|
||||
newdata = generate_data.ojdata(61, 70)
|
||||
|
||||
keyvars = ["store", "brand"]
|
||||
xvars = ["price1", "price2", "price3"]
|
||||
target = "logmove"
|
||||
|
||||
mods = fit(data, keyvars, xvars, target)
|
||||
predint = predict(newdata, mods, "week", keyvars, xvars, False, True)
|
||||
assert predint.prediction.dtype.name == "int64"
|
||||
predfloat = predict(newdata, mods, "week", keyvars, xvars, False, False)
|
||||
assert predfloat.prediction.dtype.name == "float64"
|
|
@ -0,0 +1,24 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
from fclib.common.utils import git_repo_path, module_exists, system_type, module_path
|
||||
|
||||
|
||||
def test_git_repo_path():
|
||||
# implicitly test for no exception
|
||||
assert git_repo_path() is not None
|
||||
|
||||
|
||||
def test_module_exists():
|
||||
assert module_exists("numpy")
|
||||
assert not module_exists("fakepkgxyz")
|
||||
|
||||
|
||||
def test_system_type():
|
||||
assert system_type() in ["linux", "mac", "win"]
|
||||
|
||||
|
||||
def test_module_path():
|
||||
# look for binaries we use in this repo
|
||||
assert module_path("forecasting_env", "python") != ""
|
||||
assert module_path("forecasting_env", "tensorboard") != ""
|
|
@ -41,7 +41,7 @@ jobs:
|
|||
yes | conda env create -n forecasting_env -f tools/environment.yml
|
||||
eval "$(conda shell.bash hook)" && conda activate forecasting_env
|
||||
pip install -e fclib
|
||||
pip install ray>=0.8.2
|
||||
pip install "ray>=0.8.2"
|
||||
echo "Conda env installed."
|
||||
displayName: 'Creating conda environment with dependencies'
|
||||
|
||||
|
|
|
@ -41,7 +41,8 @@ jobs:
|
|||
yes | conda env create -n forecasting_env -f tools/environment.yml
|
||||
eval "$(conda shell.bash hook)" && conda activate forecasting_env
|
||||
pip install -e fclib
|
||||
pip install ray>=0.8.2
|
||||
pip install "ray>=0.8.2"
|
||||
pip install pytest-cov
|
||||
echo "Conda env installed."
|
||||
displayName: 'Creating Conda Environment with dependencies'
|
||||
|
||||
|
@ -49,9 +50,20 @@ jobs:
|
|||
sudo ln -sf /usr/lib/R/modules/lapack.so /usr/lib/libRlapack.so
|
||||
eval "$(conda shell.bash hook)" && conda activate forecasting_env
|
||||
python -m ipykernel install --user --name forecasting_env
|
||||
pytest --durations=0 fclib/tests -m "not notebooks and not gpu and not azureml" --junitxml=junit/test-unitttest.xml
|
||||
pytest --durations=0 fclib/tests -m "not notebooks and not gpu and not azureml" --junitxml=junit/test-unitttest.xml --cov=fclib --cov-report=xml
|
||||
displayName: 'Run Unit tests'
|
||||
|
||||
# required for Cobertura
|
||||
- task: UseDotNet@2
|
||||
displayName: 'Use .NET Core sdk'
|
||||
inputs:
|
||||
version: 2.x
|
||||
|
||||
- task: PublishCodeCoverageResults@1
|
||||
inputs:
|
||||
codeCoverageTool: Cobertura
|
||||
summaryFileLocation: '$(System.DefaultWorkingDirectory)/**/coverage.xml'
|
||||
|
||||
- task: PublishTestResults@2
|
||||
inputs:
|
||||
testResultsFiles: '**/test-unitttest.xml'
|
||||
|
|
|
@ -28,7 +28,7 @@ dependencies:
|
|||
- pytest>=3.6.4
|
||||
- pylint
|
||||
- papermill>=1.0.1
|
||||
- matplotlib>=3.1.2
|
||||
- matplotlib=3.1.2
|
||||
- r-base>=3.3.0
|
||||
- pip:
|
||||
- black>=18.6b4
|
||||
|
@ -39,5 +39,5 @@ dependencies:
|
|||
- tensorboard==2.1.0
|
||||
- nteract-scrapbook==0.3.1
|
||||
- statsmodels>=0.11.1
|
||||
- pmdarima>=1.1.1
|
||||
- pmdarima==1.1.1
|
||||
- azureml-sdk[automl,notebooks]==1.0.85
|
||||
|
|
|
@ -17,7 +17,7 @@ eval "$(conda shell.bash hook)" && conda activate forecasting_env
|
|||
pip install -e fclib
|
||||
|
||||
# Install ray (available only on Linux and MacOS)
|
||||
pip install ray>=0.8.2
|
||||
pip install 'ray>=0.8.2'
|
||||
|
||||
# Register conda environment in Jupyter
|
||||
python -m ipykernel install --user --name forecasting_env
|
||||
|
|
Загрузка…
Ссылка в новой задаче