* pin pmdarima, matplotlib versions (#212)

* pin pmdarima to 1.1.1

* try 1.2.0

* fix envsetup, revert to ==1.1.1

* also update test yml

* fix this script as well

* pin matplotlib

Co-authored-by: Chenhui Hu <chenhhu@microsoft.com>

* Testing and refactoring fclib 1 (#214)

* start utils testing

* testing ojdata

* clean

* rm unneeded file

* use conftest.py

* loss fn tests

* tweak R datagen

* feature testing

* feature testing 2

* test dcnn

* fine tune

* modelling tests

* rm unneeded imports

* more pred tests

* set to run code coverage

* add coveragerc

* install pytest-cov

* fixing, removing unneeded cruft

* rm unused energy lag stuff

* more fixes

* yet more fixes

* more tests

* split feature tests up

* use codecov task

* install .net core for cobertura

* don't output html

* no reportDirectory

* tidying up

* linting

Co-authored-by: Hong Ooi <hongooi@microsoft.com>
Co-authored-by: Chenhui Hu <chenhhu@microsoft.com>
This commit is contained in:
vapaunic 2020-07-16 11:20:25 -07:00 коммит произвёл GitHub
Родитель bc47741cb1
Коммит 12e2044946
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
23 изменённых файлов: 706 добавлений и 623 удалений

8
.coveragerc Normal file
Просмотреть файл

@ -0,0 +1,8 @@
[run]
include =
fclib/fclib/*
omit =
fclib/fclib/azureml/*
fclib/tests/*
fclib/setup.py

Просмотреть файл

@ -61,7 +61,7 @@ def day_type(datetime_col, holiday_col=None, semi_holiday_offset=timedelta(days=
datetype = pd.DataFrame({"DayType": datetime_col.dt.dayofweek})
datetype.replace({"DayType": WEEK_DAY_TYPE_MAP}, inplace=True)
if holiday_col:
if holiday_col is not None:
holiday_mask = holiday_col > 0
datetype.loc[holiday_mask, "DayType"] = HOLIDAY_CODE
@ -165,58 +165,58 @@ def day_of_year(date_time_col):
return date_time_col.dt.dayofyear
def encoded_month_of_year(month_of_year):
"""
Create one hot encoding of month of year.
"""
month_of_year = pd.get_dummies(month_of_year, prefix="MonthOfYear")
# def encoded_month_of_year(month_of_year):
# """
# Create one hot encoding of month of year.
# """
# month_of_year = pd.get_dummies(month_of_year, prefix="MonthOfYear")
return month_of_year
# return month_of_year
def encoded_day_of_week(day_of_week):
"""
Create one hot encoding of day_of_week.
"""
day_of_week = pd.get_dummies(day_of_week, prefix="DayOfWeek")
# def encoded_day_of_week(day_of_week):
# """
# Create one hot encoding of day_of_week.
# """
# day_of_week = pd.get_dummies(day_of_week, prefix="DayOfWeek")
return day_of_week
# return day_of_week
def encoded_day_of_month(day_of_month):
"""
Create one hot encoding of day_of_month.
"""
day_of_month = pd.get_dummies(day_of_month, prefix="DayOfMonth")
# def encoded_day_of_month(day_of_month):
# """
# Create one hot encoding of day_of_month.
# """
# day_of_month = pd.get_dummies(day_of_month, prefix="DayOfMonth")
return day_of_month
# return day_of_month
def encoded_day_of_year(day_of_year):
"""
Create one hot encoding of day_of_year.
"""
day_of_year = pd.get_dummies(day_of_year)
# def encoded_day_of_year(day_of_year):
# """
# Create one hot encoding of day_of_year.
# """
# day_of_year = pd.get_dummies(day_of_year)
return day_of_year
# return day_of_year
def encoded_hour_of_day(hour_of_day):
"""
Create one hot encoding of hour_of_day.
"""
hour_of_day = pd.get_dummies(hour_of_day, prefix="HourOfDay")
# def encoded_hour_of_day(hour_of_day):
# """
# Create one hot encoding of hour_of_day.
# """
# hour_of_day = pd.get_dummies(hour_of_day, prefix="HourOfDay")
return hour_of_day
# return hour_of_day
def encoded_week_of_year(week_of_year):
"""
Create one hot encoding of week_of_year.
"""
week_of_year = pd.get_dummies(week_of_year, prefix="WeekOfYear")
# def encoded_week_of_year(week_of_year):
# """
# Create one hot encoding of week_of_year.
# """
# week_of_year = pd.get_dummies(week_of_year, prefix="WeekOfYear")
return week_of_year
# return week_of_year
def normalized_current_year(datetime_col, min_year, max_year):
@ -236,8 +236,8 @@ def normalized_current_year(datetime_col, min_year, max_year):
if max_year != min_year:
current_year = (year - min_year) / (max_year - min_year)
elif max_year == min_year:
current_year = 0
else:
current_year = pd.Series([0 for x in range(len(datetime_col))])
return current_year
@ -255,13 +255,13 @@ def normalized_current_date(datetime_col, min_date, max_date):
Returns:
float: the position of the current date in the min_date:max_date range
"""
date = datetime_col.dt.date
current_date = (date - min_date).apply(lambda x: x.days)
date = datetime_col # .dt.date
current_date = (date - min_date) # .apply(lambda x: x.days)
if max_date != min_date:
current_date = current_date / (max_date - min_date).days
elif max_date == min_date:
current_date = 0
current_date = current_date / (max_date - min_date) # .days
else:
current_date = pd.Series([0 for x in range(len(datetime_col))])
return current_date
@ -285,8 +285,8 @@ def normalized_current_datehour(datetime_col, min_datehour, max_datehour):
if max_min_diff != 0:
current_datehour = current_datehour / (max_min_diff.days * 24 + max_min_diff.seconds / 3600)
elif max_min_diff == 0:
current_datehour = 0
else:
current_datehour = pd.Series([0 for x in range(len(datetime_col))])
return current_datehour
@ -426,425 +426,6 @@ def daily_fourier(datetime_col, n_harmonics):
return output_dict
def same_week_day_hour_lag(
datetime_col, value_col, n_years=3, week_window=1, agg_func="mean", q=None, output_colname="SameWeekHourLag"
):
"""
Creates a lag feature by calculating quantiles, mean and std of values of and
around the same week, same day of week, and same hour of day, of previous years.
Args:
datetime_col: Datetime column.
value_col: Feature value column to create lag feature from.
n_years: Number of previous years data to use. Default value 3.
week_window: Number of weeks before and after the same week to use,
which should help reduce noise in the data. Default value 1.
agg_func: Aggregation function to apply on multiple previous values,
accepted values are 'mean', 'quantile', 'std'. Default value 'mean'.
q: If agg_func is 'quantile', taking value between 0 and 1.
output_colname: name of the output lag feature column.
Default value 'SameWeekHourLag'.
Returns:
pd.DataFrame: data frame containing the newly created lag
feature as a column.
"""
if not is_datetime_like(datetime_col):
datetime_col = pd.to_datetime(datetime_col, format=DATETIME_FORMAT)
min_time_stamp = min(datetime_col)
max_time_stamp = max(datetime_col)
df = pd.DataFrame({"Datetime": datetime_col, "value": value_col})
df.set_index("Datetime", inplace=True)
week_lag_base = 52
week_lag_last_year = list(range(week_lag_base - week_window, week_lag_base + week_window + 1))
week_lag_all = []
for y in range(n_years):
week_lag_all += [x + y * 52 for x in week_lag_last_year]
week_lag_cols = []
for w in week_lag_all:
if (max_time_stamp - timedelta(weeks=w)) >= min_time_stamp:
col_name = "week_lag_" + str(w)
week_lag_cols.append(col_name)
lag_datetime = df.index.get_level_values(0) - timedelta(weeks=w)
valid_lag_mask = lag_datetime >= min_time_stamp
df[col_name] = np.nan
df.loc[valid_lag_mask, col_name] = df.loc[lag_datetime[valid_lag_mask], "value"].values
# Additional aggregation options will be added as needed
if agg_func == "mean" and q is None:
df[output_colname] = round(df[week_lag_cols].mean(axis=1))
elif agg_func == "quantile" and q is not None:
df[output_colname] = round(df[week_lag_cols].quantile(q, axis=1))
elif agg_func == "std" and q is None:
df[output_colname] = round(df[week_lag_cols].std(axis=1))
return df[[output_colname]]
def same_day_hour_lag(
datetime_col, value_col, n_years=3, day_window=1, agg_func="mean", q=None, output_colname="SameDayHourLag"
):
"""
Creates a lag feature by calculating quantiles, mean, and std of values of
and around the same day of year, and same hour of day, of previous years.
Args:
datetime_col: Datetime column.
value_col: Feature value column to create lag feature from.
n_years: Number of previous years data to use. Default value 3.
day_window: Number of days before and after the same day to use,
which should help reduce noise in the data. Default value 1.
agg_func: Aggregation function to apply on multiple previous values,
accepted values are 'mean', 'quantile', 'std'. Default value 'mean'.
q: If agg_func is 'quantile', taking value between 0 and 1.
output_colname: name of the output lag feature column.
Default value 'SameDayHourLag'.
Returns:
pd.DataFrame: data frame containing the newly created lag
feature as a column.
"""
if not is_datetime_like(datetime_col):
datetime_col = pd.to_datetime(datetime_col, format=DATETIME_FORMAT)
min_time_stamp = min(datetime_col)
max_time_stamp = max(datetime_col)
df = pd.DataFrame({"Datetime": datetime_col, "value": value_col})
df.set_index("Datetime", inplace=True)
day_lag_base = 365
day_lag_last_year = list(range(day_lag_base - day_window, day_lag_base + day_window + 1))
day_lag_all = []
for y in range(n_years):
day_lag_all += [x + y * 365 for x in day_lag_last_year]
day_lag_cols = []
for d in day_lag_all:
if (max_time_stamp - timedelta(days=d)) >= min_time_stamp:
col_name = "day_lag_" + str(d)
day_lag_cols.append(col_name)
lag_datetime = df.index.get_level_values(0) - timedelta(days=d)
valid_lag_mask = lag_datetime >= min_time_stamp
df[col_name] = np.nan
df.loc[valid_lag_mask, col_name] = df.loc[lag_datetime[valid_lag_mask], "value"].values
# Additional aggregation options will be added as needed
if agg_func == "mean" and q is None:
df[output_colname] = round(df[day_lag_cols].mean(axis=1))
elif agg_func == "quantile" and q is not None:
df[output_colname] = round(df[day_lag_cols].quantile(q, axis=1))
elif agg_func == "std" and q is None:
df[output_colname] = round(df[day_lag_cols].std(axis=1))
return df[[output_colname]]
def same_day_hour_moving_average(
datetime_col,
value_col,
window_size,
start_week,
average_count,
forecast_creation_time,
output_col_prefix="moving_average_lag_",
):
"""
Creates moving average features by averaging values of the same day of
week and same hour of day of previous weeks.
Args:
datetime_col: Datetime column
value_col: Feature value column to create moving average features from.
window_size: Number of weeks used to compute the average.
start_week: First week of the first moving average feature.
average_count: Number of moving average features to create.
forecast_creation_time: The time point when the feature is created.
This value is used to prevent using data that are not available
at forecast creation time to compute features.
output_col_prefix: Prefix of the output columns. The start week of each
moving average feature is added at the end. Default value 'moving_average_lag_'.
Returns:
pd.DataFrame: data frame containing the newly created lag features as
columns.
For example, start_week = 9, window_size=4, and average_count = 3 will
create three moving average features.
1) moving_average_lag_9: average the same day and hour values of the 9th,
10th, 11th, and 12th weeks before the current week.
2) moving_average_lag_10: average the same day and hour values of the
10th, 11th, 12th, and 13th weeks before the current week.
3) moving_average_lag_11: average the same day and hour values of the
11th, 12th, 13th, and 14th weeks before the current week.
"""
df = pd.DataFrame({"Datetime": datetime_col, "value": value_col})
df.set_index("Datetime", inplace=True)
df = df.asfreq("H")
if not df.index.is_monotonic:
df.sort_index(inplace=True)
df["fct_diff"] = df.index - forecast_creation_time
df["fct_diff"] = df["fct_diff"].apply(lambda x: x.days * 24 + x.seconds / 3600)
max_diff = max(df["fct_diff"])
for i in range(average_count):
output_col = output_col_prefix + str(start_week + i)
week_lag_start = start_week + i
hour_lags = [(week_lag_start + w) * 24 * 7 for w in range(window_size)]
hour_lags = [h for h in hour_lags if h > max_diff]
if hour_lags:
tmp_df = df[["value"]].copy()
tmp_col_all = []
for h in hour_lags:
tmp_col = "tmp_lag_" + str(h)
tmp_col_all.append(tmp_col)
tmp_df[tmp_col] = tmp_df["value"].shift(h)
df[output_col] = round(tmp_df[tmp_col_all].mean(axis=1))
df.drop(["fct_diff", "value"], inplace=True, axis=1)
return df
def same_day_hour_moving_quantile(
datetime_col,
value_col,
window_size,
start_week,
quantile_count,
q,
forecast_creation_time,
output_col_prefix="moving_quatile_lag_",
):
"""
Creates a series of quantiles features by calculating quantiles of values of
the same day of week and same hour of day of previous weeks.
Args:
datetime_col: Datetime column
value_col: Feature value column to create quantile features from.
window_size: Number of weeks used to compute the quantile.
start_week: First week of the first moving quantile feature.
quantile_count: Number of quantile features to create.
q: quantile to compute from history values, should be between 0 and 1.
forecast_creation_time: The time point when the feature is created.
This value is used to prevent using data that are not available
at forecast creation time to compute features.
output_col_prefix: Prefix of the output columns. The start week of each
moving average feature is added at the end. Default value 'moving_quatile_lag_'.
Returns:
pd.DataFrame: data frame containing the newly created lag features as
columns.
For example, start_week = 9, window_size=4, and quantile_count = 3 will
create three quantiles features.
1) moving_quantile_lag_9: calculate quantile of the same day and hour values of the 9th,
10th, 11th, and 12th weeks before the current week.
2) moving_quantile_lag_10: calculate quantile of average the same day and hour values of the
10th, 11th, 12th, and 13th weeks before the current week.
3) moving_quantile_lag_11: calculate quantile of average the same day and hour values of the
11th, 12th, 13th, and 14th weeks before the current week.
"""
df = pd.DataFrame({"Datetime": datetime_col, "value": value_col})
df.set_index("Datetime", inplace=True)
df = df.asfreq("H")
if not df.index.is_monotonic:
df.sort_index(inplace=True)
df["fct_diff"] = df.index - forecast_creation_time
df["fct_diff"] = df["fct_diff"].apply(lambda x: x.days * 24 + x.seconds / 3600)
max_diff = max(df["fct_diff"])
for i in range(quantile_count):
output_col = output_col_prefix + str(start_week + i)
week_lag_start = start_week + i
hour_lags = [(week_lag_start + w) * 24 * 7 for w in range(window_size)]
hour_lags = [h for h in hour_lags if h > max_diff]
if hour_lags:
tmp_df = df[["value"]].copy()
tmp_col_all = []
for h in hour_lags:
tmp_col = "tmp_lag_" + str(h)
tmp_col_all.append(tmp_col)
tmp_df[tmp_col] = tmp_df["value"].shift(h)
df[output_col] = round(tmp_df[tmp_col_all].quantile(q, axis=1))
df.drop(["fct_diff", "value"], inplace=True, axis=1)
return df
def same_day_hour_moving_std(
datetime_col,
value_col,
window_size,
start_week,
std_count,
forecast_creation_time,
output_col_prefix="moving_std_lag_",
):
"""
Creates standard deviation features by calculating std of values of the
same day of week and same hour of day of previous weeks.
Args:
datetime_col: Datetime column
value_col: Feature value column to create moving std features from.
window_size: Number of weeks used to compute the std.
start_week: First week of the first moving std feature.
std_count: Number of moving std features to create.
forecast_creation_time: The time point when the feature is created.
This value is used to prevent using data that are not available at
forecast creation time to compute features.
output_col_prefix: Prefix of the output columns. The start week of each
moving average feature is added at the end. Default value 'moving_std_lag_'.
Returns:
pd.DataFrame: data frame containing the newly created lag features as
columns.
For example, start_week = 9, window_size=4, and std_count = 3 will
create three moving std features.
1) moving_std_lag_9: calculate std of the same day and hour values of the 9th,
10th, 11th, and 12th weeks before the current week.
2) moving_std_lag_10: calculate std of the same day and hour values of the
10th, 11th, 12th, and 13th weeks before the current week.
3) moving_std_lag_11: calculate std of the same day and hour values of the
11th, 12th, 13th, and 14th weeks before the current week.
"""
df = pd.DataFrame({"Datetime": datetime_col, "value": value_col})
df.set_index("Datetime", inplace=True)
df = df.asfreq("H")
if not df.index.is_monotonic:
df.sort_index(inplace=True)
df["fct_diff"] = df.index - forecast_creation_time
df["fct_diff"] = df["fct_diff"].apply(lambda x: x.days * 24 + x.seconds / 3600)
max_diff = max(df["fct_diff"])
for i in range(std_count):
output_col = output_col_prefix + str(start_week + i)
week_lag_start = start_week + i
hour_lags = [(week_lag_start + w) * 24 * 7 for w in range(window_size)]
hour_lags = [h for h in hour_lags if h > max_diff]
if hour_lags:
tmp_df = df[["value"]].copy()
tmp_col_all = []
for h in hour_lags:
tmp_col = "tmp_lag_" + str(h)
tmp_col_all.append(tmp_col)
tmp_df[tmp_col] = tmp_df["value"].shift(h)
df[output_col] = round(tmp_df[tmp_col_all].std(axis=1))
df.drop(["value", "fct_diff"], inplace=True, axis=1)
return df
def same_day_hour_moving_agg(
datetime_col,
value_col,
window_size,
start_week,
count,
forecast_creation_time,
agg_func="mean",
q=None,
output_col_prefix="moving_agg_lag_",
):
"""
Creates a series of aggregation features by calculating mean, quantiles,
or std of values of the same day of week and same hour of day of previous weeks.
Args:
datetime_col: Datetime column
value_col: Feature value column to create aggregation features from.
window_size: Number of weeks used to compute the aggregation.
start_week: First week of the first aggregation feature.
count: Number of aggregation features to create.
forecast_creation_time: The time point when the feature is created.
This value is used to prevent using data that are not available
at forecast creation time to compute features.
agg_func: Aggregation function to apply on multiple previous values,
accepted values are 'mean', 'quantile', 'std'.
q: If agg_func is 'quantile', taking value between 0 and 1.
output_col_prefix: Prefix of the output columns. The start week of each
moving average feature is added at the end. Default value 'moving_agg_lag_'.
Returns:
pd.DataFrame: data frame containing the newly created lag features as
columns.
For example, start_week = 9, window_size=4, and count = 3 will
create three aggregation of features.
1) moving_agg_lag_9: aggregate the same day and hour values of the 9th,
10th, 11th, and 12th weeks before the current week.
2) moving_agg_lag_10: aggregate the same day and hour values of the
10th, 11th, 12th, and 13th weeks before the current week.
3) moving_agg_lag_11: aggregate the same day and hour values of the
11th, 12th, 13th, and 14th weeks before the current week.
"""
df = pd.DataFrame({"Datetime": datetime_col, "value": value_col})
df.set_index("Datetime", inplace=True)
df = df.asfreq("H")
if not df.index.is_monotonic:
df.sort_index(inplace=True)
df["fct_diff"] = df.index - forecast_creation_time
df["fct_diff"] = df["fct_diff"].apply(lambda x: x.days * 24 + x.seconds / 3600)
max_diff = max(df["fct_diff"])
for i in range(count):
output_col = output_col_prefix + str(start_week + i)
week_lag_start = start_week + i
hour_lags = [(week_lag_start + w) * 24 * 7 for w in range(window_size)]
hour_lags = [h for h in hour_lags if h > max_diff]
if hour_lags:
tmp_df = df[["value"]].copy()
tmp_col_all = []
for h in hour_lags:
tmp_col = "tmp_lag_" + str(h)
tmp_col_all.append(tmp_col)
tmp_df[tmp_col] = tmp_df["value"].shift(h)
if agg_func == "mean" and q is None:
df[output_col] = round(tmp_df[tmp_col_all].mean(axis=1))
elif agg_func == "quantile" and q is not None:
df[output_col] = round(tmp_df[tmp_col_all].quantile(q, axis=1))
elif agg_func == "std" and q is None:
df[output_col] = round(tmp_df[tmp_col_all].std(axis=1))
df.drop(["fct_diff", "value"], inplace=True, axis=1)
return df
def df_from_cartesian_product(dict_in):
"""Generate a Pandas dataframe from Cartesian product of lists.
@ -1075,17 +656,17 @@ def add_datetime(input_datetime, unit, add_count):
Exception: if invalid unit is provided. Valid units are:
'year', 'month', 'week', 'day', 'hour', 'minute'.
"""
if unit == "Y":
if unit == "year":
new_datetime = input_datetime + relativedelta(years=add_count)
elif unit == "M":
elif unit == "month":
new_datetime = input_datetime + relativedelta(months=add_count)
elif unit == "W":
elif unit == "week":
new_datetime = input_datetime + relativedelta(weeks=add_count)
elif unit == "D":
elif unit == "day":
new_datetime = input_datetime + relativedelta(days=add_count)
elif unit == "h":
elif unit == "hour":
new_datetime = input_datetime + relativedelta(hours=add_count)
elif unit == "m":
elif unit == "minute":
new_datetime = input_datetime + relativedelta(minutes=add_count)
else:
raise Exception(

Просмотреть файл

@ -19,7 +19,7 @@ def create_dcnn_model(
kernel_size=2,
n_filters=3,
dropout_rate=0.1,
max_cat_id=[1e3, 1e3],
max_cat_id=[100, 100],
):
"""Create a Dilated CNN model.

58
fclib/tests/conftest.py Normal file
Просмотреть файл

@ -0,0 +1,58 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import os
import pandas as pd
import numpy as np
import pytest
from itertools import product
class GenerateData:
@staticmethod
def ojdata(start=50, stop=61):
keyvars = {"store": [1, 2], "brand": [1, 2, 3], "week": list(range(start, stop))}
df = pd.DataFrame([row for row in product(*keyvars.values())], columns=keyvars.keys())
n = len(df)
np.random.seed(12345)
df["constant"] = 1
df["logmove"] = np.random.normal(9, 1, n)
df["price1"] = np.random.normal(0.55, 0.003, n)
df["price2"] = np.random.normal(0.55, 0.003, n)
df["price3"] = np.random.normal(0.55, 0.003, n)
df["price4"] = np.random.normal(0.55, 0.003, n)
df["price5"] = np.random.normal(0.55, 0.003, n)
df["price6"] = np.random.normal(0.55, 0.003, n)
df["price7"] = np.random.normal(0.55, 0.003, n)
df["price8"] = np.random.normal(0.55, 0.003, n)
df["price9"] = np.random.normal(0.55, 0.003, n)
df["price10"] = np.random.normal(0.55, 0.003, n)
df["price11"] = np.random.normal(0.55, 0.003, n)
df["deal"] = np.random.binomial(1, 0.5, n)
df["feat"] = np.random.binomial(1, 0.25, n)
df["profit"] = np.random.normal(30, 7.5, n)
return df
@pytest.fixture(scope="session")
def generate_ojdata():
# data file that will be created and deleted each time test is run
ojdata_csv = "fclib/tests/resources/ojdatagen.csv"
df = GenerateData.ojdata()
df.to_csv(ojdata_csv, index_label=False, index=False)
yield generate_ojdata
# teardown code
try:
os.remove(ojdata_csv)
os.remove(os.path.dirname(ojdata_csv) + "/yx.csv")
except Exception:
pass
@pytest.fixture
def generate_data():
return GenerateData

Просмотреть файл

Просмотреть файл

@ -1,34 +0,0 @@
#!/usr/bin/Rscript
#
# Source entire R testing files
#
# Note that we first define a function to source entire folder including R files. Then,
# we simply source all .R files within the specified folder.
## Define a function to source entire folder
source_entire_folder <- function(folderName, verbose=FALSE, showWarnings=TRUE) {
# Find all .R files within a folder and soruces them
#
# Args:
# folderName: Name of the folder including R files to be sourced.
# verbose: If TRUE, print message; if not, not. Default is FALSE.
#
# Returns:
# NULL.
files <- list.files(folderName, full.names=FALSE)
# Grab only R files that start with the word 'test'
files <- files[grepl("\\.[rR]$", files)]
files <- files[grepl("^test", files)]
if (!length(files) && showWarnings)
warning("No R test files in ", folderName)
for (f in files) {
if (verbose)
cat("sourcing: ", f, "\n")
## TODO: add caught whether error or not and return that
try(source(paste(folderName, f, sep='/'), local=FALSE, echo=FALSE), silent=!verbose)
}
return(invisible(NULL))
}
## Source all .R files within the folder of tests/unit
source_entire_folder('./tests/unit', verbose=TRUE)

Просмотреть файл

@ -1,47 +0,0 @@
#!/usr/bin/Rscript
#
# Test download retail data
#
# Note that we define a function to test download_data.r file in retail benchmarking,
# based on output checking.
## Define a function to test download_data.r file in retail benchmarking.
test_download_retail_data <- function() {
# Test download_data.r file in retail benchmarking
#
# Args:
# NULL.
#
# Returns:
# NULL.
BENCHMARK_DIR <- file.path('./retail_sales', 'OrangeJuice_Pt_3Weeks_Weekly')
DATA_DIR <- file.path(BENCHMARK_DIR, 'data')
SCRIPT_PATH <- file.path(BENCHMARK_DIR, 'common', 'download_data.r')
# Call data download script
source(SCRIPT_PATH)
# Check downloaded data
sales <- read.csv(file.path(DATA_DIR, 'yx.csv'))
if(all(dim(sales) == c(106139, 19)) == FALSE) {
stop("There is something wrong")
}
column_names <- c('store', 'brand', 'week', 'logmove', 'constant',
'price1', 'price2', 'price3', 'price4', 'price5',
'price6', 'price7', 'price8', 'price9', 'price10',
'price11', 'deal', 'feat', 'profit')
if(all(colnames(sales) == column_names) == FALSE) {
stop("There is something wrong")
}
storedemo <- read.csv(file.path(DATA_DIR, 'storedemo.csv'))
if(all(dim(storedemo) == c(83, 12)) == FALSE) {
stop("There is something wrong")
}
column_names <- c('STORE', 'AGE60', 'EDUC', 'ETHNIC', 'INCOME',
'HHLARGE', 'WORKWOM', 'HVAL150', 'SSTRDIST',
'SSTRVOL', 'CPDIST5', 'CPWVOL5')
if(all(colnames(storedemo) == column_names) == FALSE) {
stop("There is something wrong")
}
}
## Test download_data.r file in retail benchmarking.
test_download_retail_data()

Просмотреть файл

@ -1,64 +0,0 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import os
import pandas as pd
from tempfile import TemporaryDirectory
from fclib.dataset.ojdata import download_ojdata
def test_download_retail_data():
DATA_FILE_LIST = ["yx.csv", "storedemo.csv"]
with TemporaryDirectory() as tmpdirname:
print("Created temporary directory", tmpdirname)
# Download the data to the temp directory
download_ojdata(tmpdirname)
# Check downloaded data
DATA_DIM_LIST = [(106139, 19), (83, 12)]
COLUMN_NAME_LIST = [
[
"store",
"brand",
"week",
"logmove",
"constant",
"price1",
"price2",
"price3",
"price4",
"price5",
"price6",
"price7",
"price8",
"price9",
"price10",
"price11",
"deal",
"feat",
"profit",
],
[
"STORE",
"AGE60",
"EDUC",
"ETHNIC",
"INCOME",
"HHLARGE",
"WORKWOM",
"HVAL150",
"SSTRDIST",
"SSTRVOL",
"CPDIST5",
"CPWVOL5",
],
]
for idx, f in enumerate(DATA_FILE_LIST):
file_path = os.path.join(tmpdirname, f)
assert os.path.exists(file_path)
df = pd.read_csv(file_path, index_col=None)
assert df.shape == DATA_DIM_LIST[idx]
assert list(df) == COLUMN_NAME_LIST[idx]

19
fclib/tests/test_dcnn.py Normal file
Просмотреть файл

@ -0,0 +1,19 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
from fclib.models.dilated_cnn import create_dcnn_model
def test_create_dcnn_model():
mod0 = create_dcnn_model(seq_len=1) # default args
assert mod0 is not None
mod1 = create_dcnn_model(
seq_len=1, n_dyn_fea=1, n_outputs=2, n_dilated_layers=1, kernel_size=2, dropout_rate=0.05, max_cat_id=[30, 120]
)
assert mod1 is not None
mod2 = create_dcnn_model(
seq_len=1, n_dyn_fea=1, n_outputs=2, n_dilated_layers=2, kernel_size=2, dropout_rate=0.05, max_cat_id=[30, 120]
)
assert mod2 is not None

Просмотреть файл

@ -0,0 +1,24 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import numpy as np
import pandas as pd
from fclib.evaluation.evaluation_utils import MAPE, sMAPE, pinball_loss
y = np.array([1, 2, 3])
yhat = np.array([1.1, 2.2, 3.3])
TOLERANCE = 1e-5
def test_MAPE():
assert abs(MAPE(yhat, y) - 0.1) < TOLERANCE
def test_sMAPE():
assert abs(sMAPE(yhat, y) - 0.04761904) < TOLERANCE
def test_pinball_loss():
df = pd.DataFrame({"yhat": yhat, "y": y})
assert all(abs(pinball_loss(df.yhat, df.y, 0.5) - pd.Series([0.05, 0.1, 0.15])) < TOLERANCE)

Просмотреть файл

@ -0,0 +1,119 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import pandas as pd
import datetime
from fclib.feature_engineering.feature_utils import (
is_datetime_like,
day_type,
hour_of_day,
time_of_year,
week_of_year,
week_of_month,
month_of_year,
day_of_week,
day_of_month,
day_of_year,
)
def test_is_datetime_like():
st = "2000-01-01"
assert not is_datetime_like(st)
dt = datetime.datetime.now()
assert is_datetime_like(dt)
pdt = pd.DatetimeIndex(["2000-01-01"])
assert is_datetime_like(pdt)
pts = pd.Timestamp("2000-01-01T12:00:00")
assert is_datetime_like(pts)
d = datetime.date(2000, 1, 1)
assert is_datetime_like(d)
def test_day_type():
dates = pd.to_datetime(pd.Series(["2000-01-01", "2000-01-02", "2000-01-03"]))
hols = pd.Series([True, False, False])
dty = day_type(dates)
assert all(dty == [5, 6, 0])
dty2 = day_type(dates, hols)
assert all(dty2 == [7, 8, 0])
# date component extractors
sample_date = pd.to_datetime(pd.Series(["2000-01-01 12:30:59"]))
def test_hour_of_day():
dates = sample_date
assert all(hour_of_day(dates) == 12)
def test_time_of_year():
dates = sample_date
tyr = time_of_year(dates)
assert all(tyr >= 0 and tyr <= 1)
def test_week_of_year():
dates = sample_date
assert week_of_year(dates)[0] == 52 # first day of 2000 is in last week of 1999
def test_week_of_month():
dates = sample_date
assert week_of_month(dates)[0] == 1 # first day of 2000 is in first month of 2000
def test_month_of_year():
dates = sample_date
assert month_of_year(dates)[0] == 1
def test_day_of_week():
dates = sample_date
assert day_of_week(dates)[0] == 5
def test_day_of_month():
dates = sample_date
assert day_of_month(dates)[0] == 1
def test_day_of_year():
dates = sample_date
assert day_of_year(dates)[0] == 1
# def test_encoded_month_of_year():
# dates = sample_date
# enc = encoded_month_of_year(dates)
# assert len(enc.columns) == 12
# def test_encoded_day_of_week():
# dates = sample_date
# enc = encoded_day_of_week(dates)
# assert len(enc.columns) == 7
# def test_encoded_day_of_year():
# dates = sample_date
# enc = encoded_day_of_year(dates)
# assert len(enc.columns) >= 365
# def test_encoded_hour_of_day():
# dates = sample_date
# enc = encoded_hour_of_day(dates)
# assert len(enc.columns) == 24
# def test_encoded_week_of_year():
# dates = sample_date
# enc = encoded_week_of_year(dates)
# assert len(enc.columns) == 53

Просмотреть файл

@ -0,0 +1,100 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import pandas as pd
import datetime
import pytest
from fclib.feature_engineering.feature_utils import (
df_from_cartesian_product,
lagged_features,
moving_averages,
combine_features,
gen_sequence_array,
static_feature_array,
normalize_columns,
get_datetime_col,
get_month_day_range,
add_datetime,
)
# misc utilities
def test_df_from_cartesian_product():
d = {"x1": [1, 2, 3], "x2": [4, 5, 6], "x3": ["a", "b", "c"]}
df = df_from_cartesian_product(d)
assert len(df) == 27
assert list(df.columns) == ["x1", "x2", "x3"]
def test_lagged_features():
df = pd.DataFrame({"x1": [1, 2, 3], "x2": [4, 5, 6], "x3": ["a", "b", "c"]})
dflag = lagged_features(df, [1, 2])
assert dflag.shape == (3, 6)
assert all(pd.isna(dflag.iloc[0, :]))
def test_moving_averages():
df = pd.DataFrame({"x1": [1, 2, 3], "x2": [4, 5, 6]})
dfma = moving_averages(df, 1, 2)
assert dfma.shape == (3, 2)
assert all(pd.isna(dfma.iloc[0, :]))
def test_combine_features():
df = pd.DataFrame({"x1": [1, 2, 3], "x2": [4, 5, 6]})
dfcomb = combine_features(df, ["x1", "x2"], [1, 2], 2, ["x1", "x2"])
assert dfcomb.shape == (3, 8)
def test_gen_sequence_array():
val = pd.Series(x for x in range(8))
df0 = df_from_cartesian_product({"x1": [1, 2], "x2": [1, 2, 3, 4]})
df = pd.concat([val.to_frame("y"), df0], axis=1)
arr = gen_sequence_array(df, 2, ["y"], "x1", "x2")
assert len(arr) == 8
def test_static_feature_array():
val = pd.Series(x for x in range(8))
df0 = df_from_cartesian_product({"x1": [1, 2], "x2": [1, 2, 3, 4]})
df = pd.concat([val.to_frame("y"), df0], axis=1)
arr = static_feature_array(df, 8, ["x1", "x2"], "x1", "x2")
assert len(arr) == 8
def test_normalize_columns():
df = pd.Series((x * 1.0) for x in range(20)).to_frame("x")
(sc, _) = normalize_columns(df, ["x"])
assert len(sc) == len(df)
assert all(sc["x"] >= 0) and all(sc["x"] <= 1)
def test_get_datetime_col():
df = pd.DataFrame({"x1": ["2001-01-01", "2001-01-02", "2001-01-03"], "x2": [1, 2, 3], "x3": ["a", "b", "c"]})
dt1 = get_datetime_col(df, "x1")
assert len(dt1) == 3
with pytest.raises(Exception):
get_datetime_col(df, "x3")
def test_get_month_day_range():
x = datetime.datetime(2000, 1, 15)
(first, last) = get_month_day_range(x)
assert first == datetime.datetime(2000, 1, 1, 0, 0)
assert last == datetime.datetime(2000, 1, 31, 23, 0)
def test_add_datetime():
x = datetime.datetime(2000, 1, 1)
xy = add_datetime(x, "year", 1)
assert xy == datetime.datetime(2001, 1, 1)
xm = add_datetime(x, "month", 1)
assert xm == datetime.datetime(2000, 2, 1)
xd = add_datetime(x, "day", 1)
assert xd == datetime.datetime(2000, 1, 2)

Просмотреть файл

@ -0,0 +1,63 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import pandas as pd
import pytest
from fclib.feature_engineering.feature_utils import (
normalized_current_year,
normalized_current_date,
normalized_current_datehour,
normalized_columns,
)
# normalization functions
sample_date = pd.to_datetime(pd.Series(["2000-01-01 12:30:59"]))
def test_normalized_current_year():
dates = sample_date
nyr = normalized_current_year(dates, 1980, 2020)
assert all(nyr >= 0) and all(nyr <= 1)
bad = normalized_current_year(dates, 2000, 2000)
assert len(bad) == len(dates)
def test_normalized_current_date():
dates = sample_date
span = pd.to_datetime(pd.Series(["1980-01-01 00:00:00", "2020-01-01 23:59:59"]))
ndt = normalized_current_date(dates, span[0], span[1])
assert all(ndt >= 0) and all(ndt <= 1)
badspan = pd.to_datetime(pd.Series(["2000-01-01 00:00:00", "2000-01-01 00:00:00"]))
bad = normalized_current_date(dates, badspan[0], badspan[1])
assert len(bad) == len(dates)
def test_normalized_current_datehour():
dates = sample_date
span = pd.to_datetime(pd.Series(["1980-01-01 00:00:00", "2020-01-01 23:59:59"]))
ndt = normalized_current_datehour(dates, span[0], span[1])
assert all(ndt >= 0) and all(ndt <= 1)
badspan = pd.to_datetime(pd.Series(["2000-01-01 00:00:00", "2000-01-01 00:00:00"]))
bad = normalized_current_datehour(dates, badspan[0], badspan[1])
assert len(bad) == len(dates)
def test_normalized_columns():
dates = pd.to_datetime(pd.Series(["2000-01-01", "2000-01-02", "2000-01-03"]))
vals = pd.Series([1, 2, 3])
nc1 = normalized_columns(dates, vals, mode="log")
assert type(nc1).__name__ == "DataFrame"
assert nc1.columns[0] == "normalized_columns"
nc2 = normalized_columns(dates, vals, mode="minmax")
assert all(nc2["normalized_columns"] >= 0) and all(nc2["normalized_columns"] <= 1)
with pytest.raises(Exception):
normalized_columns(dates, vals, mode="foo")

Просмотреть файл

@ -0,0 +1,36 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import pandas as pd
import datetime
from fclib.feature_engineering.feature_utils import annual_fourier, weekly_fourier, daily_fourier, fourier_approximation
# Fourier stuff
def test_fourier_approximation():
dates = pd.Series([x for x in range(1, 366)])
(fsin, fcos) = fourier_approximation(dates, 1, 365.24)
assert len(fsin) == len(dates)
assert len(fcos) == len(dates)
assert all(abs(fsin) <= 1) and all(abs(fcos) <= 1)
def test_annual_fourier():
dates = pd.to_datetime(pd.Series([datetime.date(2000, 1, 1) + datetime.timedelta(days=x) for x in range(365)]))
fa = annual_fourier(dates, 5)
assert len(fa) == 10
def test_weekly_fourier():
dates = pd.to_datetime(pd.Series([datetime.date(2000, 1, 1) + datetime.timedelta(days=x) for x in range(365)]))
fw = weekly_fourier(dates, 5)
assert len(fw) == 10
def test_daily_fourier():
dates = pd.to_datetime(pd.Series([datetime.date(2000, 1, 1) + datetime.timedelta(days=x) for x in range(365)]))
fd = daily_fourier(dates, 5)
assert len(fd) == 10

Просмотреть файл

@ -0,0 +1,23 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import pandas as pd
import lightgbm as lgb
from fclib.models.lightgbm import predict
def test_predict(generate_ojdata, generate_data):
data = pd.read_csv("fclib/tests/resources/ojdatagen.csv")
newdata = generate_data.ojdata(61, 70)
params = {"objective": "mape"}
target = "logmove"
lgb_data = lgb.Dataset(data.drop(columns=[target]), label=data[target])
lgb_model = lgb.train(params, lgb_data, valid_sets=[lgb_data])
predint = predict(newdata, lgb_model, target, ["store", "brand"], True)
assert predint.logmove.dtype.name == "int64"
predfloat = predict(newdata.drop(columns=[target]), lgb_model, "logmove", ["store", "brand"], False)
assert predfloat.logmove.dtype.name == "float64"

116
fclib/tests/test_ojdata.py Normal file
Просмотреть файл

@ -0,0 +1,116 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import os
import shutil
import pandas as pd
import numpy as np
from tempfile import TemporaryDirectory
from fclib.dataset.ojdata import download_ojdata, complete_and_fill_df, _gen_split_indices, split_train_test
# data file that will be created and deleted each time test is run
ojdata_csv = "fclib/tests/resources/ojdatagen.csv"
def test_download_retail_data():
DATA_FILE_LIST = ["yx.csv", "storedemo.csv"]
with TemporaryDirectory() as tmpdirname:
print("Created temporary directory", tmpdirname)
# Download the data to the temp directory
download_ojdata(tmpdirname)
# Check downloaded data
DATA_DIM_LIST = [(106139, 19), (83, 12)]
COLUMN_NAME_LIST = [
[
"store",
"brand",
"week",
"logmove",
"constant",
"price1",
"price2",
"price3",
"price4",
"price5",
"price6",
"price7",
"price8",
"price9",
"price10",
"price11",
"deal",
"feat",
"profit",
],
[
"STORE",
"AGE60",
"EDUC",
"ETHNIC",
"INCOME",
"HHLARGE",
"WORKWOM",
"HVAL150",
"SSTRDIST",
"SSTRVOL",
"CPDIST5",
"CPWVOL5",
],
]
for idx, f in enumerate(DATA_FILE_LIST):
file_path = os.path.join(tmpdirname, f)
assert os.path.exists(file_path)
df = pd.read_csv(file_path, index_col=None)
assert df.shape == DATA_DIM_LIST[idx]
assert list(df) == COLUMN_NAME_LIST[idx]
def test_complete_and_fill_df(generate_ojdata):
ojdata = pd.read_csv(ojdata_csv, index_col=False)
base_out = complete_and_fill_df(ojdata, stores=[1, 2], brands=[1, 2, 3], weeks=list(range(50, 61)))
assert ojdata.equals(base_out)
# remove cells
cell_na = ojdata
cell_na.loc[3:5, "logmove"] = np.nan
cell_out = complete_and_fill_df(cell_na, stores=[1, 2], brands=[1, 2, 3], weeks=list(range(50, 61)))
assert not any(pd.isna(cell_out["logmove"]))
# remove rows
row_na = ojdata
row_na.drop(3, axis=0)
row_out = complete_and_fill_df(row_na, stores=[1, 2], brands=[1, 2, 3], weeks=list(range(50, 61)))
assert len(row_out) == len(ojdata)
def test_gen_split_indices(generate_ojdata):
base = _gen_split_indices()
assert len(base) == 3
assert all([len(i) == 12 for i in base])
small = _gen_split_indices(3, 2, 0, 50, 60)
assert all([len(i) == 3 for i in small])
def test_split_train_test(generate_ojdata):
resdir = os.path.dirname(ojdata_csv)
shutil.copyfile(ojdata_csv, resdir + "/yx.csv")
(traindf, testdf, auxdf) = split_train_test(resdir, 1, 2, 1, 50, 60)
assert len(traindf) == 1
assert len(testdf) == 1
assert len(auxdf) == 1
(traindf, testdf, auxdf) = split_train_test(resdir, 3, 2, 1, 50, 60)
assert len(traindf) == 3
assert len(testdf) == 3
assert len(auxdf) == 3
for i in list(range(3)):
assert max(traindf[i].week) < min(testdf[i].week)

24
fclib/tests/test_plot.py Normal file
Просмотреть файл

@ -0,0 +1,24 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import pandas as pd
from fclib.common.plot import plot_predictions_with_history
def test_plot_predictions_with_history(generate_ojdata, generate_data):
data = pd.read_csv("fclib/tests/resources/ojdatagen.csv")
pred = generate_data.ojdata(61, 70)
# implicit assert no-exceptions
plot_predictions_with_history(
data,
pred,
grain1_unique_vals=[1, 2],
grain2_unique_vals=[1, 2, 3],
time_col_name="week",
target_col_name="logmove",
grain1_name="store",
grain2_name="brand",
min_timestep=min(data.week),
num_samples=4,
)

Просмотреть файл

@ -0,0 +1,21 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import pandas as pd
from fclib.models.multiple_linear_regression import fit, predict
def test_fit_and_predict(generate_ojdata, generate_data):
data = pd.read_csv("fclib/tests/resources/ojdatagen.csv")
newdata = generate_data.ojdata(61, 70)
keyvars = ["store", "brand"]
xvars = ["price1", "price2", "price3"]
target = "logmove"
mods = fit(data, keyvars, xvars, target)
predint = predict(newdata, mods, "week", keyvars, xvars, False, True)
assert predint.prediction.dtype.name == "int64"
predfloat = predict(newdata, mods, "week", keyvars, xvars, False, False)
assert predfloat.prediction.dtype.name == "float64"

24
fclib/tests/test_utils.py Normal file
Просмотреть файл

@ -0,0 +1,24 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
from fclib.common.utils import git_repo_path, module_exists, system_type, module_path
def test_git_repo_path():
# implicitly test for no exception
assert git_repo_path() is not None
def test_module_exists():
assert module_exists("numpy")
assert not module_exists("fakepkgxyz")
def test_system_type():
assert system_type() in ["linux", "mac", "win"]
def test_module_path():
# look for binaries we use in this repo
assert module_path("forecasting_env", "python") != ""
assert module_path("forecasting_env", "tensorboard") != ""

Просмотреть файл

@ -41,7 +41,7 @@ jobs:
yes | conda env create -n forecasting_env -f tools/environment.yml
eval "$(conda shell.bash hook)" && conda activate forecasting_env
pip install -e fclib
pip install ray>=0.8.2
pip install "ray>=0.8.2"
echo "Conda env installed."
displayName: 'Creating conda environment with dependencies'

Просмотреть файл

@ -41,7 +41,8 @@ jobs:
yes | conda env create -n forecasting_env -f tools/environment.yml
eval "$(conda shell.bash hook)" && conda activate forecasting_env
pip install -e fclib
pip install ray>=0.8.2
pip install "ray>=0.8.2"
pip install pytest-cov
echo "Conda env installed."
displayName: 'Creating Conda Environment with dependencies'
@ -49,9 +50,20 @@ jobs:
sudo ln -sf /usr/lib/R/modules/lapack.so /usr/lib/libRlapack.so
eval "$(conda shell.bash hook)" && conda activate forecasting_env
python -m ipykernel install --user --name forecasting_env
pytest --durations=0 fclib/tests -m "not notebooks and not gpu and not azureml" --junitxml=junit/test-unitttest.xml
pytest --durations=0 fclib/tests -m "not notebooks and not gpu and not azureml" --junitxml=junit/test-unitttest.xml --cov=fclib --cov-report=xml
displayName: 'Run Unit tests'
# required for Cobertura
- task: UseDotNet@2
displayName: 'Use .NET Core sdk'
inputs:
version: 2.x
- task: PublishCodeCoverageResults@1
inputs:
codeCoverageTool: Cobertura
summaryFileLocation: '$(System.DefaultWorkingDirectory)/**/coverage.xml'
- task: PublishTestResults@2
inputs:
testResultsFiles: '**/test-unitttest.xml'

Просмотреть файл

@ -28,7 +28,7 @@ dependencies:
- pytest>=3.6.4
- pylint
- papermill>=1.0.1
- matplotlib>=3.1.2
- matplotlib=3.1.2
- r-base>=3.3.0
- pip:
- black>=18.6b4
@ -39,5 +39,5 @@ dependencies:
- tensorboard==2.1.0
- nteract-scrapbook==0.3.1
- statsmodels>=0.11.1
- pmdarima>=1.1.1
- pmdarima==1.1.1
- azureml-sdk[automl,notebooks]==1.0.85

Просмотреть файл

@ -17,7 +17,7 @@ eval "$(conda shell.bash hook)" && conda activate forecasting_env
pip install -e fclib
# Install ray (available only on Linux and MacOS)
pip install ray>=0.8.2
pip install 'ray>=0.8.2'
# Register conda environment in Jupyter
python -m ipykernel install --user --name forecasting_env