This commit is contained in:
Mike Tokic 2024-07-25 09:46:12 -07:00
Родитель 31f60badf8
Коммит bc88b7b512
32 изменённых файлов: 404 добавлений и 452 удалений

Просмотреть файл

@ -181,27 +181,29 @@ ensemble_models <- function(run_info,
# model forecasts
single_model_tbl <- NULL
if (run_local_models) {
suppressWarnings(try(single_model_tbl <- read_file(run_info,
path = paste0(
"/forecasts/", hash_data(run_info$experiment_name), "-", hash_data(run_info$run_name),
"-", combo, "-single_models.", run_info$data_output
suppressWarnings(try(
single_model_tbl <- read_file(run_info,
path = paste0(
"/forecasts/", hash_data(run_info$experiment_name), "-", hash_data(run_info$run_name),
"-", combo, "-single_models.", run_info$data_output
),
return_type = "df"
),
return_type = "df"
),
silent = TRUE
silent = TRUE
))
}
global_model_tbl <- NULL
if (run_global_models) {
suppressWarnings(try(global_model_tbl <- read_file(run_info,
path = paste0(
"/forecasts/", hash_data(run_info$experiment_name), "-", hash_data(run_info$run_name),
"-", combo, "-global_models.", run_info$data_output
suppressWarnings(try(
global_model_tbl <- read_file(run_info,
path = paste0(
"/forecasts/", hash_data(run_info$experiment_name), "-", hash_data(run_info$run_name),
"-", combo, "-global_models.", run_info$data_output
),
return_type = "df"
),
return_type = "df"
),
silent = TRUE
silent = TRUE
))
}
@ -336,7 +338,6 @@ ensemble_models <- function(run_info,
.multicombine = TRUE,
.noexport = NULL
) %do% {
# get initial run info
model <- model_run %>%
dplyr::pull(Model_Name)

Просмотреть файл

@ -23,10 +23,8 @@ run_feature_selection <- function(input_data,
forecast_horizon,
external_regressors,
multistep_horizon = FALSE) {
# check for more than one unique target value
if (input_data %>% tidyr::drop_na(Target) %>% dplyr::pull(Target) %>% unique() %>% length() < 2) {
# just return the date features
fs_list <- input_data %>%
dplyr::select(tidyselect::contains("Date"))
@ -83,7 +81,6 @@ run_feature_selection <- function(input_data,
# run feature selection
if (date_type %in% c("day", "week")) {
# number of votes needed for feature to be selected
votes_needed <- 3
@ -410,7 +407,6 @@ lofo_fn <- function(run_info,
parallel_processing,
pca = FALSE,
seed = 123) {
# parallel run info
par_info <- par_start(
run_info = run_info,

Просмотреть файл

@ -141,7 +141,6 @@ final_models <- function(run_info,
run_ensemble_models <- prev_log_df$run_ensemble_models
if (sum(colnames(prev_log_df) %in% "weighted_mape")) {
# check if input values have changed
current_log_df <- tibble::tibble(
average_models = average_models,
@ -294,7 +293,6 @@ final_models <- function(run_info,
# simple model averaging
if (average_models & length(final_model_list) > 1) {
# create model combinations list
model_combinations <- tibble::tibble()
@ -338,7 +336,6 @@ final_models <- function(run_info,
.noexport = NULL
) %op%
{
# get list of models to average
model_list <- strsplit(x, "_")[[1]]
@ -362,9 +359,9 @@ final_models <- function(run_info,
} else {
averages_tbl <- NULL
}
# choose best average model
if(!is.null(averages_tbl)) {
if (!is.null(averages_tbl)) {
avg_back_test_mape <- averages_tbl %>%
dplyr::mutate(
Train_Test_ID = as.numeric(Train_Test_ID),
@ -372,7 +369,7 @@ final_models <- function(run_info,
) %>%
dplyr::filter(Train_Test_ID != 1) %>%
dplyr::mutate(MAPE = round(abs((Forecast - Target) / Target), digits = 4))
avg_best_model_mape <- avg_back_test_mape %>%
dplyr::group_by(Model_ID, Combo) %>%
dplyr::mutate(
@ -385,7 +382,7 @@ final_models <- function(run_info,
dplyr::group_by(Combo) %>%
dplyr::slice(1) %>%
dplyr::ungroup()
avg_best_model_tbl <- avg_best_model_mape %>%
dplyr::select(Combo, Model_ID)
}
@ -525,11 +522,11 @@ final_models <- function(run_info,
by = "Model_ID"
) %>%
dplyr::mutate(Best_Model = ifelse(!is.na(Best_Model), "Yes", "No"))
if(!is.null(averages_tbl)) {
if (!is.null(averages_tbl)) {
avg_model_final_tbl <- averages_tbl %>%
dplyr::right_join(avg_best_model_tbl,
by = c("Combo", "Model_ID")
by = c("Combo", "Model_ID")
) %>%
dplyr::mutate(
Combo_ID = Combo,
@ -619,14 +616,16 @@ final_models <- function(run_info,
# clean up any parallel run process
par_end(cl)
# condense outputs into less files for larger runs
if(length(combo_list) > 10000) {
if (length(combo_list) > 10000) {
cli::cli_progress_step("Condensing Forecasts")
condense_data(run_info,
parallel_processing,
num_cores)
condense_data(
run_info,
parallel_processing,
num_cores
)
}
# reconcile hierarchical forecasts
@ -641,17 +640,21 @@ final_models <- function(run_info,
num_cores
)
}
# calculate weighted mape
weighted_mape <- get_forecast_data(run_info = run_info) %>%
dplyr::filter(Run_Type == "Back_Test",
Best_Model == "Yes") %>%
dplyr::filter(
Run_Type == "Back_Test",
Best_Model == "Yes"
) %>%
dplyr::mutate(
Target = ifelse(Target == 0, 0.1, Target)
) %>%
dplyr::mutate(MAPE = round(abs((Forecast - Target) / Target), digits = 4),
Total = sum(Target, na.rm = TRUE),
Weight = (MAPE*Target)/Total) %>%
dplyr::mutate(
MAPE = round(abs((Forecast - Target) / Target), digits = 4),
Total = sum(Target, na.rm = TRUE),
Weight = (MAPE * Target) / Total
) %>%
dplyr::pull(Weight) %>%
sum() %>%
round(digits = 4)

Просмотреть файл

@ -323,7 +323,6 @@ forecast_backwards_compatibility <- function(run_info,
dplyr::select(Combo, Model, Best_Model) %>%
dplyr::distinct()
} else {
# read in unreconciled results
best_model_tbl <- read_file(run_info,
path = paste0(

Просмотреть файл

@ -82,7 +82,6 @@ prep_hierarchical_data <- function(input_data,
hierarchical_tbl <- hierarchical_tbl %>%
dplyr::left_join(temp_tbl, by = c("Date"))
} else if (value_level != "All") {
# agg by lowest level
bottom_tbl <- input_data_adj %>%
tidyr::unite("Combo",
@ -400,7 +399,6 @@ reconcile_hierarchical_data <- function(run_info,
forecast_approach,
negative_forecast = FALSE,
num_cores) {
# get run splits
model_train_test_tbl <- read_file(run_info,
path = paste0(
@ -420,15 +418,15 @@ reconcile_hierarchical_data <- function(run_info,
hts_nodes <- hts_list$nodes
original_combo_list <- hts_list$original_combos
hts_combo_list <- hts_list$hts_combos
# check if data has been condensed
cond_path <- paste0(
run_info$path, "/forecasts/*", hash_data(run_info$experiment_name), "-",
hash_data(run_info$run_name), "*condensed", ".", run_info$data_output
)
condensed_files <- list_files(run_info$storage_object, fs::path(cond_path))
if (length(condensed_files) > 0) {
condensed <- TRUE
} else {
@ -444,7 +442,7 @@ reconcile_hierarchical_data <- function(run_info,
return_type <- "df"
}
if(condensed) {
if (condensed) {
fcst_path <- paste0(
"/forecasts/*", hash_data(run_info$experiment_name), "-",
hash_data(run_info$run_name), "*condensed", ".", run_info$data_output
@ -453,7 +451,7 @@ reconcile_hierarchical_data <- function(run_info,
fcst_path <- paste0(
"/forecasts/*", hash_data(run_info$experiment_name), "-",
hash_data(run_info$run_name), "*models", ".", run_info$data_output
)
)
}
unreconciled_tbl <- read_file(run_info,
@ -889,7 +887,6 @@ reconcile_hierarchical_data <- function(run_info,
external_regressor_mapping <- function(data,
combo_variables,
external_regressors) {
# create var combinations list
var_combinations <- tibble::tibble()
@ -918,7 +915,6 @@ external_regressor_mapping <- function(data,
.multicombine = TRUE,
.noexport = NULL
) %do% {
# get unique values of regressor per combo variable iteration
var_unique_tbl <- foreach::foreach(
var = iter_list,
@ -1000,7 +996,6 @@ sum_hts_data <- function(bottom_level_tbl,
forecast_approach,
frequency_number,
return_type = "data") {
# create aggregations for target variable
Date <- bottom_level_tbl$Date

Просмотреть файл

@ -1,4 +1,3 @@
#' Check input values
#'
#' @param input_name input name
@ -13,21 +12,23 @@ check_input_type <- function(input_name,
type,
expected_value = NULL) {
if (!inherits(input_value, type)) {
stop(paste0(
"invalid type for input name '", input_name, "', needs to be of type ",
glue::glue_collapse(type, " or ")
),
call. = FALSE
stop(
paste0(
"invalid type for input name '", input_name, "', needs to be of type ",
glue::glue_collapse(type, " or ")
),
call. = FALSE
)
}
if (!is.null(expected_value) & !is.null(input_value)) {
if (!sum(input_value %in% expected_value)) {
stop(paste0(
"invalid value for input name '", input_name, "', value needs to equal ",
glue::glue_collapse(expected_value, " or ")
),
call. = FALSE
stop(
paste0(
"invalid value for input name '", input_name, "', value needs to equal ",
glue::glue_collapse(expected_value, " or ")
),
call. = FALSE
)
}
}
@ -52,7 +53,6 @@ check_input_data <- function(input_data,
date_type,
fiscal_year_start,
parallel_processing) {
# data combo names match the input data
if (sum(combo_variables %in% colnames(input_data)) != length(combo_variables)) {
stop("combo variables do not match column headers in input data")
@ -103,7 +103,6 @@ check_input_data <- function(input_data,
# input_data is correct type for parallel processing
if (inherits(input_data, c("data.frame", "tbl")) & is.null(parallel_processing)) {
# do nothing
} else if (inherits(input_data, "tbl_spark") & is.null(parallel_processing)) {
stop("spark data frames should run with spark parallel processing",
@ -148,7 +147,6 @@ check_input_data <- function(input_data,
check_parallel_processing <- function(run_info,
parallel_processing,
inner_parallel = FALSE) {
# parallel processing formatting
if (is.null(parallel_processing)) {
return()

Просмотреть файл

@ -701,7 +701,6 @@ glmnet <- function(train_data,
horizon,
external_regressors,
frequency) {
# create model recipe and spec
if (multistep) {
recipe_spec_glmnet <- train_data %>%
@ -1328,7 +1327,6 @@ xgboost <- function(train_data,
horizon,
external_regressors,
frequency) {
# create model recipe and spec
if (multistep) {
recipe_spec_xgboost <- train_data %>%

Просмотреть файл

@ -1,4 +1,3 @@
# CUBIST Multistep ----
#' Initialize custom cubist parsnip model
@ -298,7 +297,6 @@ cubist_multistep_fit_impl <- function(x, y,
external_regressors = NULL,
forecast_horizon = NULL,
selected_features = NULL) {
# X & Y
# Expect outcomes = vector
# Expect predictor = data.frame
@ -321,7 +319,6 @@ cubist_multistep_fit_impl <- function(x, y,
model_predictions <- list()
for (lag in get_multi_lags(lag_periods, forecast_horizon)) {
# get final features based on lag
xreg_tbl_final <- multi_feature_selection(
xreg_tbl,
@ -438,7 +435,6 @@ predict.cubist_multistep_fit_impl <- function(object, new_data, ...) {
#' @keywords internal
#' @export
cubist_multistep_predict_impl <- function(object, new_data, ...) {
# Date Mapping Table
date_tbl <- new_data %>%
dplyr::select(Date, Date_index.num) %>%

Просмотреть файл

@ -1,4 +1,3 @@
# GLMNET Multistep ----
#' Initialize custom glmnet parsnip model
@ -282,7 +281,6 @@ glmnet_multistep_fit_impl <- function(x, y,
external_regressors = NULL,
forecast_horizon = NULL,
selected_features = NULL) {
# X & Y
# Expect outcomes = vector
# Expect predictor = data.frame
@ -311,7 +309,6 @@ glmnet_multistep_fit_impl <- function(x, y,
parsnip::set_engine("glmnet")
for (lag in get_multi_lags(lag_periods, forecast_horizon)) {
# get final features based on lag
xreg_tbl_final <- multi_feature_selection(
xreg_tbl,
@ -426,7 +423,6 @@ predict.glmnet_multistep_fit_impl <- function(object, new_data, ...) {
#' @keywords internal
#' @export
glmnet_multistep_predict_impl <- function(object, new_data, ...) {
# Date Mapping Table
date_tbl <- new_data %>%
dplyr::select(Date, Date_index.num) %>%

Просмотреть файл

@ -1,4 +1,3 @@
# Helper Functions ----
#' Return xregs that contain future values for multistep horizon forecast

Просмотреть файл

@ -1,4 +1,3 @@
# MARS Multistep ----
#' Initialize custom mars parsnip model
@ -303,7 +302,6 @@ mars_multistep_fit_impl <- function(x, y,
external_regressors = NULL,
forecast_horizon = NULL,
selected_features = NULL) {
# X & Y
# Expect outcomes = vector
# Expect predictor = data.frame
@ -334,7 +332,6 @@ mars_multistep_fit_impl <- function(x, y,
parsnip::set_engine("earth")
for (lag in get_multi_lags(lag_periods, forecast_horizon)) {
# get final features based on lag
xreg_tbl_final <- multi_feature_selection(
xreg_tbl,
@ -449,7 +446,6 @@ predict.mars_multistep_fit_impl <- function(object, new_data, ...) {
#' @keywords internal
#' @export
mars_multistep_predict_impl <- function(object, new_data, ...) {
# Date Mapping Table
date_tbl <- new_data %>%
dplyr::select(Date, Date_index.num) %>%

Просмотреть файл

@ -1,4 +1,3 @@
# SVM-POLY Multistep ----
#' Initialize custom svm-poly parsnip model
@ -325,7 +324,6 @@ svm_poly_multistep_fit_impl <- function(x, y,
external_regressors = NULL,
forecast_horizon = NULL,
selected_features = NULL) {
# X & Y
# Expect outcomes = vector
# Expect predictor = data.frame
@ -357,7 +355,6 @@ svm_poly_multistep_fit_impl <- function(x, y,
parsnip::set_engine("kernlab")
for (lag in get_multi_lags(lag_periods, forecast_horizon)) {
# get final features based on lag
xreg_tbl_final <- multi_feature_selection(
xreg_tbl,
@ -475,7 +472,6 @@ predict.svm_poly_multistep_fit_impl <- function(object, new_data, ...) {
#' @keywords internal
#' @export
svm_poly_multistep_predict_impl <- function(object, new_data, ...) {
# Date Mapping Table
date_tbl <- new_data %>%
dplyr::select(Date, Date_index.num) %>%

Просмотреть файл

@ -1,4 +1,3 @@
# SVM-RBF Multistep ----
#' Initialize custom svm-rbf parsnip model
@ -306,7 +305,6 @@ svm_rbf_multistep_fit_impl <- function(x, y,
external_regressors = NULL,
forecast_horizon = NULL,
selected_features = NULL) {
# X & Y
# Expect outcomes = vector
# Expect predictor = data.frame
@ -337,7 +335,6 @@ svm_rbf_multistep_fit_impl <- function(x, y,
parsnip::set_engine("kernlab")
for (lag in get_multi_lags(lag_periods, forecast_horizon)) {
# get final features based on lag
xreg_tbl_final <- multi_feature_selection(
xreg_tbl,
@ -455,7 +452,6 @@ predict.svm_rbf_multistep_fit_impl <- function(object, new_data, ...) {
#' @keywords internal
#' @export
svm_rbf_multistep_predict_impl <- function(object, new_data, ...) {
# Date Mapping Table
date_tbl <- new_data %>%
dplyr::select(Date, Date_index.num) %>%

Просмотреть файл

@ -1,4 +1,3 @@
# XGBOOST Multistep ----
#' Initialize custom xgboost parsnip model
@ -389,7 +388,6 @@ xgboost_multistep_fit_impl <- function(x, y,
forecast_horizon = NULL,
selected_features = NULL,
...) {
# X & Y
# Expect outcomes = vector
# Expect predictor = data.frame
@ -412,7 +410,6 @@ xgboost_multistep_fit_impl <- function(x, y,
model_predictions <- list()
for (lag in get_multi_lags(lag_periods, forecast_horizon)) {
# get final features based on lag
xreg_tbl_final <- multi_feature_selection(
xreg_tbl,
@ -437,7 +434,7 @@ xgboost_multistep_fit_impl <- function(x, y,
y = outcome,
max_depth = max_depth,
nrounds = nrounds,
eta = eta,
eta = eta,
colsample_bytree = colsample_bytree,
colsample_bynode = colsample_bynode,
min_child_weight = min_child_weight,
@ -537,7 +534,6 @@ predict.xgboost_multistep_fit_impl <- function(object, new_data, ...) {
#' @keywords internal
#' @export
xgboost_multistep_predict_impl <- function(object, new_data, ...) {
# Date Mapping Table
date_tbl <- new_data %>%
dplyr::select(Date, Date_index.num) %>%

Просмотреть файл

@ -227,7 +227,6 @@ prep_data <- function(run_info,
dplyr::filter(Combo %in% current_combo_list_final)
if (length(combo_diff) == 0 & length(prev_combo_list) > 0) {
# check if input values have changed
current_log_df <- tibble::tibble(
combo_variables = paste(combo_variables, collapse = "---"),
@ -466,204 +465,205 @@ prep_data <- function(run_info,
} else if (parallel_processing == "spark") {
final_data <- filtered_initial_prep_tbl %>%
adjust_df(return_type = "sdf") %>%
sparklyr::spark_apply(function(df, context) {
# update objects
fn_env <- .GlobalEnv
sparklyr::spark_apply(
function(df, context) {
# update objects
fn_env <- .GlobalEnv
for (name in names(context)) {
assign(name, context[[name]], envir = fn_env)
}
for (name in names(context)) {
assign(name, context[[name]], envir = fn_env)
}
# get specific time series
combo <- unique(df$Combo)
# get specific time series
combo <- unique(df$Combo)
return_tbl <- tibble::tibble(
Combo = combo,
Combo_Hash = hash_data(combo)
)
# handle external regressors
xregs_future_tbl <- get_xregs_future_values_tbl(
df,
external_regressors,
hist_end_date
)
if (length(colnames(xregs_future_tbl)) > 2) {
xregs_future_list <- xregs_future_tbl %>%
dplyr::select(-Date, -Combo) %>%
colnames()
} else {
xregs_future_list <- NULL
}
# initial data prep
initial_tbl <- df %>%
dplyr::filter(Combo == combo) %>%
dplyr::select(
Combo,
Date,
Target,
tidyselect::all_of(external_regressors)
) %>%
dplyr::group_by(Combo) %>%
timetk::pad_by_time(Date,
.by = date_type,
.pad_value = ifelse(clean_missing_values, NA, 0),
.end_date = hist_end_date
) %>% # fill in missing values in between existing data points
timetk::pad_by_time(Date,
.by = date_type,
.pad_value = 0,
.start_date = hist_start_date,
.end_date = hist_end_date
) %>% # fill in missing values at beginning of time series with zero
timetk::future_frame(Date,
.length_out = forecast_horizon,
.bind_data = TRUE
) %>% # add future data
dplyr::ungroup() %>%
dplyr::left_join(xregs_future_tbl,
by = c("Combo", "Date")
) %>% # join xregs that contain values given by user
clean_outliers_missing_values(
clean_outliers,
clean_missing_values,
get_frequency_number(date_type),
external_regressors
) %>% # clean outliers and missing values
dplyr::mutate_if(is.numeric, list(~ replace(., is.infinite(.), NA))) %>% # replace infinite values
dplyr::mutate_if(is.numeric, list(~ replace(., is.nan(.), NA))) %>% # replace NaN values
dplyr::mutate_if(is.numeric, list(~ replace(., is.na(.), 0))) %>% # replace NA values
dplyr::mutate(Target = ifelse(Date > hist_end_date,
NA,
Target
))
# box-cox transformation
if (box_cox) {
box_cox_tbl <- initial_tbl %>%
apply_box_cox()
initial_tbl <- box_cox_tbl$data
return_tbl <- return_tbl %>%
dplyr::left_join(box_cox_tbl$diff_info, by = "Combo")
}
# make stationary
if (stationary) {
stationary_tbl <- initial_tbl %>%
make_stationary()
initial_tbl <- stationary_tbl$data
return_tbl <- return_tbl %>%
dplyr::left_join(stationary_tbl$diff_info, by = "Combo")
}
# create date features
date_features <- initial_tbl %>%
dplyr::select(Date) %>%
dplyr::mutate(
Date_Adj = Date %m+% months(fiscal_year_start - 1),
Date_day_month_end = ifelse(lubridate::day(Date_Adj) == lubridate::days_in_month(Date_Adj), 1, 0)
) %>%
timetk::tk_augment_timeseries_signature(Date_Adj) %>%
dplyr::select(!tidyselect::matches(get_date_regex(date_type)), -Date_Adj, -Date)
names(date_features) <- stringr::str_c("Date_", names(date_features))
initial_tbl <- initial_tbl %>%
cbind(date_features)
# Run Recipes
if (is.null(recipes_to_run)) {
run_all_recipes_override <- FALSE
} else if (recipes_to_run == "all") {
run_all_recipes_override <- TRUE
} else {
run_all_recipes_override <- FALSE
}
if (is.null(recipes_to_run) | "R1" %in% recipes_to_run | run_all_recipes_override) {
R1 <- initial_tbl %>%
multivariate_prep_recipe_1(external_regressors,
xregs_future_values_list = xregs_future_list,
get_fourier_periods(fourier_periods, date_type),
get_lag_periods(lag_periods, date_type, forecast_horizon, multistep_horizon, TRUE),
get_rolling_window_periods(rolling_window_periods, date_type)
) %>%
dplyr::mutate(Target = base::ifelse(Date > hist_end_date, NA, Target))
write_data(
x = R1,
combo = combo,
run_info = run_info,
output_type = "data",
folder = "prep_data",
suffix = "-R1"
return_tbl <- tibble::tibble(
Combo = combo,
Combo_Hash = hash_data(combo)
)
}
if ((is.null(recipes_to_run) & date_type %in% c("month", "quarter", "year")) | "R2" %in% recipes_to_run | run_all_recipes_override) {
R2 <- initial_tbl %>%
multivariate_prep_recipe_2(external_regressors,
xregs_future_values_list = xregs_future_list,
get_fourier_periods(fourier_periods, date_type),
get_lag_periods(lag_periods, date_type, forecast_horizon),
get_rolling_window_periods(rolling_window_periods, date_type),
date_type,
forecast_horizon
) %>%
dplyr::mutate(Target = base::ifelse(Date > hist_end_date, NA, Target))
write_data(
x = R2,
combo = combo,
run_info = run_info,
output_type = "data",
folder = "prep_data",
suffix = "-R2"
# handle external regressors
xregs_future_tbl <- get_xregs_future_values_tbl(
df,
external_regressors,
hist_end_date
)
}
return(data.frame(return_tbl))
},
group_by = "Combo",
context = list(
get_xregs_future_values_tbl = get_xregs_future_values_tbl,
external_regressors = external_regressors,
clean_missing_values = clean_missing_values,
clean_outliers_missing_values = clean_outliers_missing_values,
hash_data = hash_data,
hist_end_date = hist_end_date,
hist_start_date = hist_start_date,
forecast_approach = forecast_approach,
forecast_horizon = forecast_horizon,
clean_outliers = clean_outliers,
get_frequency_number = get_frequency_number,
date_type = date_type,
fiscal_year_start = fiscal_year_start,
get_date_regex = get_date_regex,
recipes_to_run = recipes_to_run,
multivariate_prep_recipe_1 = multivariate_prep_recipe_1,
multivariate_prep_recipe_2 = multivariate_prep_recipe_2,
run_info = run_info,
get_fourier_periods = get_fourier_periods,
fourier_periods = fourier_periods,
get_lag_periods = get_lag_periods,
lag_periods = lag_periods,
get_rolling_window_periods = get_rolling_window_periods,
rolling_window_periods = rolling_window_periods,
write_data = write_data,
write_data_folder = write_data_folder,
write_data_type = write_data_type,
box_cox = box_cox,
stationary = stationary,
make_stationary = make_stationary,
apply_box_cox = apply_box_cox
)
if (length(colnames(xregs_future_tbl)) > 2) {
xregs_future_list <- xregs_future_tbl %>%
dplyr::select(-Date, -Combo) %>%
colnames()
} else {
xregs_future_list <- NULL
}
# initial data prep
initial_tbl <- df %>%
dplyr::filter(Combo == combo) %>%
dplyr::select(
Combo,
Date,
Target,
tidyselect::all_of(external_regressors)
) %>%
dplyr::group_by(Combo) %>%
timetk::pad_by_time(Date,
.by = date_type,
.pad_value = ifelse(clean_missing_values, NA, 0),
.end_date = hist_end_date
) %>% # fill in missing values in between existing data points
timetk::pad_by_time(Date,
.by = date_type,
.pad_value = 0,
.start_date = hist_start_date,
.end_date = hist_end_date
) %>% # fill in missing values at beginning of time series with zero
timetk::future_frame(Date,
.length_out = forecast_horizon,
.bind_data = TRUE
) %>% # add future data
dplyr::ungroup() %>%
dplyr::left_join(xregs_future_tbl,
by = c("Combo", "Date")
) %>% # join xregs that contain values given by user
clean_outliers_missing_values(
clean_outliers,
clean_missing_values,
get_frequency_number(date_type),
external_regressors
) %>% # clean outliers and missing values
dplyr::mutate_if(is.numeric, list(~ replace(., is.infinite(.), NA))) %>% # replace infinite values
dplyr::mutate_if(is.numeric, list(~ replace(., is.nan(.), NA))) %>% # replace NaN values
dplyr::mutate_if(is.numeric, list(~ replace(., is.na(.), 0))) %>% # replace NA values
dplyr::mutate(Target = ifelse(Date > hist_end_date,
NA,
Target
))
# box-cox transformation
if (box_cox) {
box_cox_tbl <- initial_tbl %>%
apply_box_cox()
initial_tbl <- box_cox_tbl$data
return_tbl <- return_tbl %>%
dplyr::left_join(box_cox_tbl$diff_info, by = "Combo")
}
# make stationary
if (stationary) {
stationary_tbl <- initial_tbl %>%
make_stationary()
initial_tbl <- stationary_tbl$data
return_tbl <- return_tbl %>%
dplyr::left_join(stationary_tbl$diff_info, by = "Combo")
}
# create date features
date_features <- initial_tbl %>%
dplyr::select(Date) %>%
dplyr::mutate(
Date_Adj = Date %m+% months(fiscal_year_start - 1),
Date_day_month_end = ifelse(lubridate::day(Date_Adj) == lubridate::days_in_month(Date_Adj), 1, 0)
) %>%
timetk::tk_augment_timeseries_signature(Date_Adj) %>%
dplyr::select(!tidyselect::matches(get_date_regex(date_type)), -Date_Adj, -Date)
names(date_features) <- stringr::str_c("Date_", names(date_features))
initial_tbl <- initial_tbl %>%
cbind(date_features)
# Run Recipes
if (is.null(recipes_to_run)) {
run_all_recipes_override <- FALSE
} else if (recipes_to_run == "all") {
run_all_recipes_override <- TRUE
} else {
run_all_recipes_override <- FALSE
}
if (is.null(recipes_to_run) | "R1" %in% recipes_to_run | run_all_recipes_override) {
R1 <- initial_tbl %>%
multivariate_prep_recipe_1(external_regressors,
xregs_future_values_list = xregs_future_list,
get_fourier_periods(fourier_periods, date_type),
get_lag_periods(lag_periods, date_type, forecast_horizon, multistep_horizon, TRUE),
get_rolling_window_periods(rolling_window_periods, date_type)
) %>%
dplyr::mutate(Target = base::ifelse(Date > hist_end_date, NA, Target))
write_data(
x = R1,
combo = combo,
run_info = run_info,
output_type = "data",
folder = "prep_data",
suffix = "-R1"
)
}
if ((is.null(recipes_to_run) & date_type %in% c("month", "quarter", "year")) | "R2" %in% recipes_to_run | run_all_recipes_override) {
R2 <- initial_tbl %>%
multivariate_prep_recipe_2(external_regressors,
xregs_future_values_list = xregs_future_list,
get_fourier_periods(fourier_periods, date_type),
get_lag_periods(lag_periods, date_type, forecast_horizon),
get_rolling_window_periods(rolling_window_periods, date_type),
date_type,
forecast_horizon
) %>%
dplyr::mutate(Target = base::ifelse(Date > hist_end_date, NA, Target))
write_data(
x = R2,
combo = combo,
run_info = run_info,
output_type = "data",
folder = "prep_data",
suffix = "-R2"
)
}
return(data.frame(return_tbl))
},
group_by = "Combo",
context = list(
get_xregs_future_values_tbl = get_xregs_future_values_tbl,
external_regressors = external_regressors,
clean_missing_values = clean_missing_values,
clean_outliers_missing_values = clean_outliers_missing_values,
hash_data = hash_data,
hist_end_date = hist_end_date,
hist_start_date = hist_start_date,
forecast_approach = forecast_approach,
forecast_horizon = forecast_horizon,
clean_outliers = clean_outliers,
get_frequency_number = get_frequency_number,
date_type = date_type,
fiscal_year_start = fiscal_year_start,
get_date_regex = get_date_regex,
recipes_to_run = recipes_to_run,
multivariate_prep_recipe_1 = multivariate_prep_recipe_1,
multivariate_prep_recipe_2 = multivariate_prep_recipe_2,
run_info = run_info,
get_fourier_periods = get_fourier_periods,
fourier_periods = fourier_periods,
get_lag_periods = get_lag_periods,
lag_periods = lag_periods,
get_rolling_window_periods = get_rolling_window_periods,
rolling_window_periods = rolling_window_periods,
write_data = write_data,
write_data_folder = write_data_folder,
write_data_type = write_data_type,
box_cox = box_cox,
stationary = stationary,
make_stationary = make_stationary,
apply_box_cox = apply_box_cox
)
)
}
@ -693,12 +693,13 @@ prep_data <- function(run_info,
length()
if (successful_combos != total_combos) {
stop(paste0(
"Not all time series were prepped within 'prep_data', expected ",
total_combos, " time series but only ", successful_combos,
" time series are prepped. ", "Please run 'prep_data' again."
),
call. = FALSE
stop(
paste0(
"Not all time series were prepped within 'prep_data', expected ",
total_combos, " time series but only ", successful_combos,
" time series are prepped. ", "Please run 'prep_data' again."
),
call. = FALSE
)
}
@ -1067,7 +1068,6 @@ apply_box_cox <- function(df) {
)
for (column_name in names(df)) {
# Only check numeric columns with more than 2 unique values
if (is.numeric(df[[column_name]]) & length(unique(df[[column_name]])) > 2) {
temp_tbl <- df %>%
@ -1119,7 +1119,6 @@ make_stationary <- function(df) {
)
for (column_name in names(df)) {
# Only check numeric columns with more than 2 unique values
if (is.numeric(df[[column_name]]) & length(unique(df[[column_name]])) > 2) {
temp_tbl <- df %>%
@ -1183,7 +1182,6 @@ multivariate_prep_recipe_1 <- function(data,
rolling_window_periods,
hist_end_date,
date_type) {
# apply polynomial transformations
numeric_xregs <- c()
@ -1345,7 +1343,6 @@ multivariate_prep_recipe_2 <- function(data,
}
for (period in 1:forecast_horizon) {
# add horizon and origin components
data_lag_window <- df_poly %>%
dplyr::mutate(

Просмотреть файл

@ -1,4 +1,3 @@
#' Prep Models
#'
#' Preps various aspects of run before training models. Things like train/test
@ -60,7 +59,6 @@ prep_models <- function(run_info,
pca = NULL,
num_hyperparameters = 10,
seed = 123) {
# check input values
check_input_type("run_info", run_info, "list")
check_input_type("back_test_scenarios", back_test_scenarios, c("NULL", "numeric"))
@ -514,7 +512,6 @@ model_workflows <- function(run_info,
ml_models <- list_models()
if (is.null(models_to_run) & is.null(models_not_to_run)) {
# do nothing, using existing ml_models list
} else if (is.null(models_to_run) & !is.null(models_not_to_run)) {
ml_models <- setdiff(ml_models, models_not_to_run)

Просмотреть файл

@ -1,4 +1,3 @@
#' Get Final Forecast Data
#'
#' @param run_info run info using the [set_run_info()] function
@ -46,7 +45,6 @@
#' @export
get_forecast_data <- function(run_info,
return_type = "df") {
# check input values
check_input_type("run_info", run_info, "list")
check_input_type("return_type", return_type, "character", c("df", "sdf"))
@ -76,17 +74,17 @@ get_forecast_data <- function(run_info,
run_info$path, "/forecasts/*", hash_data(run_info$experiment_name), "-",
hash_data(run_info$run_name), "*condensed", ".", run_info$data_output
)
condensed_files <- list_files(run_info$storage_object, fs::path(cond_path))
if (length(condensed_files) > 0) {
condensed <- TRUE
} else {
condensed <- FALSE
}
# get forecast data
if(forecast_approach != "bottoms_up") {
if (forecast_approach != "bottoms_up") {
fcst_path <- paste0(
"/forecasts/*", hash_data(run_info$experiment_name), "-",
hash_data(run_info$run_name), "*reconciled", ".", run_info$data_output
@ -172,7 +170,6 @@ get_forecast_data <- function(run_info,
#' }
#' @export
get_trained_models <- function(run_info) {
# check input values
check_input_type("run_info", run_info, "list")
@ -228,7 +225,6 @@ get_trained_models <- function(run_info) {
get_prepped_data <- function(run_info,
recipe,
return_type = "df") {
# check input values
check_input_type("run_info", run_info, "list")
check_input_type("recipe", recipe, "character", c("R1", "R2"))
@ -299,7 +295,6 @@ get_prepped_data <- function(run_info,
#' }
#' @export
get_prepped_models <- function(run_info) {
# check input values
check_input_type("run_info", run_info, "list")
@ -527,12 +522,12 @@ download_file <- function(storage_object,
#' @noRd
read_file <- function(run_info,
path = NULL,
file_list = NULL,
file_list = NULL,
return_type = "df",
schema = NULL) {
storage_object <- run_info$storage_object
if(!is.null(path)) {
if (!is.null(path)) {
folder <- fs::path_dir(path)
initial_path <- run_info$path
file <- fs::path_file(path)
@ -551,7 +546,7 @@ read_file <- function(run_info,
files <- list_files(storage_object, fs::path(initial_path, path))
}
if(!is.null(file_list)) {
if (!is.null(file_list)) {
file_temp <- files[[1]]
file_ext <- fs::path_ext(file_temp)
} else if (fs::path_ext(file) == "*") {
@ -698,10 +693,9 @@ get_recipe_data <- function(run_info,
#'
#' @return nothing
#' @noRd
condense_data <- function(run_info,
condense_data <- function(run_info,
parallel_processing = NULL,
num_cores = NULL) {
# get initial list of files to condense
initial_file_list <- list_files(
run_info$storage_object,
@ -710,16 +704,16 @@ condense_data <- function(run_info,
hash_data(run_info$run_name), "*_models.", run_info$data_output
)
)
# Initialize an empty list to store the batches
list_of_batches <- list()
# Define the batch size
batch_size <- 10000
# Calculate the number of batches needed
num_batches <- ceiling(length(initial_file_list) / batch_size)
# Loop through the large list and create batches
for (i in 1:num_batches) {
start_index <- (i - 1) * batch_size + 1
@ -727,7 +721,7 @@ condense_data <- function(run_info,
batch_name <- paste0("batch_", i)
list_of_batches[[batch_name]] <- initial_file_list[start_index:end_index]
}
# parallel run info
par_info <- par_start(
run_info = run_info,
@ -735,11 +729,11 @@ condense_data <- function(run_info,
num_cores = min(length(names(list_of_batches)), num_cores),
task_length = length(names(list_of_batches))
)
cl <- par_info$cl
packages <- par_info$packages
`%op%` <- par_info$foreach_operator
# submit tasks
condense_data_tbl <- foreach::foreach(
batch = names(list_of_batches),
@ -750,23 +744,23 @@ condense_data <- function(run_info,
.inorder = FALSE,
.multicombine = TRUE,
.noexport = NULL
) %op%
{
files <- list_of_batches[[batch]]
) %op% {
files <- list_of_batches[[batch]]
data <- read_file(run_info,
file_list = files,
return_type = "df")
write_data(
x = data,
combo = batch,
run_info = run_info,
output_type = "data",
folder = "forecasts",
suffix = "-condensed"
)
return(batch)
}
data <- read_file(run_info,
file_list = files,
return_type = "df"
)
write_data(
x = data,
combo = batch,
run_info = run_info,
output_type = "data",
folder = "forecasts",
suffix = "-condensed"
)
return(batch)
}
}

Просмотреть файл

@ -44,7 +44,6 @@ set_run_info <- function(experiment_name = "finn_fcst",
data_output = "csv",
object_output = "rds",
add_unique_id = TRUE) {
# initial input checks
if (!inherits(run_name, c("NULL", "character"))) {
stop("`run_name` must either be a NULL or a string")
@ -151,7 +150,6 @@ set_run_info <- function(experiment_name = "finn_fcst",
base::suppressWarnings()
if (nrow(log_df) > 0 & add_unique_id == FALSE) {
# check if input values have changed
current_log_df <- tibble::tibble(
experiment_name = experiment_name,
@ -266,7 +264,6 @@ get_run_info <- function(experiment_name = NULL,
run_name = NULL,
storage_object = NULL,
path = NULL) {
# input checks
if (!inherits(run_name, c("NULL", "character"))) {
stop("`run_name` must either be a NULL or a string")

Просмотреть файл

@ -1,4 +1,3 @@
#' Train Individual Models
#'
#' @param run_info run info using the [set_run_info()] function
@ -220,7 +219,6 @@ train_models <- function(run_info,
stringr::str_replace(hash_data("All-Data"), "All-Data")
if (length(combo_diff) == 0 & length(prev_combo_list) > 0) {
# check if input values have changed
current_log_df <- tibble::tibble(
run_global_models = run_global_models,
@ -270,7 +268,6 @@ train_models <- function(run_info,
.noexport = NULL
) %op%
{
# get time series
combo_hash <- x
@ -402,7 +399,6 @@ train_models <- function(run_info,
.multicombine = TRUE,
.noexport = NULL
) %do% {
# get initial run info
model <- model_run %>%
dplyr::pull(Model_Name)
@ -719,12 +715,13 @@ train_models <- function(run_info,
length()
if (successful_combos != total_combos) {
stop(paste0(
"Not all time series were completed within 'train_models', expected ",
total_combos, " time series but only ", successful_combos,
" time series were ran. ", "Please run 'train_models' again."
),
call. = FALSE
stop(
paste0(
"Not all time series were completed within 'train_models', expected ",
total_combos, " time series but only ", successful_combos,
" time series were ran. ", "Please run 'train_models' again."
),
call. = FALSE
)
}
@ -784,7 +781,6 @@ negative_fcst_adj <- function(data,
#' @return tbl with train test splits
#' @noRd
create_splits <- function(data, train_test_splits) {
# Create the rsplit object
analysis_split <- function(data, train_indices, test_indices) {
rsplit_object <- rsample::make_splits(
@ -846,7 +842,6 @@ create_splits <- function(data, train_test_splits) {
undifference_forecast <- function(forecast_data,
recipe_data,
diff_tbl) {
# check if data needs to be undifferenced
diff1 <- diff_tbl$Diff_Value1
diff2 <- diff_tbl$Diff_Value2
@ -863,10 +858,8 @@ undifference_forecast <- function(forecast_data,
# non seasonal differencing
if (!is.na(diff1)) {
# loop through each back test split
for (id in train_test_id) {
# get specific train test split
fcst_temp_tbl <- forecast_data %>%
dplyr::filter(Train_Test_ID == id)
@ -950,7 +943,6 @@ undifference_forecast <- function(forecast_data,
undifference_recipe <- function(recipe_data,
diff_tbl,
hist_end_date) {
# check if data needs to be undifferenced
diff1 <- diff_tbl$Diff_Value1
diff2 <- diff_tbl$Diff_Value2

Просмотреть файл

@ -15,7 +15,7 @@ utils::globalVariables(c(
"Auto_Accept", "Feature", "Imp", "Importance", "LOFO_Var", "Var_RMSE", "Vote", "Votes", "desc",
"term", "Column", "Box_Cox_Lambda", "get_recipie_configurable", "Agg", "Unique", "Var",
"Var_Combo", "regressor", "regressor_tbl", "value_level_iter", ".actual", ".fitted",
"forecast_horizon", "lag", "new_data", "object", "fit", "Row_Num", "Run_Number", "weight",
"forecast_horizon", "lag", "new_data", "object", "fit", "Row_Num", "Run_Number", "weight",
"Total", "Weight", "batch"
))
@ -89,7 +89,6 @@ cbind.fill <- function(..., fill = NA) {
# been loaded.
.onLoad <- function(libname, pkgname) {
# CRAN OMP THREAD LIMIT
Sys.setenv("OMP_THREAD_LIMIT" = 1)

Просмотреть файл

@ -1,4 +1,3 @@
# * custom test functions ----
check_exist <- function(to_check, ret) {

Просмотреть файл

@ -1,6 +1,4 @@
test_that("prep_hierarchical_data returns correct grouped hierarchies", {
# Mock data setup
data <- tibble::tibble(
Segment = as.character(c(
@ -77,7 +75,6 @@ test_that("prep_hierarchical_data returns correct grouped hierarchies", {
})
test_that("prep_hierarchical_data returns correct standard hierarchies", {
# Mock data setup
data <- tibble::tibble(
Area = as.character(c("EMEA", "EMEA", "EMEA", "EMEA", "EMEA", "EMEA", "EMEA", "EMEA", "United States", "United States", "United States", "United States")),

Просмотреть файл

@ -1,5 +1,4 @@
test_that("multistep_horizon monthly data", {
# Mock data setup
data <- timetk::m4_monthly %>%
dplyr::mutate(id = as.character(id)) %>%

Просмотреть файл

@ -22,27 +22,33 @@ When `prep_models()` is ran, hyperparameters and back test splits are calculated
library(finnts)
hist_data <- timetk::m4_monthly %>%
dplyr::filter(date >= "2012-01-01",
id == "M2") %>%
dplyr::filter(
date >= "2012-01-01",
id == "M2"
) %>%
dplyr::rename(Date = date) %>%
dplyr::mutate(id = as.character(id))
run_info <- set_run_info(
experiment_name = "finnts_fcst",
experiment_name = "finnts_fcst",
run_name = "get_prepped_models"
)
prep_data(run_info = run_info,
input_data = hist_data,
combo_variables = c("id"),
target_variable = "value",
date_type = "month",
recipes_to_run = "R1",
forecast_horizon = 6)
prep_data(
run_info = run_info,
input_data = hist_data,
combo_variables = c("id"),
target_variable = "value",
date_type = "month",
recipes_to_run = "R1",
forecast_horizon = 6
)
prep_models(run_info = run_info,
models_to_run = c("arima", "ets", "xgboost"),
num_hyperparameters = 10)
prep_models(
run_info = run_info,
models_to_run = c("arima", "ets", "xgboost"),
num_hyperparameters = 10
)
model_info <- get_prepped_models(run_info = run_info)
@ -67,7 +73,6 @@ model_hyperparameter_info <- model_info %>%
tidyr::unnest(Data)
print(model_hyperparameter_info)
```
The above outputs allow a Finn user to understand what hyperparameters are chosen for tuning and how the model refitting process will work. When tuning hyperparameters, Finn uses the "Validation" train/test splits, with the final parameters chosen using RMSE. For some models like ARIMA that don't follow a traditional hyperparameter tuning process, the model is fit from scratch across all train/test splits. After hyperparameters are chosen, the model is refit across the "Back_Test" and "Future_Forecast" splits. The "Back_Test" splits are the true testing data that will be used when selecting the final "Best-Model". "Ensemble" splits are also created as ensemble training data if ensemble models are chosen to run. Ensemble models follow a similar tuning process.

Просмотреть файл

@ -23,17 +23,21 @@ suppressMessages(library(dplyr))
message("Simple Back Test Results")
back_test_tbl <- tibble(
Combo = c("Country_1", "Country_1", "Country_1", "Country_1", "Country_1", "Country_1", "Country_1", "Country_1", "Country_1", "Country_1"),
Date = c("2020-01-01", "2020-02-01", "2020-03-01", "2020-04-01", "2020-05-01", "2020-01-01", "2020-02-01", "2020-03-01", "2020-04-01", "2020-05-01"),
Model = c("arima", "arima", "arima", "arima", "arima", "ets", "ets", "ets", "ets", "ets"),
FCST = c(9, 23, 35, 41, 48, 7, 22, 29, 42, 53),
Combo = c("Country_1", "Country_1", "Country_1", "Country_1", "Country_1", "Country_1", "Country_1", "Country_1", "Country_1", "Country_1"),
Date = c("2020-01-01", "2020-02-01", "2020-03-01", "2020-04-01", "2020-05-01", "2020-01-01", "2020-02-01", "2020-03-01", "2020-04-01", "2020-05-01"),
Model = c("arima", "arima", "arima", "arima", "arima", "ets", "ets", "ets", "ets", "ets"),
FCST = c(9, 23, 35, 41, 48, 7, 22, 29, 42, 53),
Target = c(10, 20, 30, 40, 50, 10, 20, 30, 40, 50)
) %>%
dplyr::mutate(MAPE = abs(Target-FCST)/Target,
Date = as.Date(Date)) %>%
dplyr::mutate(
MAPE = abs(Target - FCST) / Target,
Date = as.Date(Date)
) %>%
dplyr::group_by(Combo, Model) %>%
dplyr::mutate(Target_Total = sum(Target),
Percent_Total = Target/Target_Total) %>%
dplyr::mutate(
Target_Total = sum(Target),
Percent_Total = Target / Target_Total
) %>%
dplyr::ungroup()
print(back_test_tbl)
@ -44,12 +48,13 @@ message("Overall Model Accuracy by Combo")
suppressMessages(best_model <- back_test_tbl %>%
dplyr::group_by(Combo, Model) %>%
dplyr::mutate(Weighted_MAPE = MAPE * Percent_Total) %>%
dplyr::summarise(MAPE = mean(MAPE),
Weighted_MAPE = sum(Weighted_MAPE)) %>%
dplyr::summarise(
MAPE = mean(MAPE),
Weighted_MAPE = sum(Weighted_MAPE)
) %>%
dplyr::ungroup())
print(best_model)
```
During the simple back test process above, arima seems to be the better model from a pure MAPE perspective, but ETS ends up being the winner when using weighted MAPE. The benefits of weighted MAPE allow finnts to find the optimal model that performs the best on the biggest components of a forecast, which comes with the added benefit of putting more weight on more recent observations since those are more likely to have larger target values then ones further into the past. Another way of putting more weight on more recent observations is how Finn overlaps its back testing scenarios. This means the most recent observations are tested for accuracy in different forecast horizons (H=1, H=2, etc). More info on this in the back testing vignette.

Просмотреть файл

@ -42,11 +42,11 @@ Below is an example of how you can set up your input_data to leverage future val
library(dplyr)
tibble(
Combo = c("Country_1", "Country_1", "Country_1", "Country_1", "Country_1", "Country_1", "Country_1", "Country_1", "Country_1", "Country_1", "Country_1", "Country_1", "Country_1", "Country_1", "Country_1"),
Date = c("2020-01-01", "2020-02-01", "2020-03-01", "2020-04-01", "2020-05-01", "2020-06-01", "2020-07-01", "2020-08-01", "2020-09-01", "2020-10-01", "2020-11-01", "2020-12-01", "2021-01-01", "2021-02-01", "2021-03-01"),
Target = c(10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, NA, NA, NA),
Holiday = c("New Years", "Valentines Day", "None", "Easter", "None", "None", "4th of July", "None", "Labor Day", "Halloween", "Thanksgiving", "Christmas", "New Years", "Valentines Day", "None"),
GDP = c(5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, NA, NA, NA),
Combo = c("Country_1", "Country_1", "Country_1", "Country_1", "Country_1", "Country_1", "Country_1", "Country_1", "Country_1", "Country_1", "Country_1", "Country_1", "Country_1", "Country_1", "Country_1"),
Date = c("2020-01-01", "2020-02-01", "2020-03-01", "2020-04-01", "2020-05-01", "2020-06-01", "2020-07-01", "2020-08-01", "2020-09-01", "2020-10-01", "2020-11-01", "2020-12-01", "2021-01-01", "2021-02-01", "2021-03-01"),
Target = c(10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, NA, NA, NA),
Holiday = c("New Years", "Valentines Day", "None", "Easter", "None", "None", "4th of July", "None", "Labor Day", "Halloween", "Thanksgiving", "Christmas", "New Years", "Valentines Day", "None"),
GDP = c(5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, NA, NA, NA),
Sales_Pipeline = c(100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240)
) %>%
dplyr::mutate(Date = as.Date(Date))

Просмотреть файл

@ -74,7 +74,6 @@ In addition to the standard approaches above, finnts also does two different way
In the first recipe, referred to as "R1" in default finnts models, by default takes a single step horizon approach. Meaning all of the engineered target and external regressor features are used but the lags cannot be less than the forecast horizon. For example, a monthly data set with a forecast horizon of 3, finnts will take engineered features like lags and rolling window features but only use those lags that are for periods equal to or greater than 3. You can also run a multistep horizon approach by setting `multistep_horizon` to TRUE in `prep_models()`. The multistep approach will create features that can be used by specific multivariate models that optimize for each period in a forecast horizon. More on this in the "models used in finnts" vignette. Recursive forecasting is not supported in finnts multivariate machine learning models, since feeding forecast outputs as features to create another forecast adds complex layers of uncertainty that can easily spiral out of control and produce poor forecasts. NA values created by generating lag features are filled "up". This results in the first initial periods of a time series having some data leakage but the effect should be small if the time series is long enough.
```{r, message = FALSE}
library(finnts)
hist_data <- timetk::m4_monthly %>%
@ -112,7 +111,6 @@ print(R1_prepped_data_tbl)
The second recipe is referred to as "R2" in default finnts models. It takes a very different approach than the "R1" recipe. For a 3 month forecast horizon on a monthly dataset, target and rolling window features are created depending on the horizon period. They are also constrained to be equal or less than the forecast horizon. In the below example, "Origin" and "Horizon" features are created for each time period. This results in duplicating rows in the original data set to create new features that are now specific to each horizon period. This helps the default finnts models find new unique relationships to model, when compared to a more formal approach in "R1". NA values created by generating lag features are filled "up".
```{r, message = FALSE}
library(finnts)
hist_data <- timetk::m4_monthly %>%

Просмотреть файл

@ -10,8 +10,8 @@ vignette: >
```{r, include = FALSE}
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>",
warning = FALSE,
comment = "#>",
warning = FALSE,
message = FALSE
)
library(finnts)
@ -62,18 +62,15 @@ The above data set contains 4 individual time series, identified using the "id"
Before we call the Finn forecast function. Let's first set up some run information using `set_run_info()`, this helps log all components of our Finn forecast successfully.
```{r, message = FALSE, eval = hist_data, error=FALSE, warning = FALSE, echo=T, eval = TRUE}
run_info <- set_run_info(
experiment_name = "finn_forecast",
experiment_name = "finn_forecast",
run_name = "test_run"
)
```
Calling the "forecast_time_series" function is the easiest part. In this example we will be running just two models.
```{r, message = FALSE, eval = hist_data, error=FALSE, warning = FALSE, echo=T, eval = TRUE}
# no need to assign it to a variable, since all of the outputs are written to disk :)
forecast_time_series(
run_info = run_info,
@ -82,8 +79,8 @@ forecast_time_series(
target_variable = "value",
date_type = "month",
forecast_horizon = 3,
back_test_scenarios = 6,
models_to_run = c("arima", "ets"),
back_test_scenarios = 6,
models_to_run = c("arima", "ets"),
return_data = FALSE
)
```
@ -93,7 +90,6 @@ forecast_time_series(
### Initial Finn Outputs
```{r, message = FALSE, eval = finn_output, message = FALSE, eval = FALSE, echo=T}
finn_output_tbl <- get_forecast_data(run_info = run_info)
print(finn_output_tbl)
@ -102,7 +98,6 @@ print(finn_output_tbl)
### Future Forecast
```{r, message = FALSE, eval = finn_output, message = FALSE, eval = FALSE, echo=T}
future_forecast_tbl <- finn_output_tbl %>%
dplyr::filter(Run_Type == "Future_Forecast")
@ -142,13 +137,17 @@ print(trained_model_tbl)
### Initial Prepped Data
```{r, message = FALSE, eval = finn_output, eval = FALSE, echo=T}
R1_prepped_data_tbl <- get_prepped_data(run_info = run_info,
recipe = "R1")
R1_prepped_data_tbl <- get_prepped_data(
run_info = run_info,
recipe = "R1"
)
print(R1_prepped_data_tbl)
R2_prepped_data_tbl <- get_prepped_data(run_info = run_info,
recipe = "R2")
R2_prepped_data_tbl <- get_prepped_data(
run_info = run_info,
recipe = "R2"
)
print(R2_prepped_data_tbl)
```

Просмотреть файл

@ -10,8 +10,8 @@ vignette: >
```{r, include = FALSE}
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>",
warning = FALSE,
comment = "#>",
warning = FALSE,
message = FALSE
)
library(finnts)
@ -29,13 +29,15 @@ Let's get some example data and then set our Finn run info.
library(finnts)
hist_data <- timetk::m4_monthly %>%
dplyr::filter(date >= "2013-01-01",
id == "M2") %>%
dplyr::filter(
date >= "2013-01-01",
id == "M2"
) %>%
dplyr::rename(Date = date) %>%
dplyr::mutate(id = as.character(id))
run_info <- set_run_info(
experiment_name = "finnts_fcst",
experiment_name = "finnts_fcst",
run_name = "finn_sub_component_run"
)
```
@ -45,20 +47,26 @@ run_info <- set_run_info(
Clean and prepare our data before training models. We can even pull out our prepped data to see the features and transformations applied before models are trained.
```{r message=FALSE}
prep_data(run_info = run_info,
input_data = hist_data,
combo_variables = c("id"),
target_variable = "value",
date_type = "month",
forecast_horizon = 6)
prep_data(
run_info = run_info,
input_data = hist_data,
combo_variables = c("id"),
target_variable = "value",
date_type = "month",
forecast_horizon = 6
)
R1_prepped_data_tbl <- get_prepped_data(run_info = run_info,
recipe = "R1")
R1_prepped_data_tbl <- get_prepped_data(
run_info = run_info,
recipe = "R1"
)
print(R1_prepped_data_tbl)
R2_prepped_data_tbl <- get_prepped_data(run_info = run_info,
recipe = "R2")
R2_prepped_data_tbl <- get_prepped_data(
run_info = run_info,
recipe = "R2"
)
print(R2_prepped_data_tbl)
```
@ -70,12 +78,16 @@ Now that our data is prepared for modeling, let's now train some models. First w
Then we can kick off training each model on our data.
```{r, message = FALSE}
prep_models(run_info = run_info,
models_to_run = c("arima", "ets", "glmnet"),
num_hyperparameters = 2)
prep_models(
run_info = run_info,
models_to_run = c("arima", "ets", "glmnet"),
num_hyperparameters = 2
)
train_models(run_info = run_info,
run_global_models = FALSE)
train_models(
run_info = run_info,
run_global_models = FALSE
)
```
## Train Ensemble Models
@ -99,7 +111,6 @@ final_models(run_info = run_info)
Finally we can now retrieve the forecast results from this Finn run.
```{r, message = FALSE}
finn_output_tbl <- get_forecast_data(run_info = run_info)
print(finn_output_tbl)

Просмотреть файл

@ -27,15 +27,14 @@ message("Standard Hierarchical Time Series Data")
hts <- tibble(
Continent = c("North America", "North America", "North America", "North America", "North America", "North America", "North America", "North America", "North America"),
Country = c("United States", "United States", "United States", "United States", "United States", "United States", "Mexico", "Mexico", "Mexico"),
City = c("Kansas City", "Kansas City", "Kansas City", "Seattle", "Seattle", "Seattle", "Mexico City", "Mexico City", "Mexico City"),
Date = c("2020-01-01", "2020-02-01", "2020-03-01", "2020-01-01", "2020-02-01", "2020-03-01", "2020-01-01", "2020-02-01", "2020-03-01"),
Country = c("United States", "United States", "United States", "United States", "United States", "United States", "Mexico", "Mexico", "Mexico"),
City = c("Kansas City", "Kansas City", "Kansas City", "Seattle", "Seattle", "Seattle", "Mexico City", "Mexico City", "Mexico City"),
Date = c("2020-01-01", "2020-02-01", "2020-03-01", "2020-01-01", "2020-02-01", "2020-03-01", "2020-01-01", "2020-02-01", "2020-03-01"),
Target = c(100, 250, 320, 80, 200, 270, 50, 80, 120)
) %>%
dplyr::mutate(Date = as.Date(Date))
print(hts)
```
In the above example, "City" was the lowest level of the hierarchy, which feeds into "Country", which then feeds into "Continent". Finn will take this data and will forecast by City, total Country, and total Continent. After each model is ran for every level in the hierarchy, the best model is chosen at each level, then the "Best Model" and every other model is reconciled back down to the lowest level.
@ -50,16 +49,15 @@ suppressMessages(library(dplyr))
message("Grouped Hierarchical Time Series Data")
gts <- tibble(
Country = c("United States", "United States", "United States", "United States", "United States", "United States", "Mexico", "Mexico", "Mexico", "Mexico", "Mexico", "Mexico"),
Segment = c("Enterprise", "Enterprise", "Enterprise", "Public Sector", "Public Sector", "Public Sector", "Enterprise", "Enterprise", "Enterprise", "Enterprise", "Enterprise", "Enterprise"),
Product = c("Coffee", "Coffee", "Coffee", "Coffee", "Coffee", "Coffee", "Coffee", "Coffee", "Coffee", "Tea", "Tea", "Tea"),
Date = c("2020-01-01", "2020-02-01", "2020-03-01", "2020-01-01", "2020-02-01", "2020-03-01", "2020-01-01", "2020-02-01", "2020-03-01", "2020-01-01", "2020-02-01", "2020-03-01"),
Country = c("United States", "United States", "United States", "United States", "United States", "United States", "Mexico", "Mexico", "Mexico", "Mexico", "Mexico", "Mexico"),
Segment = c("Enterprise", "Enterprise", "Enterprise", "Public Sector", "Public Sector", "Public Sector", "Enterprise", "Enterprise", "Enterprise", "Enterprise", "Enterprise", "Enterprise"),
Product = c("Coffee", "Coffee", "Coffee", "Coffee", "Coffee", "Coffee", "Coffee", "Coffee", "Coffee", "Tea", "Tea", "Tea"),
Date = c("2020-01-01", "2020-02-01", "2020-03-01", "2020-01-01", "2020-02-01", "2020-03-01", "2020-01-01", "2020-02-01", "2020-03-01", "2020-01-01", "2020-02-01", "2020-03-01"),
Target = c(10, 20, 30, 5, 8, 11, 20, 23, 27, 50, 55, 60)
) %>%
dplyr::mutate(Date = as.Date(Date))
print(gts)
```
It would be hard to aggregate the above data in a traditional hierarchy. The same products are found in different segments and countries, also the same segments are found in multiple countries. Finn will follow a similar modeling process as the one described for a traditional hierarchy, but instead will create forecasts at the below levels.

Просмотреть файл

@ -22,7 +22,7 @@ reactable::reactable(
data.frame() %>%
rbind(data.frame(Model = "arima", Type = "univariate, local", Underlying.Package = "modeltime, forecast", Description = "Regression model that is based on finding relationships between lagged values of the target variable you are trying to forecast.")) %>%
rbind(data.frame(Model = "arima-boost", Type = "multivariate, local", Underlying.Package = "modeltime, forecast, xgboost", Description = "Arima model (refer to arima) that models the trend compoent of target variable, then uses xgboost model (refer to xgboost) to train on the remaining residuals.")) %>%
rbind(data.frame(Model = "arimax", Type = "multivariate, local", Underlying.Package = "modeltime, forecast", Description = "ARIMA model that incorporates external regressors and other engineered features.")) %>%
rbind(data.frame(Model = "arimax", Type = "multivariate, local", Underlying.Package = "modeltime, forecast", Description = "ARIMA model that incorporates external regressors and other engineered features.")) %>%
rbind(data.frame(Model = "cubist", Type = "multivariate, local, global, ensemble", Underlying.Package = "rules", Description = "Hybrid of tree based and linear regression approach. Many decision trees are built, but regression coefficients are used at each terminal node instead of averging values in other tree based approaches.")) %>%
rbind(data.frame(Model = "croston", Type = "univariate, local", Underlying.Package = "modeltime, forecast", Description = "Useful for intermittent demand forecasting, aka when there are a lot of periods of zero values. Involves simple exponential smoothing on non-zero values of target variable and another application of seasonal exponential smoothing on periods between non-zero elements of the target variable. Refer to ets for more details on exponential smoothing.")) %>%
rbind(data.frame(Model = "ets", Type = "univariate, local", Underlying.Package = "modeltime, forecast", Description = "Forecasts produced using exponential smoothing methods are weighted averages of past observations, with the weights decaying exponentially as the observations get older. Exponential smoothing models try to forecast the components of a time series which can be broken down in to error, trend, and seasonality. These components can be forecasted separately then either added or multiplied together to get the final forecast output.")) %>%
@ -41,8 +41,7 @@ reactable::reactable(
rbind(data.frame(Model = "svm-rbf", Type = "multivariate, local, global, ensemble", Underlying.Package = "parsnip, kernlab", Description = "Uses a nonlinear function, specifically a radial basis function, to create a regression line of the target variable.")) %>%
rbind(data.frame(Model = "tbats", Type = "univariate, local", Underlying.Package = "modeltime, forecast", Description = "A spin off of the traditional ets model (refer to ets), with some additional components to capture multiple seasonalities.")) %>%
rbind(data.frame(Model = "theta", Type = "univariate, local", Underlying.Package = "modeltime, forecast", Description = "Theta is similar to exponential smoothing (refer to ets) but with another component called drift. Adding drift to exponential smoothing allows the forecast to increase or decrease over time, where the amount of change over time (called the drift) is set to be the average change seen within the historical data.")) %>%
rbind(data.frame(Model = "xgboost", Type = "multivariate, local, global, ensemble", Underlying.Package = "parsnip, xgboost", Description = "Builds many decision trees (similar to random forests), but predictions that are initially inaccurate are applied more weight in subsequent training rounds to increase accuracy across all predictions."))
,
rbind(data.frame(Model = "xgboost", Type = "multivariate, local, global, ensemble", Underlying.Package = "parsnip, xgboost", Description = "Builds many decision trees (similar to random forests), but predictions that are initially inaccurate are applied more weight in subsequent training rounds to increase accuracy across all predictions.")),
defaultColDef = colDef(
header = function(value) gsub(".", " ", value, fixed = TRUE),
cell = function(value) format(value, nsmall = 1),
@ -51,12 +50,11 @@ reactable::reactable(
headerStyle = list(background = "#f7f7f8")
),
columns = list(
Description = colDef(minWidth = 140, align = "left") # overrides the default
Description = colDef(minWidth = 140, align = "left") # overrides the default
),
bordered = TRUE,
highlight = TRUE
)
```
### Univariate vs Multivariate Models

Просмотреть файл

@ -49,8 +49,8 @@ hist_data <- timetk::m4_monthly %>%
data_sdf <- sparklyr::copy_to(sc, hist_data, "data_sdf", overwrite = TRUE)
run_info <- set_run_info(
experiment_name = "finn_fcst",
run_name = "spark_run_1",
experiment_name = "finn_fcst",
run_name = "spark_run_1",
path = "/dbfs/mnt/example/folder" # important that you mount an ADLS path
)
@ -60,16 +60,18 @@ forecast_time_series(
combo_variables = c("id"),
target_variable = "value",
date_type = "month",
forecast_horizon = 3,
parallel_processing = "spark",
forecast_horizon = 3,
parallel_processing = "spark",
return_data = FALSE
)
# return the outputs as a spark data frame
finn_output_tbl <- get_forecast_data(run_info = run_info,
return_type = "sdf")
finn_output_tbl <- get_forecast_data(
run_info = run_info,
return_type = "sdf"
)
```
The above example runs each time series on a separate core on a spark cluster. You can also submit multiple time series where each time series runs on a separate spark executor (VM) and then leverage all of the cores on that executor to run things like hyperparameter tuning or model refitting in parallel. This creates two levels of parallelization. One at the time series level, then another when doing things like hyperparameter tuning within a specific time series. To do that set `inner_parallel` to TRUE in `forecast_time_series()`. Also make sure that you adjust the number of spark executor cores to 1, that ensures that only 1 time series runs on an executor at a time. Leverage the "spark.executor.cores" argument when configuring your spark connection. This can be done using [sparklyr](https://spark.rstudio.com/guides/connections#:~:text=In%20sparklyr%2C%20Spark%20properties%20can%20be%20set%20by,customized%20as%20shown%20in%20the%20example%20code%20below.) or within the cluster manager itself within the Azure resource. Use the "num_cores" argument in the "forecast_time_series" function to control how many cores should be used within an executor when running things like hyperparameter tuning.
`forecast_time_series()` will be looking for a variable called "sc" to use when submitting tasks to the spark cluster, so make sure you use that as the variable name when connecting to spark. Also it's important that you mount your spark session to an Azure Data Lake Storage (ADLS) account, and provide the mounted path to where you'd like your Finn results to be written to within `set_run_info()`.
`forecast_time_series()` will be looking for a variable called "sc" to use when submitting tasks to the spark cluster, so make sure you use that as the variable name when connecting to spark. Also it's important that you mount your spark session to an Azure Data Lake Storage (ADLS) account, and provide the mounted path to where you'd like your Finn results to be written to within `set_run_info()`.