code formatting
This commit is contained in:
Родитель
31f60badf8
Коммит
bc88b7b512
|
@ -181,27 +181,29 @@ ensemble_models <- function(run_info,
|
|||
# model forecasts
|
||||
single_model_tbl <- NULL
|
||||
if (run_local_models) {
|
||||
suppressWarnings(try(single_model_tbl <- read_file(run_info,
|
||||
path = paste0(
|
||||
"/forecasts/", hash_data(run_info$experiment_name), "-", hash_data(run_info$run_name),
|
||||
"-", combo, "-single_models.", run_info$data_output
|
||||
suppressWarnings(try(
|
||||
single_model_tbl <- read_file(run_info,
|
||||
path = paste0(
|
||||
"/forecasts/", hash_data(run_info$experiment_name), "-", hash_data(run_info$run_name),
|
||||
"-", combo, "-single_models.", run_info$data_output
|
||||
),
|
||||
return_type = "df"
|
||||
),
|
||||
return_type = "df"
|
||||
),
|
||||
silent = TRUE
|
||||
silent = TRUE
|
||||
))
|
||||
}
|
||||
|
||||
global_model_tbl <- NULL
|
||||
if (run_global_models) {
|
||||
suppressWarnings(try(global_model_tbl <- read_file(run_info,
|
||||
path = paste0(
|
||||
"/forecasts/", hash_data(run_info$experiment_name), "-", hash_data(run_info$run_name),
|
||||
"-", combo, "-global_models.", run_info$data_output
|
||||
suppressWarnings(try(
|
||||
global_model_tbl <- read_file(run_info,
|
||||
path = paste0(
|
||||
"/forecasts/", hash_data(run_info$experiment_name), "-", hash_data(run_info$run_name),
|
||||
"-", combo, "-global_models.", run_info$data_output
|
||||
),
|
||||
return_type = "df"
|
||||
),
|
||||
return_type = "df"
|
||||
),
|
||||
silent = TRUE
|
||||
silent = TRUE
|
||||
))
|
||||
}
|
||||
|
||||
|
@ -336,7 +338,6 @@ ensemble_models <- function(run_info,
|
|||
.multicombine = TRUE,
|
||||
.noexport = NULL
|
||||
) %do% {
|
||||
|
||||
# get initial run info
|
||||
model <- model_run %>%
|
||||
dplyr::pull(Model_Name)
|
||||
|
|
|
@ -23,10 +23,8 @@ run_feature_selection <- function(input_data,
|
|||
forecast_horizon,
|
||||
external_regressors,
|
||||
multistep_horizon = FALSE) {
|
||||
|
||||
# check for more than one unique target value
|
||||
if (input_data %>% tidyr::drop_na(Target) %>% dplyr::pull(Target) %>% unique() %>% length() < 2) {
|
||||
|
||||
# just return the date features
|
||||
fs_list <- input_data %>%
|
||||
dplyr::select(tidyselect::contains("Date"))
|
||||
|
@ -83,7 +81,6 @@ run_feature_selection <- function(input_data,
|
|||
|
||||
# run feature selection
|
||||
if (date_type %in% c("day", "week")) {
|
||||
|
||||
# number of votes needed for feature to be selected
|
||||
votes_needed <- 3
|
||||
|
||||
|
@ -410,7 +407,6 @@ lofo_fn <- function(run_info,
|
|||
parallel_processing,
|
||||
pca = FALSE,
|
||||
seed = 123) {
|
||||
|
||||
# parallel run info
|
||||
par_info <- par_start(
|
||||
run_info = run_info,
|
||||
|
|
|
@ -141,7 +141,6 @@ final_models <- function(run_info,
|
|||
run_ensemble_models <- prev_log_df$run_ensemble_models
|
||||
|
||||
if (sum(colnames(prev_log_df) %in% "weighted_mape")) {
|
||||
|
||||
# check if input values have changed
|
||||
current_log_df <- tibble::tibble(
|
||||
average_models = average_models,
|
||||
|
@ -294,7 +293,6 @@ final_models <- function(run_info,
|
|||
|
||||
# simple model averaging
|
||||
if (average_models & length(final_model_list) > 1) {
|
||||
|
||||
# create model combinations list
|
||||
model_combinations <- tibble::tibble()
|
||||
|
||||
|
@ -338,7 +336,6 @@ final_models <- function(run_info,
|
|||
.noexport = NULL
|
||||
) %op%
|
||||
{
|
||||
|
||||
# get list of models to average
|
||||
model_list <- strsplit(x, "_")[[1]]
|
||||
|
||||
|
@ -362,9 +359,9 @@ final_models <- function(run_info,
|
|||
} else {
|
||||
averages_tbl <- NULL
|
||||
}
|
||||
|
||||
|
||||
# choose best average model
|
||||
if(!is.null(averages_tbl)) {
|
||||
if (!is.null(averages_tbl)) {
|
||||
avg_back_test_mape <- averages_tbl %>%
|
||||
dplyr::mutate(
|
||||
Train_Test_ID = as.numeric(Train_Test_ID),
|
||||
|
@ -372,7 +369,7 @@ final_models <- function(run_info,
|
|||
) %>%
|
||||
dplyr::filter(Train_Test_ID != 1) %>%
|
||||
dplyr::mutate(MAPE = round(abs((Forecast - Target) / Target), digits = 4))
|
||||
|
||||
|
||||
avg_best_model_mape <- avg_back_test_mape %>%
|
||||
dplyr::group_by(Model_ID, Combo) %>%
|
||||
dplyr::mutate(
|
||||
|
@ -385,7 +382,7 @@ final_models <- function(run_info,
|
|||
dplyr::group_by(Combo) %>%
|
||||
dplyr::slice(1) %>%
|
||||
dplyr::ungroup()
|
||||
|
||||
|
||||
avg_best_model_tbl <- avg_best_model_mape %>%
|
||||
dplyr::select(Combo, Model_ID)
|
||||
}
|
||||
|
@ -525,11 +522,11 @@ final_models <- function(run_info,
|
|||
by = "Model_ID"
|
||||
) %>%
|
||||
dplyr::mutate(Best_Model = ifelse(!is.na(Best_Model), "Yes", "No"))
|
||||
|
||||
if(!is.null(averages_tbl)) {
|
||||
|
||||
if (!is.null(averages_tbl)) {
|
||||
avg_model_final_tbl <- averages_tbl %>%
|
||||
dplyr::right_join(avg_best_model_tbl,
|
||||
by = c("Combo", "Model_ID")
|
||||
by = c("Combo", "Model_ID")
|
||||
) %>%
|
||||
dplyr::mutate(
|
||||
Combo_ID = Combo,
|
||||
|
@ -619,14 +616,16 @@ final_models <- function(run_info,
|
|||
|
||||
# clean up any parallel run process
|
||||
par_end(cl)
|
||||
|
||||
|
||||
# condense outputs into less files for larger runs
|
||||
if(length(combo_list) > 10000) {
|
||||
if (length(combo_list) > 10000) {
|
||||
cli::cli_progress_step("Condensing Forecasts")
|
||||
|
||||
condense_data(run_info,
|
||||
parallel_processing,
|
||||
num_cores)
|
||||
|
||||
condense_data(
|
||||
run_info,
|
||||
parallel_processing,
|
||||
num_cores
|
||||
)
|
||||
}
|
||||
|
||||
# reconcile hierarchical forecasts
|
||||
|
@ -641,17 +640,21 @@ final_models <- function(run_info,
|
|||
num_cores
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
# calculate weighted mape
|
||||
weighted_mape <- get_forecast_data(run_info = run_info) %>%
|
||||
dplyr::filter(Run_Type == "Back_Test",
|
||||
Best_Model == "Yes") %>%
|
||||
dplyr::filter(
|
||||
Run_Type == "Back_Test",
|
||||
Best_Model == "Yes"
|
||||
) %>%
|
||||
dplyr::mutate(
|
||||
Target = ifelse(Target == 0, 0.1, Target)
|
||||
) %>%
|
||||
dplyr::mutate(MAPE = round(abs((Forecast - Target) / Target), digits = 4),
|
||||
Total = sum(Target, na.rm = TRUE),
|
||||
Weight = (MAPE*Target)/Total) %>%
|
||||
dplyr::mutate(
|
||||
MAPE = round(abs((Forecast - Target) / Target), digits = 4),
|
||||
Total = sum(Target, na.rm = TRUE),
|
||||
Weight = (MAPE * Target) / Total
|
||||
) %>%
|
||||
dplyr::pull(Weight) %>%
|
||||
sum() %>%
|
||||
round(digits = 4)
|
||||
|
|
|
@ -323,7 +323,6 @@ forecast_backwards_compatibility <- function(run_info,
|
|||
dplyr::select(Combo, Model, Best_Model) %>%
|
||||
dplyr::distinct()
|
||||
} else {
|
||||
|
||||
# read in unreconciled results
|
||||
best_model_tbl <- read_file(run_info,
|
||||
path = paste0(
|
||||
|
|
|
@ -82,7 +82,6 @@ prep_hierarchical_data <- function(input_data,
|
|||
hierarchical_tbl <- hierarchical_tbl %>%
|
||||
dplyr::left_join(temp_tbl, by = c("Date"))
|
||||
} else if (value_level != "All") {
|
||||
|
||||
# agg by lowest level
|
||||
bottom_tbl <- input_data_adj %>%
|
||||
tidyr::unite("Combo",
|
||||
|
@ -400,7 +399,6 @@ reconcile_hierarchical_data <- function(run_info,
|
|||
forecast_approach,
|
||||
negative_forecast = FALSE,
|
||||
num_cores) {
|
||||
|
||||
# get run splits
|
||||
model_train_test_tbl <- read_file(run_info,
|
||||
path = paste0(
|
||||
|
@ -420,15 +418,15 @@ reconcile_hierarchical_data <- function(run_info,
|
|||
hts_nodes <- hts_list$nodes
|
||||
original_combo_list <- hts_list$original_combos
|
||||
hts_combo_list <- hts_list$hts_combos
|
||||
|
||||
|
||||
# check if data has been condensed
|
||||
cond_path <- paste0(
|
||||
run_info$path, "/forecasts/*", hash_data(run_info$experiment_name), "-",
|
||||
hash_data(run_info$run_name), "*condensed", ".", run_info$data_output
|
||||
)
|
||||
|
||||
|
||||
condensed_files <- list_files(run_info$storage_object, fs::path(cond_path))
|
||||
|
||||
|
||||
if (length(condensed_files) > 0) {
|
||||
condensed <- TRUE
|
||||
} else {
|
||||
|
@ -444,7 +442,7 @@ reconcile_hierarchical_data <- function(run_info,
|
|||
return_type <- "df"
|
||||
}
|
||||
|
||||
if(condensed) {
|
||||
if (condensed) {
|
||||
fcst_path <- paste0(
|
||||
"/forecasts/*", hash_data(run_info$experiment_name), "-",
|
||||
hash_data(run_info$run_name), "*condensed", ".", run_info$data_output
|
||||
|
@ -453,7 +451,7 @@ reconcile_hierarchical_data <- function(run_info,
|
|||
fcst_path <- paste0(
|
||||
"/forecasts/*", hash_data(run_info$experiment_name), "-",
|
||||
hash_data(run_info$run_name), "*models", ".", run_info$data_output
|
||||
)
|
||||
)
|
||||
}
|
||||
|
||||
unreconciled_tbl <- read_file(run_info,
|
||||
|
@ -889,7 +887,6 @@ reconcile_hierarchical_data <- function(run_info,
|
|||
external_regressor_mapping <- function(data,
|
||||
combo_variables,
|
||||
external_regressors) {
|
||||
|
||||
# create var combinations list
|
||||
var_combinations <- tibble::tibble()
|
||||
|
||||
|
@ -918,7 +915,6 @@ external_regressor_mapping <- function(data,
|
|||
.multicombine = TRUE,
|
||||
.noexport = NULL
|
||||
) %do% {
|
||||
|
||||
# get unique values of regressor per combo variable iteration
|
||||
var_unique_tbl <- foreach::foreach(
|
||||
var = iter_list,
|
||||
|
@ -1000,7 +996,6 @@ sum_hts_data <- function(bottom_level_tbl,
|
|||
forecast_approach,
|
||||
frequency_number,
|
||||
return_type = "data") {
|
||||
|
||||
# create aggregations for target variable
|
||||
Date <- bottom_level_tbl$Date
|
||||
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
#' Check input values
|
||||
#'
|
||||
#' @param input_name input name
|
||||
|
@ -13,21 +12,23 @@ check_input_type <- function(input_name,
|
|||
type,
|
||||
expected_value = NULL) {
|
||||
if (!inherits(input_value, type)) {
|
||||
stop(paste0(
|
||||
"invalid type for input name '", input_name, "', needs to be of type ",
|
||||
glue::glue_collapse(type, " or ")
|
||||
),
|
||||
call. = FALSE
|
||||
stop(
|
||||
paste0(
|
||||
"invalid type for input name '", input_name, "', needs to be of type ",
|
||||
glue::glue_collapse(type, " or ")
|
||||
),
|
||||
call. = FALSE
|
||||
)
|
||||
}
|
||||
|
||||
if (!is.null(expected_value) & !is.null(input_value)) {
|
||||
if (!sum(input_value %in% expected_value)) {
|
||||
stop(paste0(
|
||||
"invalid value for input name '", input_name, "', value needs to equal ",
|
||||
glue::glue_collapse(expected_value, " or ")
|
||||
),
|
||||
call. = FALSE
|
||||
stop(
|
||||
paste0(
|
||||
"invalid value for input name '", input_name, "', value needs to equal ",
|
||||
glue::glue_collapse(expected_value, " or ")
|
||||
),
|
||||
call. = FALSE
|
||||
)
|
||||
}
|
||||
}
|
||||
|
@ -52,7 +53,6 @@ check_input_data <- function(input_data,
|
|||
date_type,
|
||||
fiscal_year_start,
|
||||
parallel_processing) {
|
||||
|
||||
# data combo names match the input data
|
||||
if (sum(combo_variables %in% colnames(input_data)) != length(combo_variables)) {
|
||||
stop("combo variables do not match column headers in input data")
|
||||
|
@ -103,7 +103,6 @@ check_input_data <- function(input_data,
|
|||
|
||||
# input_data is correct type for parallel processing
|
||||
if (inherits(input_data, c("data.frame", "tbl")) & is.null(parallel_processing)) {
|
||||
|
||||
# do nothing
|
||||
} else if (inherits(input_data, "tbl_spark") & is.null(parallel_processing)) {
|
||||
stop("spark data frames should run with spark parallel processing",
|
||||
|
@ -148,7 +147,6 @@ check_input_data <- function(input_data,
|
|||
check_parallel_processing <- function(run_info,
|
||||
parallel_processing,
|
||||
inner_parallel = FALSE) {
|
||||
|
||||
# parallel processing formatting
|
||||
if (is.null(parallel_processing)) {
|
||||
return()
|
||||
|
|
|
@ -701,7 +701,6 @@ glmnet <- function(train_data,
|
|||
horizon,
|
||||
external_regressors,
|
||||
frequency) {
|
||||
|
||||
# create model recipe and spec
|
||||
if (multistep) {
|
||||
recipe_spec_glmnet <- train_data %>%
|
||||
|
@ -1328,7 +1327,6 @@ xgboost <- function(train_data,
|
|||
horizon,
|
||||
external_regressors,
|
||||
frequency) {
|
||||
|
||||
# create model recipe and spec
|
||||
if (multistep) {
|
||||
recipe_spec_xgboost <- train_data %>%
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
# CUBIST Multistep ----
|
||||
|
||||
#' Initialize custom cubist parsnip model
|
||||
|
@ -298,7 +297,6 @@ cubist_multistep_fit_impl <- function(x, y,
|
|||
external_regressors = NULL,
|
||||
forecast_horizon = NULL,
|
||||
selected_features = NULL) {
|
||||
|
||||
# X & Y
|
||||
# Expect outcomes = vector
|
||||
# Expect predictor = data.frame
|
||||
|
@ -321,7 +319,6 @@ cubist_multistep_fit_impl <- function(x, y,
|
|||
model_predictions <- list()
|
||||
|
||||
for (lag in get_multi_lags(lag_periods, forecast_horizon)) {
|
||||
|
||||
# get final features based on lag
|
||||
xreg_tbl_final <- multi_feature_selection(
|
||||
xreg_tbl,
|
||||
|
@ -438,7 +435,6 @@ predict.cubist_multistep_fit_impl <- function(object, new_data, ...) {
|
|||
#' @keywords internal
|
||||
#' @export
|
||||
cubist_multistep_predict_impl <- function(object, new_data, ...) {
|
||||
|
||||
# Date Mapping Table
|
||||
date_tbl <- new_data %>%
|
||||
dplyr::select(Date, Date_index.num) %>%
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
# GLMNET Multistep ----
|
||||
|
||||
#' Initialize custom glmnet parsnip model
|
||||
|
@ -282,7 +281,6 @@ glmnet_multistep_fit_impl <- function(x, y,
|
|||
external_regressors = NULL,
|
||||
forecast_horizon = NULL,
|
||||
selected_features = NULL) {
|
||||
|
||||
# X & Y
|
||||
# Expect outcomes = vector
|
||||
# Expect predictor = data.frame
|
||||
|
@ -311,7 +309,6 @@ glmnet_multistep_fit_impl <- function(x, y,
|
|||
parsnip::set_engine("glmnet")
|
||||
|
||||
for (lag in get_multi_lags(lag_periods, forecast_horizon)) {
|
||||
|
||||
# get final features based on lag
|
||||
xreg_tbl_final <- multi_feature_selection(
|
||||
xreg_tbl,
|
||||
|
@ -426,7 +423,6 @@ predict.glmnet_multistep_fit_impl <- function(object, new_data, ...) {
|
|||
#' @keywords internal
|
||||
#' @export
|
||||
glmnet_multistep_predict_impl <- function(object, new_data, ...) {
|
||||
|
||||
# Date Mapping Table
|
||||
date_tbl <- new_data %>%
|
||||
dplyr::select(Date, Date_index.num) %>%
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
# Helper Functions ----
|
||||
|
||||
#' Return xregs that contain future values for multistep horizon forecast
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
# MARS Multistep ----
|
||||
|
||||
#' Initialize custom mars parsnip model
|
||||
|
@ -303,7 +302,6 @@ mars_multistep_fit_impl <- function(x, y,
|
|||
external_regressors = NULL,
|
||||
forecast_horizon = NULL,
|
||||
selected_features = NULL) {
|
||||
|
||||
# X & Y
|
||||
# Expect outcomes = vector
|
||||
# Expect predictor = data.frame
|
||||
|
@ -334,7 +332,6 @@ mars_multistep_fit_impl <- function(x, y,
|
|||
parsnip::set_engine("earth")
|
||||
|
||||
for (lag in get_multi_lags(lag_periods, forecast_horizon)) {
|
||||
|
||||
# get final features based on lag
|
||||
xreg_tbl_final <- multi_feature_selection(
|
||||
xreg_tbl,
|
||||
|
@ -449,7 +446,6 @@ predict.mars_multistep_fit_impl <- function(object, new_data, ...) {
|
|||
#' @keywords internal
|
||||
#' @export
|
||||
mars_multistep_predict_impl <- function(object, new_data, ...) {
|
||||
|
||||
# Date Mapping Table
|
||||
date_tbl <- new_data %>%
|
||||
dplyr::select(Date, Date_index.num) %>%
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
# SVM-POLY Multistep ----
|
||||
|
||||
#' Initialize custom svm-poly parsnip model
|
||||
|
@ -325,7 +324,6 @@ svm_poly_multistep_fit_impl <- function(x, y,
|
|||
external_regressors = NULL,
|
||||
forecast_horizon = NULL,
|
||||
selected_features = NULL) {
|
||||
|
||||
# X & Y
|
||||
# Expect outcomes = vector
|
||||
# Expect predictor = data.frame
|
||||
|
@ -357,7 +355,6 @@ svm_poly_multistep_fit_impl <- function(x, y,
|
|||
parsnip::set_engine("kernlab")
|
||||
|
||||
for (lag in get_multi_lags(lag_periods, forecast_horizon)) {
|
||||
|
||||
# get final features based on lag
|
||||
xreg_tbl_final <- multi_feature_selection(
|
||||
xreg_tbl,
|
||||
|
@ -475,7 +472,6 @@ predict.svm_poly_multistep_fit_impl <- function(object, new_data, ...) {
|
|||
#' @keywords internal
|
||||
#' @export
|
||||
svm_poly_multistep_predict_impl <- function(object, new_data, ...) {
|
||||
|
||||
# Date Mapping Table
|
||||
date_tbl <- new_data %>%
|
||||
dplyr::select(Date, Date_index.num) %>%
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
# SVM-RBF Multistep ----
|
||||
|
||||
#' Initialize custom svm-rbf parsnip model
|
||||
|
@ -306,7 +305,6 @@ svm_rbf_multistep_fit_impl <- function(x, y,
|
|||
external_regressors = NULL,
|
||||
forecast_horizon = NULL,
|
||||
selected_features = NULL) {
|
||||
|
||||
# X & Y
|
||||
# Expect outcomes = vector
|
||||
# Expect predictor = data.frame
|
||||
|
@ -337,7 +335,6 @@ svm_rbf_multistep_fit_impl <- function(x, y,
|
|||
parsnip::set_engine("kernlab")
|
||||
|
||||
for (lag in get_multi_lags(lag_periods, forecast_horizon)) {
|
||||
|
||||
# get final features based on lag
|
||||
xreg_tbl_final <- multi_feature_selection(
|
||||
xreg_tbl,
|
||||
|
@ -455,7 +452,6 @@ predict.svm_rbf_multistep_fit_impl <- function(object, new_data, ...) {
|
|||
#' @keywords internal
|
||||
#' @export
|
||||
svm_rbf_multistep_predict_impl <- function(object, new_data, ...) {
|
||||
|
||||
# Date Mapping Table
|
||||
date_tbl <- new_data %>%
|
||||
dplyr::select(Date, Date_index.num) %>%
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
# XGBOOST Multistep ----
|
||||
|
||||
#' Initialize custom xgboost parsnip model
|
||||
|
@ -389,7 +388,6 @@ xgboost_multistep_fit_impl <- function(x, y,
|
|||
forecast_horizon = NULL,
|
||||
selected_features = NULL,
|
||||
...) {
|
||||
|
||||
# X & Y
|
||||
# Expect outcomes = vector
|
||||
# Expect predictor = data.frame
|
||||
|
@ -412,7 +410,6 @@ xgboost_multistep_fit_impl <- function(x, y,
|
|||
model_predictions <- list()
|
||||
|
||||
for (lag in get_multi_lags(lag_periods, forecast_horizon)) {
|
||||
|
||||
# get final features based on lag
|
||||
xreg_tbl_final <- multi_feature_selection(
|
||||
xreg_tbl,
|
||||
|
@ -437,7 +434,7 @@ xgboost_multistep_fit_impl <- function(x, y,
|
|||
y = outcome,
|
||||
max_depth = max_depth,
|
||||
nrounds = nrounds,
|
||||
eta = eta,
|
||||
eta = eta,
|
||||
colsample_bytree = colsample_bytree,
|
||||
colsample_bynode = colsample_bynode,
|
||||
min_child_weight = min_child_weight,
|
||||
|
@ -537,7 +534,6 @@ predict.xgboost_multistep_fit_impl <- function(object, new_data, ...) {
|
|||
#' @keywords internal
|
||||
#' @export
|
||||
xgboost_multistep_predict_impl <- function(object, new_data, ...) {
|
||||
|
||||
# Date Mapping Table
|
||||
date_tbl <- new_data %>%
|
||||
dplyr::select(Date, Date_index.num) %>%
|
||||
|
|
401
R/prep_data.R
401
R/prep_data.R
|
@ -227,7 +227,6 @@ prep_data <- function(run_info,
|
|||
dplyr::filter(Combo %in% current_combo_list_final)
|
||||
|
||||
if (length(combo_diff) == 0 & length(prev_combo_list) > 0) {
|
||||
|
||||
# check if input values have changed
|
||||
current_log_df <- tibble::tibble(
|
||||
combo_variables = paste(combo_variables, collapse = "---"),
|
||||
|
@ -466,204 +465,205 @@ prep_data <- function(run_info,
|
|||
} else if (parallel_processing == "spark") {
|
||||
final_data <- filtered_initial_prep_tbl %>%
|
||||
adjust_df(return_type = "sdf") %>%
|
||||
sparklyr::spark_apply(function(df, context) {
|
||||
# update objects
|
||||
fn_env <- .GlobalEnv
|
||||
sparklyr::spark_apply(
|
||||
function(df, context) {
|
||||
# update objects
|
||||
fn_env <- .GlobalEnv
|
||||
|
||||
for (name in names(context)) {
|
||||
assign(name, context[[name]], envir = fn_env)
|
||||
}
|
||||
for (name in names(context)) {
|
||||
assign(name, context[[name]], envir = fn_env)
|
||||
}
|
||||
|
||||
# get specific time series
|
||||
combo <- unique(df$Combo)
|
||||
# get specific time series
|
||||
combo <- unique(df$Combo)
|
||||
|
||||
return_tbl <- tibble::tibble(
|
||||
Combo = combo,
|
||||
Combo_Hash = hash_data(combo)
|
||||
)
|
||||
|
||||
# handle external regressors
|
||||
xregs_future_tbl <- get_xregs_future_values_tbl(
|
||||
df,
|
||||
external_regressors,
|
||||
hist_end_date
|
||||
)
|
||||
|
||||
if (length(colnames(xregs_future_tbl)) > 2) {
|
||||
xregs_future_list <- xregs_future_tbl %>%
|
||||
dplyr::select(-Date, -Combo) %>%
|
||||
colnames()
|
||||
} else {
|
||||
xregs_future_list <- NULL
|
||||
}
|
||||
|
||||
# initial data prep
|
||||
initial_tbl <- df %>%
|
||||
dplyr::filter(Combo == combo) %>%
|
||||
dplyr::select(
|
||||
Combo,
|
||||
Date,
|
||||
Target,
|
||||
tidyselect::all_of(external_regressors)
|
||||
) %>%
|
||||
dplyr::group_by(Combo) %>%
|
||||
timetk::pad_by_time(Date,
|
||||
.by = date_type,
|
||||
.pad_value = ifelse(clean_missing_values, NA, 0),
|
||||
.end_date = hist_end_date
|
||||
) %>% # fill in missing values in between existing data points
|
||||
timetk::pad_by_time(Date,
|
||||
.by = date_type,
|
||||
.pad_value = 0,
|
||||
.start_date = hist_start_date,
|
||||
.end_date = hist_end_date
|
||||
) %>% # fill in missing values at beginning of time series with zero
|
||||
timetk::future_frame(Date,
|
||||
.length_out = forecast_horizon,
|
||||
.bind_data = TRUE
|
||||
) %>% # add future data
|
||||
dplyr::ungroup() %>%
|
||||
dplyr::left_join(xregs_future_tbl,
|
||||
by = c("Combo", "Date")
|
||||
) %>% # join xregs that contain values given by user
|
||||
clean_outliers_missing_values(
|
||||
clean_outliers,
|
||||
clean_missing_values,
|
||||
get_frequency_number(date_type),
|
||||
external_regressors
|
||||
) %>% # clean outliers and missing values
|
||||
dplyr::mutate_if(is.numeric, list(~ replace(., is.infinite(.), NA))) %>% # replace infinite values
|
||||
dplyr::mutate_if(is.numeric, list(~ replace(., is.nan(.), NA))) %>% # replace NaN values
|
||||
dplyr::mutate_if(is.numeric, list(~ replace(., is.na(.), 0))) %>% # replace NA values
|
||||
dplyr::mutate(Target = ifelse(Date > hist_end_date,
|
||||
NA,
|
||||
Target
|
||||
))
|
||||
|
||||
# box-cox transformation
|
||||
if (box_cox) {
|
||||
box_cox_tbl <- initial_tbl %>%
|
||||
apply_box_cox()
|
||||
|
||||
initial_tbl <- box_cox_tbl$data
|
||||
|
||||
return_tbl <- return_tbl %>%
|
||||
dplyr::left_join(box_cox_tbl$diff_info, by = "Combo")
|
||||
}
|
||||
|
||||
# make stationary
|
||||
if (stationary) {
|
||||
stationary_tbl <- initial_tbl %>%
|
||||
make_stationary()
|
||||
|
||||
initial_tbl <- stationary_tbl$data
|
||||
|
||||
return_tbl <- return_tbl %>%
|
||||
dplyr::left_join(stationary_tbl$diff_info, by = "Combo")
|
||||
}
|
||||
|
||||
# create date features
|
||||
date_features <- initial_tbl %>%
|
||||
dplyr::select(Date) %>%
|
||||
dplyr::mutate(
|
||||
Date_Adj = Date %m+% months(fiscal_year_start - 1),
|
||||
Date_day_month_end = ifelse(lubridate::day(Date_Adj) == lubridate::days_in_month(Date_Adj), 1, 0)
|
||||
) %>%
|
||||
timetk::tk_augment_timeseries_signature(Date_Adj) %>%
|
||||
dplyr::select(!tidyselect::matches(get_date_regex(date_type)), -Date_Adj, -Date)
|
||||
|
||||
names(date_features) <- stringr::str_c("Date_", names(date_features))
|
||||
|
||||
initial_tbl <- initial_tbl %>%
|
||||
cbind(date_features)
|
||||
|
||||
# Run Recipes
|
||||
if (is.null(recipes_to_run)) {
|
||||
run_all_recipes_override <- FALSE
|
||||
} else if (recipes_to_run == "all") {
|
||||
run_all_recipes_override <- TRUE
|
||||
} else {
|
||||
run_all_recipes_override <- FALSE
|
||||
}
|
||||
|
||||
if (is.null(recipes_to_run) | "R1" %in% recipes_to_run | run_all_recipes_override) {
|
||||
R1 <- initial_tbl %>%
|
||||
multivariate_prep_recipe_1(external_regressors,
|
||||
xregs_future_values_list = xregs_future_list,
|
||||
get_fourier_periods(fourier_periods, date_type),
|
||||
get_lag_periods(lag_periods, date_type, forecast_horizon, multistep_horizon, TRUE),
|
||||
get_rolling_window_periods(rolling_window_periods, date_type)
|
||||
) %>%
|
||||
dplyr::mutate(Target = base::ifelse(Date > hist_end_date, NA, Target))
|
||||
|
||||
write_data(
|
||||
x = R1,
|
||||
combo = combo,
|
||||
run_info = run_info,
|
||||
output_type = "data",
|
||||
folder = "prep_data",
|
||||
suffix = "-R1"
|
||||
return_tbl <- tibble::tibble(
|
||||
Combo = combo,
|
||||
Combo_Hash = hash_data(combo)
|
||||
)
|
||||
}
|
||||
|
||||
if ((is.null(recipes_to_run) & date_type %in% c("month", "quarter", "year")) | "R2" %in% recipes_to_run | run_all_recipes_override) {
|
||||
R2 <- initial_tbl %>%
|
||||
multivariate_prep_recipe_2(external_regressors,
|
||||
xregs_future_values_list = xregs_future_list,
|
||||
get_fourier_periods(fourier_periods, date_type),
|
||||
get_lag_periods(lag_periods, date_type, forecast_horizon),
|
||||
get_rolling_window_periods(rolling_window_periods, date_type),
|
||||
date_type,
|
||||
forecast_horizon
|
||||
) %>%
|
||||
dplyr::mutate(Target = base::ifelse(Date > hist_end_date, NA, Target))
|
||||
|
||||
write_data(
|
||||
x = R2,
|
||||
combo = combo,
|
||||
run_info = run_info,
|
||||
output_type = "data",
|
||||
folder = "prep_data",
|
||||
suffix = "-R2"
|
||||
# handle external regressors
|
||||
xregs_future_tbl <- get_xregs_future_values_tbl(
|
||||
df,
|
||||
external_regressors,
|
||||
hist_end_date
|
||||
)
|
||||
}
|
||||
|
||||
return(data.frame(return_tbl))
|
||||
},
|
||||
group_by = "Combo",
|
||||
context = list(
|
||||
get_xregs_future_values_tbl = get_xregs_future_values_tbl,
|
||||
external_regressors = external_regressors,
|
||||
clean_missing_values = clean_missing_values,
|
||||
clean_outliers_missing_values = clean_outliers_missing_values,
|
||||
hash_data = hash_data,
|
||||
hist_end_date = hist_end_date,
|
||||
hist_start_date = hist_start_date,
|
||||
forecast_approach = forecast_approach,
|
||||
forecast_horizon = forecast_horizon,
|
||||
clean_outliers = clean_outliers,
|
||||
get_frequency_number = get_frequency_number,
|
||||
date_type = date_type,
|
||||
fiscal_year_start = fiscal_year_start,
|
||||
get_date_regex = get_date_regex,
|
||||
recipes_to_run = recipes_to_run,
|
||||
multivariate_prep_recipe_1 = multivariate_prep_recipe_1,
|
||||
multivariate_prep_recipe_2 = multivariate_prep_recipe_2,
|
||||
run_info = run_info,
|
||||
get_fourier_periods = get_fourier_periods,
|
||||
fourier_periods = fourier_periods,
|
||||
get_lag_periods = get_lag_periods,
|
||||
lag_periods = lag_periods,
|
||||
get_rolling_window_periods = get_rolling_window_periods,
|
||||
rolling_window_periods = rolling_window_periods,
|
||||
write_data = write_data,
|
||||
write_data_folder = write_data_folder,
|
||||
write_data_type = write_data_type,
|
||||
box_cox = box_cox,
|
||||
stationary = stationary,
|
||||
make_stationary = make_stationary,
|
||||
apply_box_cox = apply_box_cox
|
||||
)
|
||||
if (length(colnames(xregs_future_tbl)) > 2) {
|
||||
xregs_future_list <- xregs_future_tbl %>%
|
||||
dplyr::select(-Date, -Combo) %>%
|
||||
colnames()
|
||||
} else {
|
||||
xregs_future_list <- NULL
|
||||
}
|
||||
|
||||
# initial data prep
|
||||
initial_tbl <- df %>%
|
||||
dplyr::filter(Combo == combo) %>%
|
||||
dplyr::select(
|
||||
Combo,
|
||||
Date,
|
||||
Target,
|
||||
tidyselect::all_of(external_regressors)
|
||||
) %>%
|
||||
dplyr::group_by(Combo) %>%
|
||||
timetk::pad_by_time(Date,
|
||||
.by = date_type,
|
||||
.pad_value = ifelse(clean_missing_values, NA, 0),
|
||||
.end_date = hist_end_date
|
||||
) %>% # fill in missing values in between existing data points
|
||||
timetk::pad_by_time(Date,
|
||||
.by = date_type,
|
||||
.pad_value = 0,
|
||||
.start_date = hist_start_date,
|
||||
.end_date = hist_end_date
|
||||
) %>% # fill in missing values at beginning of time series with zero
|
||||
timetk::future_frame(Date,
|
||||
.length_out = forecast_horizon,
|
||||
.bind_data = TRUE
|
||||
) %>% # add future data
|
||||
dplyr::ungroup() %>%
|
||||
dplyr::left_join(xregs_future_tbl,
|
||||
by = c("Combo", "Date")
|
||||
) %>% # join xregs that contain values given by user
|
||||
clean_outliers_missing_values(
|
||||
clean_outliers,
|
||||
clean_missing_values,
|
||||
get_frequency_number(date_type),
|
||||
external_regressors
|
||||
) %>% # clean outliers and missing values
|
||||
dplyr::mutate_if(is.numeric, list(~ replace(., is.infinite(.), NA))) %>% # replace infinite values
|
||||
dplyr::mutate_if(is.numeric, list(~ replace(., is.nan(.), NA))) %>% # replace NaN values
|
||||
dplyr::mutate_if(is.numeric, list(~ replace(., is.na(.), 0))) %>% # replace NA values
|
||||
dplyr::mutate(Target = ifelse(Date > hist_end_date,
|
||||
NA,
|
||||
Target
|
||||
))
|
||||
|
||||
# box-cox transformation
|
||||
if (box_cox) {
|
||||
box_cox_tbl <- initial_tbl %>%
|
||||
apply_box_cox()
|
||||
|
||||
initial_tbl <- box_cox_tbl$data
|
||||
|
||||
return_tbl <- return_tbl %>%
|
||||
dplyr::left_join(box_cox_tbl$diff_info, by = "Combo")
|
||||
}
|
||||
|
||||
# make stationary
|
||||
if (stationary) {
|
||||
stationary_tbl <- initial_tbl %>%
|
||||
make_stationary()
|
||||
|
||||
initial_tbl <- stationary_tbl$data
|
||||
|
||||
return_tbl <- return_tbl %>%
|
||||
dplyr::left_join(stationary_tbl$diff_info, by = "Combo")
|
||||
}
|
||||
|
||||
# create date features
|
||||
date_features <- initial_tbl %>%
|
||||
dplyr::select(Date) %>%
|
||||
dplyr::mutate(
|
||||
Date_Adj = Date %m+% months(fiscal_year_start - 1),
|
||||
Date_day_month_end = ifelse(lubridate::day(Date_Adj) == lubridate::days_in_month(Date_Adj), 1, 0)
|
||||
) %>%
|
||||
timetk::tk_augment_timeseries_signature(Date_Adj) %>%
|
||||
dplyr::select(!tidyselect::matches(get_date_regex(date_type)), -Date_Adj, -Date)
|
||||
|
||||
names(date_features) <- stringr::str_c("Date_", names(date_features))
|
||||
|
||||
initial_tbl <- initial_tbl %>%
|
||||
cbind(date_features)
|
||||
|
||||
# Run Recipes
|
||||
if (is.null(recipes_to_run)) {
|
||||
run_all_recipes_override <- FALSE
|
||||
} else if (recipes_to_run == "all") {
|
||||
run_all_recipes_override <- TRUE
|
||||
} else {
|
||||
run_all_recipes_override <- FALSE
|
||||
}
|
||||
|
||||
if (is.null(recipes_to_run) | "R1" %in% recipes_to_run | run_all_recipes_override) {
|
||||
R1 <- initial_tbl %>%
|
||||
multivariate_prep_recipe_1(external_regressors,
|
||||
xregs_future_values_list = xregs_future_list,
|
||||
get_fourier_periods(fourier_periods, date_type),
|
||||
get_lag_periods(lag_periods, date_type, forecast_horizon, multistep_horizon, TRUE),
|
||||
get_rolling_window_periods(rolling_window_periods, date_type)
|
||||
) %>%
|
||||
dplyr::mutate(Target = base::ifelse(Date > hist_end_date, NA, Target))
|
||||
|
||||
write_data(
|
||||
x = R1,
|
||||
combo = combo,
|
||||
run_info = run_info,
|
||||
output_type = "data",
|
||||
folder = "prep_data",
|
||||
suffix = "-R1"
|
||||
)
|
||||
}
|
||||
|
||||
if ((is.null(recipes_to_run) & date_type %in% c("month", "quarter", "year")) | "R2" %in% recipes_to_run | run_all_recipes_override) {
|
||||
R2 <- initial_tbl %>%
|
||||
multivariate_prep_recipe_2(external_regressors,
|
||||
xregs_future_values_list = xregs_future_list,
|
||||
get_fourier_periods(fourier_periods, date_type),
|
||||
get_lag_periods(lag_periods, date_type, forecast_horizon),
|
||||
get_rolling_window_periods(rolling_window_periods, date_type),
|
||||
date_type,
|
||||
forecast_horizon
|
||||
) %>%
|
||||
dplyr::mutate(Target = base::ifelse(Date > hist_end_date, NA, Target))
|
||||
|
||||
write_data(
|
||||
x = R2,
|
||||
combo = combo,
|
||||
run_info = run_info,
|
||||
output_type = "data",
|
||||
folder = "prep_data",
|
||||
suffix = "-R2"
|
||||
)
|
||||
}
|
||||
|
||||
return(data.frame(return_tbl))
|
||||
},
|
||||
group_by = "Combo",
|
||||
context = list(
|
||||
get_xregs_future_values_tbl = get_xregs_future_values_tbl,
|
||||
external_regressors = external_regressors,
|
||||
clean_missing_values = clean_missing_values,
|
||||
clean_outliers_missing_values = clean_outliers_missing_values,
|
||||
hash_data = hash_data,
|
||||
hist_end_date = hist_end_date,
|
||||
hist_start_date = hist_start_date,
|
||||
forecast_approach = forecast_approach,
|
||||
forecast_horizon = forecast_horizon,
|
||||
clean_outliers = clean_outliers,
|
||||
get_frequency_number = get_frequency_number,
|
||||
date_type = date_type,
|
||||
fiscal_year_start = fiscal_year_start,
|
||||
get_date_regex = get_date_regex,
|
||||
recipes_to_run = recipes_to_run,
|
||||
multivariate_prep_recipe_1 = multivariate_prep_recipe_1,
|
||||
multivariate_prep_recipe_2 = multivariate_prep_recipe_2,
|
||||
run_info = run_info,
|
||||
get_fourier_periods = get_fourier_periods,
|
||||
fourier_periods = fourier_periods,
|
||||
get_lag_periods = get_lag_periods,
|
||||
lag_periods = lag_periods,
|
||||
get_rolling_window_periods = get_rolling_window_periods,
|
||||
rolling_window_periods = rolling_window_periods,
|
||||
write_data = write_data,
|
||||
write_data_folder = write_data_folder,
|
||||
write_data_type = write_data_type,
|
||||
box_cox = box_cox,
|
||||
stationary = stationary,
|
||||
make_stationary = make_stationary,
|
||||
apply_box_cox = apply_box_cox
|
||||
)
|
||||
)
|
||||
}
|
||||
|
||||
|
@ -693,12 +693,13 @@ prep_data <- function(run_info,
|
|||
length()
|
||||
|
||||
if (successful_combos != total_combos) {
|
||||
stop(paste0(
|
||||
"Not all time series were prepped within 'prep_data', expected ",
|
||||
total_combos, " time series but only ", successful_combos,
|
||||
" time series are prepped. ", "Please run 'prep_data' again."
|
||||
),
|
||||
call. = FALSE
|
||||
stop(
|
||||
paste0(
|
||||
"Not all time series were prepped within 'prep_data', expected ",
|
||||
total_combos, " time series but only ", successful_combos,
|
||||
" time series are prepped. ", "Please run 'prep_data' again."
|
||||
),
|
||||
call. = FALSE
|
||||
)
|
||||
}
|
||||
|
||||
|
@ -1067,7 +1068,6 @@ apply_box_cox <- function(df) {
|
|||
)
|
||||
|
||||
for (column_name in names(df)) {
|
||||
|
||||
# Only check numeric columns with more than 2 unique values
|
||||
if (is.numeric(df[[column_name]]) & length(unique(df[[column_name]])) > 2) {
|
||||
temp_tbl <- df %>%
|
||||
|
@ -1119,7 +1119,6 @@ make_stationary <- function(df) {
|
|||
)
|
||||
|
||||
for (column_name in names(df)) {
|
||||
|
||||
# Only check numeric columns with more than 2 unique values
|
||||
if (is.numeric(df[[column_name]]) & length(unique(df[[column_name]])) > 2) {
|
||||
temp_tbl <- df %>%
|
||||
|
@ -1183,7 +1182,6 @@ multivariate_prep_recipe_1 <- function(data,
|
|||
rolling_window_periods,
|
||||
hist_end_date,
|
||||
date_type) {
|
||||
|
||||
# apply polynomial transformations
|
||||
numeric_xregs <- c()
|
||||
|
||||
|
@ -1345,7 +1343,6 @@ multivariate_prep_recipe_2 <- function(data,
|
|||
}
|
||||
|
||||
for (period in 1:forecast_horizon) {
|
||||
|
||||
# add horizon and origin components
|
||||
data_lag_window <- df_poly %>%
|
||||
dplyr::mutate(
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
#' Prep Models
|
||||
#'
|
||||
#' Preps various aspects of run before training models. Things like train/test
|
||||
|
@ -60,7 +59,6 @@ prep_models <- function(run_info,
|
|||
pca = NULL,
|
||||
num_hyperparameters = 10,
|
||||
seed = 123) {
|
||||
|
||||
# check input values
|
||||
check_input_type("run_info", run_info, "list")
|
||||
check_input_type("back_test_scenarios", back_test_scenarios, c("NULL", "numeric"))
|
||||
|
@ -514,7 +512,6 @@ model_workflows <- function(run_info,
|
|||
ml_models <- list_models()
|
||||
|
||||
if (is.null(models_to_run) & is.null(models_not_to_run)) {
|
||||
|
||||
# do nothing, using existing ml_models list
|
||||
} else if (is.null(models_to_run) & !is.null(models_not_to_run)) {
|
||||
ml_models <- setdiff(ml_models, models_not_to_run)
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
#' Get Final Forecast Data
|
||||
#'
|
||||
#' @param run_info run info using the [set_run_info()] function
|
||||
|
@ -46,7 +45,6 @@
|
|||
#' @export
|
||||
get_forecast_data <- function(run_info,
|
||||
return_type = "df") {
|
||||
|
||||
# check input values
|
||||
check_input_type("run_info", run_info, "list")
|
||||
check_input_type("return_type", return_type, "character", c("df", "sdf"))
|
||||
|
@ -76,17 +74,17 @@ get_forecast_data <- function(run_info,
|
|||
run_info$path, "/forecasts/*", hash_data(run_info$experiment_name), "-",
|
||||
hash_data(run_info$run_name), "*condensed", ".", run_info$data_output
|
||||
)
|
||||
|
||||
|
||||
condensed_files <- list_files(run_info$storage_object, fs::path(cond_path))
|
||||
|
||||
|
||||
if (length(condensed_files) > 0) {
|
||||
condensed <- TRUE
|
||||
} else {
|
||||
condensed <- FALSE
|
||||
}
|
||||
|
||||
|
||||
# get forecast data
|
||||
if(forecast_approach != "bottoms_up") {
|
||||
if (forecast_approach != "bottoms_up") {
|
||||
fcst_path <- paste0(
|
||||
"/forecasts/*", hash_data(run_info$experiment_name), "-",
|
||||
hash_data(run_info$run_name), "*reconciled", ".", run_info$data_output
|
||||
|
@ -172,7 +170,6 @@ get_forecast_data <- function(run_info,
|
|||
#' }
|
||||
#' @export
|
||||
get_trained_models <- function(run_info) {
|
||||
|
||||
# check input values
|
||||
check_input_type("run_info", run_info, "list")
|
||||
|
||||
|
@ -228,7 +225,6 @@ get_trained_models <- function(run_info) {
|
|||
get_prepped_data <- function(run_info,
|
||||
recipe,
|
||||
return_type = "df") {
|
||||
|
||||
# check input values
|
||||
check_input_type("run_info", run_info, "list")
|
||||
check_input_type("recipe", recipe, "character", c("R1", "R2"))
|
||||
|
@ -299,7 +295,6 @@ get_prepped_data <- function(run_info,
|
|||
#' }
|
||||
#' @export
|
||||
get_prepped_models <- function(run_info) {
|
||||
|
||||
# check input values
|
||||
check_input_type("run_info", run_info, "list")
|
||||
|
||||
|
@ -527,12 +522,12 @@ download_file <- function(storage_object,
|
|||
#' @noRd
|
||||
read_file <- function(run_info,
|
||||
path = NULL,
|
||||
file_list = NULL,
|
||||
file_list = NULL,
|
||||
return_type = "df",
|
||||
schema = NULL) {
|
||||
storage_object <- run_info$storage_object
|
||||
|
||||
if(!is.null(path)) {
|
||||
|
||||
if (!is.null(path)) {
|
||||
folder <- fs::path_dir(path)
|
||||
initial_path <- run_info$path
|
||||
file <- fs::path_file(path)
|
||||
|
@ -551,7 +546,7 @@ read_file <- function(run_info,
|
|||
files <- list_files(storage_object, fs::path(initial_path, path))
|
||||
}
|
||||
|
||||
if(!is.null(file_list)) {
|
||||
if (!is.null(file_list)) {
|
||||
file_temp <- files[[1]]
|
||||
file_ext <- fs::path_ext(file_temp)
|
||||
} else if (fs::path_ext(file) == "*") {
|
||||
|
@ -698,10 +693,9 @@ get_recipe_data <- function(run_info,
|
|||
#'
|
||||
#' @return nothing
|
||||
#' @noRd
|
||||
condense_data <- function(run_info,
|
||||
condense_data <- function(run_info,
|
||||
parallel_processing = NULL,
|
||||
num_cores = NULL) {
|
||||
|
||||
# get initial list of files to condense
|
||||
initial_file_list <- list_files(
|
||||
run_info$storage_object,
|
||||
|
@ -710,16 +704,16 @@ condense_data <- function(run_info,
|
|||
hash_data(run_info$run_name), "*_models.", run_info$data_output
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
# Initialize an empty list to store the batches
|
||||
list_of_batches <- list()
|
||||
|
||||
|
||||
# Define the batch size
|
||||
batch_size <- 10000
|
||||
|
||||
|
||||
# Calculate the number of batches needed
|
||||
num_batches <- ceiling(length(initial_file_list) / batch_size)
|
||||
|
||||
|
||||
# Loop through the large list and create batches
|
||||
for (i in 1:num_batches) {
|
||||
start_index <- (i - 1) * batch_size + 1
|
||||
|
@ -727,7 +721,7 @@ condense_data <- function(run_info,
|
|||
batch_name <- paste0("batch_", i)
|
||||
list_of_batches[[batch_name]] <- initial_file_list[start_index:end_index]
|
||||
}
|
||||
|
||||
|
||||
# parallel run info
|
||||
par_info <- par_start(
|
||||
run_info = run_info,
|
||||
|
@ -735,11 +729,11 @@ condense_data <- function(run_info,
|
|||
num_cores = min(length(names(list_of_batches)), num_cores),
|
||||
task_length = length(names(list_of_batches))
|
||||
)
|
||||
|
||||
|
||||
cl <- par_info$cl
|
||||
packages <- par_info$packages
|
||||
`%op%` <- par_info$foreach_operator
|
||||
|
||||
|
||||
# submit tasks
|
||||
condense_data_tbl <- foreach::foreach(
|
||||
batch = names(list_of_batches),
|
||||
|
@ -750,23 +744,23 @@ condense_data <- function(run_info,
|
|||
.inorder = FALSE,
|
||||
.multicombine = TRUE,
|
||||
.noexport = NULL
|
||||
) %op%
|
||||
{
|
||||
files <- list_of_batches[[batch]]
|
||||
) %op% {
|
||||
files <- list_of_batches[[batch]]
|
||||
|
||||
data <- read_file(run_info,
|
||||
file_list = files,
|
||||
return_type = "df")
|
||||
|
||||
write_data(
|
||||
x = data,
|
||||
combo = batch,
|
||||
run_info = run_info,
|
||||
output_type = "data",
|
||||
folder = "forecasts",
|
||||
suffix = "-condensed"
|
||||
)
|
||||
|
||||
return(batch)
|
||||
}
|
||||
data <- read_file(run_info,
|
||||
file_list = files,
|
||||
return_type = "df"
|
||||
)
|
||||
|
||||
write_data(
|
||||
x = data,
|
||||
combo = batch,
|
||||
run_info = run_info,
|
||||
output_type = "data",
|
||||
folder = "forecasts",
|
||||
suffix = "-condensed"
|
||||
)
|
||||
|
||||
return(batch)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -44,7 +44,6 @@ set_run_info <- function(experiment_name = "finn_fcst",
|
|||
data_output = "csv",
|
||||
object_output = "rds",
|
||||
add_unique_id = TRUE) {
|
||||
|
||||
# initial input checks
|
||||
if (!inherits(run_name, c("NULL", "character"))) {
|
||||
stop("`run_name` must either be a NULL or a string")
|
||||
|
@ -151,7 +150,6 @@ set_run_info <- function(experiment_name = "finn_fcst",
|
|||
base::suppressWarnings()
|
||||
|
||||
if (nrow(log_df) > 0 & add_unique_id == FALSE) {
|
||||
|
||||
# check if input values have changed
|
||||
current_log_df <- tibble::tibble(
|
||||
experiment_name = experiment_name,
|
||||
|
@ -266,7 +264,6 @@ get_run_info <- function(experiment_name = NULL,
|
|||
run_name = NULL,
|
||||
storage_object = NULL,
|
||||
path = NULL) {
|
||||
|
||||
# input checks
|
||||
if (!inherits(run_name, c("NULL", "character"))) {
|
||||
stop("`run_name` must either be a NULL or a string")
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
#' Train Individual Models
|
||||
#'
|
||||
#' @param run_info run info using the [set_run_info()] function
|
||||
|
@ -220,7 +219,6 @@ train_models <- function(run_info,
|
|||
stringr::str_replace(hash_data("All-Data"), "All-Data")
|
||||
|
||||
if (length(combo_diff) == 0 & length(prev_combo_list) > 0) {
|
||||
|
||||
# check if input values have changed
|
||||
current_log_df <- tibble::tibble(
|
||||
run_global_models = run_global_models,
|
||||
|
@ -270,7 +268,6 @@ train_models <- function(run_info,
|
|||
.noexport = NULL
|
||||
) %op%
|
||||
{
|
||||
|
||||
# get time series
|
||||
combo_hash <- x
|
||||
|
||||
|
@ -402,7 +399,6 @@ train_models <- function(run_info,
|
|||
.multicombine = TRUE,
|
||||
.noexport = NULL
|
||||
) %do% {
|
||||
|
||||
# get initial run info
|
||||
model <- model_run %>%
|
||||
dplyr::pull(Model_Name)
|
||||
|
@ -719,12 +715,13 @@ train_models <- function(run_info,
|
|||
length()
|
||||
|
||||
if (successful_combos != total_combos) {
|
||||
stop(paste0(
|
||||
"Not all time series were completed within 'train_models', expected ",
|
||||
total_combos, " time series but only ", successful_combos,
|
||||
" time series were ran. ", "Please run 'train_models' again."
|
||||
),
|
||||
call. = FALSE
|
||||
stop(
|
||||
paste0(
|
||||
"Not all time series were completed within 'train_models', expected ",
|
||||
total_combos, " time series but only ", successful_combos,
|
||||
" time series were ran. ", "Please run 'train_models' again."
|
||||
),
|
||||
call. = FALSE
|
||||
)
|
||||
}
|
||||
|
||||
|
@ -784,7 +781,6 @@ negative_fcst_adj <- function(data,
|
|||
#' @return tbl with train test splits
|
||||
#' @noRd
|
||||
create_splits <- function(data, train_test_splits) {
|
||||
|
||||
# Create the rsplit object
|
||||
analysis_split <- function(data, train_indices, test_indices) {
|
||||
rsplit_object <- rsample::make_splits(
|
||||
|
@ -846,7 +842,6 @@ create_splits <- function(data, train_test_splits) {
|
|||
undifference_forecast <- function(forecast_data,
|
||||
recipe_data,
|
||||
diff_tbl) {
|
||||
|
||||
# check if data needs to be undifferenced
|
||||
diff1 <- diff_tbl$Diff_Value1
|
||||
diff2 <- diff_tbl$Diff_Value2
|
||||
|
@ -863,10 +858,8 @@ undifference_forecast <- function(forecast_data,
|
|||
|
||||
# non seasonal differencing
|
||||
if (!is.na(diff1)) {
|
||||
|
||||
# loop through each back test split
|
||||
for (id in train_test_id) {
|
||||
|
||||
# get specific train test split
|
||||
fcst_temp_tbl <- forecast_data %>%
|
||||
dplyr::filter(Train_Test_ID == id)
|
||||
|
@ -950,7 +943,6 @@ undifference_forecast <- function(forecast_data,
|
|||
undifference_recipe <- function(recipe_data,
|
||||
diff_tbl,
|
||||
hist_end_date) {
|
||||
|
||||
# check if data needs to be undifferenced
|
||||
diff1 <- diff_tbl$Diff_Value1
|
||||
diff2 <- diff_tbl$Diff_Value2
|
||||
|
|
|
@ -15,7 +15,7 @@ utils::globalVariables(c(
|
|||
"Auto_Accept", "Feature", "Imp", "Importance", "LOFO_Var", "Var_RMSE", "Vote", "Votes", "desc",
|
||||
"term", "Column", "Box_Cox_Lambda", "get_recipie_configurable", "Agg", "Unique", "Var",
|
||||
"Var_Combo", "regressor", "regressor_tbl", "value_level_iter", ".actual", ".fitted",
|
||||
"forecast_horizon", "lag", "new_data", "object", "fit", "Row_Num", "Run_Number", "weight",
|
||||
"forecast_horizon", "lag", "new_data", "object", "fit", "Row_Num", "Run_Number", "weight",
|
||||
"Total", "Weight", "batch"
|
||||
))
|
||||
|
||||
|
@ -89,7 +89,6 @@ cbind.fill <- function(..., fill = NA) {
|
|||
# been loaded.
|
||||
|
||||
.onLoad <- function(libname, pkgname) {
|
||||
|
||||
# CRAN OMP THREAD LIMIT
|
||||
Sys.setenv("OMP_THREAD_LIMIT" = 1)
|
||||
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
# * custom test functions ----
|
||||
|
||||
check_exist <- function(to_check, ret) {
|
||||
|
|
|
@ -1,6 +1,4 @@
|
|||
|
||||
test_that("prep_hierarchical_data returns correct grouped hierarchies", {
|
||||
|
||||
# Mock data setup
|
||||
data <- tibble::tibble(
|
||||
Segment = as.character(c(
|
||||
|
@ -77,7 +75,6 @@ test_that("prep_hierarchical_data returns correct grouped hierarchies", {
|
|||
})
|
||||
|
||||
test_that("prep_hierarchical_data returns correct standard hierarchies", {
|
||||
|
||||
# Mock data setup
|
||||
data <- tibble::tibble(
|
||||
Area = as.character(c("EMEA", "EMEA", "EMEA", "EMEA", "EMEA", "EMEA", "EMEA", "EMEA", "United States", "United States", "United States", "United States")),
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
test_that("multistep_horizon monthly data", {
|
||||
|
||||
# Mock data setup
|
||||
data <- timetk::m4_monthly %>%
|
||||
dplyr::mutate(id = as.character(id)) %>%
|
||||
|
|
|
@ -22,27 +22,33 @@ When `prep_models()` is ran, hyperparameters and back test splits are calculated
|
|||
library(finnts)
|
||||
|
||||
hist_data <- timetk::m4_monthly %>%
|
||||
dplyr::filter(date >= "2012-01-01",
|
||||
id == "M2") %>%
|
||||
dplyr::filter(
|
||||
date >= "2012-01-01",
|
||||
id == "M2"
|
||||
) %>%
|
||||
dplyr::rename(Date = date) %>%
|
||||
dplyr::mutate(id = as.character(id))
|
||||
|
||||
run_info <- set_run_info(
|
||||
experiment_name = "finnts_fcst",
|
||||
experiment_name = "finnts_fcst",
|
||||
run_name = "get_prepped_models"
|
||||
)
|
||||
|
||||
prep_data(run_info = run_info,
|
||||
input_data = hist_data,
|
||||
combo_variables = c("id"),
|
||||
target_variable = "value",
|
||||
date_type = "month",
|
||||
recipes_to_run = "R1",
|
||||
forecast_horizon = 6)
|
||||
prep_data(
|
||||
run_info = run_info,
|
||||
input_data = hist_data,
|
||||
combo_variables = c("id"),
|
||||
target_variable = "value",
|
||||
date_type = "month",
|
||||
recipes_to_run = "R1",
|
||||
forecast_horizon = 6
|
||||
)
|
||||
|
||||
prep_models(run_info = run_info,
|
||||
models_to_run = c("arima", "ets", "xgboost"),
|
||||
num_hyperparameters = 10)
|
||||
prep_models(
|
||||
run_info = run_info,
|
||||
models_to_run = c("arima", "ets", "xgboost"),
|
||||
num_hyperparameters = 10
|
||||
)
|
||||
|
||||
model_info <- get_prepped_models(run_info = run_info)
|
||||
|
||||
|
@ -67,7 +73,6 @@ model_hyperparameter_info <- model_info %>%
|
|||
tidyr::unnest(Data)
|
||||
|
||||
print(model_hyperparameter_info)
|
||||
|
||||
```
|
||||
|
||||
The above outputs allow a Finn user to understand what hyperparameters are chosen for tuning and how the model refitting process will work. When tuning hyperparameters, Finn uses the "Validation" train/test splits, with the final parameters chosen using RMSE. For some models like ARIMA that don't follow a traditional hyperparameter tuning process, the model is fit from scratch across all train/test splits. After hyperparameters are chosen, the model is refit across the "Back_Test" and "Future_Forecast" splits. The "Back_Test" splits are the true testing data that will be used when selecting the final "Best-Model". "Ensemble" splits are also created as ensemble training data if ensemble models are chosen to run. Ensemble models follow a similar tuning process.
|
||||
|
|
|
@ -23,17 +23,21 @@ suppressMessages(library(dplyr))
|
|||
|
||||
message("Simple Back Test Results")
|
||||
back_test_tbl <- tibble(
|
||||
Combo = c("Country_1", "Country_1", "Country_1", "Country_1", "Country_1", "Country_1", "Country_1", "Country_1", "Country_1", "Country_1"),
|
||||
Date = c("2020-01-01", "2020-02-01", "2020-03-01", "2020-04-01", "2020-05-01", "2020-01-01", "2020-02-01", "2020-03-01", "2020-04-01", "2020-05-01"),
|
||||
Model = c("arima", "arima", "arima", "arima", "arima", "ets", "ets", "ets", "ets", "ets"),
|
||||
FCST = c(9, 23, 35, 41, 48, 7, 22, 29, 42, 53),
|
||||
Combo = c("Country_1", "Country_1", "Country_1", "Country_1", "Country_1", "Country_1", "Country_1", "Country_1", "Country_1", "Country_1"),
|
||||
Date = c("2020-01-01", "2020-02-01", "2020-03-01", "2020-04-01", "2020-05-01", "2020-01-01", "2020-02-01", "2020-03-01", "2020-04-01", "2020-05-01"),
|
||||
Model = c("arima", "arima", "arima", "arima", "arima", "ets", "ets", "ets", "ets", "ets"),
|
||||
FCST = c(9, 23, 35, 41, 48, 7, 22, 29, 42, 53),
|
||||
Target = c(10, 20, 30, 40, 50, 10, 20, 30, 40, 50)
|
||||
) %>%
|
||||
dplyr::mutate(MAPE = abs(Target-FCST)/Target,
|
||||
Date = as.Date(Date)) %>%
|
||||
dplyr::mutate(
|
||||
MAPE = abs(Target - FCST) / Target,
|
||||
Date = as.Date(Date)
|
||||
) %>%
|
||||
dplyr::group_by(Combo, Model) %>%
|
||||
dplyr::mutate(Target_Total = sum(Target),
|
||||
Percent_Total = Target/Target_Total) %>%
|
||||
dplyr::mutate(
|
||||
Target_Total = sum(Target),
|
||||
Percent_Total = Target / Target_Total
|
||||
) %>%
|
||||
dplyr::ungroup()
|
||||
|
||||
print(back_test_tbl)
|
||||
|
@ -44,12 +48,13 @@ message("Overall Model Accuracy by Combo")
|
|||
suppressMessages(best_model <- back_test_tbl %>%
|
||||
dplyr::group_by(Combo, Model) %>%
|
||||
dplyr::mutate(Weighted_MAPE = MAPE * Percent_Total) %>%
|
||||
dplyr::summarise(MAPE = mean(MAPE),
|
||||
Weighted_MAPE = sum(Weighted_MAPE)) %>%
|
||||
dplyr::summarise(
|
||||
MAPE = mean(MAPE),
|
||||
Weighted_MAPE = sum(Weighted_MAPE)
|
||||
) %>%
|
||||
dplyr::ungroup())
|
||||
|
||||
print(best_model)
|
||||
|
||||
```
|
||||
|
||||
During the simple back test process above, arima seems to be the better model from a pure MAPE perspective, but ETS ends up being the winner when using weighted MAPE. The benefits of weighted MAPE allow finnts to find the optimal model that performs the best on the biggest components of a forecast, which comes with the added benefit of putting more weight on more recent observations since those are more likely to have larger target values then ones further into the past. Another way of putting more weight on more recent observations is how Finn overlaps its back testing scenarios. This means the most recent observations are tested for accuracy in different forecast horizons (H=1, H=2, etc). More info on this in the back testing vignette.
|
||||
|
|
|
@ -42,11 +42,11 @@ Below is an example of how you can set up your input_data to leverage future val
|
|||
library(dplyr)
|
||||
|
||||
tibble(
|
||||
Combo = c("Country_1", "Country_1", "Country_1", "Country_1", "Country_1", "Country_1", "Country_1", "Country_1", "Country_1", "Country_1", "Country_1", "Country_1", "Country_1", "Country_1", "Country_1"),
|
||||
Date = c("2020-01-01", "2020-02-01", "2020-03-01", "2020-04-01", "2020-05-01", "2020-06-01", "2020-07-01", "2020-08-01", "2020-09-01", "2020-10-01", "2020-11-01", "2020-12-01", "2021-01-01", "2021-02-01", "2021-03-01"),
|
||||
Target = c(10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, NA, NA, NA),
|
||||
Holiday = c("New Years", "Valentines Day", "None", "Easter", "None", "None", "4th of July", "None", "Labor Day", "Halloween", "Thanksgiving", "Christmas", "New Years", "Valentines Day", "None"),
|
||||
GDP = c(5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, NA, NA, NA),
|
||||
Combo = c("Country_1", "Country_1", "Country_1", "Country_1", "Country_1", "Country_1", "Country_1", "Country_1", "Country_1", "Country_1", "Country_1", "Country_1", "Country_1", "Country_1", "Country_1"),
|
||||
Date = c("2020-01-01", "2020-02-01", "2020-03-01", "2020-04-01", "2020-05-01", "2020-06-01", "2020-07-01", "2020-08-01", "2020-09-01", "2020-10-01", "2020-11-01", "2020-12-01", "2021-01-01", "2021-02-01", "2021-03-01"),
|
||||
Target = c(10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, NA, NA, NA),
|
||||
Holiday = c("New Years", "Valentines Day", "None", "Easter", "None", "None", "4th of July", "None", "Labor Day", "Halloween", "Thanksgiving", "Christmas", "New Years", "Valentines Day", "None"),
|
||||
GDP = c(5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, NA, NA, NA),
|
||||
Sales_Pipeline = c(100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240)
|
||||
) %>%
|
||||
dplyr::mutate(Date = as.Date(Date))
|
||||
|
|
|
@ -74,7 +74,6 @@ In addition to the standard approaches above, finnts also does two different way
|
|||
In the first recipe, referred to as "R1" in default finnts models, by default takes a single step horizon approach. Meaning all of the engineered target and external regressor features are used but the lags cannot be less than the forecast horizon. For example, a monthly data set with a forecast horizon of 3, finnts will take engineered features like lags and rolling window features but only use those lags that are for periods equal to or greater than 3. You can also run a multistep horizon approach by setting `multistep_horizon` to TRUE in `prep_models()`. The multistep approach will create features that can be used by specific multivariate models that optimize for each period in a forecast horizon. More on this in the "models used in finnts" vignette. Recursive forecasting is not supported in finnts multivariate machine learning models, since feeding forecast outputs as features to create another forecast adds complex layers of uncertainty that can easily spiral out of control and produce poor forecasts. NA values created by generating lag features are filled "up". This results in the first initial periods of a time series having some data leakage but the effect should be small if the time series is long enough.
|
||||
|
||||
```{r, message = FALSE}
|
||||
|
||||
library(finnts)
|
||||
|
||||
hist_data <- timetk::m4_monthly %>%
|
||||
|
@ -112,7 +111,6 @@ print(R1_prepped_data_tbl)
|
|||
The second recipe is referred to as "R2" in default finnts models. It takes a very different approach than the "R1" recipe. For a 3 month forecast horizon on a monthly dataset, target and rolling window features are created depending on the horizon period. They are also constrained to be equal or less than the forecast horizon. In the below example, "Origin" and "Horizon" features are created for each time period. This results in duplicating rows in the original data set to create new features that are now specific to each horizon period. This helps the default finnts models find new unique relationships to model, when compared to a more formal approach in "R1". NA values created by generating lag features are filled "up".
|
||||
|
||||
```{r, message = FALSE}
|
||||
|
||||
library(finnts)
|
||||
|
||||
hist_data <- timetk::m4_monthly %>%
|
||||
|
|
|
@ -10,8 +10,8 @@ vignette: >
|
|||
```{r, include = FALSE}
|
||||
knitr::opts_chunk$set(
|
||||
collapse = TRUE,
|
||||
comment = "#>",
|
||||
warning = FALSE,
|
||||
comment = "#>",
|
||||
warning = FALSE,
|
||||
message = FALSE
|
||||
)
|
||||
library(finnts)
|
||||
|
@ -62,18 +62,15 @@ The above data set contains 4 individual time series, identified using the "id"
|
|||
Before we call the Finn forecast function. Let's first set up some run information using `set_run_info()`, this helps log all components of our Finn forecast successfully.
|
||||
|
||||
```{r, message = FALSE, eval = hist_data, error=FALSE, warning = FALSE, echo=T, eval = TRUE}
|
||||
|
||||
run_info <- set_run_info(
|
||||
experiment_name = "finn_forecast",
|
||||
experiment_name = "finn_forecast",
|
||||
run_name = "test_run"
|
||||
)
|
||||
|
||||
```
|
||||
|
||||
Calling the "forecast_time_series" function is the easiest part. In this example we will be running just two models.
|
||||
|
||||
```{r, message = FALSE, eval = hist_data, error=FALSE, warning = FALSE, echo=T, eval = TRUE}
|
||||
|
||||
# no need to assign it to a variable, since all of the outputs are written to disk :)
|
||||
forecast_time_series(
|
||||
run_info = run_info,
|
||||
|
@ -82,8 +79,8 @@ forecast_time_series(
|
|||
target_variable = "value",
|
||||
date_type = "month",
|
||||
forecast_horizon = 3,
|
||||
back_test_scenarios = 6,
|
||||
models_to_run = c("arima", "ets"),
|
||||
back_test_scenarios = 6,
|
||||
models_to_run = c("arima", "ets"),
|
||||
return_data = FALSE
|
||||
)
|
||||
```
|
||||
|
@ -93,7 +90,6 @@ forecast_time_series(
|
|||
### Initial Finn Outputs
|
||||
|
||||
```{r, message = FALSE, eval = finn_output, message = FALSE, eval = FALSE, echo=T}
|
||||
|
||||
finn_output_tbl <- get_forecast_data(run_info = run_info)
|
||||
|
||||
print(finn_output_tbl)
|
||||
|
@ -102,7 +98,6 @@ print(finn_output_tbl)
|
|||
### Future Forecast
|
||||
|
||||
```{r, message = FALSE, eval = finn_output, message = FALSE, eval = FALSE, echo=T}
|
||||
|
||||
future_forecast_tbl <- finn_output_tbl %>%
|
||||
dplyr::filter(Run_Type == "Future_Forecast")
|
||||
|
||||
|
@ -142,13 +137,17 @@ print(trained_model_tbl)
|
|||
### Initial Prepped Data
|
||||
|
||||
```{r, message = FALSE, eval = finn_output, eval = FALSE, echo=T}
|
||||
R1_prepped_data_tbl <- get_prepped_data(run_info = run_info,
|
||||
recipe = "R1")
|
||||
R1_prepped_data_tbl <- get_prepped_data(
|
||||
run_info = run_info,
|
||||
recipe = "R1"
|
||||
)
|
||||
|
||||
print(R1_prepped_data_tbl)
|
||||
|
||||
R2_prepped_data_tbl <- get_prepped_data(run_info = run_info,
|
||||
recipe = "R2")
|
||||
R2_prepped_data_tbl <- get_prepped_data(
|
||||
run_info = run_info,
|
||||
recipe = "R2"
|
||||
)
|
||||
|
||||
print(R2_prepped_data_tbl)
|
||||
```
|
||||
|
|
|
@ -10,8 +10,8 @@ vignette: >
|
|||
```{r, include = FALSE}
|
||||
knitr::opts_chunk$set(
|
||||
collapse = TRUE,
|
||||
comment = "#>",
|
||||
warning = FALSE,
|
||||
comment = "#>",
|
||||
warning = FALSE,
|
||||
message = FALSE
|
||||
)
|
||||
library(finnts)
|
||||
|
@ -29,13 +29,15 @@ Let's get some example data and then set our Finn run info.
|
|||
library(finnts)
|
||||
|
||||
hist_data <- timetk::m4_monthly %>%
|
||||
dplyr::filter(date >= "2013-01-01",
|
||||
id == "M2") %>%
|
||||
dplyr::filter(
|
||||
date >= "2013-01-01",
|
||||
id == "M2"
|
||||
) %>%
|
||||
dplyr::rename(Date = date) %>%
|
||||
dplyr::mutate(id = as.character(id))
|
||||
|
||||
run_info <- set_run_info(
|
||||
experiment_name = "finnts_fcst",
|
||||
experiment_name = "finnts_fcst",
|
||||
run_name = "finn_sub_component_run"
|
||||
)
|
||||
```
|
||||
|
@ -45,20 +47,26 @@ run_info <- set_run_info(
|
|||
Clean and prepare our data before training models. We can even pull out our prepped data to see the features and transformations applied before models are trained.
|
||||
|
||||
```{r message=FALSE}
|
||||
prep_data(run_info = run_info,
|
||||
input_data = hist_data,
|
||||
combo_variables = c("id"),
|
||||
target_variable = "value",
|
||||
date_type = "month",
|
||||
forecast_horizon = 6)
|
||||
prep_data(
|
||||
run_info = run_info,
|
||||
input_data = hist_data,
|
||||
combo_variables = c("id"),
|
||||
target_variable = "value",
|
||||
date_type = "month",
|
||||
forecast_horizon = 6
|
||||
)
|
||||
|
||||
R1_prepped_data_tbl <- get_prepped_data(run_info = run_info,
|
||||
recipe = "R1")
|
||||
R1_prepped_data_tbl <- get_prepped_data(
|
||||
run_info = run_info,
|
||||
recipe = "R1"
|
||||
)
|
||||
|
||||
print(R1_prepped_data_tbl)
|
||||
|
||||
R2_prepped_data_tbl <- get_prepped_data(run_info = run_info,
|
||||
recipe = "R2")
|
||||
R2_prepped_data_tbl <- get_prepped_data(
|
||||
run_info = run_info,
|
||||
recipe = "R2"
|
||||
)
|
||||
|
||||
print(R2_prepped_data_tbl)
|
||||
```
|
||||
|
@ -70,12 +78,16 @@ Now that our data is prepared for modeling, let's now train some models. First w
|
|||
Then we can kick off training each model on our data.
|
||||
|
||||
```{r, message = FALSE}
|
||||
prep_models(run_info = run_info,
|
||||
models_to_run = c("arima", "ets", "glmnet"),
|
||||
num_hyperparameters = 2)
|
||||
prep_models(
|
||||
run_info = run_info,
|
||||
models_to_run = c("arima", "ets", "glmnet"),
|
||||
num_hyperparameters = 2
|
||||
)
|
||||
|
||||
train_models(run_info = run_info,
|
||||
run_global_models = FALSE)
|
||||
train_models(
|
||||
run_info = run_info,
|
||||
run_global_models = FALSE
|
||||
)
|
||||
```
|
||||
|
||||
## Train Ensemble Models
|
||||
|
@ -99,7 +111,6 @@ final_models(run_info = run_info)
|
|||
Finally we can now retrieve the forecast results from this Finn run.
|
||||
|
||||
```{r, message = FALSE}
|
||||
|
||||
finn_output_tbl <- get_forecast_data(run_info = run_info)
|
||||
|
||||
print(finn_output_tbl)
|
||||
|
|
|
@ -27,15 +27,14 @@ message("Standard Hierarchical Time Series Data")
|
|||
|
||||
hts <- tibble(
|
||||
Continent = c("North America", "North America", "North America", "North America", "North America", "North America", "North America", "North America", "North America"),
|
||||
Country = c("United States", "United States", "United States", "United States", "United States", "United States", "Mexico", "Mexico", "Mexico"),
|
||||
City = c("Kansas City", "Kansas City", "Kansas City", "Seattle", "Seattle", "Seattle", "Mexico City", "Mexico City", "Mexico City"),
|
||||
Date = c("2020-01-01", "2020-02-01", "2020-03-01", "2020-01-01", "2020-02-01", "2020-03-01", "2020-01-01", "2020-02-01", "2020-03-01"),
|
||||
Country = c("United States", "United States", "United States", "United States", "United States", "United States", "Mexico", "Mexico", "Mexico"),
|
||||
City = c("Kansas City", "Kansas City", "Kansas City", "Seattle", "Seattle", "Seattle", "Mexico City", "Mexico City", "Mexico City"),
|
||||
Date = c("2020-01-01", "2020-02-01", "2020-03-01", "2020-01-01", "2020-02-01", "2020-03-01", "2020-01-01", "2020-02-01", "2020-03-01"),
|
||||
Target = c(100, 250, 320, 80, 200, 270, 50, 80, 120)
|
||||
) %>%
|
||||
dplyr::mutate(Date = as.Date(Date))
|
||||
|
||||
print(hts)
|
||||
|
||||
```
|
||||
|
||||
In the above example, "City" was the lowest level of the hierarchy, which feeds into "Country", which then feeds into "Continent". Finn will take this data and will forecast by City, total Country, and total Continent. After each model is ran for every level in the hierarchy, the best model is chosen at each level, then the "Best Model" and every other model is reconciled back down to the lowest level.
|
||||
|
@ -50,16 +49,15 @@ suppressMessages(library(dplyr))
|
|||
message("Grouped Hierarchical Time Series Data")
|
||||
|
||||
gts <- tibble(
|
||||
Country = c("United States", "United States", "United States", "United States", "United States", "United States", "Mexico", "Mexico", "Mexico", "Mexico", "Mexico", "Mexico"),
|
||||
Segment = c("Enterprise", "Enterprise", "Enterprise", "Public Sector", "Public Sector", "Public Sector", "Enterprise", "Enterprise", "Enterprise", "Enterprise", "Enterprise", "Enterprise"),
|
||||
Product = c("Coffee", "Coffee", "Coffee", "Coffee", "Coffee", "Coffee", "Coffee", "Coffee", "Coffee", "Tea", "Tea", "Tea"),
|
||||
Date = c("2020-01-01", "2020-02-01", "2020-03-01", "2020-01-01", "2020-02-01", "2020-03-01", "2020-01-01", "2020-02-01", "2020-03-01", "2020-01-01", "2020-02-01", "2020-03-01"),
|
||||
Country = c("United States", "United States", "United States", "United States", "United States", "United States", "Mexico", "Mexico", "Mexico", "Mexico", "Mexico", "Mexico"),
|
||||
Segment = c("Enterprise", "Enterprise", "Enterprise", "Public Sector", "Public Sector", "Public Sector", "Enterprise", "Enterprise", "Enterprise", "Enterprise", "Enterprise", "Enterprise"),
|
||||
Product = c("Coffee", "Coffee", "Coffee", "Coffee", "Coffee", "Coffee", "Coffee", "Coffee", "Coffee", "Tea", "Tea", "Tea"),
|
||||
Date = c("2020-01-01", "2020-02-01", "2020-03-01", "2020-01-01", "2020-02-01", "2020-03-01", "2020-01-01", "2020-02-01", "2020-03-01", "2020-01-01", "2020-02-01", "2020-03-01"),
|
||||
Target = c(10, 20, 30, 5, 8, 11, 20, 23, 27, 50, 55, 60)
|
||||
) %>%
|
||||
dplyr::mutate(Date = as.Date(Date))
|
||||
|
||||
print(gts)
|
||||
|
||||
```
|
||||
|
||||
It would be hard to aggregate the above data in a traditional hierarchy. The same products are found in different segments and countries, also the same segments are found in multiple countries. Finn will follow a similar modeling process as the one described for a traditional hierarchy, but instead will create forecasts at the below levels.
|
||||
|
|
|
@ -22,7 +22,7 @@ reactable::reactable(
|
|||
data.frame() %>%
|
||||
rbind(data.frame(Model = "arima", Type = "univariate, local", Underlying.Package = "modeltime, forecast", Description = "Regression model that is based on finding relationships between lagged values of the target variable you are trying to forecast.")) %>%
|
||||
rbind(data.frame(Model = "arima-boost", Type = "multivariate, local", Underlying.Package = "modeltime, forecast, xgboost", Description = "Arima model (refer to arima) that models the trend compoent of target variable, then uses xgboost model (refer to xgboost) to train on the remaining residuals.")) %>%
|
||||
rbind(data.frame(Model = "arimax", Type = "multivariate, local", Underlying.Package = "modeltime, forecast", Description = "ARIMA model that incorporates external regressors and other engineered features.")) %>%
|
||||
rbind(data.frame(Model = "arimax", Type = "multivariate, local", Underlying.Package = "modeltime, forecast", Description = "ARIMA model that incorporates external regressors and other engineered features.")) %>%
|
||||
rbind(data.frame(Model = "cubist", Type = "multivariate, local, global, ensemble", Underlying.Package = "rules", Description = "Hybrid of tree based and linear regression approach. Many decision trees are built, but regression coefficients are used at each terminal node instead of averging values in other tree based approaches.")) %>%
|
||||
rbind(data.frame(Model = "croston", Type = "univariate, local", Underlying.Package = "modeltime, forecast", Description = "Useful for intermittent demand forecasting, aka when there are a lot of periods of zero values. Involves simple exponential smoothing on non-zero values of target variable and another application of seasonal exponential smoothing on periods between non-zero elements of the target variable. Refer to ets for more details on exponential smoothing.")) %>%
|
||||
rbind(data.frame(Model = "ets", Type = "univariate, local", Underlying.Package = "modeltime, forecast", Description = "Forecasts produced using exponential smoothing methods are weighted averages of past observations, with the weights decaying exponentially as the observations get older. Exponential smoothing models try to forecast the components of a time series which can be broken down in to error, trend, and seasonality. These components can be forecasted separately then either added or multiplied together to get the final forecast output.")) %>%
|
||||
|
@ -41,8 +41,7 @@ reactable::reactable(
|
|||
rbind(data.frame(Model = "svm-rbf", Type = "multivariate, local, global, ensemble", Underlying.Package = "parsnip, kernlab", Description = "Uses a nonlinear function, specifically a radial basis function, to create a regression line of the target variable.")) %>%
|
||||
rbind(data.frame(Model = "tbats", Type = "univariate, local", Underlying.Package = "modeltime, forecast", Description = "A spin off of the traditional ets model (refer to ets), with some additional components to capture multiple seasonalities.")) %>%
|
||||
rbind(data.frame(Model = "theta", Type = "univariate, local", Underlying.Package = "modeltime, forecast", Description = "Theta is similar to exponential smoothing (refer to ets) but with another component called drift. Adding drift to exponential smoothing allows the forecast to increase or decrease over time, where the amount of change over time (called the drift) is set to be the average change seen within the historical data.")) %>%
|
||||
rbind(data.frame(Model = "xgboost", Type = "multivariate, local, global, ensemble", Underlying.Package = "parsnip, xgboost", Description = "Builds many decision trees (similar to random forests), but predictions that are initially inaccurate are applied more weight in subsequent training rounds to increase accuracy across all predictions."))
|
||||
,
|
||||
rbind(data.frame(Model = "xgboost", Type = "multivariate, local, global, ensemble", Underlying.Package = "parsnip, xgboost", Description = "Builds many decision trees (similar to random forests), but predictions that are initially inaccurate are applied more weight in subsequent training rounds to increase accuracy across all predictions.")),
|
||||
defaultColDef = colDef(
|
||||
header = function(value) gsub(".", " ", value, fixed = TRUE),
|
||||
cell = function(value) format(value, nsmall = 1),
|
||||
|
@ -51,12 +50,11 @@ reactable::reactable(
|
|||
headerStyle = list(background = "#f7f7f8")
|
||||
),
|
||||
columns = list(
|
||||
Description = colDef(minWidth = 140, align = "left") # overrides the default
|
||||
Description = colDef(minWidth = 140, align = "left") # overrides the default
|
||||
),
|
||||
bordered = TRUE,
|
||||
highlight = TRUE
|
||||
)
|
||||
|
||||
```
|
||||
|
||||
### Univariate vs Multivariate Models
|
||||
|
|
|
@ -49,8 +49,8 @@ hist_data <- timetk::m4_monthly %>%
|
|||
data_sdf <- sparklyr::copy_to(sc, hist_data, "data_sdf", overwrite = TRUE)
|
||||
|
||||
run_info <- set_run_info(
|
||||
experiment_name = "finn_fcst",
|
||||
run_name = "spark_run_1",
|
||||
experiment_name = "finn_fcst",
|
||||
run_name = "spark_run_1",
|
||||
path = "/dbfs/mnt/example/folder" # important that you mount an ADLS path
|
||||
)
|
||||
|
||||
|
@ -60,16 +60,18 @@ forecast_time_series(
|
|||
combo_variables = c("id"),
|
||||
target_variable = "value",
|
||||
date_type = "month",
|
||||
forecast_horizon = 3,
|
||||
parallel_processing = "spark",
|
||||
forecast_horizon = 3,
|
||||
parallel_processing = "spark",
|
||||
return_data = FALSE
|
||||
)
|
||||
|
||||
# return the outputs as a spark data frame
|
||||
finn_output_tbl <- get_forecast_data(run_info = run_info,
|
||||
return_type = "sdf")
|
||||
finn_output_tbl <- get_forecast_data(
|
||||
run_info = run_info,
|
||||
return_type = "sdf"
|
||||
)
|
||||
```
|
||||
|
||||
The above example runs each time series on a separate core on a spark cluster. You can also submit multiple time series where each time series runs on a separate spark executor (VM) and then leverage all of the cores on that executor to run things like hyperparameter tuning or model refitting in parallel. This creates two levels of parallelization. One at the time series level, then another when doing things like hyperparameter tuning within a specific time series. To do that set `inner_parallel` to TRUE in `forecast_time_series()`. Also make sure that you adjust the number of spark executor cores to 1, that ensures that only 1 time series runs on an executor at a time. Leverage the "spark.executor.cores" argument when configuring your spark connection. This can be done using [sparklyr](https://spark.rstudio.com/guides/connections#:~:text=In%20sparklyr%2C%20Spark%20properties%20can%20be%20set%20by,customized%20as%20shown%20in%20the%20example%20code%20below.) or within the cluster manager itself within the Azure resource. Use the "num_cores" argument in the "forecast_time_series" function to control how many cores should be used within an executor when running things like hyperparameter tuning.
|
||||
|
||||
`forecast_time_series()` will be looking for a variable called "sc" to use when submitting tasks to the spark cluster, so make sure you use that as the variable name when connecting to spark. Also it's important that you mount your spark session to an Azure Data Lake Storage (ADLS) account, and provide the mounted path to where you'd like your Finn results to be written to within `set_run_info()`.
|
||||
`forecast_time_series()` will be looking for a variable called "sc" to use when submitting tasks to the spark cluster, so make sure you use that as the variable name when connecting to spark. Also it's important that you mount your spark session to an Azure Data Lake Storage (ADLS) account, and provide the mounted path to where you'd like your Finn results to be written to within `set_run_info()`.
|
||||
|
|
Загрузка…
Ссылка в новой задаче