model function updates
This commit is contained in:
Родитель
6e0fa2467c
Коммит
3f1214f391
|
@ -62,7 +62,8 @@ Suggests:
|
|||
reactable,
|
||||
rmarkdown,
|
||||
sparklyr,
|
||||
testthat (>= 3.0.0)
|
||||
testthat (>= 3.0.0),
|
||||
qs
|
||||
Config/testthat/edition: 3
|
||||
Depends:
|
||||
R (>= 3.6.0),
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
# Generated by roxygen2: do not edit by hand
|
||||
|
||||
export(average_models)
|
||||
export(best_models)
|
||||
export(ensemble_models)
|
||||
export(forecast_time_series)
|
||||
export(model_hyperparameters)
|
||||
|
@ -7,7 +9,7 @@ export(model_workflows)
|
|||
export(prep_data)
|
||||
export(refit_models)
|
||||
export(train_test_split)
|
||||
export(tune_hyperparameters)
|
||||
export(tune_models)
|
||||
import(modeltime)
|
||||
importFrom(Cubist,cubist)
|
||||
importFrom(Cubist,cubistControl)
|
||||
|
|
|
@ -0,0 +1,110 @@
|
|||
#' Average Models
|
||||
#'
|
||||
#' @param refit_predictions_tbl individual model predictions
|
||||
#' @param ensemble_predictions_tbl ensemble model predictions
|
||||
#' @param model_train_test_tbl train test splits
|
||||
#' @param max_model_average max number of models to average
|
||||
#' @param parallel_processing parallel processing
|
||||
#' @param num_cores number of cores
|
||||
#'
|
||||
#' @return list with average predictions and fitted models
|
||||
#' @keywords internal
|
||||
#' @export
|
||||
average_models <- function(refit_predictions_tbl,
|
||||
ensemble_predictions_tbl = NULL,
|
||||
model_train_test_tbl,
|
||||
max_model_average = 3,
|
||||
parallel_processing = NULL,
|
||||
num_cores = NULL) {
|
||||
|
||||
# get model list
|
||||
ind_model_list <- refit_predictions_tbl %>%
|
||||
dplyr::mutate(Combo = ifelse(Combo == "All-Data", "global", "local")) %>%
|
||||
tidyr::unite(col = "Model_Name", c("Model", "Recipe_ID", "Combo"), sep = "-") %>%
|
||||
dplyr::pull(Model_Name) %>%
|
||||
unique()
|
||||
|
||||
if(!is.null(ensemble_predictions_tbl)) {
|
||||
ensemble_model_list <- ensemble_predictions_tbl %>%
|
||||
dplyr::mutate(Recipe_ID = "Ensemble-local") %>%
|
||||
tidyr::unite(col = "Model_Name", c("Model", "Recipe_ID"), sep = "-") %>%
|
||||
dplyr::pull(Model_Name) %>%
|
||||
unique()
|
||||
} else {
|
||||
ensemble_model_list <- NULL
|
||||
}
|
||||
|
||||
final_model_list <- c(ind_model_list, ensemble_model_list)
|
||||
|
||||
# create model combinations list
|
||||
model_combinations <- tibble::tibble()
|
||||
|
||||
for(number in 2:min(length(final_model_list), max_model_average)) {
|
||||
|
||||
temp <- data.frame(gtools::combinations(v=final_model_list, n=length(final_model_list), r=number))
|
||||
|
||||
temp <- temp %>%
|
||||
tidyr::unite(Model_Combo, colnames(temp)) %>%
|
||||
dplyr::select(Model_Combo) %>%
|
||||
tibble::tibble()
|
||||
|
||||
model_combinations <- rbind(model_combinations, temp)
|
||||
}
|
||||
|
||||
# prep prediction data
|
||||
train_test_id_list <- model_train_test_tbl %>%
|
||||
dplyr::filter(Run_Type != "Ensemble") %>%
|
||||
dplyr::pull(Run_ID) %>%
|
||||
unique()
|
||||
|
||||
predictions_tbl <- refit_predictions_tbl %>%
|
||||
dplyr::mutate(Model_Suffix = ifelse(Combo == "All-Data", "global", "local")) %>%
|
||||
dplyr::select(Model_Suffix, Model, Recipe_ID, Train_Test_ID, Prediction) %>%
|
||||
tidyr::unite(col = 'Model_Name', c("Model", "Recipe_ID", "Model_Suffix"), sep= "-", remove = FALSE) %>%
|
||||
dplyr::select(-Model_Suffix) %>%
|
||||
dplyr::filter(Train_Test_ID %in% train_test_id_list) %>%
|
||||
tidyr::unnest(Prediction) %>%
|
||||
rbind(
|
||||
if(!is.null(ensemble_predictions_tbl)) {
|
||||
ensemble_predictions_tbl %>%
|
||||
dplyr::mutate(Model_Suffix = ifelse(Combo == "All-Data", "global", "local")) %>%
|
||||
dplyr::select(Model_Suffix, Model, Recipe_ID, Train_Test_ID, Prediction) %>%
|
||||
tidyr::unite(col = 'Model_Name', c("Model", "Recipe_ID", "Model_Suffix"), sep= "-", remove = FALSE) %>%
|
||||
dplyr::select(-Model_Suffix) %>%
|
||||
dplyr::filter(Train_Test_ID %in% train_test_id_list) %>%
|
||||
tidyr::unnest(Prediction)
|
||||
} else {
|
||||
tibble::tibble()
|
||||
}
|
||||
)
|
||||
|
||||
# model average function
|
||||
model_average <- function(model) {
|
||||
|
||||
# get list of models to average
|
||||
model_list <- strsplit(model, "_")[[1]]
|
||||
|
||||
# create model average
|
||||
temp <- predictions_tbl %>%
|
||||
dplyr::filter(Model_Name %in% model_list) %>%
|
||||
dplyr::group_by(Combo, Train_Test_ID, Date) %>%
|
||||
dplyr::summarise(Target = mean(Target, na.rm = TRUE),
|
||||
Forecast = mean(Forecast, na.rm = TRUE)) %>%
|
||||
dplyr::mutate(Model = model) %>%
|
||||
dplyr::select(Combo, Model, Train_Test_ID, Date, Target, Forecast) %>%
|
||||
dplyr::ungroup()
|
||||
|
||||
return(temp)
|
||||
}
|
||||
|
||||
model_avg_tbl <- submit_fn(predictions_tbl,
|
||||
parallel_processing,
|
||||
model_combinations %>%
|
||||
dplyr::pull(Model_Combo),
|
||||
model_average,
|
||||
num_cores,
|
||||
package_exports = c("tibble", "dplyr", "tidyselect", "stringr", "foreach",'doParallel', 'parallel'),
|
||||
function_exports = NULL)
|
||||
|
||||
return(model_avg_tbl)
|
||||
}
|
|
@ -0,0 +1,60 @@
|
|||
#' Select Best Models
|
||||
#'
|
||||
#' @param refit_predictions_tbl individual model predictions
|
||||
#' @param ensemble_predictions_tbl ensemble model predictions
|
||||
#' @param average_predictions_tbl model average predictions
|
||||
#' @param model_train_test_tbl train test splits
|
||||
#' @param parallel_processing parallel processing
|
||||
#' @param num_cores number of cores
|
||||
#'
|
||||
#' @return tbl with best model flag
|
||||
#' @keywords internal
|
||||
#' @export
|
||||
best_models <- function(refit_predictions_tbl,
|
||||
ensemble_predictions_tbl = NULL,
|
||||
average_predictions_tbl = NULL,
|
||||
model_train_test_tbl,
|
||||
parallel_processing = NULL,
|
||||
num_cores = NULL) {
|
||||
|
||||
# get model predictions
|
||||
train_test_id_list <- model_train_test_tbl %>%
|
||||
dplyr::filter(Run_Type != "Ensemble") %>%
|
||||
dplyr::pull(Run_ID) %>%
|
||||
unique()
|
||||
|
||||
predictions_tbl <- refit_predictions_tbl %>%
|
||||
dplyr::mutate(Model_Suffix = ifelse(Combo == "All-Data", "global", "local")) %>%
|
||||
dplyr::select(Model, Recipe_ID, Model_Suffix, Train_Test_ID, Prediction) %>%
|
||||
tidyr::unite(col = 'Model_Name', c("Model", "Recipe_ID", "Model_Suffix"), sep= "-", remove = FALSE) %>%
|
||||
dplyr::select(-Model_Suffix, -Model, -Recipe_ID) %>%
|
||||
dplyr::filter(Train_Test_ID %in% train_test_id_list) %>%
|
||||
tidyr::unnest(Prediction) %>%
|
||||
rbind(
|
||||
if(!is.null(ensemble_predictions_tbl)) {
|
||||
ensemble_predictions_tbl %>%
|
||||
dplyr::mutate(Model_Suffix = ifelse(Combo == "All-Data", "global", "local")) %>%
|
||||
dplyr::select(Model, Recipe_ID, Model_Suffix, Train_Test_ID, Prediction) %>%
|
||||
tidyr::unite(col = 'Model_Name', c("Model", "Recipe_ID", "Model_Suffix"), sep= "-", remove = FALSE) %>%
|
||||
dplyr::select(-Model_Suffix, -Model, -Recipe_ID) %>%
|
||||
dplyr::filter(Train_Test_ID %in% train_test_id_list) %>%
|
||||
tidyr::unnest(Prediction)
|
||||
} else {
|
||||
tibble::tibble()
|
||||
}
|
||||
) %>%
|
||||
rbind(
|
||||
if(!is.null(average_predictions_tbl)) {
|
||||
average_predictions_tbl %>%
|
||||
dplyr::rename(Model_Name = Model)
|
||||
} else {
|
||||
tibble::tibble()
|
||||
}
|
||||
)
|
||||
|
||||
# calculate model accuracy
|
||||
mape_tbl <- predictions_tbl %>%
|
||||
dplyr::filter(Train_Test_ID != "01")
|
||||
|
||||
return(predictions_tbl)
|
||||
}
|
|
@ -0,0 +1,413 @@
|
|||
#' Ensemble Models
|
||||
#'
|
||||
#' @param tuning_tbl hyperparameter tuning predictions
|
||||
#' @param refit_tbl individual model predictions
|
||||
#' @param model_train_test_tbl model train test split table
|
||||
#' @param date_type date type
|
||||
#' @param num_hyperparameters number of hyperparameters
|
||||
#' @param parallel_processing parallel processing
|
||||
#' @param num_cores number of cores
|
||||
#' @param seed seed number
|
||||
#'
|
||||
#' @return list with ensemble predictions and fitted models
|
||||
#' @keywords internal
|
||||
#' @export
|
||||
ensemble_models <- function(tuning_tbl,
|
||||
refit_tbl,
|
||||
model_train_test_tbl,
|
||||
date_type,
|
||||
num_hyperparameters = 5,
|
||||
parallel_processing = NULL,
|
||||
num_cores = NULL,
|
||||
seed = 123) {
|
||||
|
||||
# get individual prediction data
|
||||
initial_results_tbl <- tuning_tbl %>%
|
||||
dplyr::select(Combo, Model, Recipe_ID, Prediction) %>%
|
||||
dplyr::rename(Combo_Key = Combo) %>%
|
||||
tidyr::unnest(Prediction) %>%
|
||||
rbind(
|
||||
refit_tbl %>%
|
||||
dplyr::select(Combo, Model, Recipe_ID, Train_Test_ID, Prediction) %>%
|
||||
dplyr::rename(Combo_Key = Combo) %>%
|
||||
tidyr::unnest(Prediction)
|
||||
)
|
||||
|
||||
combo_iter_list <- unique(initial_results_tbl$Combo)
|
||||
|
||||
prep_ensemble_fn <- function(combo) {
|
||||
|
||||
initial_results_tbl %>%
|
||||
dplyr::filter(Combo == combo) %>%
|
||||
dplyr::mutate(Suffix = ifelse(Combo_Key == "All-Data", "Global", "Local")) %>%
|
||||
tidyr::unite(col= "Model_Key",
|
||||
c("Model", "Recipe_ID", "Suffix"),
|
||||
sep="-",
|
||||
remove=F) %>%
|
||||
tidyr::pivot_wider(names_from = Model_Key, values_from = Forecast,
|
||||
id_cols = c("Combo", "Date", "Train_Test_ID", "Target"), values_fill = 0)
|
||||
}
|
||||
|
||||
prep_ensemble_tbl <- submit_fn(tuning_tbl,
|
||||
parallel_processing,
|
||||
combo_iter_list,
|
||||
prep_ensemble_fn,
|
||||
num_cores,
|
||||
package_exports = c("tibble", "dplyr", "timetk", "hts", "tidyselect", "stringr", "foreach",
|
||||
'doParallel', 'parallel', "lubridate", 'parsnip', 'tune', 'dials', 'workflows',
|
||||
'Cubist', 'earth', 'glmnet', 'kernlab', 'modeltime.gluonts', 'purrr',
|
||||
'recipes', 'rules', 'modeltime'),
|
||||
function_exports = NULL)
|
||||
|
||||
# ensemble models to run
|
||||
refit_models <- unique(refit_tbl$Model)
|
||||
|
||||
ensemble_model_list <- refit_models[refit_models %in% c("cubist", "glmnet", "sv-poly", "svm-rbf", "xgboost")]
|
||||
|
||||
if(length(ensemble_model_list) < 1) {
|
||||
stop("no ensemble models chosen to run")
|
||||
}
|
||||
|
||||
model_workflow_tbl <- tibble::tibble()
|
||||
|
||||
for(model in ensemble_model_list) {
|
||||
|
||||
avail_arg_list <- list('train_data' = prep_ensemble_tbl %>% dplyr::select(-Train_Test_ID),
|
||||
'model_type' = "ensemble",
|
||||
'pca' = FALSE)
|
||||
|
||||
# get specific model spec
|
||||
fn_to_invoke <- get(gsub('-', '_', model))
|
||||
|
||||
exp_arg_list <- formalArgs(fn_to_invoke)
|
||||
|
||||
avail_names <- names(avail_arg_list)
|
||||
|
||||
inp_arg_list <- list()
|
||||
|
||||
for(x in avail_names){
|
||||
|
||||
if(x %in% exp_arg_list){
|
||||
inp_arg_list[x] <- avail_arg_list[x]
|
||||
}
|
||||
}
|
||||
|
||||
model_workflow <- do.call(fn_to_invoke,inp_arg_list, quote=TRUE)
|
||||
|
||||
workflow_tbl <- tibble::tibble(Model_Name = model,
|
||||
Model_Workflow = list(model_workflow))
|
||||
|
||||
model_workflow_tbl <- rbind(model_workflow_tbl, workflow_tbl)
|
||||
}
|
||||
|
||||
# get hyperparameters
|
||||
hyperparameters_tbl <- tibble::tibble()
|
||||
|
||||
for(x in model_workflow_tbl %>% dplyr::group_split(dplyr::row_number(), .keep = FALSE)) {
|
||||
|
||||
model <- x %>%
|
||||
dplyr::pull(Model_Name)
|
||||
|
||||
temp_tbl <- model_workflow_tbl %>%
|
||||
dplyr::filter(Model_Name == model)
|
||||
|
||||
model_workflow <- temp_tbl$Model_Workflow[[1]]
|
||||
|
||||
model_spec <- model_workflow %>%
|
||||
workflows::extract_spec_parsnip()
|
||||
|
||||
recipe_features <- prep_ensemble_tbl
|
||||
|
||||
if(model=="svm-rbf") {
|
||||
parameters <- model_spec %>%
|
||||
workflows::extract_parameter_set_dials()
|
||||
} else {
|
||||
parameters <- model_spec %>%
|
||||
workflows::extract_parameter_set_dials() %>%
|
||||
dials::finalize(recipe_features, force = FALSE)
|
||||
}
|
||||
|
||||
grid <- dials::grid_latin_hypercube(parameters, size = num_hyperparameters)
|
||||
|
||||
hyperparameters_temp <- grid %>%
|
||||
dplyr::group_split(dplyr::row_number(), .keep = FALSE) %>%
|
||||
purrr::map_df(tidyr::nest, data=tidyselect::everything()) %>%
|
||||
dplyr::rename(Hyperparameters = data) %>%
|
||||
tibble::rowid_to_column("Hyperparameter_Combo") %>%
|
||||
dplyr::mutate(Model = model)
|
||||
|
||||
hyperparameters_tbl <- rbind(hyperparameters_tbl, hyperparameters_temp)
|
||||
}
|
||||
|
||||
# fit models by hyperparameter
|
||||
hyperparmaeter_iter_list <- purrr::map(combo_iter_list, .f = function(x) {
|
||||
model_train_test_tbl %>%
|
||||
dplyr::mutate(Combo = x) %>%
|
||||
dplyr::rename(Train_Test_ID = Run_ID) %>%
|
||||
dplyr::filter(Run_Type == "Validation") %>%
|
||||
dplyr::select(Combo, Train_Test_ID)
|
||||
}) %>%
|
||||
dplyr::bind_rows() %>%
|
||||
dplyr::group_split(dplyr::row_number(), .keep = FALSE) %>%
|
||||
purrr::map(.f = function(x) {
|
||||
hyperparameters_tbl %>%
|
||||
dplyr::select(Hyperparameter_Combo, Model) %>%
|
||||
dplyr::rename(Hyperparameter_ID = Hyperparameter_Combo) %>%
|
||||
dplyr::mutate(Combo = x$Combo,
|
||||
Train_Test_ID = x$Train_Test_ID)
|
||||
}) %>%
|
||||
dplyr::bind_rows() %>%
|
||||
dplyr::select(Combo, Model, Train_Test_ID, Hyperparameter_ID)
|
||||
|
||||
initial_tune_fn <- function(x) {
|
||||
|
||||
# run input values
|
||||
param_combo <- x %>%
|
||||
dplyr::pull(Hyperparameter_ID)
|
||||
|
||||
model <- x %>%
|
||||
dplyr::pull(Model)
|
||||
|
||||
data_split <- x %>%
|
||||
dplyr::pull(Train_Test_ID)
|
||||
|
||||
combo <- x %>%
|
||||
dplyr::pull(Combo)
|
||||
|
||||
train_end_date <- model_train_test_tbl %>%
|
||||
dplyr::filter(Run_ID == data_split) %>%
|
||||
dplyr::pull(Train_End)
|
||||
|
||||
test_end_date <- model_train_test_tbl %>%
|
||||
dplyr::filter(Run_ID == data_split) %>%
|
||||
dplyr::pull(Test_End)
|
||||
|
||||
# get train/test data
|
||||
full_data <- prep_ensemble_tbl %>%
|
||||
dplyr::filter(Combo == combo) %>%
|
||||
dplyr::mutate(Date_index.num = 0)
|
||||
|
||||
training <- full_data %>%
|
||||
dplyr::filter(Date <= train_end_date) %>%
|
||||
dplyr::select(-Train_Test_ID)
|
||||
|
||||
testing <- full_data %>%
|
||||
dplyr::filter(Date > train_end_date,
|
||||
Date <= test_end_date,
|
||||
Train_Test_ID == data_split)
|
||||
|
||||
# get workflow
|
||||
workflow <- model_workflow_tbl %>%
|
||||
dplyr::filter(Model_Name == model)
|
||||
|
||||
workflow_final <- workflow$Model_Workflow[[1]]
|
||||
|
||||
# get hyperparameters
|
||||
hyperparameters <- hyperparameters_tbl %>%
|
||||
dplyr::filter(Model == model,
|
||||
Hyperparameter_Combo == param_combo) %>%
|
||||
dplyr::select(Hyperparameters) %>%
|
||||
tidyr::unnest(Hyperparameters)
|
||||
|
||||
# fit model
|
||||
set.seed(seed)
|
||||
|
||||
model_fit <- workflow_final %>%
|
||||
tune::finalize_workflow(parameters = hyperparameters) %>%
|
||||
generics::fit(data = training)
|
||||
|
||||
# create prediction
|
||||
model_prediction <- testing %>%
|
||||
dplyr::bind_cols(
|
||||
predict(model_fit, new_data = testing)
|
||||
) %>%
|
||||
dplyr::select(Combo, Date, Target, .pred) %>%
|
||||
dplyr::rename(Forecast = .pred)
|
||||
|
||||
# finalize output tbl
|
||||
final_tbl <- tibble::tibble(
|
||||
Combo = combo,
|
||||
Model = model,
|
||||
Train_Test_ID = data_split,
|
||||
Hyperparameter_ID = param_combo,
|
||||
Model_Fit = list(model_fit),
|
||||
Prediction = list(model_prediction)
|
||||
)
|
||||
|
||||
return(final_tbl)
|
||||
}
|
||||
|
||||
initial_tuning_tbl <- submit_fn(model_workflow_tbl,
|
||||
parallel_processing,
|
||||
hyperparmaeter_iter_list %>%
|
||||
dplyr::group_split(dplyr::row_number(), .keep = FALSE),
|
||||
initial_tune_fn,
|
||||
num_cores,
|
||||
package_exports = c("tibble", "dplyr", "timetk", "hts", "tidyselect", "stringr", "foreach",
|
||||
'doParallel', 'parallel', "lubridate", 'parsnip', 'tune', 'dials', 'workflows',
|
||||
'Cubist', 'earth', 'glmnet', 'kernlab', 'modeltime.gluonts', 'purrr',
|
||||
'recipes', 'rules', 'modeltime'),
|
||||
function_exports = NULL)
|
||||
|
||||
hyperparmaeter_iter_list2 <- hyperparmaeter_iter_list %>%
|
||||
dplyr::select(Combo, Model) %>%
|
||||
dplyr::distinct()
|
||||
|
||||
choose_hyperparameters_fn <- function(x) {
|
||||
|
||||
combo <- x %>%
|
||||
dplyr::pull(Combo)
|
||||
|
||||
model <- x %>%
|
||||
dplyr::pull(Model)
|
||||
|
||||
test_tbl <- initial_tuning_tbl %>%
|
||||
dplyr::filter(Combo == combo,
|
||||
Model == model) %>%
|
||||
dplyr::select(Model, Hyperparameter_ID, Train_Test_ID, Prediction, Model_Fit)
|
||||
|
||||
best_param <- test_tbl %>%
|
||||
dplyr::select(-Model_Fit) %>%
|
||||
tidyr::unnest(Prediction) %>%
|
||||
dplyr::mutate(Combo = combo) %>%
|
||||
dplyr::group_by(Combo, Model, Hyperparameter_ID) %>%
|
||||
yardstick::rmse(truth = Target,
|
||||
estimate = Forecast,
|
||||
na_rm = TRUE) %>%
|
||||
dplyr::ungroup() %>%
|
||||
dplyr::arrange(.estimate) %>%
|
||||
dplyr::slice(1) %>%
|
||||
dplyr::pull(Hyperparameter_ID)
|
||||
|
||||
best_model_fit <- test_tbl %>%
|
||||
dplyr::filter(Hyperparameter_ID == best_param) %>%
|
||||
dplyr::slice(1)
|
||||
|
||||
best_model_fit <- best_model_fit$Model_Fit[[1]]
|
||||
|
||||
final_predictions <- test_tbl %>%
|
||||
dplyr::filter(Hyperparameter_ID == best_param) %>%
|
||||
dplyr::select(-Model_Fit) %>%
|
||||
tidyr::unnest(Prediction) %>%
|
||||
dplyr::select(Combo, Date, Train_Test_ID, Target, Forecast)
|
||||
|
||||
return(tibble::tibble(Combo = combo,
|
||||
Model = model,
|
||||
Hyperparameter_ID = best_param,
|
||||
Model_Fit = list(best_model_fit),
|
||||
Prediction = list(final_predictions)))
|
||||
}
|
||||
|
||||
final_tuning_tbl <- submit_fn(model_workflow_tbl,
|
||||
parallel_processing,
|
||||
hyperparmaeter_iter_list2 %>%
|
||||
dplyr::group_split(dplyr::row_number(), .keep = FALSE),
|
||||
choose_hyperparameters_fn,
|
||||
num_cores,
|
||||
package_exports = c("tibble", "dplyr", "timetk", "hts", "tidyselect", "stringr", "foreach",
|
||||
'doParallel', 'parallel', "lubridate", 'parsnip', 'tune', 'dials', 'workflows',
|
||||
'Cubist', 'earth', 'glmnet', 'kernlab', 'modeltime.gluonts', 'purrr',
|
||||
'recipes', 'rules', 'modeltime', 'yardstick'),
|
||||
function_exports = NULL)
|
||||
|
||||
# refit ensemble models
|
||||
refit_iter_list <- model_train_test_tbl %>%
|
||||
dplyr::filter(Run_Type %in% c("Future_Forecast", "Back_Test")) %>%
|
||||
dplyr::group_split(dplyr::row_number(), .keep = FALSE) %>%
|
||||
purrr::map(.f = function(x) {
|
||||
final_tuning_tbl %>%
|
||||
dplyr::mutate(Run_Type = x %>% dplyr::pull(Run_Type),
|
||||
Run_ID = x %>% dplyr::pull(Run_ID),
|
||||
Train_End = x %>% dplyr::pull(Train_End),
|
||||
Test_End = x %>% dplyr::pull(Test_End)) %>%
|
||||
dplyr::select(-Model_Fit, -Prediction)}) %>%
|
||||
dplyr::bind_rows()
|
||||
|
||||
fit_model <- function(x) {
|
||||
|
||||
combo <- x %>%
|
||||
dplyr::pull(Combo)
|
||||
|
||||
model <- x %>%
|
||||
dplyr::pull(Model)
|
||||
|
||||
model_fit <- final_tuning_tbl %>%
|
||||
dplyr::filter(Model == model,
|
||||
Combo == combo)
|
||||
|
||||
model_fit <- model_fit$Model_Fit[[1]]
|
||||
|
||||
run_type <- x %>%
|
||||
dplyr::pull(Run_Type)
|
||||
|
||||
run_id <- x %>%
|
||||
dplyr::pull(Run_ID)
|
||||
|
||||
train_end <- x %>%
|
||||
dplyr::pull(Train_End)
|
||||
|
||||
test_end <- x %>%
|
||||
dplyr::pull(Test_End)
|
||||
|
||||
full_data <- prep_ensemble_tbl %>%
|
||||
dplyr::filter(Combo == combo) %>%
|
||||
dplyr::mutate(Date_index.num = 0)
|
||||
|
||||
training <- full_data %>%
|
||||
dplyr::filter(Date <= train_end) %>%
|
||||
dplyr::select(-Train_Test_ID)
|
||||
|
||||
testing <- full_data %>%
|
||||
dplyr::filter(Date > train_end,
|
||||
Date <= test_end,
|
||||
Train_Test_ID == run_id)
|
||||
|
||||
# fit model
|
||||
set.seed(seed)
|
||||
|
||||
model_fit <- model_fit %>%
|
||||
generics::fit(data = training)
|
||||
|
||||
# create prediction
|
||||
model_prediction <- testing %>%
|
||||
dplyr::bind_cols(
|
||||
predict(model_fit, new_data = testing)
|
||||
) %>%
|
||||
dplyr::select(Combo, Date, Target, .pred) %>%
|
||||
dplyr::rename(Forecast = .pred)
|
||||
|
||||
# finalize output tbl
|
||||
final_tbl <- tibble::tibble(
|
||||
Combo = combo,
|
||||
Model = model,
|
||||
Recipe_ID = "Ensemble",
|
||||
Train_Test_ID = run_id,
|
||||
Model_Fit = list(model_fit),
|
||||
Prediction = list(model_prediction)
|
||||
)
|
||||
|
||||
return(final_tbl)
|
||||
}
|
||||
|
||||
model_refit_final_tbl <- submit_fn(final_tuning_tbl,
|
||||
parallel_processing,
|
||||
refit_iter_list %>%
|
||||
dplyr::group_split(dplyr::row_number(), .keep = FALSE),
|
||||
fit_model,
|
||||
num_cores,
|
||||
package_exports = c("tibble", "dplyr", "timetk", "hts", "tidyselect", "stringr", "foreach",
|
||||
'doParallel', 'parallel', "lubridate", 'parsnip', 'tune', 'dials', 'workflows',
|
||||
'Cubist', 'earth', 'glmnet', 'kernlab', 'modeltime.gluonts', 'purrr',
|
||||
'recipes', 'rules', 'modeltime'),
|
||||
function_exports = NULL)
|
||||
|
||||
#get final combined results and return final fitted models
|
||||
final_model_fit_tbl <- model_refit_final_tbl %>%
|
||||
dplyr::filter(Train_Test_ID == "01") %>%
|
||||
dplyr::select(Combo, Model, Recipe_ID, Model_Fit)
|
||||
|
||||
final_ensemble_results_tbl <- model_refit_final_tbl %>%
|
||||
dplyr::select(-Model_Fit)
|
||||
|
||||
return(list(Model_Predictions = final_ensemble_results_tbl, Model_Fit = final_model_fit_tbl))
|
||||
}
|
|
@ -0,0 +1,394 @@
|
|||
#' Gets the back testing spacing
|
||||
#'
|
||||
#' Checks if back_test_spacing is set to NULL and gets the right one
|
||||
#'
|
||||
#'
|
||||
#' @param back_test_spacing back_test_spacing override
|
||||
#' @param date_type year, quarter, month, week, day
|
||||
#'
|
||||
#' @return Returns back_test_spacing
|
||||
#' @noRd
|
||||
get_back_test_spacing <- function(back_test_spacing,
|
||||
date_type){
|
||||
|
||||
if(!is.null(back_test_spacing)) {
|
||||
return(back_test_spacing)
|
||||
}
|
||||
|
||||
|
||||
back_test_spacing <- switch (date_type,
|
||||
"day" = 7,
|
||||
"week" = 4,
|
||||
1)
|
||||
return(back_test_spacing)
|
||||
}
|
||||
|
||||
#' Gets the back testing scenarios
|
||||
#'
|
||||
#' Gets back testing scenarios accounting for splits
|
||||
#'
|
||||
#' @param input_tbl full data table
|
||||
#' @param hist_end_date historical end date
|
||||
#' @param forecast_horizon forecast horizon
|
||||
#' @param back_test_scenarios back test scenarios
|
||||
#' @param back_test_spacing back test spacing
|
||||
#'
|
||||
#' @return Returns back_test_scenarios and hist_periods_80
|
||||
#' @noRd
|
||||
get_back_test_scenario_hist_periods <- function(input_tbl,
|
||||
hist_end_date,
|
||||
forecast_horizon,
|
||||
back_test_scenarios,
|
||||
back_test_spacing){
|
||||
|
||||
historical_periods <- input_tbl %>%
|
||||
dplyr::filter(Date <= hist_end_date) %>%
|
||||
dplyr::select(Date) %>%
|
||||
unique() %>%
|
||||
nrow() %>%
|
||||
as.numeric()
|
||||
|
||||
hist_periods_80 <- floor(historical_periods*0.8) #used with time series CV in multivariate models
|
||||
|
||||
if(is.null(back_test_scenarios)) {
|
||||
|
||||
historical_periods_20 <- floor(historical_periods*0.2)
|
||||
|
||||
#account for initial back tests that are smaller than the forecast horizon (1, 2, 3, etc up to fcst horizon)
|
||||
if(historical_periods_20 > forecast_horizon) {
|
||||
back_test_scenarios <- floor(historical_periods_20/back_test_spacing)
|
||||
} else {
|
||||
back_test_scenarios <- floor(forecast_horizon/back_test_spacing)
|
||||
}
|
||||
}
|
||||
|
||||
back_test_scenarios <- back_test_scenarios + 1
|
||||
|
||||
return (list(hist_periods_80=hist_periods_80,
|
||||
back_test_scenarios = back_test_scenarios))
|
||||
}
|
||||
|
||||
#' Gets the train test splits
|
||||
#'
|
||||
#' @param input_tbl full data table
|
||||
#' @param hist_end_date historical end date
|
||||
#' @param date_type date type
|
||||
#' @param forecast_horizon forecast horizon
|
||||
#' @param back_test_scenarios back test scenarios
|
||||
#' @param back_test_spacing back test spacing
|
||||
#'
|
||||
#' @return Returns table of train test splits
|
||||
#' @keywords internal
|
||||
#' @export
|
||||
train_test_split <- function(input_tbl,
|
||||
hist_end_date,
|
||||
date_type,
|
||||
forecast_horizon,
|
||||
back_test_scenarios = NULL,
|
||||
back_test_spacing = NULL){
|
||||
|
||||
back_test_spacing_final <- get_back_test_spacing(back_test_spacing,
|
||||
date_type)
|
||||
|
||||
# pull out first recipe data
|
||||
temp_tbl <- input_tbl %>%
|
||||
dplyr::slice(1) %>%
|
||||
dplyr::select(Data) %>%
|
||||
tidyr::unnest(Data)
|
||||
|
||||
# get back test info
|
||||
bt <- temp_tbl %>%
|
||||
get_back_test_scenario_hist_periods(hist_end_date,
|
||||
forecast_horizon,
|
||||
back_test_scenarios,
|
||||
back_test_spacing_final)
|
||||
|
||||
back_test_scenarios_final <- bt$back_test_scenarios
|
||||
back_test_initial <- bt$hist_periods_80
|
||||
|
||||
# create train/test split info
|
||||
train_test_initial <- temp_tbl %>%
|
||||
timetk::time_series_cv(
|
||||
initial = "1 year",
|
||||
asses = forecast_horizon,
|
||||
skip = back_test_spacing_final,
|
||||
cumulative = TRUE,
|
||||
slice_limit = 100) %>%
|
||||
timetk::tk_time_series_cv_plan() %>%
|
||||
tidyr::separate(col = .id, into = c(NA, "Slice_ID"), sep = "Slice")
|
||||
|
||||
train_test_final <- tibble::tibble()
|
||||
|
||||
for(id in unique(train_test_initial$Slice_ID)) {
|
||||
|
||||
temp_tbl <- train_test_initial %>%
|
||||
dplyr::filter(Slice_ID == id)
|
||||
|
||||
train_tbl <- temp_tbl %>%
|
||||
dplyr::filter(.key == "training") %>%
|
||||
dplyr::select(Date)
|
||||
|
||||
if(as.numeric(id) == 1) {
|
||||
|
||||
run_type <- "Future_Forecast"
|
||||
|
||||
test_tbl <- temp_tbl %>%
|
||||
dplyr::filter(.key == "testing") %>%
|
||||
dplyr::select(Date)
|
||||
|
||||
} else if(as.numeric(id) > 1 && as.numeric(id) <= back_test_scenarios_final){
|
||||
|
||||
run_type <- "Back_Test"
|
||||
|
||||
test_tbl <- temp_tbl %>%
|
||||
dplyr::filter(Date <= hist_end_date)
|
||||
|
||||
} else if (as.numeric(id) > back_test_scenarios_final & as.numeric(id) < max(back_test_scenarios_final + (forecast_horizon/back_test_spacing_final) + 1, back_test_scenarios_final*1.5)) {
|
||||
|
||||
run_type <- "Validation"
|
||||
|
||||
back_test_date <- train_test_final %>%
|
||||
dplyr::filter(Run_Type == "Back_Test")
|
||||
|
||||
test_tbl <- temp_tbl %>%
|
||||
dplyr::filter(Date <= min(back_test_date$Train_End))
|
||||
|
||||
} else {
|
||||
run_type <- "Ensemble"
|
||||
|
||||
test_tbl <- temp_tbl %>%
|
||||
dplyr::filter(.key == "testing") %>%
|
||||
dplyr::select(Date)
|
||||
}
|
||||
|
||||
train_test_tbl <- tibble::tibble(Run_Type = run_type,
|
||||
Run_ID = id,
|
||||
Train_End = max(train_tbl$Date),
|
||||
Test_End = max(test_tbl$Date))
|
||||
|
||||
train_test_final <- rbind(train_test_final, train_test_tbl)
|
||||
}
|
||||
|
||||
# check for back test and validation data
|
||||
|
||||
if(!("Validation" %in% unique(train_test_final$Run_Type))) {
|
||||
stop("No validation data produced. Add more historical data, shorten the forecast horizon, or shorten the number of back test scenarios")
|
||||
} else if(!("Back_Test" %in% unique(train_test_final$Run_Type))) {
|
||||
stop("No back testing data produced. Shorten the forecast horizon, or shorten the number of back test scenarios or back test spacing")
|
||||
}
|
||||
|
||||
return(train_test_final)
|
||||
}
|
||||
|
||||
#' Gets model workflows
|
||||
#'
|
||||
#' @param model_recipe_tbl model recipe table
|
||||
#' @param models_to_run models to run
|
||||
#' @param models_not_to_run models not to run
|
||||
#' @param run_deep_learning run deep learning models
|
||||
#' @param pca pca
|
||||
#'
|
||||
#' @return Returns table of model workflows
|
||||
#' @keywords internal
|
||||
#' @export
|
||||
model_workflows <- function(model_recipe_tbl,
|
||||
models_to_run = NULL,
|
||||
models_not_to_run = NULL,
|
||||
run_deep_learning = FALSE,
|
||||
pca = FALSE) {
|
||||
|
||||
# get recipe input data
|
||||
combos <- unique(model_recipe_tbl$Combo)
|
||||
|
||||
input_tbl <- model_recipe_tbl %>%
|
||||
dplyr::filter(Combo == combos[[1]])
|
||||
|
||||
# tibble to add model workflows to
|
||||
model_workflow_tbl <- tibble::tibble()
|
||||
|
||||
# models to run
|
||||
ml_models <- c("arima", "arima-boost", "cubist", "croston", "ets", "glmnet", "mars", "meanf",
|
||||
"nnetar", "nnetar-xregs", "prophet", "prophet-boost", "prophet-xregs", "snaive",
|
||||
"stlm-arima", "stlm-ets", "svm-poly", "svm-rbf", "tbats", "theta", "xgboost")
|
||||
|
||||
if(is.null(models_to_run) & is.null(models_not_to_run)) {
|
||||
|
||||
# do nothing, using existing ml_models list
|
||||
|
||||
} else if(is.null(models_to_run) & !is.null(models_not_to_run)) {
|
||||
|
||||
ml_models <- setdiff(ml_models, models_not_to_run)
|
||||
|
||||
} else {
|
||||
|
||||
if(!is.null(models_not_to_run)) {
|
||||
cli::cli_alert_warning("Note: 'models_to_run' argument overrides the 'models_not_to_run' argument")
|
||||
}
|
||||
|
||||
ml_models <- models_to_run
|
||||
|
||||
}
|
||||
|
||||
if(run_deep_learning) {
|
||||
ml_models <- c(ml_models, "nnetar", "nbeats")
|
||||
}
|
||||
|
||||
r2_models <- c("cubist", "glmnet", "svm-poly", "svm-rbf", "xgboost")
|
||||
|
||||
iter_tbl <- tibble::tibble()
|
||||
|
||||
for(recipe in unique(input_tbl$Recipe)) {
|
||||
iter_tbl <- rbind(iter_tbl,
|
||||
tibble::tibble(Model = ml_models,
|
||||
Recipe = recipe))
|
||||
}
|
||||
|
||||
for(x in iter_tbl %>% dplyr::group_split(dplyr::row_number(), .keep = FALSE)) {
|
||||
|
||||
model <- x %>%
|
||||
dplyr::pull(Model)
|
||||
|
||||
recipe <- x %>%
|
||||
dplyr::pull(Recipe)
|
||||
|
||||
recipe_tbl <- input_tbl %>%
|
||||
dplyr::filter(Recipe == recipe) %>%
|
||||
dplyr::select(Data) %>%
|
||||
tidyr::unnest(Data)
|
||||
|
||||
# get args to feel into model spec functions
|
||||
avail_arg_list <- list('train_data' = recipe_tbl,
|
||||
'frequency' = 12,
|
||||
'horizon' = 12,
|
||||
'seasonal_period' = c(3,6,12),
|
||||
'model_type' = "single",
|
||||
"pca" = TRUE)
|
||||
|
||||
# don't create workflows for models that only use R1 recipe
|
||||
if(recipe == "R2" & !(model %in% r2_models)) {
|
||||
next
|
||||
}
|
||||
|
||||
# get specific model spec
|
||||
fn_to_invoke <- get(gsub('-', '_', model))
|
||||
|
||||
exp_arg_list <- formalArgs(fn_to_invoke)
|
||||
|
||||
avail_names <- names(avail_arg_list)
|
||||
|
||||
inp_arg_list <- list()
|
||||
|
||||
for(x in avail_names){
|
||||
|
||||
if(x %in% exp_arg_list){
|
||||
inp_arg_list[x] <- avail_arg_list[x]
|
||||
}
|
||||
}
|
||||
|
||||
model_workflow <- do.call(fn_to_invoke,inp_arg_list, quote=TRUE)
|
||||
|
||||
# model_workflow <- workflows::workflow() %>%
|
||||
# workflows::add_model(model_spec) %>%
|
||||
# workflows::add_recipe(model_recipe)
|
||||
|
||||
workflow_tbl <- tibble::tibble(Model_Name = model,
|
||||
Model_Recipe = recipe,
|
||||
Model_Workflow = list(model_workflow))
|
||||
|
||||
model_workflow_tbl <- rbind(model_workflow_tbl, workflow_tbl)
|
||||
}
|
||||
|
||||
return(model_workflow_tbl %>% dplyr::arrange(Model_Name))
|
||||
}
|
||||
|
||||
#' Get model hyperparameters
|
||||
#'
|
||||
#' @param model_workflow_tbl model workflow table
|
||||
#' @param model_recipe_tbl model recipe table
|
||||
#' @param num_hyperparameters number of hyperparameter combinations
|
||||
#'
|
||||
#' @return table of model hyperparameters
|
||||
#' @keywords internal
|
||||
#' @export
|
||||
model_hyperparameters <- function(model_workflow_tbl,
|
||||
model_recipe_tbl,
|
||||
num_hyperparameters = 5) {
|
||||
|
||||
# get recipe input data
|
||||
combos <- unique(model_recipe_tbl$Combo)
|
||||
|
||||
input_tbl <- model_recipe_tbl %>%
|
||||
dplyr::filter(Combo == combos[[1]])
|
||||
|
||||
iter_tbl <- model_workflow_tbl %>%
|
||||
dplyr::select(Model_Name, Model_Recipe)
|
||||
|
||||
# get hyperparameters
|
||||
hyperparameters_tbl <- tibble::tibble()
|
||||
|
||||
for(x in iter_tbl %>% dplyr::group_split(dplyr::row_number(), .keep = FALSE)) {
|
||||
|
||||
model <- x %>%
|
||||
dplyr::pull(Model_Name)
|
||||
|
||||
recipe <- x %>%
|
||||
dplyr::pull(Model_Recipe)
|
||||
|
||||
temp_tbl <- model_workflow_tbl %>%
|
||||
dplyr::filter(Model_Name == model,
|
||||
Model_Recipe == recipe)
|
||||
|
||||
model_workflow <- temp_tbl$Model_Workflow[[1]]
|
||||
|
||||
model_spec <- model_workflow %>%
|
||||
workflows::extract_spec_parsnip()
|
||||
|
||||
recipe_features <- input_tbl %>%
|
||||
dplyr::filter(Recipe == recipe) %>%
|
||||
dplyr::select(Data) %>%
|
||||
tidyr::unnest(Data)
|
||||
|
||||
if(workflows::extract_parameter_set_dials(model_spec) %>% nrow() > 0) {
|
||||
|
||||
if(model=="svm-rbf") {
|
||||
parameters <- model_spec %>%
|
||||
workflows::extract_parameter_set_dials()
|
||||
} else {
|
||||
parameters <- model_spec %>%
|
||||
workflows::extract_parameter_set_dials() %>%
|
||||
dials::finalize(recipe_features, force = FALSE)
|
||||
}
|
||||
|
||||
grid <- dials::grid_latin_hypercube(parameters, size = num_hyperparameters)
|
||||
|
||||
hyperparameters_temp <- grid %>%
|
||||
dplyr::group_split(dplyr::row_number(), .keep = FALSE) %>%
|
||||
purrr::map_df(tidyr::nest, data=tidyselect::everything()) %>%
|
||||
dplyr::rename(Hyperparameters = data) %>%
|
||||
tibble::rowid_to_column("Hyperparameter_Combo") %>%
|
||||
dplyr::mutate(Model = model,
|
||||
Recipe = recipe)
|
||||
|
||||
} else{
|
||||
|
||||
hyperparameters_temp <- tibble::tibble(Hyperparameter_Combo = 1,
|
||||
Hyperparameters = list(tibble::tibble()),
|
||||
Model = model,
|
||||
Recipe = recipe)
|
||||
}
|
||||
|
||||
hyperparameters_tbl <- rbind(hyperparameters_tbl, hyperparameters_temp)
|
||||
}
|
||||
return(hyperparameters_tbl %>% dplyr::select(Model, Recipe, Hyperparameter_Combo, Hyperparameters))
|
||||
}
|
||||
|
||||
# To Do
|
||||
# [ ] should the recipe column be removed from the output of model_hyperparameters?
|
||||
# this would be important to have if the same models with two recipes had diff hyperparameters
|
||||
# but models with params that change based on predictor column number are only R1 recipes (boost and mars models)
|
||||
# [ ] allow user to select accuracy metric for hyperparameter selection
|
||||
# [x] allow users to turn models on/off in workflow function
|
||||
# [ ] fix long running parallel process for 2nd function call in parameter tuning
|
||||
# [x] filter test data for R2 recipes
|
||||
# [ ] adjust tuning process if only models with no hyperparameters are selected
|
||||
# [x] ensure tuning function can work in spark
|
|
@ -0,0 +1,146 @@
|
|||
#' Refit Models
|
||||
#'
|
||||
#' @param model_recipe_tbl model recipe table
|
||||
#' @param model_fit_tbl model fit table
|
||||
#' @param model_train_test_tbl model train test split table
|
||||
#' @param combo_variables combo variables
|
||||
#' @param parallel_processing parallel processing
|
||||
#' @param num_cores number of cores
|
||||
#' @param seed seed number
|
||||
#'
|
||||
#' @return list of individual model predictions and fitted models
|
||||
#' @keywords internal
|
||||
#' @export
|
||||
refit_models <- function(model_fit_tbl,
|
||||
model_recipe_tbl,
|
||||
model_train_test_tbl = NULL,
|
||||
combo_variables,
|
||||
parallel_processing = NULL,
|
||||
num_cores = NULL,
|
||||
seed = 123) {
|
||||
|
||||
iter_list <- model_train_test_tbl %>%
|
||||
dplyr::filter(Run_Type %in% c("Future_Forecast", "Back_Test", "Ensemble")) %>%
|
||||
dplyr::group_split(dplyr::row_number(), .keep = FALSE) %>%
|
||||
purrr::map(.f = function(x) {
|
||||
model_fit_tbl %>%
|
||||
dplyr::mutate(Run_Type = x %>% dplyr::pull(Run_Type),
|
||||
Run_ID = x %>% dplyr::pull(Run_ID),
|
||||
Train_End = x %>% dplyr::pull(Train_End),
|
||||
Test_End = x %>% dplyr::pull(Test_End)) %>%
|
||||
dplyr::select(-Model_Fit, -Prediction)}) %>%
|
||||
dplyr::bind_rows()
|
||||
|
||||
fit_model <- function(x) {
|
||||
|
||||
combo <- x %>%
|
||||
dplyr::pull(Combo)
|
||||
|
||||
model <- x %>%
|
||||
dplyr::pull(Model)
|
||||
|
||||
recipe <- x %>%
|
||||
dplyr::pull(Recipe_ID)
|
||||
|
||||
model_fit <- model_fit_tbl %>%
|
||||
dplyr::filter(Combo == combo,
|
||||
Model == model,
|
||||
Recipe_ID == recipe)
|
||||
|
||||
model_fit <- model_fit$Model_Fit[[1]]
|
||||
|
||||
run_type <- x %>%
|
||||
dplyr::pull(Run_Type)
|
||||
|
||||
run_id <- x %>%
|
||||
dplyr::pull(Run_ID)
|
||||
|
||||
train_end <- x %>%
|
||||
dplyr::pull(Train_End)
|
||||
|
||||
test_end <- x %>%
|
||||
dplyr::pull(Test_End)
|
||||
|
||||
if(combo != 'All-Data') {
|
||||
|
||||
recipe_data <- model_recipe_tbl %>%
|
||||
dplyr::filter(Recipe == recipe,
|
||||
Combo == combo) %>%
|
||||
dplyr::select(Data) %>%
|
||||
tidyr::unnest(Data)
|
||||
|
||||
} else {
|
||||
|
||||
recipe_data <- model_recipe_tbl %>%
|
||||
dplyr::filter(Recipe == recipe) %>%
|
||||
tidyr::separate(col = Combo,
|
||||
into = combo_variables,
|
||||
sep = "---",
|
||||
remove = FALSE) %>%
|
||||
dplyr::select(Data) %>%
|
||||
tidyr::unnest(Data)
|
||||
}
|
||||
|
||||
training <- recipe_data %>%
|
||||
dplyr::filter(Date <= train_end)
|
||||
|
||||
testing <- recipe_data %>%
|
||||
dplyr::filter(Date > train_end,
|
||||
Date <= test_end)
|
||||
|
||||
if(recipe == "R2") {
|
||||
|
||||
train_origin_max <- training %>%
|
||||
dplyr::filter(Horizon == 1)
|
||||
|
||||
testing <- testing %>%
|
||||
dplyr::filter(Origin == max(train_origin_max$Origin) + 1)
|
||||
}
|
||||
|
||||
# fit model
|
||||
set.seed(seed)
|
||||
|
||||
model_fit <- model_fit %>%
|
||||
generics::fit(data = training)
|
||||
|
||||
# create prediction
|
||||
model_prediction <- testing %>%
|
||||
dplyr::bind_cols(
|
||||
predict(model_fit, new_data = testing)
|
||||
) %>%
|
||||
dplyr::select(Combo, Date, Target, .pred) %>%
|
||||
dplyr::rename(Forecast = .pred)
|
||||
|
||||
# finalize output tbl
|
||||
final_tbl <- tibble::tibble(
|
||||
Combo = combo,
|
||||
Model = model,
|
||||
Recipe_ID = recipe,
|
||||
Train_Test_ID = run_id,
|
||||
Model_Fit = list(model_fit),
|
||||
Prediction = list(model_prediction)
|
||||
)
|
||||
|
||||
return(final_tbl)
|
||||
}
|
||||
|
||||
model_recipe_tbl <- model_recipe_tbl # prevent error in exporting tbl to compute cluster
|
||||
|
||||
model_refit_final_tbl <- submit_fn(model_fit_tbl,
|
||||
parallel_processing,
|
||||
iter_list %>%
|
||||
dplyr::group_split(dplyr::row_number(), .keep = FALSE),
|
||||
fit_model,
|
||||
num_cores,
|
||||
package_exports = c("tibble", "dplyr", "timetk", "hts", "tidyselect", "stringr", "foreach",
|
||||
'doParallel', 'parallel', "lubridate", 'parsnip', 'tune', 'dials', 'workflows',
|
||||
'Cubist', 'earth', 'glmnet', 'kernlab', 'modeltime.gluonts', 'purrr',
|
||||
'recipes', 'rules', 'modeltime'),
|
||||
function_exports = NULL)
|
||||
|
||||
fitted_models <- model_refit_final_tbl %>%
|
||||
dplyr::filter(Train_Test_ID == "01") %>%
|
||||
dplyr::select(Combo, Model, Recipe_ID, Model_Fit)
|
||||
|
||||
return(list(Model_Predictions = model_refit_final_tbl %>% dplyr::select(-Model_Fit), Model_Fit = fitted_models))
|
||||
}
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -0,0 +1,280 @@
|
|||
#' Tune model hyperparameters
|
||||
#'
|
||||
#' @param model_recipe_tbl model recipe table
|
||||
#' @param model_workflow_tbl model workflow table
|
||||
#' @param model_hyparameter_tbl model hyperparameter table
|
||||
#' @param model_train_test_tbl model train test split table
|
||||
#' @param run_global_models run global models
|
||||
#' @param run_local_models run local models
|
||||
#' @param global_model_recipes global model recipes
|
||||
#' @param combo_variables combo variables
|
||||
#' @param parallel_processing parallel processing
|
||||
#' @param num_cores number of cores
|
||||
#' @param seed seed number
|
||||
#'
|
||||
#' @return table
|
||||
#' @keywords internal
|
||||
#' @export
|
||||
tune_models <- function(model_recipe_tbl,
|
||||
model_workflow_tbl,
|
||||
model_hyperparameter_tbl,
|
||||
model_train_test_tbl,
|
||||
run_global_models,
|
||||
run_local_models,
|
||||
global_model_recipes,
|
||||
combo_variables,
|
||||
parallel_processing,
|
||||
num_cores,
|
||||
seed = 123) {
|
||||
|
||||
# get list of tasks to run
|
||||
combo_list <- c()
|
||||
|
||||
global_model_list <- c("cubist", "glmnet", "mars", "svm-poly", "svm-rbf", "xgboost")
|
||||
|
||||
if(run_local_models) {
|
||||
combo_list <- c(combo_list, unique(model_recipe_tbl$Combo))
|
||||
}
|
||||
|
||||
if(run_global_models) {
|
||||
combo_list <- c(combo_list, "All-Data")
|
||||
}
|
||||
|
||||
iter_list <- purrr::map(combo_list, .f = function(x) {
|
||||
model_train_test_tbl %>%
|
||||
dplyr::mutate(Combo = x) %>%
|
||||
dplyr::rename(Train_Test_ID = Run_ID) %>%
|
||||
dplyr::filter(Run_Type == "Validation") %>%
|
||||
dplyr::select(Combo, Train_Test_ID)
|
||||
}) %>%
|
||||
dplyr::bind_rows() %>%
|
||||
dplyr::group_split(dplyr::row_number(), .keep = FALSE) %>%
|
||||
purrr::map(.f = function(x) {
|
||||
temp <- model_hyperparameter_tbl %>%
|
||||
dplyr::select(Hyperparameter_Combo, Model, Recipe) %>%
|
||||
dplyr::rename(Hyperparameter_ID = Hyperparameter_Combo,
|
||||
Recipe_ID = Recipe) %>%
|
||||
dplyr::mutate(Combo = x$Combo,
|
||||
Train_Test_ID = x$Train_Test_ID)
|
||||
|
||||
if(x$Combo == 'All-Data') {
|
||||
temp <- temp %>%
|
||||
dplyr::filter(Model %in% global_model_list,
|
||||
Recipe_ID %in% global_model_recipes)
|
||||
}
|
||||
|
||||
return(temp)
|
||||
}) %>%
|
||||
dplyr::bind_rows() %>%
|
||||
dplyr::select(Combo, Model, Recipe_ID, Train_Test_ID, Hyperparameter_ID)
|
||||
|
||||
#return(iter_list)
|
||||
|
||||
# task run function
|
||||
model_workflow_tbl <- model_workflow_tbl # prevent error in exporting tbl to compute cluster
|
||||
|
||||
initial_tune_fn <- function(x) {
|
||||
|
||||
# run input values
|
||||
param_combo <- x %>%
|
||||
dplyr::pull(Hyperparameter_ID)
|
||||
|
||||
model <- x %>%
|
||||
dplyr::pull(Model)
|
||||
|
||||
data_split <- x %>%
|
||||
dplyr::pull(Train_Test_ID)
|
||||
|
||||
data_prep_recipe <- x %>%
|
||||
dplyr::pull(Recipe_ID)
|
||||
|
||||
combo <- x %>%
|
||||
dplyr::pull(Combo)
|
||||
|
||||
train_end_date <- model_train_test_tbl %>%
|
||||
dplyr::filter(Run_ID == data_split) %>%
|
||||
dplyr::pull(Train_End)
|
||||
|
||||
test_end_date <- model_train_test_tbl %>%
|
||||
dplyr::filter(Run_ID == data_split) %>%
|
||||
dplyr::pull(Test_End)
|
||||
|
||||
# get train/test data
|
||||
full_data <- model_recipe_tbl %>%
|
||||
dplyr::filter(Recipe == data_prep_recipe) %>%
|
||||
dplyr::select(Data) %>%
|
||||
tidyr::unnest(Data)
|
||||
|
||||
if(combo != "All-Data") {
|
||||
|
||||
full_data <- full_data %>%
|
||||
dplyr::filter(Combo == combo)
|
||||
|
||||
} else {
|
||||
|
||||
full_data <- full_data %>%
|
||||
tidyr::separate(col = Combo,
|
||||
into = combo_variables,
|
||||
sep = "---",
|
||||
remove = FALSE)
|
||||
}
|
||||
|
||||
training <- full_data %>%
|
||||
dplyr::filter(Date <= train_end_date)
|
||||
|
||||
testing <- full_data %>%
|
||||
dplyr::filter(Date > train_end_date,
|
||||
Date <= test_end_date)
|
||||
|
||||
if(data_prep_recipe == "R2") {
|
||||
|
||||
train_origin_max <- training %>%
|
||||
dplyr::filter(Horizon == 1)
|
||||
|
||||
testing <- testing %>%
|
||||
dplyr::filter(Origin == max(train_origin_max$Origin) + 1)
|
||||
}
|
||||
|
||||
# get workflow
|
||||
workflow <- model_workflow_tbl %>%
|
||||
dplyr::filter(Model_Name == model,
|
||||
Model_Recipe == data_prep_recipe)
|
||||
|
||||
workflow_final <- workflow$Model_Workflow[[1]]
|
||||
|
||||
# get hyperparameters
|
||||
hyperparameters <- model_hyperparameter_tbl %>%
|
||||
dplyr::filter(Model == model,
|
||||
Recipe == data_prep_recipe,
|
||||
Hyperparameter_Combo == param_combo) %>%
|
||||
dplyr::select(Hyperparameters) %>%
|
||||
tidyr::unnest(Hyperparameters)
|
||||
|
||||
# fit model
|
||||
set.seed(seed)
|
||||
|
||||
if(nrow(hyperparameters) > 0) {
|
||||
model_fit <- workflow_final %>%
|
||||
tune::finalize_workflow(parameters = hyperparameters) %>%
|
||||
generics::fit(data = training)
|
||||
} else {
|
||||
model_fit <- workflow_final %>%
|
||||
generics::fit(data = training)
|
||||
}
|
||||
|
||||
# create prediction
|
||||
model_prediction <- testing %>%
|
||||
dplyr::bind_cols(
|
||||
predict(model_fit, new_data = testing)
|
||||
) %>%
|
||||
dplyr::select(Combo, Date, Target, .pred) %>%
|
||||
dplyr::rename(Forecast = .pred)
|
||||
|
||||
# finalize output tbl
|
||||
final_tbl <- tibble::tibble(
|
||||
Combo = combo,
|
||||
Model = model,
|
||||
Recipe_ID = data_prep_recipe,
|
||||
Train_Test_ID = data_split,
|
||||
Hyperparameter_ID = param_combo,
|
||||
Model_Fit = list(model_fit),
|
||||
Prediction = list(model_prediction)
|
||||
)
|
||||
|
||||
return(final_tbl)
|
||||
}
|
||||
|
||||
initial_tuning_tbl <- submit_fn(model_workflow_tbl,
|
||||
parallel_processing,
|
||||
iter_list %>%
|
||||
dplyr::group_split(dplyr::row_number(), .keep = FALSE),
|
||||
initial_tune_fn,
|
||||
num_cores,
|
||||
package_exports = c("tibble", "dplyr", "timetk", "hts", "tidyselect", "stringr", "foreach",
|
||||
'doParallel', 'parallel', "lubridate", 'parsnip', 'tune', 'dials', 'workflows',
|
||||
'Cubist', 'earth', 'glmnet', 'kernlab', 'modeltime.gluonts', 'purrr',
|
||||
'recipes', 'rules', 'modeltime'),
|
||||
function_exports = NULL)
|
||||
|
||||
# select the best combination of hyperparameters
|
||||
iter_list2 <- iter_list %>%
|
||||
dplyr::select(Combo, Model, Recipe_ID) %>%
|
||||
dplyr::distinct()
|
||||
|
||||
choose_hyperparameters_fn <- function(x) {
|
||||
|
||||
combo <- x %>%
|
||||
dplyr::pull(Combo)
|
||||
|
||||
model <- x %>%
|
||||
dplyr::pull(Model)
|
||||
|
||||
recipe <- x %>%
|
||||
dplyr::pull(Recipe_ID)
|
||||
|
||||
test_tbl <- initial_tuning_tbl %>%
|
||||
dplyr::filter(Combo == combo,
|
||||
Recipe_ID == recipe,
|
||||
Model == model) %>%
|
||||
dplyr::select(Model, Recipe_ID, Hyperparameter_ID, Train_Test_ID, Prediction, Model_Fit)
|
||||
|
||||
# if(combo != "All-Data") {
|
||||
# test_tbl <- initial_tuning_tbl %>%
|
||||
# dplyr::filter(Combo == combo,
|
||||
# Recipe_ID == recipe,
|
||||
# Model == model) %>%
|
||||
# dplyr::select(Model, Recipe_ID, Hyperparameter_ID, Train_Test_ID, Prediction, Model_Fit)
|
||||
# } else{
|
||||
# test_tbl <- initial_tuning_tbl %>%
|
||||
# dplyr::filter(Recipe_ID == recipe,
|
||||
# Model == model) %>%
|
||||
# dplyr::select(Model, Recipe_ID, Hyperparameter_ID, Train_Test_ID, Prediction, Model_Fit)
|
||||
# }
|
||||
|
||||
best_param <- test_tbl %>%
|
||||
dplyr::select(-Model_Fit) %>%
|
||||
tidyr::unnest(Prediction) %>%
|
||||
dplyr::mutate(Combo = combo) %>%
|
||||
dplyr::group_by(Combo, Model, Recipe_ID, Hyperparameter_ID) %>%
|
||||
yardstick::rmse(truth = Target,
|
||||
estimate = Forecast,
|
||||
na_rm = TRUE) %>%
|
||||
dplyr::ungroup() %>%
|
||||
dplyr::arrange(.estimate) %>%
|
||||
dplyr::slice(1) %>%
|
||||
dplyr::pull(Hyperparameter_ID)
|
||||
|
||||
best_model_fit <- test_tbl %>%
|
||||
dplyr::filter(Hyperparameter_ID == best_param) %>%
|
||||
dplyr::slice(1)
|
||||
|
||||
best_model_fit <- best_model_fit$Model_Fit[[1]]
|
||||
|
||||
final_predictions <- test_tbl %>%
|
||||
dplyr::filter(Hyperparameter_ID == best_param) %>%
|
||||
dplyr::select(-Model_Fit) %>%
|
||||
tidyr::unnest(Prediction) %>%
|
||||
dplyr::select(Combo, Date, Train_Test_ID, Target, Forecast)
|
||||
|
||||
return(tibble::tibble(Combo = combo,
|
||||
Model = model,
|
||||
Recipe_ID = recipe,
|
||||
Hyperparameter_ID = best_param,
|
||||
Model_Fit = list(best_model_fit),
|
||||
Prediction = list(final_predictions)))
|
||||
}
|
||||
|
||||
final_tuning_tbl <- submit_fn(model_workflow_tbl,
|
||||
parallel_processing,
|
||||
iter_list2 %>%
|
||||
dplyr::group_split(dplyr::row_number(), .keep = FALSE),
|
||||
choose_hyperparameters_fn,
|
||||
num_cores,
|
||||
package_exports = c("tibble", "dplyr", "timetk", "hts", "tidyselect", "stringr", "foreach",
|
||||
'doParallel', 'parallel', "lubridate", 'parsnip', 'tune', 'dials', 'workflows',
|
||||
'Cubist', 'earth', 'glmnet', 'kernlab', 'modeltime.gluonts', 'purrr',
|
||||
'recipes', 'rules', 'modeltime', 'yardstick'),
|
||||
function_exports = NULL)
|
||||
|
||||
return(final_tuning_tbl)
|
||||
}
|
|
@ -0,0 +1,35 @@
|
|||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/finnts_0.2.0_average_models.R
|
||||
\name{average_models}
|
||||
\alias{average_models}
|
||||
\title{Average Models}
|
||||
\usage{
|
||||
average_models(
|
||||
refit_predictions_tbl,
|
||||
ensemble_predictions_tbl = NULL,
|
||||
model_train_test_tbl,
|
||||
max_model_average = 3,
|
||||
parallel_processing = NULL,
|
||||
num_cores = NULL
|
||||
)
|
||||
}
|
||||
\arguments{
|
||||
\item{refit_predictions_tbl}{individual model predictions}
|
||||
|
||||
\item{ensemble_predictions_tbl}{ensemble model predictions}
|
||||
|
||||
\item{model_train_test_tbl}{train test splits}
|
||||
|
||||
\item{max_model_average}{max number of models to average}
|
||||
|
||||
\item{parallel_processing}{parallel processing}
|
||||
|
||||
\item{num_cores}{number of cores}
|
||||
}
|
||||
\value{
|
||||
list with average predictions and fitted models
|
||||
}
|
||||
\description{
|
||||
Average Models
|
||||
}
|
||||
\keyword{internal}
|
|
@ -0,0 +1,35 @@
|
|||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/finnts_0.2.0_best_models.R
|
||||
\name{best_models}
|
||||
\alias{best_models}
|
||||
\title{Select Best Models}
|
||||
\usage{
|
||||
best_models(
|
||||
refit_predictions_tbl,
|
||||
ensemble_predictions_tbl = NULL,
|
||||
average_predictions_tbl = NULL,
|
||||
model_train_test_tbl,
|
||||
parallel_processing = NULL,
|
||||
num_cores = NULL
|
||||
)
|
||||
}
|
||||
\arguments{
|
||||
\item{refit_predictions_tbl}{individual model predictions}
|
||||
|
||||
\item{ensemble_predictions_tbl}{ensemble model predictions}
|
||||
|
||||
\item{average_predictions_tbl}{model average predictions}
|
||||
|
||||
\item{model_train_test_tbl}{train test splits}
|
||||
|
||||
\item{parallel_processing}{parallel processing}
|
||||
|
||||
\item{num_cores}{number of cores}
|
||||
}
|
||||
\value{
|
||||
tbl with best model flag
|
||||
}
|
||||
\description{
|
||||
Select Best Models
|
||||
}
|
||||
\keyword{internal}
|
|
@ -1,5 +1,5 @@
|
|||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/finnts_0.2.0_train_models.R
|
||||
% Please edit documentation in R/finnts_0.2.0_ensemble_models.R
|
||||
\name{ensemble_models}
|
||||
\alias{ensemble_models}
|
||||
\title{Ensemble Models}
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/finnts_0.2.0_train_models.R
|
||||
% Please edit documentation in R/finnts_0.2.0_model_utility.R
|
||||
\name{model_hyperparameters}
|
||||
\alias{model_hyperparameters}
|
||||
\title{Get model hyperparameters}
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/finnts_0.2.0_train_models.R
|
||||
% Please edit documentation in R/finnts_0.2.0_model_utility.R
|
||||
\name{model_workflows}
|
||||
\alias{model_workflows}
|
||||
\title{Gets model workflows}
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/finnts_0.2.0_train_models.R
|
||||
% Please edit documentation in R/finnts_0.2.0_refit_models.R
|
||||
\name{refit_models}
|
||||
\alias{refit_models}
|
||||
\title{Refit Models}
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/finnts_0.2.0_train_models.R
|
||||
% Please edit documentation in R/finnts_0.2.0_model_utility.R
|
||||
\name{train_test_split}
|
||||
\alias{train_test_split}
|
||||
\title{Gets the train test splits}
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/finnts_0.2.0_train_models.R
|
||||
\name{tune_hyperparameters}
|
||||
\alias{tune_hyperparameters}
|
||||
\title{Tune hyperparameters}
|
||||
% Please edit documentation in R/finnts_0.2.0_tune_models.R
|
||||
\name{tune_models}
|
||||
\alias{tune_models}
|
||||
\title{Tune model hyperparameters}
|
||||
\usage{
|
||||
tune_hyperparameters(
|
||||
tune_models(
|
||||
model_recipe_tbl,
|
||||
model_workflow_tbl,
|
||||
model_hyperparameter_tbl,
|
||||
|
@ -45,6 +45,6 @@ tune_hyperparameters(
|
|||
table
|
||||
}
|
||||
\description{
|
||||
Tune hyperparameters
|
||||
Tune model hyperparameters
|
||||
}
|
||||
\keyword{internal}
|
Загрузка…
Ссылка в новой задаче