This commit is contained in:
Mike Tokic 2022-03-19 19:23:40 -07:00
Родитель a3e5a07c41
Коммит dd79d517e1
8 изменённых файлов: 453 добавлений и 70 удалений

Просмотреть файл

@ -1,6 +1,6 @@
Package: finnts
Title: Microsoft Finance Time Series Forecasting Framework
Version: 0.1.0
Version: 0.2.0
Authors@R:
c(person(given = "Mike",
family = "Tokic",

Просмотреть файл

@ -60,7 +60,8 @@ get_recipie_configurable <- function(train_data,
recipes::step_rm(Date),
"with_adj_index" = df %>%
recipes::step_rm(Date, Date_index.num),
df)
df,
"none" = df)
}
@ -436,7 +437,7 @@ cubist <- function(train_data,
if(model_type=="ensemble"){
recipe_spec_cubist <- train_data %>%
get_recipie_configurable(rm_date = "with_adj_index",
get_recipie_configurable(rm_date = "with_adj",
step_nzv = "nzv",
one_hot = FALSE,
pca = pca)

Просмотреть файл

@ -72,12 +72,12 @@ get_back_test_scenario_hist_periods <- function(input_tbl,
#'
#' @param input_tbl full data table
#' @param hist_end_date historical end date
#' @param date_type
#' @param date_type date type
#' @param forecast_horizon forecast horizon
#' @param back_test_scenarios back test scenarios
#' @param back_test_spacing back test spacing
#'
#' @return Returns back_test_scenarios and hist_periods_80
#' @return Returns table of train test splits
#' @keywords internal
#' @export
train_test_split <- function(input_tbl,
@ -182,13 +182,13 @@ train_test_split <- function(input_tbl,
#' Gets model workflows
#'
#' @param model_recipe_tbl
#' @param models_to_run
#' @param models_not_to_run
#' @param run_deep_learning
#' @param pca
#' @param model_recipe_tbl model recipe table
#' @param models_to_run models to run
#' @param models_not_to_run models not to run
#' @param run_deep_learning run deep learning models
#' @param pca pca
#'
#' @return Returns back_test_scenarios and hist_periods_80
#' @return Returns table of model workflows
#' @keywords internal
#' @export
model_workflows <- function(model_recipe_tbl,
@ -392,8 +392,8 @@ model_hyperparameters <- function(model_workflow_tbl,
#' @param run_local_models run local models
#' @param global_model_recipes global model recipes
#' @param combo_variables combo variables
#' @param parallel_processing
#' @param num_cores
#' @param parallel_processing parallel processing
#' @param num_cores number of cores
#' @param seed seed number
#'
#' @return table
@ -579,12 +579,12 @@ tune_hyperparameters <- function(model_recipe_tbl,
'Cubist', 'earth', 'glmnet', 'kernlab', 'modeltime.gluonts', 'purrr',
'recipes', 'rules', 'modeltime'),
function_exports = NULL)
# select the best combination of hyperparameters
iter_list2 <- iter_list %>%
dplyr::select(Combo, Model, Recipe_ID) %>%
dplyr::distinct()
choose_hyperparameters_fn <- function(x) {
combo <- x %>%
@ -596,18 +596,24 @@ tune_hyperparameters <- function(model_recipe_tbl,
recipe <- x %>%
dplyr::pull(Recipe_ID)
if(combo != "All-Data") {
test_tbl <- initial_tuning_tbl %>%
dplyr::filter(Combo == combo,
Recipe_ID == recipe,
Model == model) %>%
dplyr::select(Model, Recipe_ID, Hyperparameter_ID, Prediction, Model_Fit)
} else{
test_tbl <- initial_tuning_tbl %>%
dplyr::filter(Recipe_ID == recipe,
Model == model) %>%
dplyr::select(Model, Recipe_ID, Hyperparameter_ID, Prediction, Model_Fit)
}
test_tbl <- initial_tuning_tbl %>%
dplyr::filter(Combo == combo,
Recipe_ID == recipe,
Model == model) %>%
dplyr::select(Model, Recipe_ID, Hyperparameter_ID, Train_Test_ID, Prediction, Model_Fit)
# if(combo != "All-Data") {
# test_tbl <- initial_tuning_tbl %>%
# dplyr::filter(Combo == combo,
# Recipe_ID == recipe,
# Model == model) %>%
# dplyr::select(Model, Recipe_ID, Hyperparameter_ID, Train_Test_ID, Prediction, Model_Fit)
# } else{
# test_tbl <- initial_tuning_tbl %>%
# dplyr::filter(Recipe_ID == recipe,
# Model == model) %>%
# dplyr::select(Model, Recipe_ID, Hyperparameter_ID, Train_Test_ID, Prediction, Model_Fit)
# }
best_param <- test_tbl %>%
dplyr::select(-Model_Fit) %>%
@ -631,7 +637,8 @@ tune_hyperparameters <- function(model_recipe_tbl,
final_predictions <- test_tbl %>%
dplyr::filter(Hyperparameter_ID == best_param) %>%
dplyr::select(-Model_Fit) %>%
tidyr::unnest(Prediction)
tidyr::unnest(Prediction) %>%
dplyr::select(Combo, Date, Train_Test_ID, Target, Forecast)
return(tibble::tibble(Combo = combo,
Model = model,
@ -662,11 +669,11 @@ tune_hyperparameters <- function(model_recipe_tbl,
#' @param model_fit_tbl model fit table
#' @param model_train_test_tbl model train test split table
#' @param combo_variables combo variables
#' @param parallel_processing
#' @param num_cores
#' @param parallel_processing parallel processing
#' @param num_cores number of cores
#' @param seed seed number
#'
#' @return table
#' @return list of individual model predictions and fitted models
#' @keywords internal
#' @export
refit_models <- function(model_fit_tbl,
@ -678,7 +685,7 @@ refit_models <- function(model_fit_tbl,
seed = 123) {
iter_list <- model_train_test_tbl %>%
dplyr::filter(Run_Type %in% c("Future_Forecast", "Back_Test")) %>%
dplyr::filter(Run_Type %in% c("Future_Forecast", "Back_Test", "Ensemble")) %>%
dplyr::group_split(dplyr::row_number(), .keep = FALSE) %>%
purrr::map(.f = function(x) {
model_fit_tbl %>%
@ -690,7 +697,7 @@ refit_models <- function(model_fit_tbl,
dplyr::bind_rows()
fit_model <- function(x) {
print(x)
combo <- x %>%
dplyr::pull(Combo)
@ -796,32 +803,74 @@ refit_models <- function(model_fit_tbl,
'recipes', 'rules', 'modeltime'),
function_exports = NULL)
return(model_refit_final_tbl)
fitted_models <- model_refit_final_tbl %>%
dplyr::filter(Train_Test_ID == "01") %>%
dplyr::select(Combo, Model, Recipe_ID, Model_Fit)
return(list(Model_Predictions = model_refit_final_tbl %>% dplyr::select(-Model_Fit), Model_Fit = fitted_models))
}
#' Ensemble Models
#'
#' @param model_recipe_tbl model recipe table
#' @param model_fit_tbl model fit table
#' @param tuning_tbl hyperparameter tuning predictions
#' @param refit_tbl individual model predictions
#' @param model_train_test_tbl model train test split table
#' @param combo_variables combo variables
#' @param parallel_processing
#' @param num_cores
#' @param date_type date type
#' @param num_hyperparameters number of hyperparameters
#' @param parallel_processing parallel processing
#' @param num_cores number of cores
#' @param seed seed number
#'
#' @return table
#' @return list with ensemble predictions and fitted models
#' @keywords internal
#' @export
ensemble_models <- function(tuning_tbl,
refit_tbl,
model_train_test_tbl,
date_type,
num_hyperparameters = 5,
parallel_processing = NULL,
num_cores = NULL,
seed = 123) {
# example data
#tuning_results %>% dplyr::select(Prediction) %>% tidyr::unnest(Prediction)
# get individual prediction data
initial_results_tbl <- tuning_tbl %>%
dplyr::select(Combo, Model, Recipe_ID, Prediction) %>%
dplyr::rename(Combo_Key = Combo) %>%
tidyr::unnest(Prediction) %>%
rbind(
refit_tbl %>%
dplyr::select(Combo, Model, Recipe_ID, Train_Test_ID, Prediction) %>%
dplyr::rename(Combo_Key = Combo) %>%
tidyr::unnest(Prediction)
)
combo_iter_list <- unique(initial_results_tbl$Combo)
prep_ensemble_fn <- function(combo) {
initial_results_tbl %>%
dplyr::filter(Combo == combo) %>%
dplyr::mutate(Suffix = ifelse(Combo_Key == "All-Data", "Global", "Local")) %>%
tidyr::unite(col= "Model_Key",
c("Model", "Recipe_ID", "Suffix"),
sep="-",
remove=F) %>%
tidyr::pivot_wider(names_from = Model_Key, values_from = Forecast,
id_cols = c("Combo", "Date", "Train_Test_ID", "Target"), values_fill = 0)
}
prep_ensemble_tbl <- submit_fn(tuning_tbl,
parallel_processing,
combo_iter_list,
prep_ensemble_fn,
num_cores,
package_exports = c("tibble", "dplyr", "timetk", "hts", "tidyselect", "stringr", "foreach",
'doParallel', 'parallel', "lubridate", 'parsnip', 'tune', 'dials', 'workflows',
'Cubist', 'earth', 'glmnet', 'kernlab', 'modeltime.gluonts', 'purrr',
'recipes', 'rules', 'modeltime'),
function_exports = NULL)
# ensemble models to run
refit_models <- unique(refit_tbl$Model)
@ -830,40 +879,349 @@ ensemble_models <- function(tuning_tbl,
if(length(ensemble_model_list) < 1) {
stop("no ensemble models chosen to run")
}
model_workflow_tbl <- tibble::tibble()
for(model in ensemble_model_list) {
avail_arg_list <- list('train_data' = tibble::tibble(Combo = 1,
Date = as),
'model_type' = "ensemble",
avail_arg_list <- list('train_data' = prep_ensemble_tbl %>% dplyr::select(-Train_Test_ID),
'model_type' = "ensemble",
'pca' = FALSE)
# get specific model spec
fn_to_invoke <- get(gsub('-', '_', model))
exp_arg_list <- formalArgs(fn_to_invoke)
avail_names <- names(avail_arg_list)
inp_arg_list <- list()
for(x in avail_names){
if(x %in% exp_arg_list){
inp_arg_list[x] <- avail_arg_list[x]
}
}
model_workflow <- do.call(fn_to_invoke,inp_arg_list, quote=TRUE)
workflow_tbl <- tibble::tibble(Model_Name = model,
workflow_tbl <- tibble::tibble(Model_Name = model,
Model_Workflow = list(model_workflow))
model_workflow_tbl <- rbind(model_workflow_tbl, workflow_tbl)
}
return(model_workflow_tbl)
# get hyperparameters
hyperparameters_tbl <- tibble::tibble()
for(x in model_workflow_tbl %>% dplyr::group_split(dplyr::row_number(), .keep = FALSE)) {
model <- x %>%
dplyr::pull(Model_Name)
temp_tbl <- model_workflow_tbl %>%
dplyr::filter(Model_Name == model)
model_workflow <- temp_tbl$Model_Workflow[[1]]
model_spec <- model_workflow %>%
workflows::extract_spec_parsnip()
recipe_features <- prep_ensemble_tbl
if(model=="svm-rbf") {
parameters <- model_spec %>%
dials::parameters()
} else {
parameters <- model_spec %>%
dials::parameters() %>%
dials::finalize(recipe_features, force = FALSE)
}
grid <- dials::grid_latin_hypercube(parameters, size = num_hyperparameters)
hyperparameters_temp <- grid %>%
dplyr::group_split(dplyr::row_number(), .keep = FALSE) %>%
purrr::map_df(tidyr::nest, data=tidyselect::everything()) %>%
dplyr::rename(Hyperparameters = data) %>%
tibble::rowid_to_column("Hyperparameter_Combo") %>%
dplyr::mutate(Model = model)
hyperparameters_tbl <- rbind(hyperparameters_tbl, hyperparameters_temp)
}
# fit models by hyperparameter
hyperparmaeter_iter_list <- purrr::map(combo_iter_list, .f = function(x) {
model_train_test_tbl %>%
dplyr::mutate(Combo = x) %>%
dplyr::rename(Train_Test_ID = Run_ID) %>%
dplyr::filter(Run_Type == "Validation") %>%
dplyr::select(Combo, Train_Test_ID)
}) %>%
dplyr::bind_rows() %>%
dplyr::group_split(dplyr::row_number(), .keep = FALSE) %>%
purrr::map(.f = function(x) {
hyperparameters_tbl %>%
dplyr::select(Hyperparameter_Combo, Model) %>%
dplyr::rename(Hyperparameter_ID = Hyperparameter_Combo) %>%
dplyr::mutate(Combo = x$Combo,
Train_Test_ID = x$Train_Test_ID)
}) %>%
dplyr::bind_rows() %>%
dplyr::select(Combo, Model, Train_Test_ID, Hyperparameter_ID)
initial_tune_fn <- function(x) {
# run input values
param_combo <- x %>%
dplyr::pull(Hyperparameter_ID)
model <- x %>%
dplyr::pull(Model)
data_split <- x %>%
dplyr::pull(Train_Test_ID)
combo <- x %>%
dplyr::pull(Combo)
train_end_date <- model_train_test_tbl %>%
dplyr::filter(Run_ID == data_split) %>%
dplyr::pull(Train_End)
test_end_date <- model_train_test_tbl %>%
dplyr::filter(Run_ID == data_split) %>%
dplyr::pull(Test_End)
# get train/test data
full_data <- prep_ensemble_tbl %>%
dplyr::filter(Combo == combo) %>%
dplyr::mutate(Date_index.num = 0)
training <- full_data %>%
dplyr::filter(Date <= train_end_date) %>%
dplyr::select(-Train_Test_ID)
testing <- full_data %>%
dplyr::filter(Date > train_end_date,
Date <= test_end_date,
Train_Test_ID == data_split)
# get workflow
workflow <- model_workflow_tbl %>%
dplyr::filter(Model_Name == model)
workflow_final <- workflow$Model_Workflow[[1]]
# get hyperparameters
hyperparameters <- hyperparameters_tbl %>%
dplyr::filter(Model == model,
Hyperparameter_Combo == param_combo) %>%
dplyr::select(Hyperparameters) %>%
tidyr::unnest(Hyperparameters)
# fit model
set.seed(seed)
model_fit <- workflow_final %>%
tune::finalize_workflow(parameters = hyperparameters) %>%
generics::fit(data = training)
# create prediction
model_prediction <- testing %>%
dplyr::bind_cols(
predict(model_fit, new_data = testing)
) %>%
dplyr::select(Combo, Date, Target, .pred) %>%
dplyr::rename(Forecast = .pred)
# finalize output tbl
final_tbl <- tibble::tibble(
Combo = combo,
Model = model,
Train_Test_ID = data_split,
Hyperparameter_ID = param_combo,
Model_Fit = list(model_fit),
Prediction = list(model_prediction)
)
return(final_tbl)
}
initial_tuning_tbl <- submit_fn(model_workflow_tbl,
parallel_processing,
hyperparmaeter_iter_list %>%
dplyr::group_split(dplyr::row_number(), .keep = FALSE),
initial_tune_fn,
num_cores,
package_exports = c("tibble", "dplyr", "timetk", "hts", "tidyselect", "stringr", "foreach",
'doParallel', 'parallel', "lubridate", 'parsnip', 'tune', 'dials', 'workflows',
'Cubist', 'earth', 'glmnet', 'kernlab', 'modeltime.gluonts', 'purrr',
'recipes', 'rules', 'modeltime'),
function_exports = NULL)
hyperparmaeter_iter_list2 <- hyperparmaeter_iter_list %>%
dplyr::select(Combo, Model) %>%
dplyr::distinct()
choose_hyperparameters_fn <- function(x) {
combo <- x %>%
dplyr::pull(Combo)
model <- x %>%
dplyr::pull(Model)
test_tbl <- initial_tuning_tbl %>%
dplyr::filter(Combo == combo,
Model == model) %>%
dplyr::select(Model, Hyperparameter_ID, Train_Test_ID, Prediction, Model_Fit)
best_param <- test_tbl %>%
dplyr::select(-Model_Fit) %>%
tidyr::unnest(Prediction) %>%
dplyr::mutate(Combo = combo) %>%
dplyr::group_by(Combo, Model, Hyperparameter_ID) %>%
yardstick::rmse(truth = Target,
estimate = Forecast,
na_rm = TRUE) %>%
dplyr::ungroup() %>%
dplyr::arrange(.estimate) %>%
dplyr::slice(1) %>%
dplyr::pull(Hyperparameter_ID)
best_model_fit <- test_tbl %>%
dplyr::filter(Hyperparameter_ID == best_param) %>%
dplyr::slice(1)
best_model_fit <- best_model_fit$Model_Fit[[1]]
final_predictions <- test_tbl %>%
dplyr::filter(Hyperparameter_ID == best_param) %>%
dplyr::select(-Model_Fit) %>%
tidyr::unnest(Prediction) %>%
dplyr::select(Combo, Date, Train_Test_ID, Target, Forecast)
return(tibble::tibble(Combo = combo,
Model = model,
Hyperparameter_ID = best_param,
Model_Fit = list(best_model_fit),
Prediction = list(final_predictions)))
}
final_tuning_tbl <- submit_fn(model_workflow_tbl,
parallel_processing,
hyperparmaeter_iter_list2 %>%
dplyr::group_split(dplyr::row_number(), .keep = FALSE),
choose_hyperparameters_fn,
num_cores,
package_exports = c("tibble", "dplyr", "timetk", "hts", "tidyselect", "stringr", "foreach",
'doParallel', 'parallel', "lubridate", 'parsnip', 'tune', 'dials', 'workflows',
'Cubist', 'earth', 'glmnet', 'kernlab', 'modeltime.gluonts', 'purrr',
'recipes', 'rules', 'modeltime', 'yardstick'),
function_exports = NULL)
# refit ensemble models
refit_iter_list <- model_train_test_tbl %>%
dplyr::filter(Run_Type %in% c("Future_Forecast", "Back_Test")) %>%
dplyr::group_split(dplyr::row_number(), .keep = FALSE) %>%
purrr::map(.f = function(x) {
final_tuning_tbl %>%
dplyr::mutate(Run_Type = x %>% dplyr::pull(Run_Type),
Run_ID = x %>% dplyr::pull(Run_ID),
Train_End = x %>% dplyr::pull(Train_End),
Test_End = x %>% dplyr::pull(Test_End)) %>%
dplyr::select(-Model_Fit, -Prediction)}) %>%
dplyr::bind_rows()
fit_model <- function(x) {
combo <- x %>%
dplyr::pull(Combo)
model <- x %>%
dplyr::pull(Model)
model_fit <- final_tuning_tbl %>%
dplyr::filter(Model == model,
Combo == combo)
model_fit <- model_fit$Model_Fit[[1]]
run_type <- x %>%
dplyr::pull(Run_Type)
run_id <- x %>%
dplyr::pull(Run_ID)
train_end <- x %>%
dplyr::pull(Train_End)
test_end <- x %>%
dplyr::pull(Test_End)
full_data <- prep_ensemble_tbl %>%
dplyr::filter(Combo == combo) %>%
dplyr::mutate(Date_index.num = 0)
training <- full_data %>%
dplyr::filter(Date <= train_end) %>%
dplyr::select(-Train_Test_ID)
testing <- full_data %>%
dplyr::filter(Date > train_end,
Date <= test_end,
Train_Test_ID == run_id)
# fit model
set.seed(seed)
model_fit <- model_fit %>%
generics::fit(data = training)
# create prediction
model_prediction <- testing %>%
dplyr::bind_cols(
predict(model_fit, new_data = testing)
) %>%
dplyr::select(Combo, Date, Target, .pred) %>%
dplyr::rename(Forecast = .pred)
# finalize output tbl
final_tbl <- tibble::tibble(
Combo = combo,
Model = model,
Recipe_ID = "Ensemble",
Train_Test_ID = run_id,
Model_Fit = list(model_fit),
Prediction = list(model_prediction)
)
return(final_tbl)
}
model_refit_final_tbl <- submit_fn(final_tuning_tbl,
parallel_processing,
refit_iter_list %>%
dplyr::group_split(dplyr::row_number(), .keep = FALSE),
fit_model,
num_cores,
package_exports = c("tibble", "dplyr", "timetk", "hts", "tidyselect", "stringr", "foreach",
'doParallel', 'parallel', "lubridate", 'parsnip', 'tune', 'dials', 'workflows',
'Cubist', 'earth', 'glmnet', 'kernlab', 'modeltime.gluonts', 'purrr',
'recipes', 'rules', 'modeltime'),
function_exports = NULL)
#get final combined results and return final fitted models
final_model_fit_tbl <- model_refit_final_tbl %>%
dplyr::filter(Train_Test_ID == "01") %>%
dplyr::select(Combo, Model, Recipe_ID, Model_Fit)
final_ensemble_results_tbl <- model_refit_final_tbl %>%
dplyr::select(-Model_Fit)
return(list(Model_Predictions = final_ensemble_results_tbl, Model_Fit = final_model_fit_tbl))
}

Просмотреть файл

@ -7,25 +7,33 @@
ensemble_models(
tuning_tbl,
refit_tbl,
model_train_test_tbl,
date_type,
num_hyperparameters = 5,
parallel_processing = NULL,
num_cores = NULL,
seed = 123
)
}
\arguments{
\item{seed}{seed number}
\item{tuning_tbl}{hyperparameter tuning predictions}
\item{model_recipe_tbl}{model recipe table}
\item{model_fit_tbl}{model fit table}
\item{refit_tbl}{individual model predictions}
\item{model_train_test_tbl}{model train test split table}
\item{combo_variables}{combo variables}
\item{date_type}{date type}
\item{num_hyperparameters}{number of hyperparameters}
\item{parallel_processing}{parallel processing}
\item{num_cores}{number of cores}
\item{seed}{seed number}
}
\value{
table
list with ensemble predictions and fitted models
}
\description{
Ensemble Models

Просмотреть файл

@ -13,12 +13,18 @@ model_workflows(
)
}
\arguments{
\item{model_recipe_tbl}{}
\item{model_recipe_tbl}{model recipe table}
\item{pca}{}
\item{models_to_run}{models to run}
\item{models_not_to_run}{models not to run}
\item{run_deep_learning}{run deep learning models}
\item{pca}{pca}
}
\value{
Returns back_test_scenarios and hist_periods_80
Returns table of model workflows
}
\description{
Gets model workflows

Просмотреть файл

@ -23,10 +23,14 @@ refit_models(
\item{combo_variables}{combo variables}
\item{parallel_processing}{parallel processing}
\item{num_cores}{number of cores}
\item{seed}{seed number}
}
\value{
table
list of individual model predictions and fitted models
}
\description{
Refit Models

Просмотреть файл

@ -18,6 +18,8 @@ train_test_split(
\item{hist_end_date}{historical end date}
\item{date_type}{date type}
\item{forecast_horizon}{forecast horizon}
\item{back_test_scenarios}{back test scenarios}
@ -25,7 +27,7 @@ train_test_split(
\item{back_test_spacing}{back test spacing}
}
\value{
Returns back_test_scenarios and hist_periods_80
Returns table of train test splits
}
\description{
Gets the train test splits

Просмотреть файл

@ -33,6 +33,10 @@ tune_hyperparameters(
\item{combo_variables}{combo variables}
\item{parallel_processing}{parallel processing}
\item{num_cores}{number of cores}
\item{seed}{seed number}
\item{model_hyparameter_tbl}{model hyperparameter table}