Merge pull request #60 from microsoft/hongooi/dev

R model init
2020-02-15 02:35:06 +11:00 · 2020-02-15 02:35:06 +11:00 · 512adf8220
--- a/.gitignore
+++ b/.gitignore
@ -4,3 +4,4 @@
 .vscode/

 ojdata/*
+*.Rdata
--- a/R/forecasting.Rproj
+++ b/R/forecasting.Rproj
@ -0,0 +1,13 @@
+Version: 1.0
+
+RestoreWorkspace: No
+SaveWorkspace: No
+AlwaysSaveHistory: No
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: Yes
+NumSpacesForTab: 4
+Encoding: UTF-8
+
+RnwWeave: knitr
+
--- a/R/orange_juice/01_dataprep.Rmd
+++ b/R/orange_juice/01_dataprep.Rmd
@ -0,0 +1,53 @@
+---
+title: Data preparation
+output: html_notebook
+---
+
+```{r, echo=FALSE, results="hide", message=FALSE}
+library(tidyr)
+library(dplyr)
+library(tsibble)
+library(feasts)
+library(fable)
+```
+
+In this notebook, we generate the datasets that will be used for model training and validating. The experiment parameters are obtained from the file `ojdata_forecast_settings.json`; you can modify that file to vary the experimental setup, or just edit the values in this notebook.
+
+The orange juice dataset comes from the bayesm package, and gives pricing and sales figures over time for a variety of orange juice brands in several stores in Florida.
+
+```{r}
+settings <- jsonlite::fromJSON("ojdata_forecast_settings.json")
+
+train_periods <- seq(settings$TRAIN_END[1], settings$TRAIN_END[2], settings$STEP)
+start_date <- as.Date(settings$START_DATE)
+
+data(orangeJuice, package="bayesm")
+
+# fill out missing weeks
+# use complete() to force all store/brand combinations to have the same no. of weeks
+oj_data <- orangeJuice$yx %>%
+    complete(store, brand, week) %>%
+    mutate(week=yearweek(start_date + week*7)) %>%
+    as_tsibble(index=week, key=c(store, brand)) %>%
+    fill(everything())
+
+subset_oj_data <- function(start, end)
+{
+    start <- yearweek(start_date + start*7)
+    end <- yearweek(start_date + end*7)
+    filter(oj_data, week >= start, week <= end)
+}
+
+oj_train <- lapply(train_periods, function(i) subset_oj_data(settings$TRAIN_START_WEEK, i))
+oj_test <- lapply(train_periods, function(i) subset_oj_data(i + 1, i + settings$STEP))
+
+save(oj_train, oj_test, file="oj_data.Rdata")
+```
+
+Here are some glimpses of what the data looks like. The dependent variable is `logmove`, the logarithm of the total sales for a given brand and store, in a particular week. The variables starting with `price` are the sales price for each brand, in that week.
+
+```{r}
+head(oj_train[[1]])
+
+head(oj_test[[1]])
+```
--- a/R/orange_juice/01_dataprep.nb.html
+++ b/R/orange_juice/01_dataprep.nb.html
--- a/R/orange_juice/02_simplemodels.Rmd
+++ b/R/orange_juice/02_simplemodels.Rmd
@ -0,0 +1,76 @@
+---
+title: Simple models
+output: html_notebook
+encoding: utf8
+---
+
+```{r, echo=FALSE, results="hide", message=FALSE}
+library(tidyr)
+library(dplyr)
+library(tsibble)
+library(feasts)
+library(fable)
+```
+
+We fit some simple models to the orange juice data. One model is fit for each combination of store and brand.
+
+- `mean`: This is just a simple mean.
+- `naive`: A random walk model without any other components. This amounts to setting all forecast values to the last observed value.
+- `drift`: This adjusts the `naive` model to incorporate a trend.
+- `arima`: An ARIMA model with the parameter values estimated from the data.
+- `ets`: An exponentially weighted model, again with parameter values estimated from the data.
+
+```{r}
+load("oj_data.Rdata")
+
+# train the models in parallel
+ncores <- max(2, parallel::detectCores(logical=FALSE) - 2)
+cl <- parallel::makeCluster(ncores)
+invisible(parallel::clusterEvalQ(cl,
+{
+    library(feasts)
+    library(fable)
+    library(tsibble)
+}))
+
+# we have multiple training sets, so parallelise by dataset
+oj_modelset <- parallel::parLapply(cl, oj_train, function(df)
+{
+    model(df,
+        mean=MEAN(logmove),
+        naive=NAIVE(logmove),
+        drift=RW(logmove ~ drift()),
+        arima=ARIMA(logmove),
+        ets=ETS(logmove ~ error("A") + trend("A") + season("N"))
+    )
+})
+
+head(oj_modelset[[1]])
+```
+
+Having fit the models, let's examine their goodness of fit, using the MAPE (mean absolute percentage error) metric.
+
+```{r}
+# compute forecasts, again parallelised by dataset
+oj_fcast <- parallel::clusterMap(cl, function(mod, df) forecast(mod, df), oj_modelset, oj_test)
+
+parallel::stopCluster(cl)
+
+# compute GOF statistics
+orig <- do.call(rbind, oj_test) %>%
+    select(store, brand, week, logmove)
+
+fcast <- do.call(rbind, oj_fcast) %>%
+    as_tibble() %>%
+    select(store, brand, week, .model, logmove) %>%
+    pivot_wider(id_cols=c(store, brand, week), names_from=.model, values_from=logmove)
+
+full_join(fcast, orig) %>%
+    summarise(
+        mean=MAPE(mean - logmove, logmove),
+        naive=MAPE(naive - logmove, logmove),
+        drift=MAPE(drift - logmove, logmove),
+        arima=MAPE(arima - logmove, logmove),
+        ets=MAPE(ets - logmove, logmove)
+    )
+```
--- a/R/orange_juice/02_simplemodels.nb.html
+++ b/R/orange_juice/02_simplemodels.nb.html
--- a/R/orange_juice/README.md
+++ b/R/orange_juice/README.md
@ -0,0 +1,22 @@
+## Orange juice dataset
+
+You'll need the following packages to run the notebooks in this directory:
+
+- bayesm (the source of the data)
+- dplyr
+- tidyr
+- jsonlite
+- tsibble
+- fable
+- fabletools
+- feasts
+
+The easiest way to install them is to run
+
+```r
+install.packages("bayesm")
+install.packages("tidyverse") # installs all tidyverse packages
+install.packages("fable")     # installs other tidyverts packages as dependencies
+install.packages("feasts")
+```
+
--- a/R/orange_juice/ojdata_forecast_settings.json
+++ b/R/orange_juice/ojdata_forecast_settings.json
@ -0,0 +1,6 @@
+{
+    "STEP": 2,
+    "TRAIN_START_WEEK": 40,
+    "TRAIN_END_WEEK": [134, 158],
+    "START_DATE": "1989-09-14"
+}
--- a/examples/ojdata_forecast_settings.json
+++ b/examples/ojdata_forecast_settings.json
@ -1,10 +0,0 @@
-{
-    "NUM_ROUNDS": 1,
-    "PRED_HORIZON": 3,
-    "PRED_STEPS": 2,
-    "TRAIN_START_WEEK": 40,
-    "TRAIN_END_WEEK_LIST": [134, 158, 2],
-    "TEST_START_WEEK_LIST": [135, 159, 2],
-    "TEST_END_WEEK_LIST": [136, 160, 2],
-    "FIRST_WEEK_START": "1989-09-14"
-}