Merge pull request #60 from microsoft/hongooi/dev
R model init
Former-commit-id: 512adf8220
This commit is contained in:
Коммит
8146fe1701
|
@ -4,3 +4,4 @@
|
|||
.vscode/
|
||||
|
||||
ojdata/*
|
||||
*.Rdata
|
||||
|
|
|
@ -0,0 +1,13 @@
|
|||
Version: 1.0
|
||||
|
||||
RestoreWorkspace: No
|
||||
SaveWorkspace: No
|
||||
AlwaysSaveHistory: No
|
||||
|
||||
EnableCodeIndexing: Yes
|
||||
UseSpacesForTab: Yes
|
||||
NumSpacesForTab: 4
|
||||
Encoding: UTF-8
|
||||
|
||||
RnwWeave: knitr
|
||||
|
|
@ -0,0 +1,53 @@
|
|||
---
|
||||
title: Data preparation
|
||||
output: html_notebook
|
||||
---
|
||||
|
||||
```{r, echo=FALSE, results="hide", message=FALSE}
|
||||
library(tidyr)
|
||||
library(dplyr)
|
||||
library(tsibble)
|
||||
library(feasts)
|
||||
library(fable)
|
||||
```
|
||||
|
||||
In this notebook, we generate the datasets that will be used for model training and validating. The experiment parameters are obtained from the file `ojdata_forecast_settings.json`; you can modify that file to vary the experimental setup, or just edit the values in this notebook.
|
||||
|
||||
The orange juice dataset comes from the bayesm package, and gives pricing and sales figures over time for a variety of orange juice brands in several stores in Florida.
|
||||
|
||||
```{r}
|
||||
settings <- jsonlite::fromJSON("ojdata_forecast_settings.json")
|
||||
|
||||
train_periods <- seq(settings$TRAIN_END[1], settings$TRAIN_END[2], settings$STEP)
|
||||
start_date <- as.Date(settings$START_DATE)
|
||||
|
||||
data(orangeJuice, package="bayesm")
|
||||
|
||||
# fill out missing weeks
|
||||
# use complete() to force all store/brand combinations to have the same no. of weeks
|
||||
oj_data <- orangeJuice$yx %>%
|
||||
complete(store, brand, week) %>%
|
||||
mutate(week=yearweek(start_date + week*7)) %>%
|
||||
as_tsibble(index=week, key=c(store, brand)) %>%
|
||||
fill(everything())
|
||||
|
||||
subset_oj_data <- function(start, end)
|
||||
{
|
||||
start <- yearweek(start_date + start*7)
|
||||
end <- yearweek(start_date + end*7)
|
||||
filter(oj_data, week >= start, week <= end)
|
||||
}
|
||||
|
||||
oj_train <- lapply(train_periods, function(i) subset_oj_data(settings$TRAIN_START_WEEK, i))
|
||||
oj_test <- lapply(train_periods, function(i) subset_oj_data(i + 1, i + settings$STEP))
|
||||
|
||||
save(oj_train, oj_test, file="oj_data.Rdata")
|
||||
```
|
||||
|
||||
Here are some glimpses of what the data looks like. The dependent variable is `logmove`, the logarithm of the total sales for a given brand and store, in a particular week. The variables starting with `price` are the sales price for each brand, in that week.
|
||||
|
||||
```{r}
|
||||
head(oj_train[[1]])
|
||||
|
||||
head(oj_test[[1]])
|
||||
```
|
Различия файлов скрыты, потому что одна или несколько строк слишком длинны
|
@ -0,0 +1,76 @@
|
|||
---
|
||||
title: Simple models
|
||||
output: html_notebook
|
||||
encoding: utf8
|
||||
---
|
||||
|
||||
```{r, echo=FALSE, results="hide", message=FALSE}
|
||||
library(tidyr)
|
||||
library(dplyr)
|
||||
library(tsibble)
|
||||
library(feasts)
|
||||
library(fable)
|
||||
```
|
||||
|
||||
We fit some simple models to the orange juice data. One model is fit for each combination of store and brand.
|
||||
|
||||
- `mean`: This is just a simple mean.
|
||||
- `naive`: A random walk model without any other components. This amounts to setting all forecast values to the last observed value.
|
||||
- `drift`: This adjusts the `naive` model to incorporate a trend.
|
||||
- `arima`: An ARIMA model with the parameter values estimated from the data.
|
||||
- `ets`: An exponentially weighted model, again with parameter values estimated from the data.
|
||||
|
||||
```{r}
|
||||
load("oj_data.Rdata")
|
||||
|
||||
# train the models in parallel
|
||||
ncores <- max(2, parallel::detectCores(logical=FALSE) - 2)
|
||||
cl <- parallel::makeCluster(ncores)
|
||||
invisible(parallel::clusterEvalQ(cl,
|
||||
{
|
||||
library(feasts)
|
||||
library(fable)
|
||||
library(tsibble)
|
||||
}))
|
||||
|
||||
# we have multiple training sets, so parallelise by dataset
|
||||
oj_modelset <- parallel::parLapply(cl, oj_train, function(df)
|
||||
{
|
||||
model(df,
|
||||
mean=MEAN(logmove),
|
||||
naive=NAIVE(logmove),
|
||||
drift=RW(logmove ~ drift()),
|
||||
arima=ARIMA(logmove),
|
||||
ets=ETS(logmove ~ error("A") + trend("A") + season("N"))
|
||||
)
|
||||
})
|
||||
|
||||
head(oj_modelset[[1]])
|
||||
```
|
||||
|
||||
Having fit the models, let's examine their goodness of fit, using the MAPE (mean absolute percentage error) metric.
|
||||
|
||||
```{r}
|
||||
# compute forecasts, again parallelised by dataset
|
||||
oj_fcast <- parallel::clusterMap(cl, function(mod, df) forecast(mod, df), oj_modelset, oj_test)
|
||||
|
||||
parallel::stopCluster(cl)
|
||||
|
||||
# compute GOF statistics
|
||||
orig <- do.call(rbind, oj_test) %>%
|
||||
select(store, brand, week, logmove)
|
||||
|
||||
fcast <- do.call(rbind, oj_fcast) %>%
|
||||
as_tibble() %>%
|
||||
select(store, brand, week, .model, logmove) %>%
|
||||
pivot_wider(id_cols=c(store, brand, week), names_from=.model, values_from=logmove)
|
||||
|
||||
full_join(fcast, orig) %>%
|
||||
summarise(
|
||||
mean=MAPE(mean - logmove, logmove),
|
||||
naive=MAPE(naive - logmove, logmove),
|
||||
drift=MAPE(drift - logmove, logmove),
|
||||
arima=MAPE(arima - logmove, logmove),
|
||||
ets=MAPE(ets - logmove, logmove)
|
||||
)
|
||||
```
|
Различия файлов скрыты, потому что одна или несколько строк слишком длинны
|
@ -0,0 +1,22 @@
|
|||
## Orange juice dataset
|
||||
|
||||
You'll need the following packages to run the notebooks in this directory:
|
||||
|
||||
- bayesm (the source of the data)
|
||||
- dplyr
|
||||
- tidyr
|
||||
- jsonlite
|
||||
- tsibble
|
||||
- fable
|
||||
- fabletools
|
||||
- feasts
|
||||
|
||||
The easiest way to install them is to run
|
||||
|
||||
```r
|
||||
install.packages("bayesm")
|
||||
install.packages("tidyverse") # installs all tidyverse packages
|
||||
install.packages("fable") # installs other tidyverts packages as dependencies
|
||||
install.packages("feasts")
|
||||
```
|
||||
|
|
@ -0,0 +1,6 @@
|
|||
{
|
||||
"STEP": 2,
|
||||
"TRAIN_START_WEEK": 40,
|
||||
"TRAIN_END_WEEK": [134, 158],
|
||||
"START_DATE": "1989-09-14"
|
||||
}
|
|
@ -1,10 +0,0 @@
|
|||
{
|
||||
"NUM_ROUNDS": 1,
|
||||
"PRED_HORIZON": 3,
|
||||
"PRED_STEPS": 2,
|
||||
"TRAIN_START_WEEK": 40,
|
||||
"TRAIN_END_WEEK_LIST": [134, 158, 2],
|
||||
"TEST_START_WEEK_LIST": [135, 159, 2],
|
||||
"TEST_END_WEEK_LIST": [136, 160, 2],
|
||||
"FIRST_WEEK_START": "1989-09-14"
|
||||
}
|
Загрузка…
Ссылка в новой задаче