1 What drives you to build model?

All models are wrong, but some are useful.

  • George Box

2 What did you find from Homework 4?

2.1 Check raw data

if (!require("forecast")){install.packages("forecast")}
Walmart <- readxl::read_xlsx(path = "data/Walmart_2.xlsx")
Walmart
# Store the data in a conveniently named variable
Walmart_Sales <- ts(Walmart[,3], start = c(2003, 1), frequency = 4)

2.2 Explore the components

# First, plot the time series and a seasonal plot
plot(Walmart_Sales)

seasonplot(Walmart_Sales, year.labels = T)

2.3 Fit additive and multiplicative Holt-Winter models

# The time series is clearly trended and seasonal, and the seasonality is multiplicative
fitsample <- window(Walmart_Sales, end=c(2014,4))
holdout <- window(Walmart_Sales, start=c(2015,1))

# Develop seasonal models
fit <- vector("list",2)
# Addtive
fit[[1]] <- ets(fitsample,model="AAA",damped=FALSE)
# Multiplicative
fit[[2]] <- ets(fitsample,model="MAM",damped=FALSE)

2.4 Evaluate fitsample performance

# Get MSE of models 
# (note: they have the same number of parameters and 
# are estimated on the same sample,
# therefore using in-sample MSE to compare them is valid)
# One way:
# MSE <- unlist(lapply(fit,function(x){x$mse}))
# names(MSE) <- c("Additive","Multiplicative")
# print(MSE)

# Another way:
MSE1 <- fit[[1]]$mse
MSE2 <- fit[[2]]$mse
MSE <- data.frame(Additive=MSE1, Multiplicative=MSE2)
rownames(MSE) <- "MSE"
MSE
# MSE suggests to use the additive model, right?

2.5 Evaluate hold-out sample performance

# Let us validate that on holdout errors
frc <- array(NA,c(length(holdout),length(fit)),dimnames=list(time(holdout),names(MSE)))
for (i in 1:2){
  frc[,i] <- forecast(fit[[i]],h=length(holdout))$mean
}
# calculate errors
e <- matrix(rep(holdout,2),ncol=2) - frc
RMSE <- sqrt(apply(e^2,2,mean))
MAE <- apply(abs(e),2,mean)
RMSE_MAE <- data.frame(rbind(RMSE, MAE))
RMSE_MAE
# The MAE result agree, the additive model is best

2.6 Model selection

# Information Criteria
ICs <- data.frame(Additive=c(fit[[1]]$aic, fit[[1]]$aicc, fit[[1]]$bic),
           Multiplicative=c(fit[[2]]$aic, fit[[2]]$aicc, fit[[2]]$bic))
rownames(ICs) <- c("AIC", "AICc", "BIC")
ICs

2.7 Do you believe these two models are good enough?

2.7.1 YES!

Give me your reasons.

2.7.2 NO!

Why?

3 Validation of Model Assumptions

3.2 How about the hold-out sample?

e_Add <- ts(e[,1], start=c(2015,1), frequency = 4)
e_Mult <- ts(e[,2], start=c(2015,1), frequency = 4)
grid.draw(
  arrangeGrob(autoplot(e_Add)+ylab("Residuals(Additive)"), 
              autoplot(e_Mult)+ylab("Residuals(Multiplicative)"),
    nrow = 1, top = "Hold-out Residuals Diagnostics"
))

data.frame(Yt=e_Add[-1], Yt_1=e_Add[-length(e_Add)])
cor(e_Add[-1], e_Add[-length(e_Add)], use = "pairwise.complete.obs")
## [1] 0.9643207

4 Chapter 6

4.1 Check the Sample Autocorrelation Function(ACF) & PACF

# Some additional handy functions for building ARIMA models
# ACF and PACF plots can be produced using the functions acf and pacf
forecast::ggtsdisplay(Walmart_Sales,lag.max=30, plot.type = c("scatter"))

# We can calculate differences with the function diff
d1y <- diff(Walmart_Sales,1)
plot(d1y)

# forecast::ggtsdisplay(d1y,lag.max=30, plot.ty = c("partial"))
# fit_test <- auto.arima(d1y)

4.2 Let’s improve the forecasting by ARIMA

fit_ARIMA <- auto.arima(fitsample)

frc_ARIMA <- forecast(fit_ARIMA, h=length(holdout))$mean
cbind(holdout, frc, frc_ARIMA)
##         holdout frc.Additive frc.Multiplicative frc_ARIMA
## 2015 Q1   114.2     116.2300           116.0815  117.5340
## 2015 Q2   119.3     121.0544           119.4586  119.8878
## 2015 Q3   118.1     120.1729           119.2284  119.5271
## 2015 Q4   130.7     133.5632           134.4199  133.7297
## 2016 Q1   114.0     121.0584           121.0825  121.3568
## 2016 Q2   119.3     125.8829           124.5506  124.1712
## 2016 Q3   116.6     125.0014           124.2573  123.5463
## 2016 Q4   128.7     138.3916           140.0307  137.7462
## 2017 Q1   115.0     125.8869           126.0842  126.1064
## 2017 Q2   119.4     130.7114           129.6432  128.7893
## 2017 Q3   117.2     129.8298           129.2868  128.4064
## 2017 Q4   129.8     143.2201           145.6422  142.6927
## 2018 Q1   116.5     130.7154           131.0864  130.7305
## 2018 Q2   121.9     135.5399           134.7365  133.5460
## 2018 Q3   122.1     134.6583           134.3169  133.0856
## 2018 Q4   135.2     148.0486           151.2544  147.3703
## 2019 Q1   121.6     135.5438           136.0893  135.6202
## 2019 Q2   127.1     140.3683           139.8303  138.3971
# Calculate error metrics
e_ARIMA <- holdout - frc_ARIMA
# RMSE & MAE
RMSE_ARIMA <- sqrt(apply(e_ARIMA^2,2,mean))
MAE_ARIMA <- apply(abs(e_ARIMA),2,mean)
data.frame(RMSE_MAE, ARIMA=c(RMSE_ARIMA,MAE_ARIMA))
# Various model diagnostics can be easily done
# The function tsdiag is quite useful
tsdiag(fit_ARIMA)