All models are wrong, but some are useful.
- George Box
if (!require("forecast")){install.packages("forecast")}
Walmart <- readxl::read_xlsx(path = "data/Walmart_2.xlsx")
Walmart
# Store the data in a conveniently named variable
Walmart_Sales <- ts(Walmart[,3], start = c(2003, 1), frequency = 4)
# First, plot the time series and a seasonal plot
plot(Walmart_Sales)
seasonplot(Walmart_Sales, year.labels = T)
# The time series is clearly trended and seasonal, and the seasonality is multiplicative
fitsample <- window(Walmart_Sales, end=c(2014,4))
holdout <- window(Walmart_Sales, start=c(2015,1))
# Develop seasonal models
fit <- vector("list",2)
# Addtive
fit[[1]] <- ets(fitsample,model="AAA",damped=FALSE)
# Multiplicative
fit[[2]] <- ets(fitsample,model="MAM",damped=FALSE)
# Get MSE of models
# (note: they have the same number of parameters and
# are estimated on the same sample,
# therefore using in-sample MSE to compare them is valid)
# One way:
# MSE <- unlist(lapply(fit,function(x){x$mse}))
# names(MSE) <- c("Additive","Multiplicative")
# print(MSE)
# Another way:
MSE1 <- fit[[1]]$mse
MSE2 <- fit[[2]]$mse
MSE <- data.frame(Additive=MSE1, Multiplicative=MSE2)
rownames(MSE) <- "MSE"
MSE
# MSE suggests to use the additive model, right?
# Let us validate that on holdout errors
frc <- array(NA,c(length(holdout),length(fit)),dimnames=list(time(holdout),names(MSE)))
for (i in 1:2){
frc[,i] <- forecast(fit[[i]],h=length(holdout))$mean
}
# calculate errors
e <- matrix(rep(holdout,2),ncol=2) - frc
RMSE <- sqrt(apply(e^2,2,mean))
MAE <- apply(abs(e),2,mean)
RMSE_MAE <- data.frame(rbind(RMSE, MAE))
RMSE_MAE
# The MAE result agree, the additive model is best
# Information Criteria
ICs <- data.frame(Additive=c(fit[[1]]$aic, fit[[1]]$aicc, fit[[1]]$bic),
Multiplicative=c(fit[[2]]$aic, fit[[2]]$aicc, fit[[2]]$bic))
rownames(ICs) <- c("AIC", "AICc", "BIC")
ICs
Give me your reasons.
Why?
e_Add <- ts(e[,1], start=c(2015,1), frequency = 4)
e_Mult <- ts(e[,2], start=c(2015,1), frequency = 4)
grid.draw(
arrangeGrob(autoplot(e_Add)+ylab("Residuals(Additive)"),
autoplot(e_Mult)+ylab("Residuals(Multiplicative)"),
nrow = 1, top = "Hold-out Residuals Diagnostics"
))
data.frame(Yt=e_Add[-1], Yt_1=e_Add[-length(e_Add)])
cor(e_Add[-1], e_Add[-length(e_Add)], use = "pairwise.complete.obs")
## [1] 0.9643207
# Some additional handy functions for building ARIMA models
# ACF and PACF plots can be produced using the functions acf and pacf
forecast::ggtsdisplay(Walmart_Sales,lag.max=30, plot.type = c("scatter"))
# We can calculate differences with the function diff
d1y <- diff(Walmart_Sales,1)
plot(d1y)
# forecast::ggtsdisplay(d1y,lag.max=30, plot.ty = c("partial"))
# fit_test <- auto.arima(d1y)
fit_ARIMA <- auto.arima(fitsample)
frc_ARIMA <- forecast(fit_ARIMA, h=length(holdout))$mean
cbind(holdout, frc, frc_ARIMA)
## holdout frc.Additive frc.Multiplicative frc_ARIMA
## 2015 Q1 114.2 116.2300 116.0815 117.5340
## 2015 Q2 119.3 121.0544 119.4586 119.8878
## 2015 Q3 118.1 120.1729 119.2284 119.5271
## 2015 Q4 130.7 133.5632 134.4199 133.7297
## 2016 Q1 114.0 121.0584 121.0825 121.3568
## 2016 Q2 119.3 125.8829 124.5506 124.1712
## 2016 Q3 116.6 125.0014 124.2573 123.5463
## 2016 Q4 128.7 138.3916 140.0307 137.7462
## 2017 Q1 115.0 125.8869 126.0842 126.1064
## 2017 Q2 119.4 130.7114 129.6432 128.7893
## 2017 Q3 117.2 129.8298 129.2868 128.4064
## 2017 Q4 129.8 143.2201 145.6422 142.6927
## 2018 Q1 116.5 130.7154 131.0864 130.7305
## 2018 Q2 121.9 135.5399 134.7365 133.5460
## 2018 Q3 122.1 134.6583 134.3169 133.0856
## 2018 Q4 135.2 148.0486 151.2544 147.3703
## 2019 Q1 121.6 135.5438 136.0893 135.6202
## 2019 Q2 127.1 140.3683 139.8303 138.3971
# Calculate error metrics
e_ARIMA <- holdout - frc_ARIMA
# RMSE & MAE
RMSE_ARIMA <- sqrt(apply(e_ARIMA^2,2,mean))
MAE_ARIMA <- apply(abs(e_ARIMA),2,mean)
data.frame(RMSE_MAE, ARIMA=c(RMSE_ARIMA,MAE_ARIMA))
# Various model diagnostics can be easily done
# The function tsdiag is quite useful
tsdiag(fit_ARIMA)