1 What drives you to build model?

All models are wrong, but some are useful.

George Box

2 What did you find from Homework 4?

2.1 Check raw data

if (!require("forecast")){install.packages("forecast")}
Walmart <- readxl::read_xlsx(path = "data/Walmart_2.xlsx")
Walmart

# Store the data in a conveniently named variable
Walmart_Sales <- ts(Walmart[,3], start = c(2003, 1), frequency = 4)

2.2 Explore the components

# First, plot the time series and a seasonal plot
plot(Walmart_Sales)

seasonplot(Walmart_Sales, year.labels = T)

2.3 Fit additive and multiplicative Holt-Winter models

# The time series is clearly trended and seasonal, and the seasonality is multiplicative
fitsample <- window(Walmart_Sales, end=c(2014,4))
holdout <- window(Walmart_Sales, start=c(2015,1))

# Develop seasonal models
fit <- vector("list",2)
# Addtive
fit[[1]] <- ets(fitsample,model="AAA",damped=FALSE)
# Multiplicative
fit[[2]] <- ets(fitsample,model="MAM",damped=FALSE)

2.4 Evaluate fitsample performance

# Get MSE of models 
# (note: they have the same number of parameters and 
# are estimated on the same sample,
# therefore using in-sample MSE to compare them is valid)
# One way:
# MSE <- unlist(lapply(fit,function(x){x$mse}))
# names(MSE) <- c("Additive","Multiplicative")
# print(MSE)

# Another way:
MSE1 <- fit[[1]]$mse
MSE2 <- fit[[2]]$mse
MSE <- data.frame(Additive=MSE1, Multiplicative=MSE2)
rownames(MSE) <- "MSE"
MSE

# MSE suggests to use the additive model, right?

2.5 Evaluate hold-out sample performance

# Let us validate that on holdout errors
frc <- array(NA,c(length(holdout),length(fit)),dimnames=list(time(holdout),names(MSE)))
for (i in 1:2){
  frc[,i] <- forecast(fit[[i]],h=length(holdout))$mean
}
# calculate errors
e <- matrix(rep(holdout,2),ncol=2) - frc
RMSE <- sqrt(apply(e^2,2,mean))
MAE <- apply(abs(e),2,mean)
RMSE_MAE <- data.frame(rbind(RMSE, MAE))
RMSE_MAE

# The MAE result agree, the additive model is best

2.6 Model selection

# Information Criteria
ICs <- data.frame(Additive=c(fit[[1]]$aic, fit[[1]]$aicc, fit[[1]]$bic),
           Multiplicative=c(fit[[2]]$aic, fit[[2]]$aicc, fit[[2]]$bic))
rownames(ICs) <- c("AIC", "AICc", "BIC")
ICs

2.7 Do you believe these two models are good enough?

2.7.1 YES!

Give me your reasons.

2.7.2 NO!

Why?

3 Validation of Model Assumptions

3.1 Is the expected value of each error term is zero? (Are errors related? Constant Variance?)

p1 <- qplot(y=fit[[1]]$residuals, ylab = "Residuals(Additive)")
p2 <- p<-ggplot(data = fit[[1]]$residuals, aes(x=fit[[1]]$residuals)) +
  xlab("Residuals(Additive)") + geom_histogram()
p3 <- qplot(y=fit[[2]]$residuals, ylab = "Residuals(Multiplicative)")
p4 <- p<-ggplot(data = fit[[2]]$residuals, aes(x=fit[[2]]$residuals)) +
  xlab("Residuals(Multiplicative)") + geom_histogram()
grid.draw(
  arrangeGrob(p1, p2, p3, p4,
    nrow = 2, top = "Residuals Diagnostics"
))

3.2 How about the hold-out sample?

e_Add <- ts(e[,1], start=c(2015,1), frequency = 4)
e_Mult <- ts(e[,2], start=c(2015,1), frequency = 4)
grid.draw(
  arrangeGrob(autoplot(e_Add)+ylab("Residuals(Additive)"), 
              autoplot(e_Mult)+ylab("Residuals(Multiplicative)"),
    nrow = 1, top = "Hold-out Residuals Diagnostics"
))

data.frame(Yt=e_Add[-1], Yt_1=e_Add[-length(e_Add)])

cor(e_Add[-1], e_Add[-length(e_Add)], use = "pairwise.complete.obs")

## [1] 0.9643207

4 Chapter 6

4.1 Check the Sample Autocorrelation Function(ACF) & PACF

# Some additional handy functions for building ARIMA models
# ACF and PACF plots can be produced using the functions acf and pacf
forecast::ggtsdisplay(Walmart_Sales,lag.max=30, plot.type = c("scatter"))

# We can calculate differences with the function diff
d1y <- diff(Walmart_Sales,1)
plot(d1y)

# forecast::ggtsdisplay(d1y,lag.max=30, plot.ty = c("partial"))
# fit_test <- auto.arima(d1y)

4.2 Let’s improve the forecasting by ARIMA

fit_ARIMA <- auto.arima(fitsample)

frc_ARIMA <- forecast(fit_ARIMA, h=length(holdout))$mean
cbind(holdout, frc, frc_ARIMA)

##         holdout frc.Additive frc.Multiplicative frc_ARIMA
## 2015 Q1   114.2     116.2300           116.0815  117.5340
## 2015 Q2   119.3     121.0544           119.4586  119.8878
## 2015 Q3   118.1     120.1729           119.2284  119.5271
## 2015 Q4   130.7     133.5632           134.4199  133.7297
## 2016 Q1   114.0     121.0584           121.0825  121.3568
## 2016 Q2   119.3     125.8829           124.5506  124.1712
## 2016 Q3   116.6     125.0014           124.2573  123.5463
## 2016 Q4   128.7     138.3916           140.0307  137.7462
## 2017 Q1   115.0     125.8869           126.0842  126.1064
## 2017 Q2   119.4     130.7114           129.6432  128.7893
## 2017 Q3   117.2     129.8298           129.2868  128.4064
## 2017 Q4   129.8     143.2201           145.6422  142.6927
## 2018 Q1   116.5     130.7154           131.0864  130.7305
## 2018 Q2   121.9     135.5399           134.7365  133.5460
## 2018 Q3   122.1     134.6583           134.3169  133.0856
## 2018 Q4   135.2     148.0486           151.2544  147.3703
## 2019 Q1   121.6     135.5438           136.0893  135.6202
## 2019 Q2   127.1     140.3683           139.8303  138.3971

# Calculate error metrics
e_ARIMA <- holdout - frc_ARIMA
# RMSE & MAE
RMSE_ARIMA <- sqrt(apply(e_ARIMA^2,2,mean))
MAE_ARIMA <- apply(abs(e_ARIMA),2,mean)
data.frame(RMSE_MAE, ARIMA=c(RMSE_ARIMA,MAE_ARIMA))

# Various model diagnostics can be easily done
# The function tsdiag is quite useful
tsdiag(fit_ARIMA)

# To extract residuals from an ARIMA fit we can use the function residuals
resid_ARIMA <- residuals(fit_ARIMA)
# From this we can construct various graphs
hist(resid_ARIMA)

qqnorm(resid_ARIMA)
qqline(resid_ARIMA)

4.3 GDP Growth Rate Example

4.3.1 Load data

GDP_change_2.xlsx

GDP <- readxl::read_xlsx(path = "data/GDP_change_2.xlsx")
GDP_Change <- GDP$`GDP Change`

# Store the data in a conveniently named variable
y <- ts(GDP_Change, start=1963, frequency = 1)
n <- length(y)

# Let us find how long is the evaluation set
test <- window(y, start=2001)
m <- length(test)

# We will rely on the function auto,arima to specify the model automatically from the forecast package.
# auto.arima relies on testing the serues for differencing and then using information criteria to choose the model orders.
# For manual building the functions Arima (forecast package) and arima (stats package) are useful.
if (!require("forecast")){install.packages("forecast")}

# Produce forecasts
frc <- array(NA,c(m,1),dimnames=list(time(test),"ARIMA"))
for (i in 1:m){
  # We run a rolling origin evaluation. At each iteration (i) we need to provide the current fitting sample
  # This must be a ts class object for the forecast package functions to work properly
  fitsample <- y[1:(n-m+i-1)]
  fitsample <- ts(fitsample,frequency=frequency(y),start=start(y))
  # Fit ARIMA
  fit <- auto.arima(fitsample)
  # Produce the 1-step ahead forecast - the function and syntax is the same as for ets
  frc[i,1] <- forecast(fit,h=1)$mean
}

# To simplify plotting the results we convert frc into a ts object
frc <- ts(frc,frequency=frequency(test),start=start(test))

# Produce plot
if (!require("RColorBrewer")){install.packages("RColorBrewer")}
cmp <- brewer.pal(3,"Set1")

plot(y,ylab="GDP change")
lines(frc,col=cmp[1])
legend("topright",c("Data",colnames(frc)),col=c("black",cmp),lty=1)

# Calculate error metrics
e <- test - frc
# RMSE & MAE
RMSE <- sqrt(apply(e^2,2,mean))
MAE <- apply(abs(e),2,mean)
E <- rbind(RMSE,MAE)
print(round(E,3))

##        frc
## RMSE 2.242
## MAE  1.633

# Various model diagnostics can be easily done
# The function tsdiag is quite useful
tsdiag(fit)

# To extract residuals from an ARIMA fit we can use the function residuals
resid <- residuals(fit)
# From this we can construct various graphs
hist(resid)

qqnorm(resid)
qqline(resid)

5 Summary

go to top

Model Diagnostics and HW4 Discussion

1 What drives you to build model?

2 What did you find from Homework 4?

2.1 Check raw data

2.2 Explore the components

2.3 Fit additive and multiplicative Holt-Winter models

2.4 Evaluate fitsample performance

2.5 Evaluate hold-out sample performance

2.6 Model selection

2.7 Do you believe these two models are good enough?

2.7.1 YES!

2.7.2 NO!

3 Validation of Model Assumptions

3.1 Is the expected value of each error term is zero? (Are errors related? Constant Variance?)

3.2 How about the hold-out sample?

4 Chapter 6

4.1 Check the Sample Autocorrelation Function(ACF) & PACF

4.2 Let’s improve the forecasting by ARIMA

4.3 GDP Growth Rate Example

4.3.1 Load data

5 Summary