1 Generalized Additive Model

There are two common implementations of GAMs in R. The older version (originally made for S-PLUS) is available as the ‘gam’ package by Hastie and Tibshirani. The newer version that we will use below is the ‘mgcv’ package from Simon Wood. The basic modeling procedure for both packages is similar (the function is gam for both; be wary of having both libraries loaded at the same time), but the behind-the-scenes computational approaches differ, as do the arguments for optimization and the model output. Expect the results to be slightly different when used with the same model structure on the same dataset.

1.1 GAM on Boston Housing dataset

library(MASS)
set.seed(1234)
sample_index <- sample(nrow(Boston),nrow(Boston)*0.70)
Boston_train <- Boston[sample_index,]
Boston_test <- Boston[-sample_index,]
str(Boston_train)

Model and plots

library(mgcv)

#create gam model
Boston_gam <- gam(medv ~ s(crim)+s(zn)+s(indus)+chas+s(nox)
                 +s(rm)+s(age)+s(dis)+rad+s(tax)+s(ptratio)
                 +s(black)+s(lstat),data=Boston_train)

summary(Boston_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## medv ~ s(crim) + s(zn) + s(indus) + chas + s(nox) + s(rm) + s(age) + 
##     s(dis) + rad + s(tax) + s(ptratio) + s(black) + s(lstat)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  18.9672     1.4638  12.958   <2e-16 ***
## chas          1.0098     0.7104   1.421   0.1562    
## rad           0.3454     0.1505   2.296   0.0224 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##              edf Ref.df      F  p-value    
## s(crim)    4.543  5.488  7.436 6.38e-07 ***
## s(zn)      1.000  1.000  0.973 0.324644    
## s(indus)   6.701  7.629  4.124 0.000171 ***
## s(nox)     8.946  8.995 17.931  < 2e-16 ***
## s(rm)      7.792  8.605 19.944  < 2e-16 ***
## s(age)     2.415  3.058  1.302 0.282947    
## s(dis)     8.742  8.975  8.895 4.36e-12 ***
## s(tax)     6.122  7.145  7.091 7.01e-08 ***
## s(ptratio) 1.000  1.000 21.257 5.90e-06 ***
## s(black)   1.000  1.000  0.002 0.963662    
## s(lstat)   6.362  7.484 18.551  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.893   Deviance explained =   91%
## GCV = 10.713  Scale est. = 8.9693    n = 354
plot(Boston_gam, pages=1)

Model AIC/BIC and mean residual deviance

AIC(Boston_gam)
BIC(Boston_gam)
Boston_gam$deviance

In-sample fit performance

#in-sample mse using df 
Boston_gam.mse.train <- Boston_gam$dev/Boston_gam$df.residual 
#Average Sum of Squared Error
Boston_gam.mse.train <- Boston_gam$dev/nrow(Boston_train) 

#using the predict() function
pi <- predict(Boston_gam,Boston_train)
mean((pi - Boston_train$medv)^2)
## [1] 7.509325

out of sample performance

pi.out <- predict(Boston_gam,Boston_test)
mean((pi.out - Boston_test$medv)^2)
## [1] 14.04571

go to top

1.2 GAM on Bankruptcy dataset

Bank_data <- read.csv(file = "https://xiaoruizhu.github.io/Data-Mining-R/lecture/data/bankruptcy.csv", header=T)
# summary(Bank_data)

sample_index <- sample(nrow(Bank_data),nrow(Bank_data)*0.70)
Bank_train <- Bank_data[sample_index,]
Bank_test <- Bank_data[-sample_index,]