There are two common implementations of GAMs in R. The older version (originally made for S-PLUS) is available as the ‘gam’ package by Hastie and Tibshirani. The newer version that we will use below is the ‘mgcv’ package from Simon Wood. The basic modeling procedure for both packages is similar (the function is gam for both; be wary of having both libraries loaded at the same time), but the behind-the-scenes computational approaches differ, as do the arguments for optimization and the model output. Expect the results to be slightly different when used with the same model structure on the same dataset.
library(MASS)
set.seed(1234)
sample_index <- sample(nrow(Boston),nrow(Boston)*0.70)
Boston_train <- Boston[sample_index,]
Boston_test <- Boston[-sample_index,]
str(Boston_train)
Model and plots
library(mgcv)
#create gam model
Boston_gam <- gam(medv ~ s(crim)+s(zn)+s(indus)+chas+s(nox)
+s(rm)+s(age)+s(dis)+rad+s(tax)+s(ptratio)
+s(black)+s(lstat),data=Boston_train)
summary(Boston_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## medv ~ s(crim) + s(zn) + s(indus) + chas + s(nox) + s(rm) + s(age) +
## s(dis) + rad + s(tax) + s(ptratio) + s(black) + s(lstat)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 18.9672 1.4638 12.958 <2e-16 ***
## chas 1.0098 0.7104 1.421 0.1562
## rad 0.3454 0.1505 2.296 0.0224 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(crim) 4.543 5.488 7.436 6.38e-07 ***
## s(zn) 1.000 1.000 0.973 0.324644
## s(indus) 6.701 7.629 4.124 0.000171 ***
## s(nox) 8.946 8.995 17.931 < 2e-16 ***
## s(rm) 7.792 8.605 19.944 < 2e-16 ***
## s(age) 2.415 3.058 1.302 0.282947
## s(dis) 8.742 8.975 8.895 4.36e-12 ***
## s(tax) 6.122 7.145 7.091 7.01e-08 ***
## s(ptratio) 1.000 1.000 21.257 5.90e-06 ***
## s(black) 1.000 1.000 0.002 0.963662
## s(lstat) 6.362 7.484 18.551 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.893 Deviance explained = 91%
## GCV = 10.713 Scale est. = 8.9693 n = 354
plot(Boston_gam, pages=1)
Model AIC/BIC and mean residual deviance
AIC(Boston_gam)
BIC(Boston_gam)
Boston_gam$deviance
In-sample fit performance
#in-sample mse using df
Boston_gam.mse.train <- Boston_gam$dev/Boston_gam$df.residual
#Average Sum of Squared Error
Boston_gam.mse.train <- Boston_gam$dev/nrow(Boston_train)
#using the predict() function
pi <- predict(Boston_gam,Boston_train)
mean((pi - Boston_train$medv)^2)
## [1] 7.509325
out of sample performance
pi.out <- predict(Boston_gam,Boston_test)
mean((pi.out - Boston_test$medv)^2)
## [1] 14.04571
Bank_data <- read.csv(file = "https://xiaoruizhu.github.io/Data-Mining-R/lecture/data/bankruptcy.csv", header=T)
# summary(Bank_data)
sample_index <- sample(nrow(Bank_data),nrow(Bank_data)*0.70)
Bank_train <- Bank_data[sample_index,]
Bank_test <- Bank_data[-sample_index,]
Model and plots
Bank_gam <- gam(DLRSN ~ s(R1)+s(R2)+s(R3)+s(R4)+
s(R5)+s(R6)+s(R7)+s(R8)+s(R9)+s(R10), data=Bank_train)
summary(Bank_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## DLRSN ~ s(R1) + s(R2) + s(R3) + s(R4) + s(R5) + s(R6) + s(R7) +
## s(R8) + s(R9) + s(R10)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.139028 0.004601 30.21 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(R1) 1.000 1.000 0.553 0.4571
## s(R2) 4.227 5.204 25.941 < 2e-16 ***
## s(R3) 2.125 2.723 8.647 5.27e-05 ***
## s(R4) 1.000 1.000 5.893 0.0153 *
## s(R5) 6.630 7.743 2.379 0.0140 *
## s(R6) 4.501 5.568 22.453 < 2e-16 ***
## s(R7) 1.000 1.000 0.788 0.3749
## s(R8) 6.564 7.670 9.507 1.68e-12 ***
## s(R9) 4.046 5.069 14.335 5.31e-14 ***
## s(R10) 5.621 6.770 50.329 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.327 Deviance explained = 33.4%
## GCV = 0.08137 Scale est. = 0.080564 n = 3805
plot(Bank_gam, pages=1)
In-sample fit performance
The in-sample confusion matrix:
pcut_gam <- 1/36
prob_gam_in <-predict(Bank_gam, Bank_train, type="response")
pred_gam_in <- (prob_gam_in>=pcut_gam)*1
table(Bank_train$DLRSN, pred_gam_in, dnn=c("Observed","Predicted"))
## Predicted
## Observed 0 1
## 0 1335 1941
## 1 17 512
The in-sample asymmetric cost is another thing you can check:
bankcost <- function(r, pi){
weight1 = 35
weight0 = 1
pcut <- weight0/(weight0+weight1)
c1 = (r==1)&(pi<pcut) #logical vector - true if actual 1 but predict 0
c0 = (r==0)&(pi>pcut) #logical vector - true if actual 0 but predict 1
return(mean(weight1*c1+weight0*c0))
}
bankcost(Bank_train$DLRSN, pred_gam_in)
## [1] 0.6664915
ROC Curve:
library(ROCR)
pred <- prediction(c(prob_gam_in), Bank_train$DLRSN)
perf <- performance(pred, "tpr", "fpr")
plot(perf, colorize=TRUE)
#Get the AUC
unlist(slot(performance(pred, "auc"), "y.values"))
## [1] 0.8931203
Model AIC/BIC and mean residual deviance
AIC(Bank_gam)
## [1] 1253.97
BIC(Bank_gam)
## [1] 1495.704
#in-sample mean residual deviance using df
Bank_gam$dev/Bank_gam$df.residual
## [1] 0.08056376
Out-of-sample fit performance
The out-of-sample confusion matrix:
prob_gam_out <- predict(Bank_gam, Bank_test, type="response")
pred_gam_out <- (prob_gam_out>=pcut_gam)*1
table(Bank_test$DLRSN, pred_gam_out, dnn=c("Observed","Predicted"))
## Predicted
## Observed 0 1
## 0 586 798
## 1 12 235
The asymmetric cost is:
bankcost(Bank_test$DLRSN, pred_gam_out)
## [1] 0.7467811
ROC Curve:
pred <- prediction(c(prob_gam_out), Bank_test$DLRSN)
perf <- performance(pred, "tpr", "fpr")
plot(perf, colorize=TRUE)
#Get the AUC
unlist(slot(performance(pred, "auc"), "y.values"))
## [1] 0.8753335
The Credit Card Default Data has 10800 observations and 23 predictive variables. The details of the data can be found at http://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients. Think about what kind of factors could affect people to fail to pay their credit balance.
credit_data <- read.csv(file = "https://xiaoruizhu.github.io/Data-Mining-R/lecture/data/credit_default.csv", header=T)
# rename
library(dplyr)
credit_data<- rename(credit_data, default=default.payment.next.month)
# convert categorical data to factor
credit_data$SEX <- as.factor(credit_data$SEX)
credit_data$EDUCATION <- as.factor(credit_data$EDUCATION)
credit_data$MARRIAGE <- as.factor(credit_data$MARRIAGE)
Now split the data 90/10 as training/testing datasets:
index <- sample(nrow(credit_data),nrow(credit_data)*0.90)
credit_train = credit_data[index,]
credit_test = credit_data[-index,]
Some of these predictors are categorical variables and they will enter the gam()
model as partially linear terms. We only add flexible s()
function to those continuous predictor variables such as LIMIT_BAL, AGE etc. Here we will demonstrate using five continuous variables as smooth terms and three categorical variables SEX, EDUCATION, and MARRIAGE as partially linear terms to save the space of summary output.
## Create a formula for a model with a large number of variables:
gam_formula <- as.formula("default~s(LIMIT_BAL)+s(AGE)+s(PAY_0)+s(BILL_AMT1)+s(PAY_AMT1)+SEX+EDUCATION+MARRIAGE")
credit_gam <- gam(formula = gam_formula, family=binomial, data=credit_train);
summary(credit_gam)
##
## Family: binomial
## Link function: logit
##
## Formula:
## default ~ s(LIMIT_BAL) + s(AGE) + s(PAY_0) + s(BILL_AMT1) + s(PAY_AMT1) +
## SEX + EDUCATION + MARRIAGE
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.217750 0.067325 -18.088 < 2e-16 ***
## SEX2 -0.175662 0.053552 -3.280 0.00104 **
## EDUCATION2 -0.046877 0.061423 -0.763 0.44535
## EDUCATION3 -0.218309 0.083468 -2.615 0.00891 **
## EDUCATION4 -1.204359 0.306518 -3.929 8.52e-05 ***
## MARRIAGE2 -0.117430 0.060287 -1.948 0.05143 .
## MARRIAGE3 -0.007125 0.234567 -0.030 0.97577
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(LIMIT_BAL) 5.382 6.283 133.792 < 2e-16 ***
## s(AGE) 1.330 1.595 3.418 0.094709 .
## s(PAY_0) 6.714 7.454 1207.921 < 2e-16 ***
## s(BILL_AMT1) 3.827 4.780 47.780 4.1e-09 ***
## s(PAY_AMT1) 5.199 6.402 26.454 0.000282 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.198 Deviance explained = 16.8%
## UBRE = -0.11427 Scale est. = 1 n = 10800
plot(credit_gam, shade=TRUE, seWithMean=TRUE, scale=0, pages = 1)
The function vis.gam()
can visualize the nonlinear relationship between two variables and the linear predictor in a 3D space as follows:
# vis.gam(credit_gam)
vis.gam(credit_gam, view=c("LIMIT_BAL","AGE"), theta= 140) # different view
In order to see the in-sample fit performance, you may look into the confusion matrix by using commands as following. We assume the cut-off probability as 1/6.
pcut_gam <- 1/6
prob_gam_in <-predict(credit_gam,credit_train,type="response")
pred_gam_in <- (prob_gam_in>=pcut_gam)*1
table(credit_train$default, pred_gam_in,dnn=c("Observed","Predicted"))
## Predicted
## Observed 0 1
## 0 5564 2841
## 1 694 1701
The asymmetric cost with 5:1 cost ratio is:
creditcost <- function(r, pi){
weight1 = 5
weight0 = 1
pcut <- weight0/(weight0+weight1)
c1 = (r==1)&(pi<pcut) #logical vector - true if actual 1 but predict 0
c0 = (r==0)&(pi>pcut) #logical vector - true if actual 0 but predict 1
return(mean(weight1*c1+weight0*c0))
}
creditcost(credit_train$default, pred_gam_in)
## [1] 0.5843519
Training model AIC, BIC, and mean residual deviance:
AIC(credit_gam)
## [1] 9565.864
BIC(credit_gam)
## [1] 9780.494
# credit_gam$deviance
ROC Curve:
library(ROCR)
pred <- prediction(predictions = c(prob_gam_in), labels = credit_train$default)
perf <- performance(pred, "tpr", "fpr")
plot(perf, colorize=TRUE)
#Get the AUC
unlist(slot(performance(pred, "auc"), "y.values"))
## [1] 0.7614636
pcut <- 1/6
prob_gam_out <- predict(credit_gam, credit_test,type="response")
pred_gam_out <- (prob_gam_out>=pcut)*1
table(credit_test$default, pred_gam_out,dnn=c("Observed","Predicted"))
## Predicted
## Observed 0 1
## 0 633 330
## 1 56 181
The asymmetric cost is
creditcost(credit_test$default, pred_gam_out)
## [1] 0.5083333
ROC Curve:
pred <- prediction(predictions = c(prob_gam_out), labels = credit_test$default)
perf <- performance(pred, "tpr", "fpr")
plot(perf, colorize=TRUE)
#Get the AUC
unlist(slot(performance(pred, "auc"), "y.values"))
## [1] 0.7919257
Finally, we can also apply gam()
for a univariate smoothing on the motorcycle data.
library(MASS)
data('mcycle')
str(mcycle)
summary(mcycle)
# Rename the variables for ease of usage
Y <- mcycle$accel
X <- mcycle$times
#Scatterplot
plot(Y~X, xlab="time",ylab="Acceleration", main="Scatterplot of Acceleration against Time")
library(mgcv)
s_gam <- gam(Y ~ s(X),data=mcycle)
summary(s_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## Y ~ s(X)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -25.546 1.951 -13.1 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(X) 8.693 8.972 53.52 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.783 Deviance explained = 79.8%
## GCV = 545.78 Scale est. = 506 n = 133
#plot the model
plot(s_gam, residuals = TRUE, pch = 1)