In this lab, we will cover some state-of-the-art techniques in the framework of tree models. We use the same datasets as in previous lab, Boston Housing data and Credit Scoring data.
# load Boston data
library(MASS)
data(Boston)
index <- sample(nrow(Boston),nrow(Boston)*0.60)
boston_train <- Boston[index,]
boston_test <- Boston[-index,]
# load credit card data
credit_data <- read.csv(file = "https://xiaoruizhu.github.io/Data-Mining-R/lecture/data/credit_default.csv", header=T)
# convert categorical variables
credit_data$SEX<- as.factor(credit_data$SEX)
credit_data$EDUCATION<- as.factor(credit_data$EDUCATION)
credit_data$MARRIAGE<- as.factor(credit_data$MARRIAGE)
# random splitting
index <- sample(nrow(credit_data),nrow(credit_data)*0.60)
credit_train = credit_data[index,]
credit_test = credit_data[-index,]
Boosting builds a number of small trees, and each time, the response is the residual from last tree. It is a sequential procedure. We use gbm
package to build boosted trees. Note: The current gbm
package do not handle asymmetric loss.
library(gbm)
# ?gbm
boston_boost<- gbm(formula = medv~.,
data = boston_train,
distribution = "gaussian",
n.trees = 10000,
shrinkage = 0.01,
interaction.depth = 8)
summary(boston_boost)
## var rel.inf
## rm rm 41.8568725
## lstat lstat 30.5467507
## dis dis 8.4696084
## crim crim 4.5585922
## age age 2.9712051
## ptratio ptratio 2.6892714
## nox nox 2.6404223
## black black 2.3516301
## tax tax 1.1650961
## indus indus 1.0070304
## chas chas 0.8985881
## rad rad 0.7003911
## zn zn 0.1445417
Note that we need to specify distribution = "gaussian"
if we are working on regression tree. The default is Bernoulli distribution for binary classification problem. n.trees
is the number of small trees we fit. We need to choose this parameter carefully because it may results in overfitting if the number is too large. shrinkage
is another tuning parameter that controls how much contribution each tree makes. interaction.depth
is how many splits of each tree we want. All those tuning parameters can be chosen from cross-validation. The idea is that we don’t want overfitting.
The fitted boosted tree also gives the relation between response and each predictor.
par(mfrow=c(1,2))
plot(boston_boost, i="lstat")
plot(boston_boost, i="rm")
Prediction on testing sample.
boston_boost_pred_test<- predict(boston_boost, boston_test, n.trees = 10000)
mean((boston_test$medv-boston_boost_pred_test)^2)
## [1] 14.537
We can investigate how the testing error changes with different number of trees.
ntree <- seq(100, 10000, 100)
test.err <- rep(0, 13)
predmat <- predict(boston_boost, newdata = boston_test, n.trees = ntree)
err <- apply((predmat-boston_test$medv)^2, 2, mean)
plot(ntree, err, type = 'l', col=2, lwd=2, xlab = "n.trees", ylab = "Test MSE")
abline(h=min(test.err), lty=2)
The horizontal line is the best prediction error from random forests we obtained earlier.
library(adabag)
credit_train$default.payment.next.month= as.factor(credit_train$default.payment.next.month)
credit_boost= boosting(default.payment.next.month~., data = credit_train, boos = T)
save(credit_boost, file = "credit_boost.Rdata")
# Training AUC
pred_credit_boost= predict(credit_boost, newdata = credit_train)
pred <- prediction(pred_credit_boost$prob[,2], credit_train$default.payment.next.month)
perf <- performance(pred, "tpr", "fpr")
plot(perf, colorize=TRUE)
#Get the AUC
unlist(slot(performance(pred, "auc"), "y.values"))
pred_credit_boost= predict(credit_boost, newdata = credit_test)
# Testing AUC
pred <- prediction(pred_credit_boost$prob[,2], credit_test$default.payment.next.month)
perf <- performance(pred, "tpr", "fpr")
plot(perf, colorize=TRUE)
#Get the AUC
unlist(slot(performance(pred, "auc"), "y.values"))