Outline

Machine Learning for Forecasting

Characteristics of Machine Learning Methods

Machine Learning Approaches

Trees

Fitting Trees

Regularization and Evaluation

Application: Mortgage Loan Approval

library(rpart) # Fit Classification and Regression Trees (could also use "tree")
library(randomForest) # Fit Random Forests
#library(ranger) # Also fit random forests, but using faster method
library(xgboost) # Fit Gradient Boosting
library(kernlab) # Fit Reproducing Kernel Methods
#library(KRLS) #Fit least squares regression with kernels
library(caret) #Train and validate machine learning models
library(plyr) #Data manipulation
library(dplyr) # Data Manipulation


 #Load Data Set on Mortgage Loan Application Approvals
library(foreign) # Load data sets in Stata format
#Data on loan applications: see http://fmwww.bc.edu/ec-p/data/wooldridge/loanapp.des for descriptions
loanapps<-read.dta("http://fmwww.bc.edu/ec-p/data/wooldridge/loanapp.dta")

#Remove "action", which is more refined outcome variable, to predict binary yes-no loan approval
loanapp<-select(loanapps,-one_of("action","reject"))

#Split data into train and test sets
set.seed(998) #Ensure randomness is reproducible
# Take 75% random sample of approved and non-approved each into training data, 25% into test data
inTraining <- createDataPartition(loanapp$approve, p = .75, list = FALSE) #Function in "caret" library
training <- loanapp[ inTraining,]
testing  <- loanapp[-inTraining,]

# Fit classification tree to mortgage loan data to see approval
rpfit <- rpart(approve ~ ., data = training)


# Now try to simplify tree by pruning
# Use 10-fold cross-validation to fit complexity parameter, using caret library
fitControl <- trainControl(## 10-fold CV
                           method = "repeatedcv",
                           number = 10,
                           ## repeated ten times
                           repeats = 10)

#Cross-validate tuning parameters using RMSE
rpfit1  <- train(approve ~ ., data = training, 
                 method = "rpart", 
                 trControl = fitControl,
                 na.action = na.exclude)

Decision Tree for Mortgage Approval, Before and After Pruning

#Plot sequence of partitions and outcomes
par(mfrow = c(1,2), xpd = NA) # otherwise on some devices the text is clipped
plot(rpfit)
text(rpfit, use.n = FALSE,digits=2)

#Plot pruned tree using CV-selected optimal tuning parameter
plot(rpfit1$finalModel)
text(rpfit1$finalModel, use.n = FALSE,digits=2)

Model Combination

Bagging

Random Forests

Bagging and Random Forests on Time Series Data

Exercise: Try it yourself

Boosting

Gradient Boosting

Feature Maps and Engineering

Kernel Methods

# Fit a Random Forest using randomForest package, accessed through caret train function
# Caret provides additional diagnostics, but since no CV used here, mostly just for comparability
# We fit regression even though outcome is binary because goal is eliciting probabilities

forestControl <- trainControl(method = "none") ## Use Default Parameters all the Way

rffit1  <- train(approve ~ ., data = training, 
                 method = "rf", 
                 trControl = forestControl,
                 na.action = na.exclude)

splitspertree<-rffit1$finalModel$mtry
numbertrees<-rffit1$finalModel$ntree
## Commented out cross-validation because very slow, and don't want it to run every time I recompile
# # Use 10-fold cross-validation to fit parameters, using caret library
# xgbfitControl <- trainControl(## 10-fold CV
#                            method = "repeatedcv",
#                            number = 10,
#                            ## repeated ten times
#                            repeats = 10)
# 
# #Fit tuning parameters by cross-validation
# xgbfit1  <- train(approve ~ ., data = training, 
#                  method = "xgbTree", 
#                  trControl = xgbfitControl,
#                  na.action = na.exclude)


# Set parameters: learning rate eta, # of splits max.depth, MSE loss and rmse evaluation,etc 
# Set parameters to be hard-coded at value chosen by cross validation
cvparam <- list(max_depth = 1, eta = 0.3, gamma=0, colsample_bytree=0.8,
                min_child_weight=1, subsample=1, objective = "reg:squarederror", eval_metric = "rmse")

## Syntax to train xgboost directly

# Extract features from training and test data
trainfeat<-select(training,-one_of("approve"))
testfeat<-select(testing,-one_of("approve"))
#Transform data into format needed for xgboost command
dtrain <- xgb.DMatrix(as.matrix(trainfeat), label=training$approve)
dtest <- xgb.DMatrix(as.matrix(testfeat), label = testing$approve)
#Set training and test sets for evaluation
watchlist <- list(train = dtrain, eval = dtest)

# Fit gradient boosted trees using xgboost
boostrounds<-50
bst <- xgb.train(params=cvparam, data=dtrain,verbose=0,nrounds=boostrounds,watchlist=watchlist)
#Fit a radial basis kernel regression using kernlab and KRLS, again fit through caret

kernControl <- trainControl(## 10-fold CV
                            method = "repeatedcv",
                            number = 10,
                            ## repeated ten times
                            repeats = 10) ## Use Default Parameters all the Way

#Fit Support Vector Regression to centered and scaled data, and cross-validate constraint C and radial basis width sigma
kernfit  <- train(approve ~ ., data = training, 
                 method = "svmRadial", 
                 preProc = c("center", "scale"),
                 metric = "RMSE",
                 trControl = kernControl,
                 na.action = na.exclude)

Application: Mortgage Approval Prediction Again

#Construct test set predictions and RMSE for each
rppreds<-predict(rpfit1$finalModel, newdata = testing) #Decision Tree
#Use approximate imputation for missing values in random forest prediction, rather than excluding
rfpreds<-predict(rffit1$finalModel, newdata = na.roughfix(testing)) #Random Forest
bstpreds<-predict(bst,newdata=as.matrix(testfeat)) #Tree Boosting
kernpreds<-predict(kernfit$finalModel, newdata = testfeat) #Kernel Support Vector Regression

#Squared errors
prederrors<-data.frame((rppreds-testing$approve)^2,(rfpreds-testing$approve)^2,
                       (bstpreds-testing$approve)^2,(kernpreds-testing$approve)^2)

MSEvec<-colMeans(prederrors,na.rm=TRUE) #Mean Squared Errors


TestRMSE<-sqrt(MSEvec) #Test Set Root Mean Squared Errors

#Compute Training Set RMSE
rprmse<-rpfit1$results$RMSE[1] #Take from CV fit
rfrmse<-sqrt(rffit1$finalModel$mse[numbertrees]) #Take from model fit at last random forest
bstrmse<-bst$evaluation_log$train_rmse[boostrounds] #Take from model fit at last boosting iteration
kernrmse<-kernfit$results$RMSE[2] #Take from CV fit

TrainRMSE<-c(rprmse,rfrmse,bstrmse,kernrmse)
resmat<-as.matrix(data.frame(TestRMSE,TrainRMSE))
fitresults<-data.frame(t(resmat))

colnames(fitresults)<-c("Tree","Random Forest", "Boosting","Kernel SVM")


library(knitr)
library(kableExtra)
kable(fitresults,
  caption="Prediction Performance of ML Methods") %>%
  kable_styling(bootstrap_options = "striped")
Prediction Performance of ML Methods
Tree Random Forest Boosting Kernel SVM
TestRMSE 0.2397885 0.2252832 0.2211216 0.4324194
TrainRMSE 0.2533201 0.2344840 0.2189350 0.2770203

Conclusions

References