Outline

Online Learning: Review

General Purpose Algorithms

Online Convex Optimization Setting

Examples

Follow The Leader

Worst Case Behavior of Follow The Leader

Follow the Regularized Leader

FTRL Example 1: Online Linear Optimization

Linearization Trick

Example Continued: Online Gradient Descent

FTRL Example 2: Exponential Weights Methods

Try it yourself

General Results (Shalev-Shwartz Thm 2.11/Hazan Thm 5.1)

Revisiting Follow the Leader

Extensions and Adaptivity

library(opera) #Library for online learning
library(mgcv) #Library for additive models: used as part of the expert forecasts
library(caret) #Library for several machine learning tasks: used as part of the expert forecasts

set.seed(1) #Ensure randomness always identical (only shows up in GBM model)

data(electric_load)
idx_data_test <- 620:nrow(electric_load)
data_train <- electric_load[-idx_data_test, ] 
data_test <- electric_load[idx_data_test, ]  

#Expert 1: A Generalized Additive Model, in Several Predictors
gam.fit <- gam(Load ~ s(IPI) + s(Temp) + s(Time, k=3) + 
                s(Load1) + as.factor(NumWeek), data = data_train)
gam.fcst <- predict(gam.fit, newdata = data_test)

#Expert 2: "medium term model", which adds autoregression component on residuals of an additive model
medium.fit <- gam(Load ~ s(Time,k=3) + s(NumWeek) + s(Temp) + s(IPI), data = data_train)
electric_load$Medium <- c(predict(medium.fit), predict(medium.fit, newdata = data_test))
electric_load$Residuals <- electric_load$Load - electric_load$Medium

# autoregressive correction
ar.fcst <- numeric(length(idx_data_test))
for (i in seq(idx_data_test)) {
  ar.fit <- ar(electric_load$Residuals[1:(idx_data_test[i] - 1)])
  ar.fcst[i] <- as.numeric(predict(ar.fit)$pred) + electric_load$Medium[idx_data_test[i]]
}

# Expert 3: A Gradient Boosting model (a machine learning thing based on trees) 
capture.output(gbm.fit <- train(Load ~ IPI + IPI_CVS + Temp + Temp1 + Time + Load1 + NumWeek, 
                  data = data_train, method = "gbm"),file="/dev/null")
#capture.output is used to prevent command from spewing hundreds of lines of text
gbm.fcst <- predict(gbm.fit, newdata = data_test)


# Combine expert forecasts into sequences X along with observed outcomes Y
Y <- data_test$Load
X <- cbind(gam.fcst, ar.fcst, gbm.fcst)

#Find loss of best expert ex post, according to absolute loss, and compare individual experts
oracle.expert<-oracle(Y = Y, experts = X, loss.type = "absolute", model = "expert")

params<-list(alpha=0.5,simplex=TRUE) # Code uses \eta=t^_{\alpha}, and projects to simplex
#Select weighting of experts by projected online gradient descent
ogdexperts<- mixture(Y=Y, experts=X, model = "OGD", loss.type = "absolute",parameters = params)

param2<-list(eta=0.5)
#Select weighting of experts by exponentially weighted average
exponentialexperts<- mixture(Y=Y, experts=X, model = "EWA", loss.type = "absolute",parameters = param2)

Application: Electricity Forecasting

Projected Online Gradient Descent Results

#Plot results
plot(ogdexperts)

Exponential Weights Results

#Plot results
plot(exponentialexperts)

Application: Ad-Click Prediction at Google

Conclusions

References