Empirical Risk Minimization

Linear Function Classes

Predictors

Nonlinear predictors

Linear regression

#Select coefficients (b0,b1,b2,b3) by minimizing square loss
regression<-lm(y~z1+z2+z3)
#Produce forecasts by evaluating regression function at latest values
regressionforecast<-forecast(regression,zt)
#AR(2) forecast
autoregression<-dynlm(y~L(y,1)+L(y,2)) 
autoregressionforecast<-predict(autoregression,zt)

Application: GDP Growth Forecasting

##Empirical example: GDP and Recession forecasting

library(fredr) #Access FRED, the Federal Reserve Economic Data
library(fpp2) #Analysis and manipulation functions for forecasting
library(dynlm) #Linear Model for Time Series Data
library(quantreg) #quantile Regression
fredr_set_key("8782f247febb41f291821950cf9118b6") #Key I obtained for this class

#US GDP components 
GDP<-fredr(series_id = "GDP",
           observation_start = as.Date("1947-01-01"),
           observation_end=as.Date("2021-02-27"),
           vintage_dates = as.Date("2021-02-27")) #Gross Domestic Product

#US GDP components from NIPA tables (cf http://www.bea.gov/national/pdf/nipaguid.pdf)
PCEC<-fredr(series_id = "PCEC",
           observation_start = as.Date("1947-01-01"),
           observation_end=as.Date("2021-02-27"),
           vintage_dates = as.Date("2021-02-27")) #Personal consumption expenditures
FPI<-fredr(series_id = "FPI",
           observation_start = as.Date("1947-01-01"),
           observation_end=as.Date("2021-02-27"),
           vintage_dates = as.Date("2021-02-27")) #Fixed Private Investment
CBI<-fredr(series_id = "CBI",
           observation_start = as.Date("1947-01-01"),
           observation_end=as.Date("2021-02-27"),
           vintage_dates = as.Date("2021-02-27")) #Change in Private Inventories
NETEXP<-fredr(series_id = "NETEXP",
           observation_start = as.Date("1947-01-01"),
           observation_end=as.Date("2021-02-27"),
           vintage_dates = as.Date("2021-02-27")) #Net Exports of Goods and Services
GCE<-fredr(series_id = "GCE",
           observation_start = as.Date("1947-01-01"),
           observation_end=as.Date("2021-02-27"),
           vintage_dates = as.Date("2021-02-27")) #Government Consumption Expenditures and Gross Investment

#Format the series as quarterly time series objects, starting at the first date
gdp<-ts(GDP$value,frequency = 4,start=c(1947,1),names="Gross Domestic Product") 
pcec<-ts(PCEC$value,frequency = 4,start=c(1947,1),names="Consumption")
fpi<-ts(FPI$value,frequency = 4,start=c(1947,1),names="Fixed Investment")
cbi<-ts(CBI$value,frequency = 4,start=c(1947,1),names="Inventory Growth")
invest<-fpi+cbi #Private Investment
netexp<-ts(NETEXP$value,frequency = 4,start=c(1947,1),names="Net Exports")
gce<-ts(GCE$value,frequency = 4,start=c(1947,1),names="Government Spending")

# ## Plot Data 
# 
# autoplot(gdp, series="Gross Domestic Product")+
#   autolayer(pcec,series="Consumption")+
#   autolayer(invest,series="Investment")+
#   autolayer(netexp,series="Net Exports")+
#   autolayer(gce,series="Government Spending")+
#   ggtitle("US GDP and NIPA Components") +
#   ylab("Billions of Dollars")+xlab("Date")+
#   guides(colour=guide_legend(title="NIPA Component"))

## Predict using OLS on lagged predictors
#gdpdynlm<-dynlm(d(gdp)~L(d(gdp))+L(d(pcec))+L(d(invest))+L(d(gce))+L(d(netexp)))

#Restate as aligned variables of same length
dgdp<-window(diff(log(gdp)),start=c(1947,3),end=c(2020,4))
ldgdp<-window(lag(diff(log(gdp)),-1),end=c(2020,4))
ldpcec<-window(lag(diff(log(pcec)),-1),end=c(2020,4))
ldinvest<-window(lag(diff(log(invest)),-1),end=c(2020,4))
ldgce<-window(lag(diff(log(gce)),-1),end=c(2020,4))
#ldnetexp<-window(lag(diff(netexp),-1),end=c(2020,4)) #Exclude net exports, which is sometimes negative

#Predict using OLS
gdpOLS<-lm(dgdp~ldgdp+ldpcec+ldinvest+ldgce)

#Collect Q42020 Data for Q12021 Forecast
last<-length(diff(pcec)) #Index of last observation for all series, Q4 2020
zt<-data.frame(diff(log(gdp))[last],diff(log(pcec))[last],diff(log(invest))[last],diff(log(gce))[last])
colnames(zt)<-c("ldgdp","ldpcec","ldinvest","ldgce")

#Predict 
gdpfcstOLS<-predict(gdpOLS,zt)
#Use OLS to predict GDP growth from lagged component growth
gdpOLS<-lm(dgdp~ldgdp+ldpcec+ldinvest+ldgce)
#Predict value for 2021Q1, using 2020Q4 data
gdpfcstOLS<-predict(gdpOLS,zt)

Loss Guarantees for OLS

Least Absolute Deviations regression

#Select coefficients (b0,b1,b2,b3,b4) by minimizing absolute loss
gdpLAD<-rq(dgdp~ldgdp+ldpcec+ldinvest+ldgce)
#Produce forecasts by evaluating regression function at latest values
gdpfcstLAD<-predict(gdpLAD,zt)

Application: Comparison of LAD and OLS Forecasts

library(knitr)
library(kableExtra)

#Table of results
payemerrors<-data.frame(Coefficient=c("Constant",
                  "Delta log(Y_{t-1})","Delta log(C_{t-1})","Delta log(I_{t-1})",
                  "Delta log(G_{t-1})"),
  OLScoeffs=gdpOLS$coefficients,
  LADcoeffs=gdpLAD$coefficients)
kable(payemerrors,
  col.names=c("Coefficient","OLS",
              "LAD"),
  caption="Coefficients for ERM in GDP Growth Forecast")
Coefficients for ERM in GDP Growth Forecast
Coefficient OLS LAD
(Intercept) Constant 0.0116165 0.0079664
ldgdp Delta log(Y_{t-1}) -0.6160785 -0.2348550
ldpcec Delta log(C_{t-1}) 0.5118518 0.5045953
ldinvest Delta log(I_{t-1}) 0.1389572 0.0713135
ldgce Delta log(G_{t-1}) 0.1860983 0.0860652

Log GDP Growth and OLS vs LAD Predictions

#Contemporaneous Predictions
LADfcsts<-ts(predict(gdpLAD),frequency=4,start=c(1947,3))
OLSfcsts<-ts(predict(gdpOLS),frequency=4,start=c(1947,3))

autoplot(dgdp,series="Log GDP Growth")+
  autolayer(LADfcsts,series="LAD Forecasts")+
  autolayer(OLSfcsts,series="OLS Forecasts")+
  ggtitle("Past GDP Growth Forecasts") +
   ylab("Log Change (Billions of Dollars)")+xlab("Date")+
   guides(colour=guide_legend(title="Series"))

Exercise

Binary Prediction

Logistic Regression

#Select coefficients (b0,b1,b2,b3) by minimizing categorical cross-entropy loss
logitregression<-glm(y~z1+z2+z3,family=binomial(link="logit"))
#Produce forecasts by evaluating regression function at latest values
logitregressionforecast<-predict(logitregression,zt,type="response")

Application: Recession Forecasting

#The NBER recession indicator has FRED ID "USREC"
#1 in NBER declared recession, 0 in any other date
USREC<-fredr(series_id = "USREC",
           vintage_dates = as.Date("2021-02-27"))
usrec<-ts(USREC$value,frequency=12,start=c(1854,12)) #Convert to time series
#Produce lagged values
l1usrec<-window(lag(usrec,-1),start=c(1855,2),end=c(2021,1))
l2usrec<-window(lag(usrec,-2),start=c(1855,2),end=c(2021,1))
recession<-window(usrec,start=c(1855,2),end=c(2021,1))
recessionlogit<-glm(recession~l1usrec+l2usrec,family=binomial(link="logit"))
recessionlogit$coefficients
## (Intercept)     l1usrec     l2usrec 
##   -3.672571   20.854996  -14.427029

Recessions, Predicted and Observed

probs<-predict(recessionlogit,type="response")
recessionprobs<-ts(probs,frequency=12,start=c(1855,2))
autoplot(usrec,series="NBER Recession Indicator")+
  autolayer(recessionprobs,series="Predicted Probability of Recession")+
  ggtitle("NBER Recessions and Logit Autoregression Forecasted Probabilities") +
   ylab("Probability of Recession")+xlab("Date")+
   guides(colour=guide_legend(title="Series"))

What if my data isn’t stationary or weak dependent?

Nonstationary probability models

Restoring stationarity

Applying ERM with detrending

What if source of nonstationarity not known?

Implementation

#OLS with prediction rule b_0+b1*z1_t+b2*y_{t-1}+c*t+sum(a_j*season_j)
regression<-dynlm(y~z1+L(y)+trend(y)+season(y))

When does this method work?

Conclusions

References

Bonus Slide: Risk From Approximate Detrending

\[\underset{f\in\mathcal{F}}{\min}\frac{1}{T-h}\sum_{t=1}^{T-h}\ell(y_{t+h},f(\{y_s-\widehat{T}(s)\}_{s=1}^{t})+\widehat{T}(t+h))-\underset{f\in\mathcal{F}}{\min}E_{p}[\ell(y_{T+h},f(\{y_s-T(s)\}_{s=1}^{T})+T(T+h))]\]

Bonus Slide 2: Proof of Bound