Problem Setup

library(dagitty) #Library to create and analyze causal graphs
library(ggplot2) #Plotting
library(npcausal) #Obtain from https://github.com/ehkennedy/npcausal ince not on CRAN 
suppressWarnings(suppressMessages(library(ggdag))) #library to plot causal graphs
yxzdag<-dagify(Y~X+Z,X~Z) #create graph with arrow from X to Y
#Set position of nodes so they lie on a straight line
  coords<-list(x=c(X = 0, Y = 2, Z=1),y=c(X = 0, Y = 0, Z=1))
  coords_df<-coords2df(coords)
  coordinates(yxzdag)<-coords2list(coords_df)
ggdag(yxzdag)+theme_dag_blank()+labs(title="Z confounds relationship of X to Y") #Plot causal graph

Identification: Assumptions

Identification: Derivation

When might we have ignorability?

The bad old days

Estimators for \(E[Y^x]\)

Inverse Propensity Lemma

Double robustness of AIPW

Choice of Estimators for \(\pi\), \(\mu\)

Handling nuisance error: Regularity or Sample Splitting

AIPW limit distribution

Interpreting conditions

Proof that \(\sqrt{n_1}(\widehat{\gamma}^{n_1}_x-\widehat{\gamma}^{exact})=o_p(1)\)

Communicating results

Software

set.seed(42) #Reproducible numbers
n <- 100;
z <- matrix(runif(n*5),nrow=n)
b <- as.vector(c(1,1,-1,1,-1))
g1 <- rnorm(5); g2 <- rnorm(5);
pi<-exp(z%*%b)/(1+exp(z%*%b))
x <- rbinom(n,1,pi)
y <- rnorm(n,mean=x+z%*%g1+sin(z%*%g2))
#ATE by cross-fit AIPW, with weighted average of ML algorithms
ate.res <- ate(y,x,z, sl.lib=c("SL.mean","SL.gam","SL.ranger","SL.glm"))
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |=========                                                             |  12%
  |                                                                            
  |==================                                                    |  25%
  |                                                                            
  |==========================                                            |  38%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |============================================                          |  62%
  |                                                                            
  |====================================================                  |  75%
  |                                                                            
  |=============================================================         |  88%
  |                                                                            
  |======================================================================| 100%
##      parameter       est        se      ci.ll       ci.ul  pval
## 1      E{Y(0)} -1.388809 0.1867523 -1.7548431 -1.02277420 0.000
## 2      E{Y(1)} -0.255938 0.1434104 -0.5370224  0.02514642 0.074
## 3 E{Y(1)-Y(0)}  1.132871 0.2008997  0.7391072  1.52663408 0.000
# Compare pure regression and IPW estimates from same data
#Regression
(EY0reg<-mean(ate.res$nuis[,3]))
## [1] -1.425214
(EY1reg<-mean(ate.res$nuis[,4]))
## [1] -0.1762365
(ATEreg<-EY1reg-EY0reg)
## [1] 1.248978
#IPW
(EY0ipw<-mean(y*(1-x)/ate.res$nuis[,1]))
## [1] -1.445764
(EY1ipw<-mean(y*x/ate.res$nuis[,2]))
## [1] -0.2120509
(ATEipw<-EY1ipw-EY0ipw)
## [1] 1.233713

Which to use?

What to do about non-ignorability

What else can go wrong? Overlap

When is overlap condition a problem?

set.seed(1234)
n<-10000
shift <- 2
za<-rnorm(n/2,0,1)
zb<-rnorm(n/2,shift,1)
z<-c(za,zb)
xa<-rep(0,n/2)
xb<-rep(1,n/2)
X<-factor(c(xa,xb))

#Apply Bayes rule: P(X=1|Z)=P(Z|X=1)P(X=1)/(P(Z|X=1)P(X=1)+P(Z|X=0)P(X=0))
#By P(X=1)=P(X=0)=0.5 and normality obtain
probXgivenZ= 1/(1+exp(0.5*shift-z))

dataf<-data.frame(z,X,probXgivenZ)

ggplot(data=dataf)+geom_density(aes(x=z,fill=X,color=X),alpha=0.5)+
    geom_line(aes(x=z,y=probXgivenZ))+
    ylab("P(X=1|Z)")+
    ggtitle("P(Z|X=1), P(Z|X=0), and P(X=1|Z)", subtitle = paste("Normal Distributions shifted by",shift))