因果推断学习笔记（二）：经典方法尝试之 Regression (Lalonde's Dataset)

Mostly Harmless Econometrics 的第三章是关于如何用 regression 来估算因果效应的。其中一节是用 Lalonde’s dataset (NSW + PSID + CPS Data) 这个数据集来尝试不同的 regression specifications。这篇博客的目标是复现一下课本儿里的结果，实战一下。

以下内容由Rmarkdown转来。Git: yishilin14/causal_playground

博客的md用render("./causality2_playaround_with_the_lalonde_dataset_regression.Rmd", md_document(variant = "markdown_github"))生成~

读数据

数据集

Paper: Dehejia R H, Wahba S. Causal effects in nonexperimental studies: Reevaluating the evaluation of training programs[J]. Journal of the American statistical Association, 1999, 94(448): 1053-1062. (http://www.uh.edu/~adkugler/Dehejia&Wahba_JASA.pdf)
下载地址：http://users.nber.org/~rdehejia/data/nswdata2.html

# Load the datsets
col.names=c('treat', 'age', 'educ', 'black', 'hispan', 'married', 'nodegree', 're74', 're75', 're78')
nsw_treated <- fread('./dataset/nswdata/nswre74_treated.txt', col.names = col.names)
nsw_control <- fread('./dataset/nswdata/nswre74_control.txt', col.names = col.names)
cps1_control <- fread('./dataset/nswdata/cps_controls.txt', col.names = col.names)
cps3_control <- fread('./dataset/nswdata/cps3_controls.txt', col.names = col.names)

# Combine all the Datasets
nsw <- rbind(nsw_treated, nsw_control)
cps1 <- rbind(nsw_treated, cps1_control)
cps3 <- rbind(nsw_treated, cps3_control)
nsw[, Dataset := 'NSW']
cps1[, Dataset := 'CPS-1']
cps3[, Dataset := 'CPS-3']
data <- rbind(nsw, cps1, cps3)
data[, Dataset := factor(Dataset, unique(data$Dataset))]

复现 Dehejia-Wahba (1999) 里的 Table 1，看看每一个分组的 sample mean 。均值和论文里的一致，standard error of sample mean 的数值和论文相似但是不完全一致，不知道论文中是用什么方式计算的。

get.mean.se <- function(x) {
  mean = round(mean(x), 2)
  se = round(sd(x)/sqrt(length(x)), 2)
  return(paste0(mean, '(', se, ')'))
}
results <-  merge(
  data[, .(`no. obs`=.N), by = .(Dataset, treat)][Dataset == 'NSW' | treat ==0, ],
  data[, lapply(.SD, get.mean.se), by = .(Dataset, treat)][Dataset == 'NSW' | treat ==0, ]
)
kable(results[order(Dataset, -treat)])# caption = 'Sample Means of Characteristics for NSW and Comparison Samples')

Dataset	treat	no. obs	age	educ	black	hispan	married	nodegree	re74	re75	re78
NSW	1	185	25.82(0.53)	10.35(0.15)	0.84(0.03)	0.06(0.02)	0.19(0.03)	0.71(0.03)	2095.57(359.27)	1532.06(236.68)	6349.14(578.42)
NSW	0	260	25.05(0.44)	10.09(0.1)	0.83(0.02)	0.11(0.02)	0.15(0.02)	0.83(0.02)	2107.03(352.75)	1266.91(192.44)	4554.8(340.09)
CPS-1	0	15992	33.23(0.09)	12.03(0.02)	0.07(0)	0.07(0)	0.71(0)	0.3(0)	14016.8(75.67)	13650.8(73.31)	14846.66(76.29)
CPS-3	0	429	28.03(0.52)	10.24(0.14)	0.2(0.02)	0.14(0.02)	0.51(0.02)	0.6(0.02)	5619.24(327.76)	2466.48(158.94)	6984.17(352.17)

Regression

目标是复现 Mostly Harmless Econometrics: Table 4.3.3 (page 68) 。

Raw Difference

书中的 standard error 用的是 pooled variance。计算SE的时候学习了 Comparing Two Population Means: Independent Samples:

raw.difference <- function(dt) {
  v1 <- dt[treat==1, ]$re78
  v0 <- dt[treat==0, ]$re78
  diff <- mean(v1) - mean(v0)
  n1 <- length(v1)
  n0 <- length(v0)
  sd1 <- sd(v1)
  sd0 <- sd(v0)
  var.pooled <- (n1 * sd1^2 + n0 * sd0^2) / (n1 + n0)
  se <- sqrt(var.pooled*(1/n1 + 1/n0))
  return(list('ATT'= round(diff, 0), 'SE' = round(se, 0), 'No. Obs.'= paste0(n1, '/', n0)))
}

results.rawdiff <- data[, raw.difference(.SD), by = .(Dataset)]
kable(results.rawdiff, escape = FALSE, caption = 'Raw Difference')

Dataset	ATT	SE	No. Obs.
NSW	1794	633	185/260
CPS-1	-8498	712	185/15992
CPS-3	-635	657	185/429

Demographic controls

这里复现的结果和书中数值不完全一致，但是趋势是差不多的。书里没有明确描述 regression 的时候具体的模型是如何的，例如有没有二次项。接下来几块利用了 demographic controls的结果都和书中数值不完全一致。

# P-score Screened Samples
add.pscore.screened.samples <- function(fml) {
  cps1_pscore <- glm(fml, data = cps1, family=binomial(logit))
  cps1_pscore_screened <- cps1[(cps1_pscore$fitted.values >= 0.1 & cps1_pscore$fitted.values <= 0.9), ]
  cps3_pscore <- glm(fml, data = cps3, family=binomial(logit))
  cps3_pscore_screened <- cps3[(cps3_pscore$fitted.values >= 0.1 & cps3_pscore$fitted.values <= 0.9), ]
  cps1_pscore_screened[, Dataset := 'CPS-1-Subset']
  cps3_pscore_screened[, Dataset := 'CPS-3-Subset']
  return (rbind(data, cps1_pscore_screened, cps3_pscore_screened))
}

# Estimate the traning effects
estimate.via.regression <- function(dt, fml) {
  fit_summary <- summary(lm(fml, data = dt))
  diff <- coef(fit_summary)[2, 1]
  se <- coef(fit_summary)[2, 2]
  return(list('ATT'= round(diff, 0), 'SE' = round(se, 0), 'No. Obs.'= paste0(dt[treat==1, .N], '/', dt[treat==0, .N])))
}

fml_att <- re78 ~ treat + age + educ + black + hispan + nodegree + married
fml_pscore <- treat ~ age + educ + black + hispan + nodegree + married

data.tmp <- add.pscore.screened.samples(fml_pscore)
results.demographic <- data.tmp[, estimate.via.regression(.SD, fml_att), by = .(Dataset)]
kable(results.demographic, escape = FALSE, caption = 'Demographic controls')

Dataset	ATT	SE	No. Obs.
NSW	1671	638	185/260
CPS-1	-2973	716	185/15992
CPS-3	1164	812	185/429
CPS-1-Subset	-2675	826	123/415
CPS-3-Subset	1383	882	172/156

1975 Earnings

fml_att <- re78 ~ treat + re75
fml_pscore <- treat ~ re75

data.tmp <- add.pscore.screened.samples(fml_pscore)
results.re75 <- data.tmp[, estimate.via.regression(.SD, fml_att), by = .(Dataset)]
kable(results.re75, escape = FALSE, caption = '1975 Earnings')

Dataset	ATT	SE	No. Obs.
NSW	1750	632	185/260
CPS-1	-78	537	185/15992
CPS-3	-91	641	185/429
CPS-3-Subset	162	643	183/426

Demographics, 1975 Earnings

fml_att <- re78 ~ treat + age + educ + black + hispan + nodegree + married + re75
fml_pscore <- treat ~ age + educ + black + hispan + nodegree + married + re75

data.tmp <- add.pscore.screened.samples(fml_pscore)
results.demographic.re75 <- data.tmp[, estimate.via.regression(.SD, fml_att), by = .(Dataset)]
kable(results.demographic.re75, escape = FALSE, caption = 'Demographics, 1975 Earnings')

Dataset	ATT	SE	No. Obs.
NSW	1636	638	185/260
CPS-1	632	557	185/15992
CPS-3	1221	794	185/429
CPS-1-Subset	1421	675	147/360
CPS-3-Subset	1262	883	171/160

Demographics, 1974 and 1974 Earnings

fml_att <- re78 ~ treat + age + educ + black + hispan + nodegree + married + re74 + re75
fml_pscore <- treat ~ age + educ + black + hispan + nodegree + married + re74 + re75

data.tmp <- add.pscore.screened.samples(fml_pscore)
results.demographic.re7475 <- data.tmp[, estimate.via.regression(.SD, fml_att), by = .(Dataset)]
kable(results.demographic.re7475, escape = FALSE, caption = 'Demographics, 1974 and 1974 Earnings')

Dataset	ATT	SE	No. Obs.
NSW	1676	639	185/260
CPS-1	699	548	185/15992
CPS-3	1548	781	185/429
CPS-1-Subset	1653	668	145/359
CPS-3-Subset	1230	849	175/166

汇总

results.all <- results.demographic[, .(Dataset)]
results.all <- merge(results.all, results.rawdiff, by = 'Dataset', all.x = T, suffixes = c('', 'rawdiff'))
results.all <- merge(results.all, results.demographic, by = 'Dataset', all.x = T, suffixes = c('', 'demo'))
results.all <- merge(results.all, results.re75, by = 'Dataset', all.x = T, suffixes = c('', 're75'))
results.all <- merge(results.all, results.demographic.re75, by = 'Dataset', all.x = T, suffixes = c('', 'demo_re75'))
results.all <- merge(results.all, results.demographic.re7475, by = 'Dataset', all.x = T, suffixes = c('', 'demo_re7475'))
results.all <- results.all[, lapply(.SD, as.character)]
results.all[, 2:4][is.na(results.all[, 2:4])] <- ''
results.all[, 5:ncol(results.all)][is.na(results.all[, 5:ncol(results.all)])] <- 'no obs.'
kable(results.all[, c(1,seq(2, 16, 3))],
      escape = FALSE,
      caption = 'Regression estimates of NSW training effects using alternate controls',
      col.names = c('Dataset', 'Raw Difference',
                    'Demographic controls', '1975 Earnings', 'Demographics, 1975 Earnings', 'Demographics, 1974 and 1974 Earnings'))

Dataset	Raw Difference	Demographic controls	1975 Earnings	Demographics, 1975 Earnings	Demographics, 1974 and 1974 Earnings
NSW	1794	1671	1750	1636	1676
CPS-1	-8498	-2973	-78	632	699
CPS-3	-635	1164	-91	1221	1548
CPS-1-Subset		-2675	no obs.	1421	1653
CPS-3-Subset		1383	162	1262	1230