因果推断学习笔记(二):经典方法尝试之 Regression (Lalonde's Dataset)

Mostly Harmless Econometrics 的第三章是关于如何用 regression 来估算因果效应的。其中一节是用 Lalonde’s dataset (NSW + PSID + CPS Data) 这个数据集来尝试不同的 regression specifications。这篇博客的目标是复现一下课本儿里的结果,实战一下。

以下内容由Rmarkdown转来。Git: yishilin14/causal_playground


博客的md用render("./nswdata_regression.Rmd", md_document(variant = "markdown_github"))生成~

读数据

数据集

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
# Load the datsets
col.names=c('treat', 'age', 'educ', 'black', 'hispan', 'married', 'nodegree', 're74', 're75', 're78')
nsw_treated <- fread('./dataset/nswdata/nswre74_treated.txt', col.names = col.names)
nsw_control <- fread('./dataset/nswdata/nswre74_control.txt', col.names = col.names)
cps1_control <- fread('./dataset/nswdata/cps_controls.txt', col.names = col.names)
cps3_control <- fread('./dataset/nswdata/cps3_controls.txt', col.names = col.names)

# Combine all the Datasets
nsw <- rbind(nsw_treated, nsw_control)
cps1 <- rbind(nsw_treated, cps1_control)
cps3 <- rbind(nsw_treated, cps3_control)
nsw[, Dataset := 'NSW']
cps1[, Dataset := 'CPS-1']
cps3[, Dataset := 'CPS-3']
data <- rbind(nsw, cps1, cps3)
data[, Dataset := factor(Dataset, unique(data$Dataset))]

复现 Dehejia-Wahba (1999) 里的 Table 1,看看每一个分组的 sample mean 。 均值和论文里的一致,standard error of sample mean 的数值和论文相似但是不完全一致,不知道论文中是用什么方式计算的。

1
2
3
4
5
6
7
8
9
10
get.mean.se <- function(x) {
mean = round(mean(x), 2)
se = round(sd(x)/sqrt(length(x)), 2)
return(paste0(mean, '(', se, ')'))
}
results <- merge(
data[, .(`no. obs`=.N), by = .(Dataset, treat)][Dataset == 'NSW' | treat ==0, ],
data[, lapply(.SD, get.mean.se), by = .(Dataset, treat)][Dataset == 'NSW' | treat ==0, ]
)
kable(results[order(Dataset, -treat)])# caption = 'Sample Means of Characteristics for NSW and Comparison Samples')
Dataset treat no. obs age educ black hispan married nodegree re74 re75 re78
NSW 1 185 25.82(0.53) 10.35(0.15) 0.84(0.03) 0.06(0.02) 0.19(0.03) 0.71(0.03) 2095.57(359.27) 1532.06(236.68) 6349.14(578.42)
NSW 0 260 25.05(0.44) 10.09(0.1) 0.83(0.02) 0.11(0.02) 0.15(0.02) 0.83(0.02) 2107.03(352.75) 1266.91(192.44) 4554.8(340.09)
CPS-1 0 15992 33.23(0.09) 12.03(0.02) 0.07(0) 0.07(0) 0.71(0) 0.3(0) 14016.8(75.67) 13650.8(73.31) 14846.66(76.29)
CPS-3 0 429 28.03(0.52) 10.24(0.14) 0.2(0.02) 0.14(0.02) 0.51(0.02) 0.6(0.02) 5619.24(327.76) 2466.48(158.94) 6984.17(352.17)

Regression

目标是复现 Mostly Harmless Econometrics: Table 4.3.3 (page 68) 。

Raw Difference

书中的 standard error 用的是 pooled variance。计算SE的时候学习了 Comparing Two Population Means: Independent Samples:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
raw.difference <- function(dt) {
v1 <- dt[treat==1, ]$re78
v0 <- dt[treat==0, ]$re78
diff <- mean(v1) - mean(v0)
n1 <- length(v1)
n0 <- length(v0)
sd1 <- sd(v1)
sd0 <- sd(v0)
var.pooled <- (n1 * sd1^2 + n0 * sd0^2) / (n1 + n0)
se <- sqrt(var.pooled*(1/n1 + 1/n0))
return(list('ATT'= round(diff, 0), 'SE' = round(se, 0), 'No. Obs.'= paste0(n1, '/', n0)))
}

results.rawdiff <- data[, raw.difference(.SD), by = .(Dataset)]
kable(results.rawdiff, escape = FALSE, caption = 'Raw Difference')
Dataset ATT SE No. Obs.
NSW 1794 633 185/260
CPS-1 -8498 712 185/15992
CPS-3 -635 657 185/429

Demographic controls

这里复现的结果和书中数值不完全一致,但是趋势是差不多的。书里没有明确描述 regression 的时候具体的模型是如何的,例如有没有二次项。接下来几块利用了 demographic controls的结果都和书中数值不完全一致。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
# P-score Screened Samples
add.pscore.screened.samples <- function(fml) {
cps1_pscore <- glm(fml, data = cps1, family=binomial(logit))
cps1_pscore_screened <- cps1[(cps1_pscore$fitted.values >= 0.1 & cps1_pscore$fitted.values <= 0.9), ]
cps3_pscore <- glm(fml, data = cps3, family=binomial(logit))
cps3_pscore_screened <- cps3[(cps3_pscore$fitted.values >= 0.1 & cps3_pscore$fitted.values <= 0.9), ]
cps1_pscore_screened[, Dataset := 'CPS-1-Subset']
cps3_pscore_screened[, Dataset := 'CPS-3-Subset']
return (rbind(data, cps1_pscore_screened, cps3_pscore_screened))
}

# Estimate the traning effects
estimate.via.regression <- function(dt, fml) {
fit_summary <- summary(lm(fml, data = dt))
diff <- coef(fit_summary)[2, 1]
se <- coef(fit_summary)[2, 2]
return(list('ATT'= round(diff, 0), 'SE' = round(se, 0), 'No. Obs.'= paste0(dt[treat==1, .N], '/', dt[treat==0, .N])))
}

fml_att <- re78 ~ treat + age + educ + black + hispan + nodegree + married
fml_pscore <- treat ~ age + educ + black + hispan + nodegree + married

data.tmp <- add.pscore.screened.samples(fml_pscore)
results.demographic <- data.tmp[, estimate.via.regression(.SD, fml_att), by = .(Dataset)]
kable(results.demographic, escape = FALSE, caption = 'Demographic controls')
Dataset ATT SE No. Obs.
NSW 1671 638 185/260
CPS-1 -2973 716 185/15992
CPS-3 1164 812 185/429
CPS-1-Subset -2675 826 123/415
CPS-3-Subset 1383 882 172/156

1975 Earnings

1
2
3
4
5
6
fml_att <- re78 ~ treat + re75
fml_pscore <- treat ~ re75

data.tmp <- add.pscore.screened.samples(fml_pscore)
results.re75 <- data.tmp[, estimate.via.regression(.SD, fml_att), by = .(Dataset)]
kable(results.re75, escape = FALSE, caption = '1975 Earnings')
Dataset ATT SE No. Obs.
NSW 1750 632 185/260
CPS-1 -78 537 185/15992
CPS-3 -91 641 185/429
CPS-3-Subset 162 643 183/426

Demographics, 1975 Earnings

1
2
3
4
5
6
fml_att <- re78 ~ treat + age + educ + black + hispan + nodegree + married + re75
fml_pscore <- treat ~ age + educ + black + hispan + nodegree + married + re75

data.tmp <- add.pscore.screened.samples(fml_pscore)
results.demographic.re75 <- data.tmp[, estimate.via.regression(.SD, fml_att), by = .(Dataset)]
kable(results.demographic.re75, escape = FALSE, caption = 'Demographics, 1975 Earnings')
Dataset ATT SE No. Obs.
NSW 1636 638 185/260
CPS-1 632 557 185/15992
CPS-3 1221 794 185/429
CPS-1-Subset 1421 675 147/360
CPS-3-Subset 1262 883 171/160

Demographics, 1974 and 1974 Earnings

1
2
3
4
5
6
fml_att <- re78 ~ treat + age + educ + black + hispan + nodegree + married + re74 + re75
fml_pscore <- treat ~ age + educ + black + hispan + nodegree + married + re74 + re75

data.tmp <- add.pscore.screened.samples(fml_pscore)
results.demographic.re7475 <- data.tmp[, estimate.via.regression(.SD, fml_att), by = .(Dataset)]
kable(results.demographic.re7475, escape = FALSE, caption = 'Demographics, 1974 and 1974 Earnings')
Dataset ATT SE No. Obs.
NSW 1676 639 185/260
CPS-1 699 548 185/15992
CPS-3 1548 781 185/429
CPS-1-Subset 1653 668 145/359
CPS-3-Subset 1230 849 175/166

汇总

1
2
3
4
5
6
7
8
9
10
11
12
13
14
results.all <- results.demographic[, .(Dataset)]
results.all <- merge(results.all, results.rawdiff, by = 'Dataset', all.x = T, suffixes = c('', 'rawdiff'))
results.all <- merge(results.all, results.demographic, by = 'Dataset', all.x = T, suffixes = c('', 'demo'))
results.all <- merge(results.all, results.re75, by = 'Dataset', all.x = T, suffixes = c('', 're75'))
results.all <- merge(results.all, results.demographic.re75, by = 'Dataset', all.x = T, suffixes = c('', 'demo_re75'))
results.all <- merge(results.all, results.demographic.re7475, by = 'Dataset', all.x = T, suffixes = c('', 'demo_re7475'))
results.all <- results.all[, lapply(.SD, as.character)]
results.all[, 2:4][is.na(results.all[, 2:4])] <- ''
results.all[, 5:ncol(results.all)][is.na(results.all[, 5:ncol(results.all)])] <- 'no obs.'
kable(results.all[, c(1,seq(2, 16, 3))],
escape = FALSE,
caption = 'Regression estimates of NSW training effects using alternate controls',
col.names = c('Dataset', 'Raw Difference',
'Demographic controls', '1975 Earnings', 'Demographics, 1975 Earnings', 'Demographics, 1974 and 1974 Earnings'))
Dataset Raw Difference Demographic controls 1975 Earnings Demographics, 1975 Earnings Demographics, 1974 and 1974 Earnings
NSW 1794 1671 1750 1636 1676
CPS-1 -8498 -2973 -78 632 699
CPS-3 -635 1164 -91 1221 1548
CPS-1-Subset -2675 no obs. 1421 1653
CPS-3-Subset 1383 162 1262 1230