Include packages:
library(tidyverse)
library(did)
First setup the true scores for each individual:
set.seed(42)
true_outcomes <- data.frame(id = 1:8000) |>
mutate(true = rnorm(n(), 0, 1))
Look at a random sample:
true_outcomes |>
mutate(true = round(true, 2)) |>
slice_sample(n = 10)
Now the measured outcomes, which are noisy version of the true score:
gen_obs <- function(id, true, sd, n) {
data.frame(time = 1:n) |>
mutate(id = id, .before = time) |>
mutate(true = true,
y = true + rnorm(n, 0, sd))
}
sim_dat <- pmap_df(true_outcomes,
\(id, true) gen_obs(id = id, true = true, sd = 1, n = 4)) |>
mutate(y = scale(y) |> as.numeric())
Look at a random sample:
sim_dat |>
mutate(true = round(true, 2),
y = round(y, 2)) |>
slice_sample(n = 10)
We have standardised y:
mean(sim_dat$y) |> round(2)
## [1] 0
sd(sim_dat$y) |> round(2)
## [1] 1
The correlations:
sim_dat |>
pivot_wider(names_from = time, values_from = y) |>
select(-id) |>
cor() |>
round(1)
## true 1 2 3 4
## true 1.0 0.7 0.7 0.7 0.7
## 1 0.7 1.0 0.5 0.5 0.5
## 2 0.7 0.5 1.0 0.5 0.5
## 3 0.7 0.5 0.5 1.0 0.5
## 4 0.7 0.5 0.5 0.5 1.0
Each measurement is correlated about .7 with the (unmeasurable) true score and the measurements are correlated around .5 with each other.
Now, select people for the intervention if their scores 1 SD below the population mean or less for periods prior to the intervention period.
ids <- list()
ids[[3]] <- sim_dat |>
filter(time %in% 1:2 & y < -1) |>
pull(id) |>
unique() |>
sample(size = 500)
ids[[4]] <- sim_dat |>
filter(time %in% 1:3 & y < -1) |>
pull(id) |>
setdiff(ids[[3]]) |>
unique() |>
sample(size = 500)
ids[[5]] <- setdiff(sim_dat$id, c(ids[[3]], ids[[4]])) |>
sample(size = 500)
Build a dataset mapping ID to group. {did} treats group = 0 as never-treated.
group_ids <- list(
data.frame(id = ids[[3]], group = 3),
data.frame(id = ids[[4]], group = 4),
data.frame(id = ids[[5]], group = 0)) |>
bind_rows()
stopifnot((group_ids |> duplicated() |> sum()) == 0)
Join into the main dataset and drop missings:
sim_dat <- sim_dat |>
left_join(group_ids) |>
na.omit()
## Joining with `by = join_by(id)`
Here are the means by group:
sim_dat |>
group_by(group, time) |>
summarise(mean_y = mean(y) |> round(1),
n = n()) |>
mutate(treat = time >= group & group != 0, .after = time)
## `summarise()` has grouped output by 'group'. You can override using the
## `.groups` argument.
A couple of plots:
curvy_plot <- sim_dat |>
mutate(group = factor(group)) |>
ggplot() +
aes(x = time, y = y, colour = group) +
geom_smooth(method = "loess", formula = y ~ x)
curvy_plot +
geom_jitter(alpha = .5, width = .1, size = .1)
Zoom in and remove the points:
curvy_plot
The never treated group (0) cruise along a little above the population mean (0 by design) with no apparent change. The means of group 3 and 4 are much lower (as expected, since we select them to be 1 SD below the mean or less), apparently increasing as the intervention is introduced.
Let’s see what {did} makes of this:
did_mod <- att_gt(
yname = "y",
gname = "group",
control_group = "notyettreated",
idname = "id",
tname = "time",
data = sim_dat,
est_method = "reg",
base_period = "varying"
)
did_mod
##
## Call:
## att_gt(yname = "y", tname = "time", idname = "id", gname = "group",
## data = sim_dat, control_group = "notyettreated", est_method = "reg",
## base_period = "varying")
##
## Reference: Callaway, Brantly and Pedro H.C. Sant'Anna. "Difference-in-Differences with Multiple Time Periods." Journal of Econometrics, Vol. 225, No. 2, pp. 200-230, 2021. <https://doi.org/10.1016/j.jeconom.2020.12.001>, <https://arxiv.org/abs/1803.09015>
##
## Group-Time Average Treatment Effects:
## Group Time ATT(g,t) Std. Error [95% Simult. Conf. Band]
## 3 2 -0.1112 0.0640 -0.2745 0.0521
## 3 3 0.4829 0.0637 0.3204 0.6454 *
## 3 4 0.4423 0.0655 0.2751 0.6095 *
## 4 2 0.0179 0.0619 -0.1402 0.1759
## 4 3 0.0368 0.0728 -0.1490 0.2226
## 4 4 0.1580 0.0638 -0.0048 0.3208
## ---
## Signif. codes: `*' confidence band does not cover 0
##
## P-value for pre-test of parallel trends assumption: 0.28651
## Control Group: Not Yet Treated, Anticipation Periods: 0
## Estimation Method: Outcome Regression
did_mod |>
ggdid()
aggte(did_mod, type = "group") |>
ggdid()
aggte(did_mod, type = "dynamic") |>
ggdid()
Would we conclude that the intervention was effective if we didn’t know how the data were generated…?
What happens if we remove the never-treated group?
sim_dat_sometimes_treated <- sim_dat |>
filter(group != 0)
did_mod_2 <- att_gt(
yname = "y",
gname = "group",
control_group = "notyettreated",
idname = "id",
tname = "time",
data = sim_dat_sometimes_treated,
est_method = "reg",
base_period = "varying"
)
did_mod_2
##
## Call:
## att_gt(yname = "y", tname = "time", idname = "id", gname = "group",
## data = sim_dat_sometimes_treated, control_group = "notyettreated",
## est_method = "reg", base_period = "varying")
##
## Reference: Callaway, Brantly and Pedro H.C. Sant'Anna. "Difference-in-Differences with Multiple Time Periods." Journal of Econometrics, Vol. 225, No. 2, pp. 200-230, 2021. <https://doi.org/10.1016/j.jeconom.2020.12.001>, <https://arxiv.org/abs/1803.09015>
##
## Group-Time Average Treatment Effects:
## Group Time ATT(g,t) Std. Error [95% Simult. Conf. Band]
## 3 2 -0.0861 0.0763 -0.2576 0.0854
## 3 3 0.4645 0.0685 0.3106 0.6184 *
## ---
## Signif. codes: `*' confidence band does not cover 0
##
## P-value for pre-test of parallel trends assumption: 0.26747
## Control Group: Not Yet Treated, Anticipation Periods: 0
## Estimation Method: Outcome Regression
did_mod_2 |>
ggdid()