library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
-- Attaching packages ------------------------ tidyverse 1.3.0 --
v ggplot2 3.3.3 v purrr 0.3.4
v tibble 3.1.0 v dplyr 1.0.5
v tidyr 1.1.3 v stringr 1.4.0
v readr 1.4.0 v forcats 0.5.1
-- Conflicts --------------------------- tidyverse_conflicts() --
x dplyr::filter() masks stats::filter()
x dplyr::lag() masks stats::lag()
library(magrittr)
Attaching package: 㤼㸱magrittr㤼㸲
The following object is masked from 㤼㸱package:purrr㤼㸲:
set_names
The following object is masked from 㤼㸱package:tidyr㤼㸲:
extract
library(janitor)
Attaching package: 㤼㸱janitor㤼㸲
The following objects are masked from 㤼㸱package:stats㤼㸲:
chisq.test, fisher.test
gen_RCT_dat <-
function(total_N,
baseline_SD,
mean_diff,
baseline_outcome_slope,
outcome_resid) {
tibble(
treat = rbinom(total_N, 1, 0.5),
baseline = rnorm(total_N, 0, baseline_SD),
uncorcov = rnorm(total_N, 0, baseline_SD),
outcome = mean_diff * treat +
baseline_outcome_slope * baseline +
rnorm(total_N, 0, outcome_resid)
)
}
get_ps <- function(mod) {
(mod %>% summary())$coefficients[,"Pr(>|t|)"]
}
confint_includes <- function(interval, true_val) {
stopifnot(nrow(interval) == 1)
interval[1] <= true_val && true_val <= interval[2]
}
analyse_sim <- function(dat, true_mean_diff) {
mod0 <- lm(outcome ~ treat, data = dat)
mod1 <- lm(outcome ~ treat + baseline, data = dat)
mod2 <- lm(outcome ~ treat + uncorcov, data = dat)
mod3 <- lm(outcome ~ treat + baseline + uncorcov, data = dat)
descriptives <- dat %>%
group_by(treat) %>%
summarise(
mean_pre = mean(baseline),
mean_post = mean(outcome),
SD_pre = sd(baseline),
SD_post = sd(outcome),
n = n()
) %>%
pivot_wider(names_from = treat,
values_from = mean_pre:n)
mod_coefs <- tibble(
mod0_coef = coef(mod0)["treat"],
mod1_coef = coef(mod1)["treat"],
mod2_coef = coef(mod2)["treat"],
mod3_coef = coef(mod3)["treat"],
mod0_p = get_ps(mod0)["treat"],
mod1_p = get_ps(mod1)["treat"],
mod2_p = get_ps(mod2)["treat"],
mod3_p = get_ps(mod3)["treat"],
)
bind_cols(descriptives, mod_coefs) %>%
mutate(
baseline_diff = mean_pre_1 - mean_pre_0,
treat_group_prop = n_1 / (n_0 + n_1),
mod0_abs_error = abs(mod0_coef - true_mean_diff),
mod1_abs_error = abs(mod1_coef - true_mean_diff),
mod_error_diff = mod0_abs_error - mod1_abs_error,
mod0_includes_true = confint_includes(confint(mod0)["treat",],
true_mean_diff),
mod1_includes_true = confint_includes(confint(mod1)["treat",],
true_mean_diff),
mod2_includes_true = confint_includes(confint(mod2)["treat",],
true_mean_diff),
mod3_includes_true = confint_includes(confint(mod3)["treat",],
true_mean_diff)
)
}
run_sim <- function(total_N,
baseline_SD,
mean_diff,
baseline_outcome_slope,
outcome_resid,
sims = 1000) {
1:sims %>%
map_dfr(
~ gen_RCT_dat(
total_N,
baseline_SD,
mean_diff,
baseline_outcome_slope,
outcome_resid
) %>%
analyse_sim(mean_diff)
)
}
sim_results <-
run_sim(
total_N = 200,
baseline_SD = 1,
mean_diff = 0.5,
baseline_outcome_slope = 0.5,
outcome_resid = 1
)
sim_results %>%
summarise(
unadjusted_true_cov = mean(mod0_includes_true),
adjusted_true_cov = mean(mod1_includes_true),
unadjusted_significant = mean(mod0_p < .05),
adjusted_significant = mean(mod1_p < .05)
)
sim_results %>%
tabyl(mod0_includes_true, mod1_includes_true)
mod0_includes_true FALSE TRUE
FALSE 25 16
TRUE 23 936
sim_results %>%
ggplot(aes(baseline_diff, mod_error_diff)) +
geom_point(alpha = 0.5) +
labs(x = "Baseline imbalance",
y = "Unadjusted minus adjusted error")

do_one_sim <- function(baseline_slope, diff, total_n) {
run_sim(
total_N = total_n,
baseline_SD = 1,
mean_diff = diff,
baseline_outcome_slope = baseline_slope,
outcome_resid = 1
) %>%
summarise(
# 95% confidence interval coverage
unadjust_true_cov = mean(mod0_includes_true),
adjust_base_true_cov = mean(mod1_includes_true),
adjust_uncor_true_cov = mean(mod2_includes_true),
adjust_both_true_cov = mean(mod3_includes_true),
# % statistically significant
unadjust_significant = mean(mod0_p < .05),
adjust_base_significant = mean(mod1_p < .05),
adjust_uncor_significant = mean(mod2_p < .05),
adjust_both_significant = mean(mod3_p < .05),
)
}
sim_setup <- expand.grid(
baseline_slope = seq(0, 2, 0.5),
diff = c(0, 0.25, 0.5),
total_n = c(50, 100)
)
results <- pmap_dfr(sim_setup, do_one_sim)
results_by_param <- bind_cols(sim_setup, results) %>%
arrange(diff, total_n, baseline_slope)
results_by_param %>%
pivot_longer(cols = ends_with("significant")) %>%
mutate(name = recode(name,
unadjust_significant = "1. None",
adjust_base_significant = "2. Baseline",
adjust_uncor_significant = "3. Uncorrelated covariate",
adjust_both_significant = "4. Both",
)) %>%
ggplot(aes(baseline_slope, value, colour = name)) +
geom_line() +
geom_point() +
facet_grid(cols = vars(diff), rows = vars(total_n)) +
labs(x = "True baseline coefficient",
y = "Proportion",
title = "Effect of adjusting for covariates",
subtitle = "Proportion of simulated studies with p < .05",
colour = "Adjustment",
caption = "Note: True treatment effect in columns, sample size in rows") +
theme(panel.spacing = unit(1, "lines"),
legend.position = "bottom") +
ylim(0,1)

results_by_param %>%
pivot_longer(cols = ends_with("true_cov")) %>%
mutate(name = recode(name,
unadjust_true_cov = "1. None",
adjust_base_true_cov = "2. Baseline",
adjust_uncor_true_cov = "3. Uncorrelated covariate",
adjust_both_true_cov = "4. Both",
)) %>%
ggplot(aes(baseline_slope, value, colour = name)) +
geom_line() +
geom_point() +
facet_grid(cols = vars(diff), rows = vars(total_n)) +
labs(x = "True baseline coefficient",
y = "Proportion",
title = "Effect of adjusting for covariates",
subtitle = "Proportion of 95% CIs including true value",
colour = "Adjustment",
caption = "Note: True treatment effect in columns, sample size in rows") +
theme(panel.spacing = unit(1, "lines"),
legend.position = "bottom") +
ylim(0,1)

---
title: "Playing around with baseline adjusted vs. unadjusted RCT analysis"
author: Andi Fugard
output: html_notebook
---

```{r}
library(tidyverse)
library(magrittr)
library(janitor)
```


```{r}
gen_RCT_dat <-
  function(total_N,
           baseline_SD,
           mean_diff,
           baseline_outcome_slope,
           outcome_resid) {
    tibble(
      treat    = rbinom(total_N, 1, 0.5),
      baseline = rnorm(total_N, 0, baseline_SD),
      uncorcov = rnorm(total_N, 0, baseline_SD),
      outcome  = mean_diff * treat +
                 baseline_outcome_slope * baseline +
                 rnorm(total_N, 0, outcome_resid)
    )
  }
```



```{r}
get_ps <- function(mod) {
  (mod %>% summary())$coefficients[,"Pr(>|t|)"]
}

confint_includes <- function(interval, true_val) {
  stopifnot(nrow(interval) == 1)
  
  interval[1] <= true_val && true_val <= interval[2]
}
```


```{r}
analyse_sim <- function(dat, true_mean_diff) {
  mod0 <- lm(outcome ~ treat, data = dat)
  mod1 <- lm(outcome ~ treat + baseline, data = dat)
  mod2 <- lm(outcome ~ treat + uncorcov, data = dat)
  mod3 <- lm(outcome ~ treat + baseline + uncorcov, data = dat)
  
  descriptives <- dat %>%
    group_by(treat) %>%
    summarise(
      mean_pre = mean(baseline),
      mean_post = mean(outcome),
      SD_pre = sd(baseline),
      SD_post = sd(outcome),
      n = n()
    ) %>%
    pivot_wider(names_from = treat,
                values_from = mean_pre:n)
  
  mod_coefs <- tibble(
    mod0_coef = coef(mod0)["treat"],
    mod1_coef = coef(mod1)["treat"],
    mod2_coef = coef(mod2)["treat"],
    mod3_coef = coef(mod3)["treat"],
    mod0_p    = get_ps(mod0)["treat"],
    mod1_p    = get_ps(mod1)["treat"],
    mod2_p    = get_ps(mod2)["treat"],
    mod3_p    = get_ps(mod3)["treat"],
  )  
  
  bind_cols(descriptives, mod_coefs) %>%
    mutate(
      baseline_diff = mean_pre_1 - mean_pre_0,
      treat_group_prop = n_1 / (n_0 + n_1),
      mod0_abs_error = abs(mod0_coef - true_mean_diff),
      mod1_abs_error = abs(mod1_coef - true_mean_diff),
      mod_error_diff = mod0_abs_error - mod1_abs_error,
      mod0_includes_true = confint_includes(confint(mod0)["treat",],
                                            true_mean_diff),
      mod1_includes_true = confint_includes(confint(mod1)["treat",],
                                            true_mean_diff),
      mod2_includes_true = confint_includes(confint(mod2)["treat",],
                                            true_mean_diff),
      mod3_includes_true = confint_includes(confint(mod3)["treat",],
                                            true_mean_diff)
    )  
}
```


```{r}
run_sim <- function(total_N,
                    baseline_SD,
                    mean_diff,
                    baseline_outcome_slope,
                    outcome_resid,
                    sims = 1000) {
  1:sims %>%
    map_dfr(
      ~ gen_RCT_dat(
        total_N,
        baseline_SD,
        mean_diff,
        baseline_outcome_slope,
        outcome_resid
      ) %>%
        analyse_sim(mean_diff)
    )
}
```



```{r}
sim_results <-
  run_sim(
    total_N = 200,
    baseline_SD = 1,
    mean_diff = 0.5,
    baseline_outcome_slope = 0.5,
    outcome_resid = 1
  )
```


```{r}
sim_results %>%
  summarise(
    unadjusted_true_cov = mean(mod0_includes_true),
    adjusted_true_cov = mean(mod1_includes_true),
    unadjusted_significant = mean(mod0_p < .05),
    adjusted_significant = mean(mod1_p < .05)
  )
```

```{r}
sim_results %>%
  tabyl(mod0_includes_true, mod1_includes_true)
```



```{r dpi=300}
sim_results %>%
  ggplot(aes(baseline_diff, mod_error_diff)) +
  geom_point(alpha = 0.5) +
  labs(x = "Baseline imbalance",
       y = "Unadjusted minus adjusted error")
```







```{r}
do_one_sim <- function(baseline_slope, diff, total_n) {
  run_sim(
    total_N = total_n,
    baseline_SD = 1,
    mean_diff = diff,
    baseline_outcome_slope = baseline_slope,
    outcome_resid = 1
  ) %>%
    summarise(
      # 95% confidence interval coverage
      unadjust_true_cov        = mean(mod0_includes_true),
      adjust_base_true_cov     = mean(mod1_includes_true),
      adjust_uncor_true_cov    = mean(mod2_includes_true),
      adjust_both_true_cov     = mean(mod3_includes_true),
      # % statistically significant
      unadjust_significant     = mean(mod0_p < .05),
      adjust_base_significant  = mean(mod1_p < .05),
      adjust_uncor_significant = mean(mod2_p < .05),
      adjust_both_significant  = mean(mod3_p < .05),
    )
}
```

```{r}
sim_setup <- expand.grid(
  baseline_slope = seq(0, 2, 0.5),
  diff = c(0, 0.25, 0.5),
  total_n = c(50, 100)
)
```

```{r}
results <- pmap_dfr(sim_setup, do_one_sim)
results_by_param <- bind_cols(sim_setup, results) %>%
  arrange(diff, total_n, baseline_slope)
```


```{r fig.height=5, fig.width=6, dpi=300}
results_by_param %>%
  pivot_longer(cols = ends_with("significant")) %>%
  mutate(name = recode(name,
    unadjust_significant     = "1. None",
    adjust_base_significant  = "2. Baseline",
    adjust_uncor_significant = "3. Uncorrelated covariate",
    adjust_both_significant  = "4. Both",
    )) %>%
  ggplot(aes(baseline_slope, value, colour = name)) +
  geom_line() +
  geom_point() +
  facet_grid(cols = vars(diff), rows = vars(total_n)) +
  labs(x = "True baseline coefficient",
       y = "Proportion",
       title = "Effect of adjusting for covariates",
       subtitle = "Proportion of simulated studies with p < .05",
       colour = "Adjustment",
       caption = "Note: True treatment effect in columns, sample size in rows") +
  theme(panel.spacing = unit(1, "lines"),
        legend.position = "bottom") +
  ylim(0,1)
```


```{r fig.height=5, fig.width=6, dpi=300}
results_by_param %>%
  pivot_longer(cols = ends_with("true_cov")) %>%
  mutate(name = recode(name,
    unadjust_true_cov     = "1. None",
    adjust_base_true_cov  = "2. Baseline",
    adjust_uncor_true_cov = "3. Uncorrelated covariate",
    adjust_both_true_cov  = "4. Both",
    )) %>%
  ggplot(aes(baseline_slope, value, colour = name)) +
  geom_line() +
  geom_point() +
  facet_grid(cols = vars(diff), rows = vars(total_n)) +
  labs(x = "True baseline coefficient",
       y = "Proportion",
       title = "Effect of adjusting for covariates",
       subtitle = "Proportion of 95% CIs including true value",
       colour = "Adjustment",
       caption = "Note: True treatment effect in columns, sample size in rows") +
  theme(panel.spacing = unit(1, "lines"),
        legend.position = "bottom") +
  ylim(0,1)
```



