library(tidyverse)
library(marginaleffects)
library(sandwich)
library(cobalt)
library(tictoc)

Use the code below at your own risk! I primarily wrote this to try to get my head around what kmatch is doing in Stata.

Sample data:

data(lalonde)

Take a peek:

head(lalonde)

Homebaked kernel matching

First, get the propensity scores:

glm1 <- glm(
    treat ~ age + educ + race + married + nodegree + re74 + re75,
    data = lalonde,
    family = binomial
  )
lalonde$ps <- predict(glm1, type = "response")
lalonde %>%
  ggplot(aes(x = ps, fill = factor(treat))) +
  geom_density(alpha = .4) + 
  labs(x = "Propensity Scores", y = "Density", fill = "") 

The Epanechnikov kernel:

epan <- function(x) {
  (3/4)*(1-x^2)*(abs(x)<=1)
}
curve(epan(x), -4, 4, n = 1001)

I’ll want to rescale variables on an arbitrary range to [-1, 1]

rescale <- function(x, in_min, in_max, out_min, out_max) {
  out_min + ((x - in_min)*(out_max - out_min)/(in_max - in_min))
}
rescale(seq(-0.3, 0.3, .1), -0.3, 0.3, -1, 1) |> round(2)
## [1] -1.00 -0.67 -0.33  0.00  0.33  0.67  1.00

The tofu. This function produces a long dataset with control observations potentially repeating a large number of times:

kmatch <- function(dat, radius, treat_var, ps_var) {
  res <- data.frame()
  
  # Give each observation a unique ID, for the SE calc later
  temp <- dat |>
    mutate(.id = 1:n())
  
  treats <- temp |>
    filter(!!sym(treat_var) == 1)
  controls <- temp |>
    filter(!!sym(treat_var) == 0) |>
    arrange(!!sym(ps_var)) # hopefully speeds up filter
  
  # work through treat obs and find matches
  for (r in 1:nrow(treats)) {
    cur_row <- treats[r,]
    target_ps <- as.numeric(cur_row[ps_var])
    
    # note the strictly less/greater than
    matches <- controls |> 
      filter(!!sym(ps_var) > target_ps - radius &
             !!sym(ps_var) < target_ps + radius) |>
      mutate(.dist = abs(target_ps - !!sym(ps_var)),
             .distnorm = rescale(.dist, -radius, radius, -1, 1),
             .epan = epan(.distnorm),
             .wt = .epan/sum(.epan)) # weights sum to 1 within a class
    
    if (nrow(matches) >= 1) {
      cur_row$.class <- r
      cur_row$.wt    <- 1 # treat obs gets weight 1
      matches$.class <- r
    
      res <- bind_rows(res, cur_row, matches)
    }
  }
  
  res
}

Run it:

tic()
outdat <-
  kmatch(
    dat = lalonde,
    treat_var = "treat",
    radius = 0.03,
    ps_var = "ps"
  )
toc()
## 0.65 sec elapsed
nrow(lalonde)
## [1] 614
nrow(outdat)
## [1] 4696

Now aggregate the weights to make it easier to check for balance. This yields the same weights as those produced by psmatch2 and kmatch and each control observation appears only one.

outdat_agg <- outdat |>
  group_by(.id) |>
  summarise(.sumwt = sum(.wt))

widedat <- lalonde |>
  mutate(.id = 1:n()) |>
  right_join(outdat_agg)
## Joining with `by = join_by(.id)`

Count

All data

lalonde |>
  group_by(treat) |>
  tally()

Matched data

outdat |>
  group_by(treat) |>
  summarise(unique_n = unique(.id) |> length(),
            rep_n = .id |> length())

Check balance

bal_unadj <- bal.tab(
  treat ~ age + educ + race + married + nodegree + re74 + re75,
  data = lalonde,
  binary = "std",
  continuous = "std",
  s.d.denom = "pooled"
)
bal_adj <- bal.tab(
  treat ~ age + educ + race + married + nodegree + re74 + re75,
  data = widedat,
  weights = ".sumwt",
  binary = "std",
  continuous = "std",
  s.d.denom = "pooled"
)
summary_tab <- merge(bal_unadj$Balance |> select(-Diff.Adj,-Type),
      bal_adj$Balance   |> select(-Diff.Un,-Type), by = 0, all = TRUE) |>
  rename(Matched  = Diff.Adj,
         All      = Diff.Un,
         Variable = Row.names)
summary_tab
summary_tab |>
  pivot_longer(cols = -Variable) |>
  ggplot(aes(x = abs(value), y = factor(Variable), color = name)) +
  geom_point() +
  labs(y = "",
       x = "|SMD|",
       color = "")

Outcome model

The outcome model, as tentatively suggested by MatchIt authors for another approach using matching with replacement: “There is some evidence for an alternative approach that incorporates pair membership and adjusts for reuse of control units, though this has only been studied for survival outcomes.”

Note interaction between treat and covariates. This is a doubly-robust approach. Leave out the covariates if you don’t want that.

outfit <-
  lm(
    re78 ~ treat * (age + educ + race + married + nodegree + re74 + re75),
    data = outdat,
    weights = .wt
  )

Now estimate ATT with errors clustered by subclass and ID:

avg_comparisons(
  outfit,
  variables = "treat",
  vcov = ~ .class + .id,
  newdata = subset(outdat, treat == 1),
  wts = ".wt"
)

The following call replicates kmatch (no covariates, no clustering by subclass):

avg_comparisons(
  lm(re78 ~ treat, data = outdat, weights = .wt),
  variables = "treat",
  vcov = ~ .id,
  newdata = subset(outdat, treat == 1),
  wts = ".wt"
)