```
library(tidyverse)
library(marginaleffects)
library(sandwich)
library(cobalt)
library(tictoc)
```

Use the code below at your own risk! I primarily wrote this to try to
get my head around what `kmatch`

is doing in Stata.

Sample data:

`data(lalonde)`

Take a peek:

`head(lalonde)`

First, get the propensity scores:

```
glm1 <- glm(
treat ~ age + educ + race + married + nodegree + re74 + re75,
data = lalonde,
family = binomial
)
lalonde$ps <- predict(glm1, type = "response")
```

```
lalonde %>%
ggplot(aes(x = ps, fill = factor(treat))) +
geom_density(alpha = .4) +
labs(x = "Propensity Scores", y = "Density", fill = "")
```

The Epanechnikov kernel:

```
epan <- function(x) {
(3/4)*(1-x^2)*(abs(x)<=1)
}
```

`curve(epan(x), -4, 4, n = 1001)`

I’ll want to rescale variables on an arbitrary range to [-1, 1]

```
rescale <- function(x, in_min, in_max, out_min, out_max) {
out_min + ((x - in_min)*(out_max - out_min)/(in_max - in_min))
}
```

`rescale(seq(-0.3, 0.3, .1), -0.2, 0.2, -1, 1) |> round(2)`

`## [1] -1.5 -1.0 -0.5 0.0 0.5 1.0 1.5`

The tofu. This function produces a LONG dataset with control observations potentially appearing a large number of times:

```
kmatch <- function(dat, radius, treat_var, ps_var) {
res <- data.frame()
# Give each observation a unique ID, for the SE calc later
temp <- dat |>
mutate(.id = 1:n())
treats <- temp |>
filter(!!sym(treat_var) == 1)
controls <- temp |>
filter(!!sym(treat_var) == 0) |>
arrange(!!sym(ps_var)) # hopefully speeds up filter
# work through treat obs and find matches
for (r in 1:nrow(treats)) {
cur_row <- treats[r,]
target_ps <- as.numeric(cur_row[ps_var])
# note the strictly less/greater than
matches <- controls |>
filter(!!sym(ps_var) > target_ps - radius &
!!sym(ps_var) < target_ps + radius) |>
mutate(.dist = abs(target_ps - !!sym(ps_var)),
.distnorm = rescale(.dist, -radius, radius, -1, 1),
.epan = epan(.distnorm),
.wt = .epan/sum(.epan)) # weights sum to 1 within a class
if (nrow(matches) >= 1) {
cur_row$.class <- r
cur_row$.wt <- 1 # treat obs gets weight 1
matches$.class <- r
res <- bind_rows(res, cur_row, matches)
}
}
res
}
```

Run it:

```
tic()
outdat <-
kmatch(
dat = lalonde,
treat_var = "treat",
radius = 0.03,
ps_var = "ps"
)
toc()
```

`## 0.57 sec elapsed`

`nrow(lalonde)`

`## [1] 614`

`nrow(outdat)`

`## [1] 4696`

Now aggregate the weights to make it easier to check for balance.
This yields the same weights as those produced by `psmatch2`

and `kmatch`

and each control observation appears only
one.

```
outdat_agg <- outdat |>
group_by(.id) |>
summarise(.sumwt = sum(.wt))
widedat <- lalonde |>
mutate(.id = 1:n()) |>
right_join(outdat_agg)
```

`## Joining with `by = join_by(.id)``

```
lalonde |>
group_by(treat) |>
tally()
```

```
outdat |>
group_by(treat) |>
summarise(unique_n = unique(.id) |> length(),
rep_n = .id |> length())
```

```
bal_unadj <- bal.tab(
treat ~ age + educ + race + married + nodegree + re74 + re75,
data = lalonde,
binary = "std",
continuous = "std",
s.d.denom = "pooled"
)
```

```
bal_adj <- bal.tab(
treat ~ age + educ + race + married + nodegree + re74 + re75,
data = widedat,
weights = ".sumwt",
binary = "std",
continuous = "std",
s.d.denom = "pooled"
)
```

```
summary_tab <- merge(bal_unadj$Balance |> select(-Diff.Adj,-Type),
bal_adj$Balance |> select(-Diff.Un,-Type),
by=0, all=TRUE) |>
rename(Matched = Diff.Adj,
All = Diff.Un,
Variable = Row.names)
summary_tab
```

```
summary_tab |>
pivot_longer(cols = -Variable) |>
ggplot(aes(x = abs(value), y = factor(Variable), color = name)) +
geom_point() +
labs(y = "",
x = "|SMD|",
color = "")
```

The outcome model, as tentatively
suggested by `MatchIt`

authors for another approach using
matching with replacement: “There is some evidence for an alternative
approach that incorporates pair membership and adjusts for reuse of
control units, though this has only been studied for survival
outcomes.”

Note interaction between treat and covariates. This is a doubly-robust approach. Leave out the covariates if you don’t want that.

```
outfit <-
lm(
re78 ~ treat * (age + educ + race + married + nodegree + re74 + re75),
data = outdat,
weights = .wt
)
```

Now estimate ATT with errors clustered by subclass and ID:

```
avg_comparisons(
outfit,
variables = "treat",
vcov = ~ .class + .id,
newdata = subset(outdat, treat == 1),
wts = ".wt"
)
```

The following call replicates `kmatch`

(no covariates, no
clustering by subclass):

```
avg_comparisons(
lm(re78 ~ treat, data = outdat, weights = .wt),
variables = "treat",
vcov = ~ .id,
newdata = subset(outdat, treat == 1),
wts = ".wt"
)
```