This is an excuse to try out the {compos} package, by David Firth and Fiona Sammut, using data from the House of Commons Library, published 18 July.
Load up some packages ({compos} isn’t on CRAN yet):
#devtools::install_github("DavidFirth/compos")
library(compos)
library(conflicted)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
library(ggmice)
library(mice)
library(table1)
library(marginaleffects)
Candidate-level data:
ge_dat <- read_csv("HoC-GE2024-results-by-candidate.csv")
## Rows: 4515 Columns: 22
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (15): ONS ID, ONS region ID, Constituency name, Region name, Country nam...
## dbl (6): MNIS party ID, Member MNIS ID, Votes, Share, Change, DC Person ID
## lgl (1): County name
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Constituency-level data, used for the turnout figures:
ge_turnout <- read_csv("HoC-GE2024-results-by-constituency.csv") |>
mutate(turnout_percentage = 100*`Valid votes`/Electorate) |>
select(`Constituency name`, turnout_percentage) |>
rename(constituency = `Constituency name`)
## Rows: 650 Columns: 32
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (12): ONS ID, ONS region ID, Constituency name, Region name, Country nam...
## dbl (18): Electorate, Valid votes, Invalid votes, Majority, Con, Lab, LD, RU...
## lgl (2): County name, Declaration time
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
ge_turnout
Filter so we only have Labour, Tory, and Reform, join on the turnout, and pivot wider with all the data we want (also fix one typo in the data, which I’ve notified Commons Library about):
parties <- c("Lab", "Con", "RUK")
simp_dat <- ge_dat |>
dplyr::filter(`Party abbreviation` %in% parties) |>
select(`Constituency name`, `Party abbreviation`, Votes) |>
mutate(
`Constituency name` = ifelse(
`Constituency name` == "YeoYeovil",
"Yeovil",
`Constituency name`
)
) |>
pivot_wider(names_from = `Party abbreviation`, values_from = "Votes") |>
rename(constituency = `Constituency name`) |>
left_join(ge_turnout)
## Joining with `by = join_by(constituency)`
simp_dat
table1( ~ ., data = simp_dat |> dplyr::select(-constituency))
Overall (N=636) |
|
---|---|
Con | |
Mean (SD) | 10800 (5580) |
Median [Min, Max] | 11400 [46.0, 25500] |
Missing | 1 (0.2%) |
RUK | |
Mean (SD) | 6760 (2750) |
Median [Min, Max] | 7050 [697, 21200] |
Missing | 27 (4.2%) |
Lab | |
Mean (SD) | 15400 (5670) |
Median [Min, Max] | 16200 [1490, 29200] |
Missing | 5 (0.8%) |
turnout_percentage | |
Mean (SD) | 59.8 (6.92) |
Median [Min, Max] | 60.1 [40.0, 79.2] |
Missingness:
plot_pattern(simp_dat, rotate = TRUE)
names(simp_dat)
## [1] "constituency" "Con" "RUK"
## [4] "Lab" "turnout_percentage"
Zap all the rows with any missing data (e.g., because Reform didn’t stand a candidate).
for_analy_counts <- na.omit(simp_dat)
table1( ~ ., data = for_analy_counts |> dplyr::select(-constituency))
Overall (N=608) |
|
---|---|
Con | |
Mean (SD) | 10800 (5460) |
Median [Min, Max] | 11500 [586, 25500] |
RUK | |
Mean (SD) | 6750 (2750) |
Median [Min, Max] | 7050 [697, 21200] |
Lab | |
Mean (SD) | 15400 (5660) |
Median [Min, Max] | 16100 [1490, 29200] |
turnout_percentage | |
Mean (SD) | 60.0 (6.84) |
Median [Min, Max] | 60.4 [40.0, 79.2] |
Create another dataframe with percentages, conditional on voting for one of these three parties.
perc <- for_analy_counts |>
pivot_longer(names_to = "Party",
values_to = "Votes",
cols = Con:Lab) |>
group_by(constituency) |>
mutate(Perc = 100 * Votes / sum(Votes))
perc
plot_votes <- perc |>
ggplot(aes(turnout_percentage, Perc, colour = Party)) +
geom_point(size = .7) +
labs(
x = "Turnout (%)",
y = "Votes (%)",
title = "UK general election 2024",
caption = "Percentage votes are conditional on voting Tory, Labour, or Reform"
) +
scale_color_manual(values = c("#0087dc", "#d50000", "#12b6cf"))
plot_votes +
geom_smooth(se = FALSE,
method = "loess",
formula = y ~ x)
Now, onto the modelling. These use the generalized Wedderburn logit model.
mod1_count <- colm(cbind(Con, Lab, RUK) ~ turnout_percentage, ref = 1, data = for_analy_counts)
mod2_count <- colm(cbind(Con, Lab, RUK) ~ turnout_percentage, ref = 2, data = for_analy_counts)
mod3_count <- colm(cbind(Con, Lab, RUK) ~ turnout_percentage, ref = 3, data = for_analy_counts)
short_sum <- function(mod) {
(mod |> summary())$Coefficients
}
mod1_count |> short_sum()
## Lab / Con :
## Estimate St. err
## (Intercept) 5.13500 0.243600
## turnout_percentage -0.07861 0.004036
##
## RUK / Con :
## Estimate St. err
## (Intercept) 2.93800 0.149500
## turnout_percentage -0.05564 0.002477
mod2_count |> short_sum()
## Con / Lab :
## Estimate St. err
## (Intercept) -5.13500 0.243600
## turnout_percentage 0.07861 0.004036
##
## RUK / Lab :
## Estimate St. err
## (Intercept) -2.19800 0.230800
## turnout_percentage 0.02297 0.003823
mod3_count |> short_sum()
## Con / RUK :
## Estimate St. err
## (Intercept) -2.93800 0.149500
## turnout_percentage 0.05564 0.002477
##
## Lab / RUK :
## Estimate St. err
## (Intercept) 2.19800 0.230800
## turnout_percentage -0.02297 0.003823
The estimates/SE for turnout are around 20 for the comparisons of Labour versus Conservative and Reform versus conservative. They are around 6 for Labour versus Reform. So, all “statistically significant”.
Now the coefficients. Choose the first model:
bs <- mod1_count |> coef()
bs
## Con Lab RUK
## (Intercept) 0 5.13542079 2.93753844
## turnout_percentage 0 -0.07860971 -0.05563858
These are as easy to interpret as logistic regression coefficients ;-) One trick is to exp them to get odds ratios.
bs |> exp() |> round(2)
## Con Lab RUK
## (Intercept) 1 169.94 18.87
## turnout_percentage 1 0.92 0.95
The intercepts give the odds of voting each party rather than Conservative when turnout is zero, which probably isn’t useful… Each percentage point increase in turnout reduces the odds of voting Labour by 8% compared to Conservative.
It’s easier to plot them:
lab_vs_con_pred <- function(t)
exp(bs["(Intercept)", "Lab"] + bs["turnout_percentage", "Lab"] * t)
ref_vs_con_pred <- function(t)
exp(bs["(Intercept)", "RUK"] + bs["turnout_percentage", "RUK"] * t)
con_vs_con_pred <- function(t)
exp(bs["(Intercept)", "Con"] + bs["turnout_percentage", "Con"] * t)
lab_pred <- function(turnout)
100 * lab_vs_con_pred(turnout) /
(lab_vs_con_pred(turnout) + con_vs_con_pred(turnout) + ref_vs_con_pred(turnout))
con_pred <- function(turnout)
100 * con_vs_con_pred(turnout) /
(lab_vs_con_pred(turnout) + con_vs_con_pred(turnout) + ref_vs_con_pred(turnout))
ref_pred <- function(turnout)
100 * ref_vs_con_pred(turnout) /
(lab_vs_con_pred(turnout) + con_vs_con_pred(turnout) + ref_vs_con_pred(turnout))
Note the Conservative predictions, since it is the base category with intercept and slope of zero:
con_vs_con_pred(seq(0, 100, 10))
## [1] 1 1 1 1 1 1 1 1 1 1 1
(I wanted to follow the recipe rather than just add a constant.)
plot_votes +
geom_function(
fun = lab_pred,
colour = "#d50000",
linewidth = 1,
linetype = "dashed"
) +
geom_function(
fun = con_pred,
colour = "#0087dc",
linewidth = 1,
linetype = "dashed"
) +
geom_function(
fun = ref_pred,
colour = "#12b6cf",
linewidth = 1,
linetype = "dashed"
) +
labs(title = "UK general election 2024 – colm model predictions",
caption = "Percentage votes are conditional on voting Tory, Labour, or Reform.\nDashed curves are colm model predictions; solid curves are loess.") +
geom_smooth(se = FALSE,
method = "loess",
formula = y ~ x)
The loess curve looks more sigmoid-shaped than the colm fit.
If I’ve understood the model correctly, then using the percentages as outcomes, rather than counts, should give the same model coefficients and standard errors:
for_analy_percs <- perc |>
select(-Votes) |>
pivot_wider(names_from = "Party",
values_from = "Perc")
for_analy_percs
mod1_perc <- colm(cbind(Con, Lab, RUK) ~ turnout_percentage,
ref = 1,
data = for_analy_percs)
mod2_perc <- colm(cbind(Con, Lab, RUK) ~ turnout_percentage,
ref = 2,
data = for_analy_percs)
mod3_perc <- colm(cbind(Con, Lab, RUK) ~ turnout_percentage,
ref = 3,
data = for_analy_percs)
Reference level Con:
mod1_count |> short_sum()
## Lab / Con :
## Estimate St. err
## (Intercept) 5.13500 0.243600
## turnout_percentage -0.07861 0.004036
##
## RUK / Con :
## Estimate St. err
## (Intercept) 2.93800 0.149500
## turnout_percentage -0.05564 0.002477
mod1_perc |> short_sum()
## Lab / Con :
## Estimate St. err
## (Intercept) 5.13500 0.243600
## turnout_percentage -0.07861 0.004036
##
## RUK / Con :
## Estimate St. err
## (Intercept) 2.93800 0.149500
## turnout_percentage -0.05564 0.002477
Reference level Lab:
mod2_count |> short_sum()
## Con / Lab :
## Estimate St. err
## (Intercept) -5.13500 0.243600
## turnout_percentage 0.07861 0.004036
##
## RUK / Lab :
## Estimate St. err
## (Intercept) -2.19800 0.230800
## turnout_percentage 0.02297 0.003823
mod2_perc |> short_sum()
## Con / Lab :
## Estimate St. err
## (Intercept) -5.13500 0.243600
## turnout_percentage 0.07861 0.004036
##
## RUK / Lab :
## Estimate St. err
## (Intercept) -2.19800 0.230800
## turnout_percentage 0.02297 0.003823
Reference level Reform:
mod3_count |> short_sum()
## Con / RUK :
## Estimate St. err
## (Intercept) -2.93800 0.149500
## turnout_percentage 0.05564 0.002477
##
## Lab / RUK :
## Estimate St. err
## (Intercept) 2.19800 0.230800
## turnout_percentage -0.02297 0.003823
mod3_perc |> short_sum()
## Con / RUK :
## Estimate St. err
## (Intercept) -2.93800 0.149500
## turnout_percentage 0.05564 0.002477
##
## Lab / RUK :
## Estimate St. err
## (Intercept) 2.19800 0.230800
## turnout_percentage -0.02297 0.003823
And indeed they do.
The fitted values are different, though – they are on the original scale:
mod1_perc$fitted.values |> head()
## Con Lab RUK
## 1 17.38262 61.45780 21.15958
## 2 24.76353 53.86305 21.37342
## 3 31.24491 47.74911 21.00598
## 4 23.53006 55.07815 21.39179
## 5 20.69681 57.94510 21.35809
## 6 34.17761 45.10923 20.71316
mod1_count$fitted.values |> head()
## Con Lab RUK
## 1 4906.245 17346.46 5972.291
## 2 5555.699 12084.17 4795.126
## 3 8109.304 12392.80 5451.893
## 4 5165.083 12090.21 4695.712
## 5 4871.615 13639.12 5027.268
## 6 14373.394 18970.69 8710.920
library(nnet)
mn_mod1 <- multinom(cbind(Con, Lab, RUK) ~ turnout_percentage,
Hess = TRUE,
data = for_analy_counts)
## # weights: 9 (4 variable)
## initial value 22016388.015121
## final value 20535471.109038
## converged
mn_mod1 |> summary()
## Call:
## multinom(formula = cbind(Con, Lab, RUK) ~ turnout_percentage,
## data = for_analy_counts, Hess = TRUE)
##
## Coefficients:
## (Intercept) turnout_percentage
## Lab 4.814745 -0.07338617
## RUK 2.963362 -0.05614951
##
## Std. Errors:
## (Intercept) turnout_percentage
## Lab 0.005086341 8.278916e-05
## RUK 0.006108462 9.967585e-05
##
## Residual Deviance: 41070942
## AIC: 41070950
Compare with colm:
mod1_count |> short_sum()
## Lab / Con :
## Estimate St. err
## (Intercept) 5.13500 0.243600
## turnout_percentage -0.07861 0.004036
##
## RUK / Con :
## Estimate St. err
## (Intercept) 2.93800 0.149500
## turnout_percentage -0.05564 0.002477
The estimates are similar. The multinomial regression SEs are much smaller, presumably because it can take advantage of the counts.
Plot predictions:
pred_mod <- Vectorize(function(turnout, party) {
pred_matrix <- predict(mn_mod1,
newdat = data.frame(turnout_percentage = turnout),
type = "probs") |> as.matrix()
pred_matrix[party, ] |>
as.vector() * 100
})
plot_votes +
geom_function(
fun = \(x) pred_mod(x, "Lab"),
colour = "#d50000",
linewidth = 1,
linetype = "dashed"
) +
geom_function(
fun = \(x) pred_mod(x, "Con"),
colour = "#0087dc",
linewidth = 1,
linetype = "dashed"
) +
geom_function(
fun = \(x) pred_mod(x, "RUK"),
colour = "#12b6cf",
linewidth = 1,
linetype = "dashed"
) +
labs(title = "UK general election 2024 – multinomial model predictions",
caption = "Percentage votes are conditional on voting Tory, Labour, or Reform.\nDashed curves are multinom model predictions; solid curves are loess.") +
geom_smooth(se = FALSE,
method = "loess",
formula = y ~ x)
Same issue as for the colm model that the predictions aren’t sufficiently s-shaped, which is unsurprising given how similar the coefficients are.
Try lobbing a third degree polynomial at it:
mn_mod_2 <- multinom(cbind(Con, Lab, RUK) ~ poly(turnout_percentage, 3),
Hess = TRUE,
data = for_analy_counts)
## # weights: 15 (8 variable)
## initial value 22016388.015121
## iter 10 value 20579651.427454
## final value 20524634.012745
## converged
pred_mod_2 <- Vectorize(function(turnout, party) {
pred_matrix <- predict(mn_mod_2,
newdat = data.frame(turnout_percentage = turnout),
type = "probs") |> as.matrix()
pred_matrix[party, ] |>
as.vector() * 100
})
plot_votes +
geom_function(
fun = \(x) pred_mod_2(x, "Lab"),
colour = "#d50000",
linewidth = 1,
linetype = "dashed"
) +
geom_function(
fun = \(x) pred_mod_2(x, "Con"),
colour = "#0087dc",
linewidth = 1,
linetype = "dashed"
) +
geom_function(
fun = \(x) pred_mod_2(x, "RUK"),
colour = "#12b6cf",
linewidth = 1,
linetype = "dashed"
) +
labs(title = "UK general election 2024 – multinomial model predictions", caption = "Percentage votes are conditional on voting Tory, Labour, or Reform.\nDashed curves are multinom model predictions; solid curves are loess.") +
geom_smooth(se = FALSE,
method = "loess",
formula = y ~ x)
mn_mod_2
## Call:
## multinom(formula = cbind(Con, Lab, RUK) ~ poly(turnout_percentage,
## 3), data = for_analy_counts, Hess = TRUE)
##
## Coefficients:
## (Intercept) poly(turnout_percentage, 3)1 poly(turnout_percentage, 3)2
## Lab 0.4148995 -12.329299 -0.1413748
## RUK -0.4008138 -9.463173 0.6312236
## poly(turnout_percentage, 3)3
## Lab 1.757868
## RUK 1.306942
##
## Residual Deviance: 41049268
## AIC: 41049284
They stood most places, so this is going to be tricky…
reform_ass <- simp_dat |>
drop_na(Lab, Con, turnout_percentage) |>
mutate(reform_stood = 1 - is.na(RUK),
Lab_perc = 100 * Lab / (Lab + Con))
table1(~ Lab + Con + RUK + reform_stood + turnout_percentage, data = reform_ass)
Overall (N=630) |
|
---|---|
Lab | |
Mean (SD) | 15400 (5680) |
Median [Min, Max] | 16200 [1490, 29200] |
Con | |
Mean (SD) | 10800 (5520) |
Median [Min, Max] | 11500 [586, 25500] |
RUK | |
Mean (SD) | 6750 (2750) |
Median [Min, Max] | 7050 [697, 21200] |
Missing | 22 (3.5%) |
reform_stood | |
Mean (SD) | 0.965 (0.184) |
Median [Min, Max] | 1.00 [0, 1.00] |
turnout_percentage | |
Mean (SD) | 59.9 (6.93) |
Median [Min, Max] | 60.3 [40.0, 79.2] |
Where they didn’t stand:
reform_ass |>
dplyr::filter(reform_stood == 0) |>
arrange(-Lab_perc) |>
select(-c(RUK, reform_stood)) |>
print(n = Inf)
## # A tibble: 22 × 5
## constituency Con Lab turnout_percentage Lab_perc
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Sheffield Central 2339 16569 52.5 87.6
## 2 Blaenau Gwent and Rhymney 3776 16027 42.7 80.9
## 3 Oxford East 4739 19541 54.8 80.5
## 4 Leeds South 4172 17117 41.7 80.4
## 5 Sheffield Heeley 5242 21230 52.3 80.2
## 6 Sheffield Brightside and Hillsborough 4069 16301 44.9 80.0
## 7 Cambridge 5073 19614 59.9 79.5
## 8 Sheffield Hallam 6205 23875 70.8 79.4
## 9 Bristol East 6435 20748 60.7 76.3
## 10 Sheffield South East 6252 18710 48.2 75.0
## 11 Doncaster North 7105 16231 44.4 69.6
## 12 Brighton Kemptown and Peacehaven 8230 17893 59.2 68.5
## 13 Hexham 20275 23988 67.8 54.2
## 14 Earley and Woodley 17361 18209 62.4 51.2
## 15 Middlesbrough South and East Clevela… 16254 16468 54.1 50.3
## 16 Stone, Great Wyrley and Penkridge 19880 14414 59.8 42.0
## 17 Epping Forest 18038 12356 57.8 40.7
## 18 East Grinstead and Uckfield 19319 10440 66.8 35.1
## 19 Maidenhead 18932 5766 66.5 23.3
## 20 Mid Dorset and North Poole 20090 4566 65.8 18.5
## 21 West Dorset 19210 3086 69.3 13.8
## 22 Cheltenham 17866 2665 65.1 13.0
reform_ass |>
ggplot(aes(
colour = factor(reform_stood, labels = c("No", "Yes")),
x = turnout_percentage,
y = Lab_perc
)) +
geom_point() +
geom_smooth(method = "loess",
formula = "y ~ x",
se = FALSE) +
labs(x = "Turnout (%)",
y = "Percentage voting Lab (versus Con)",
colour = "Reform stood",
title = "Percentage voting Lab (vs. Con)") +
ylim(0, NA)
Not enough data to have faith in this!
unemployment_june_dat <- read_csv("unemployment_benefit_june_2024.csv")
## Rows: 650 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Constituency
## dbl (1): Percent
## num (1): Number
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
unemployment_june_dat
summary(unemployment_june_dat)
## Constituency Number Percent
## Length:650 Min. : 275 Min. : 1.259
## Class :character 1st Qu.: 1455 1st Qu.: 2.443
## Mode :character Median : 2012 Median : 3.238
## Mean : 2560 Mean : 3.768
## 3rd Qu.: 3119 3rd Qu.: 4.546
## Max. :14150 Max. :14.919
vote_unemp <- left_join(simp_dat,
unemployment_june_dat |>
rename(constituency = "Constituency",
unemp_percent = "Percent") |>
select(-Number))
## Joining with `by = join_by(constituency)`
vote_unemp |>
ggplot(aes(y = turnout_percentage, x = unemp_percent)) +
geom_point() +
geom_smooth(method = lm, formula = y ~ log(x), se = TRUE) +
labs(y = "Turnout (%)",
x = "Claimant rate (%)",
caption = "Claimant rate is the percentage of the population aged 16–64 who are claiming unemployment related benefits.",
title = "Turnout as a function of unemployment benefit claimant rates")
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
left_join(
perc,
unemployment_june_dat |>
rename(constituency = "Constituency", unemp_percent = "Percent") |>
select(-Number)
) |>
ggplot(aes(unemp_percent, Perc, colour = Party)) +
geom_point(size = .7) +
labs(
x = "Claimant rate (%)",
y = "Votes (%)",
title = "UK general election 2024",
caption = "Percentage votes are conditional on voting Tory, Labour, or Reform"
) +
scale_color_manual(values = c("#0087dc", "#d50000", "#12b6cf")) +
geom_smooth(se = FALSE,
method = "loess",
formula = y ~ x,
span = .8)
## Joining with `by = join_by(constituency)`
## Warning: Removed 6 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 6 rows containing missing values or values outside the scale range
## (`geom_point()`).
mn_tout_unemp <- multinom(
cbind(Con, Lab, RUK) ~ turnout_percentage + log(unemp_percent) ,
Hess = TRUE,
data = vote_unemp
)
## # weights: 12 (6 variable)
## initial value 21961267.340762
## iter 10 value 20412550.701075
## iter 10 value 20412550.663830
## iter 10 value 20412550.663767
## final value 20412550.663767
## converged
mn_tout_unemp |> summary()
## Call:
## multinom(formula = cbind(Con, Lab, RUK) ~ turnout_percentage +
## log(unemp_percent), data = vote_unemp, Hess = TRUE)
##
## Coefficients:
## (Intercept) turnout_percentage log(unemp_percent)
## Lab 2.501812 -0.04629887 0.5595221
## RUK 3.666410 -0.06442263 -0.1713252
##
## Std. Errors:
## (Intercept) turnout_percentage log(unemp_percent)
## Lab 0.009638597 0.0001263663 0.002005744
## RUK 0.011763838 0.0001544506 0.002472342
##
## Residual Deviance: 40825101
## AIC: 40825113
mn_tout_unemp |> confint()
## , , Lab
##
## 2.5 % 97.5 %
## (Intercept) 2.48292082 2.5207034
## turnout_percentage -0.04654655 -0.0460512
## log(unemp_percent) 0.55559093 0.5634533
##
## , , RUK
##
## 2.5 % 97.5 %
## (Intercept) 3.64335379 3.68946719
## turnout_percentage -0.06472535 -0.06411992
## log(unemp_percent) -0.17617094 -0.16647954