Load in some packages:
library(conflicted)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
Download the digits (in a zip file) from https://www.mersenne.org/primes/?press=M136279841
temp_dir <- tempdir(check = TRUE)
temp_file <- tempfile(pattern = "primezip",
tmpdir = temp_dir)
download.file("https://www.mersenne.org/primes/digits/M136279841.zip",
temp_file)
Unzip:
unzip(temp_file, files = "m136279841.txt", exdir = temp_dir)
Load it into R:
filename <- paste(temp_dir, "m136279841.txt", sep = "\\")
digits_str <- readLines(filename) |> paste(collapse = "")
Transform into a 1D vector of digits:
digits <- digits_str |>
strsplit("") |>
pluck(1) |>
as.numeric()
Check the first and last three digits:
digits[1:3]
## [1] 8 8 1
digits[(length(digits) - 2):length(digits)]
## [1] 5 5 1
Count all the digits:
length(digits)
## [1] 41024320
Make a few tables and a pic:
dig_table <- table(digits) |> as.data.frame()
dig_table <- dig_table |>
mutate(perc = 100 * Freq / sum(Freq))
dig_table
dig_table |>
ggplot(aes(x = digits, y = Freq)) +
geom_point() +
labs(x = "Digit", y = "Frequency", title = "Digit frequency in M136279841") +
theme(legend.position = "none")
dig_table |> arrange(Freq)
Are the digits all equally likely?
chisq.test(dig_table$Freq)
##
## Chi-squared test for given probabilities
##
## data: dig_table$Freq
## X-squared = 5.2656, df = 9, p-value = 0.8106
chisq.test(dig_table$Freq)$stdres
## [1] 0.5459257 0.1014829 -0.9253154 1.6866970 -0.1790262 0.5141798
## [7] -1.1079846 0.0166536 -0.0983603 -0.5542525
No evidence they aren’t.
How much more frequent would the 3 need to be to get a statistically significant \(\chi^2\)?
sens_check <- dig_table$Freq
the_p <- chisq.test(sens_check)$p.value
add_on <- 0
while (the_p > .05) {
add_on <- add_on + 1
sens_check[4] <- dig_table$Freq[4] + add_on
the_p <- chisq.test(sens_check)$p.value
}
add_on
## [1] 4529
chisq.test(sens_check)
##
## Chi-squared test for given probabilities
##
## data: sens_check
## X-squared = 16.92, df = 9, p-value = 0.04999
chisq.test(sens_check)$stdres
## [1] 0.3102081 -0.1342102 -1.1609518 3.8077907 -0.4147037 0.2784640
## [7] -1.3436108 -0.2190347 -0.3340423 -0.7899093
dig_table |>
mutate(Freq = sens_check) |>
ggplot(aes(x = digits, y = Freq)) +
geom_point() +
labs(x = "Digit", y = "Frequency", title = "Mini freq for 3 needed to get a chi-square test with p < .05") +
theme(legend.position = "none")
twogram <- function(test_vec) {
res <- matrix(0, nrow = 10, ncol = 10)
rownames(res) <- 0:9
colnames(res) <- 0:9
for (i in 1:(length(test_vec) - 1)) {
res[test_vec[i] + 1,
test_vec[i + 1] + 1] <- res[test_vec[i] + 1,
test_vec[i + 1] + 1] + 1
}
res
}
two_grams <- twogram(digits)
two_grams - min(two_grams)
## 0 1 2 3 4 5 6 7 8 9
## 0 2477 1396 440 1619 2004 1535 1684 1376 1583 1317
## 1 1640 738 1958 1810 1698 1366 769 1925 1672 1000
## 2 1391 1378 1893 2165 827 123 88 1659 2009 1071
## 3 2527 1709 1791 2480 2501 2188 1238 576 733 1880
## 4 761 2450 533 1619 546 887 1029 2381 1200 2632
## 5 1516 1693 1902 1790 2541 1890 1466 766 1163 643
## 6 1216 1456 773 1036 664 987 1565 1472 1490 1594
## 7 1061 767 1716 1905 1074 2079 1348 875 2240 1349
## 8 951 2053 1598 1728 853 2858 1774 1329 659 390
## 9 1891 937 0 1471 1330 1457 1292 2055 1443 1441
chisq.test(two_grams)
##
## Pearson's Chi-squared test
##
## data: two_grams
## X-squared = 77.275, df = 81, p-value = 0.5966