library(conflicted)
library(rvest)
library(kableExtra)
library(stringdist)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
Scrape, with a little help from clues I’ve added to the page:
web_lines <- read_html("https://andifugard.info/misc/books/") |>
html_text() |>
read_lines()
dat <- tibble(raw = web_lines)
start_row <- which(dat$raw == "Note to self: start scraping here!") + 1
end_row <- which(dat$raw == "Note to self: stop scraping here!") - 1
scraped <- dat |>
slice(start_row:end_row)
Separate author and title into a tibble:
title_authormess <- str_split_fixed(scraped$raw, ", by | – ", n = 3)
my_books <- tibble(Author = title_authormess[, 2] |> trimws(),
Title = title_authormess[, 1] |> trimws())
current_heading <- NA
my_books$Category <- rep(NA, nrow(my_books))
for (r in 1:nrow(my_books)) {
if (my_books$Title[r] != "" && my_books$Author[r] == "")
current_heading <- my_books$Title[r]
if (my_books$Title[r] != "" && my_books$Author[r] != "")
my_books$Category[r] <- current_heading
}
my_books <- na.omit(my_books)
Ta-da:
my_books |>
arrange(Category) |>
kable(format = "html") |>
kable_styling() |>
column_spec(1, width = "10em") |>
column_spec(2, width = "20em") |>
column_spec(3, width = "5em")
Author | Title | Category |
---|---|---|
Stella Rimington | Open Secret | (Auto)Biographies |
Geoffrey Robertson | Stephen Ward Was Innocent, OK | (Auto)Biographies |
McKenzie Wark | Reverse Cowgirl | (Auto)Biographies |
Ben Macintrye | The Spy and the Traitor | (Auto)Biographies |
Paul Feyerabend | Killing Time | (Auto)Biographies |
Tracey Emin | My life in a column | (Auto)Biographies |
Tracey Emin | Strangeland | (Auto)Biographies |
Nicholas Lezard | Bitter Experience Has Taught Me | (Auto)Biographies |
Naoki Higashida | The Reason I Jump | (Auto)Biographies |
Martin Green | Otto Gross: Freudian Psychoanalyst 1877-1920 | (Auto)Biographies |
Adrian Laing | R. D. Laing: a life | (Auto)Biographies |
Daniel Burston | The Wing of Madness: the Life and Work of R.D. Laing | (Auto)Biographies |
Anita Burdman Feferman & Solomon Feferman | Alfred Tarski: Life and Logic | (Auto)Biographies |
Ray Monk | Ludwig Wittgenstein: The Duty of Genius | (Auto)Biographies |
Andrew Hodges | Alan Turing: The Enigma | (Auto)Biographies |
Mick Herron | The Secret Hours | Fiction (including plays) |
Stella Rimington | The Hidden Hand | Fiction (including plays) |
Mick Herron | The Drop | Fiction (including plays) |
Mick Herron | Dead Lions | Fiction (including plays) |
Mick Herron | Slow Horses | Fiction (including plays) |
Frank Gardner | Invasion | Fiction (including plays) |
Robert Sheckley | Dimension of Miracles | Fiction (including plays) |
Elif Shafak | 10 Minutes 38 Seconds in this Strange World | Fiction (including plays) |
Alma Katsu | Red London | Fiction (including plays) |
Alma Katsu | Red Widow | Fiction (including plays) |
Stella Rimington | The Devil’s Bargain | Fiction (including plays) |
Stella Rimington | The Moscow Sleepers | Fiction (including plays) |
Stella Rimington | Breaking Cover | Fiction (including plays) |
Stella Rimington | Close Call | Fiction (including plays) |
Stella Rimington | The Geneva Trap | Fiction (including plays) |
Stella Rimington | Rip Tide | Fiction (including plays) |
Stella Rimington | Present Danger | Fiction (including plays) |
Stella Rimington | Dead Line | Fiction (including plays) |
Stella Rimington | Illegal Action | Fiction (including plays) |
Stella Rimington | Secret Asset | Fiction (including plays) |
Stella Rimington | At Risk | Fiction (including plays) |
Graham Greene | The Human Factor | Fiction (including plays) |
John le Carré | Call for the Dead | Fiction (including plays) |
Michèle Bernstein | All the King’s Horses | Fiction (including plays) |
Ian McEwan | The Comfort of Strangers | Fiction (including plays) |
John le Carré | Our Kind of Traitor | Fiction (including plays) |
Ian McEwan | On Chesil Beach | Fiction (including plays) |
Graeme Simsion | The Rosie Project | Fiction (including plays) |
John le Carré | The Spy Who Came in from the Cold | Fiction (including plays) |
Helen DeWitt | Lightning Rods | Fiction (including plays) |
John le Carré | A Delicate Truth | Fiction (including plays) |
Philippe Claudel | The Investigation | Fiction (including plays) |
Ian McEwan | Sweet Tooth | Fiction (including plays) |
John le Carré | Tinker Tailor Solider Spy | Fiction (including plays) |
Andrey Kurkov | Death and the Penguin | Fiction (including plays) |
N.H. Kleinbaum | Dead Poets Society | Fiction (including plays) |
Stieg Larsson | The Girl with the Dragon Tattoo | Fiction (including plays) |
Richard Milward | Apples | Fiction (including plays) |
Ian Rankin | Exit Music | Fiction (including plays) |
Charlotte Roche | Wetlands | Fiction (including plays) |
Nick Hornby | A Long Way Down | Fiction (including plays) |
Aifric Campbell | The Semantics of Murder | Fiction (including plays) |
Elfriede Jelinek | The Piano Teacher | Fiction (including plays) |
Sarah Kane | 4.48 Psychosis | Fiction (including plays) |
Joe Stretch | Friction | Fiction (including plays) |
Roald Dalh | Switch Bitch | Fiction (including plays) |
Bertold Brecht | Collected Plays: One | Fiction (including plays) |
Tom Stoppard | Arcadia | Fiction (including plays) |
Athol Fugard | Tsotsi | Fiction (including plays) |
Ken Kesey | One Flew Over the Cuckoo’s Nest | Fiction (including plays) |
J. M. Coetzee | Elizabeth Costello | Fiction (including plays) |
Paulo Coelho | Veronika Decides to Die | Fiction (including plays) |
Mark Haddon | The Curious Incident of the Dog in the Night-Time | Fiction (including plays) |
Gabriel Josipovici | The Big Glass | Fiction (including plays) |
Salman Rushdie | Fury | Fiction (including plays) |
J. M. Coetzee | Youth | Fiction (including plays) |
Douglas Adams | The Hitchhiker’s Guide to the Galaxy | Fiction (including plays) |
Douglas Adams | The Restaurant at the End of the Universe | Fiction (including plays) |
Douglas Adams | Life the Universe and Everything | Fiction (including plays) |
Douglas Adams | So Long, and Thanks for All the Fish | Fiction (including plays) |
Douglas Adams | Mostly Harmless | Fiction (including plays) |
Antoine de Saint Exupéry | The Little Prince | Fiction (including plays) |
Jens Christian Grøndahl | Silence in October | Fiction (including plays) |
Oscar Wilde | The Picture of Dorian Gray | Fiction (including plays) |
Oscar Wilde | The Happy Prince and Other Tales | Fiction (including plays) |
John Downham | British Market Research Bureau: The First Sixty Years, 1933–1993 | Non-fiction |
Robert Hannigan | Counter-intelligence: what the secret world can teach us about problem solving and creativity | Non-fiction |
Michael Bungay Stanier | The Coaching Habit | Non-fiction |
Alex Grant | Sex, Spies and Scandal | Non-fiction |
Peter Wright | Spycatcher | Non-fiction |
Rob Dover | Hacker, Influencer, Faker, Spy | Non-fiction |
Alice Flarend & Bob Hilborn | Quantum computing: from Alice to Bob | Non-fiction |
Legacy Russell | Glitch Feminism | Non-fiction |
Erik Olin Wright | Class Counts | Non-fiction |
Richard Norton-Taylor | The State of Secrecy | Non-fiction |
Andrew Wilson | Ukraine Crisis: What It Means for the West | Non-fiction |
Richard Layard & David Clark | Thrive | Non-fiction |
Leo Bersani & Adam Phillips | Intimacies | Non-fiction |
Hannah Fry | The Mathematics of Love | Non-fiction |
Peter Callero | The Myth of Individualism: How Social Forces Shape Our Lives | Non-fiction |
Paul Moloney | The Therapy Industry: the Irresistible Rise of the Talking Cure, and Why it Doesn’t Work | Non-fiction |
Penny Rimbaud | The Last of the Hippies | Non-fiction |
Richard Swedberg | The art of social theory | Non-fiction |
Laurie Penny | Unspeakable Things | Non-fiction |
David Smail | Power, Interest and Psychology | Non-fiction |
Adam Phillips | Monogamy | Non-fiction |
Meg-John Barker | Rewriting the Rules: An integrative guide to love, sex and relationships | Non-fiction |
Richard Bentall | Doctoring the Mind: Why Psychiatric Treatments Fail | Non-fiction |
Anna Funder | Stasiland | Non-fiction |
Nick Cohen | What’s Left | Non-fiction |
Ian Bone | Bash the Rich | Non-fiction |
Mark Thomas | The People’s Manifesto | Non-fiction |
Mick Power | Emotion Focused Cognitive Therapy | Non-fiction |
Richard J. Aldrich | GCHQ | Non-fiction |
Colin Ward | Anarchy in Action | Non-fiction |
The Invisible Committee | The Coming Insurrection | Non-fiction |
Robert Sutton | The No Asshole Rule | Non-fiction |
G. A. Cohen | Why Not Socialism? | Non-fiction |
Craig Murray | The Catholic Orangemen of Togo and Other Conflicts I have Known | Non-fiction |
Erich Fromm | The Art of Being | Non-fiction |
Jean Baudrillard | In the Shadow of the Silent Majorities | Non-fiction |
Jean Baudrillard | Fragments: Cool Memories III 1990-1995 | Non-fiction |
Gerd Gigerenzer | Gut Feelings | Non-fiction |
R. D. Laing | The Politics of Experience and The Bird of Paradise | Non-fiction |
Jean Baudrillard | Simulations | Non-fiction |
Daniel Dennett | Freedom Evolves | Non-fiction |
Darian Leader & David Corfield | Why Do People Get Ill? | Non-fiction |
R. D. Laing | Self and Others | Non-fiction |
R. D. Laing | Knots | Non-fiction |
Darian Leader | Why do women write more letters than they post? | Non-fiction |
Thomas Joiner | Why People Die by Suicide | Non-fiction |
Keith Stanovich | The Robot’s Rebellion | Non-fiction |
Richard Bentall | Madness Explained | Non-fiction |
Benjamin Hoff | The Tao of Pooh and the Te of Piglet | Non-fiction |
Mateja Jamnik | Mathematical Reasoning with Diagrams | Non-fiction |
Nigel Warburton | Philosophy: The Classics | Non-fiction |
Marie McGinn | Routledge Philosophy Guidebook to Wittgenstein and the Philosophical Investigations | Non-fiction |
Ludwig Wittgenstein | Tractatus Logico-philosophicus | Non-fiction |
A.J. Ayer | Language, Truth and Logic | Non-fiction |
Douglas R. Hofstadter | Godel, Escher, Bach: An Eternal Golden Braid | Non-fiction |
Alain De Botton | The Consolations of Philosophy | Non-fiction |
Arthur Schopenhauer | Essays and Aphorisms | Non-fiction |
Ludwig Wittgenstein | Blue & Brown Books | Non-fiction |
I used this to look for mild one-off misspellings and inconsistent rendering of authors’ names mentioned twice or more, such as:
stringdist("JM Coetzee", "J. M. Coetzee")
## [1] 3
Anything obvious has been fixed, but for future use:
all_author_pairs <- my_books_coauth$Author |>
unique() |>
combn(m = 2) |>
t()
colnames(all_author_pairs) <- c("a1", "a2")
all_author_pairs |>
as_tibble() |>
mutate(dist = stringdist(a1, a2)) |>
arrange(dist) |>
slice_head(n =10)
For importing into a BookWyrm instance:
for_csv <- my_books_coauth |>
group_by(Title) |>
summarise(author = paste(Author, collapse = ", ")) |>
rename(title = Title) |>
mutate(shelf = "read")
#write_csv(for_csv, "books.csv")