This page shows how to plot citations data using the
scholar
package by Guangchuang Yu et al. and the
tidyverse
.
Load the packages:
library(scholar)
library(tidyverse)
I’ll use my citations data:
andi_id <- "xrY7bFYAAAAJ"
pubs <- get_publications(andi_id)
Now pubs
has a bunch of info on each of my papers:
names(pubs)
## [1] "title" "author" "journal" "number" "cites" "year" "cid"
## [8] "pubid"
Here’s the current year, which will come in handy later.
current_year <- as.numeric(format(Sys.Date(), "%Y"))
It was 2023 when I ran this.
Now let’s look at citations each paper got by year. Get all citation data:
all_cites <- map_dfr(pubs$pubid,
~ get_article_cite_history(andi_id, .x))
This has one row per year and original cited paper:
head(all_cites)
## year cites pubid
## 1 2014 4 PaBasH6fAo0C
## 2 2015 8 PaBasH6fAo0C
## 3 2016 32 PaBasH6fAo0C
## 4 2017 50 PaBasH6fAo0C
## 5 2018 108 PaBasH6fAo0C
## 6 2019 131 PaBasH6fAo0C
Now glue the original paper info back on. First reduce it a bit:
red_pubs <- pubs |>
mutate(paper_year = year,
total_cites = cites) |>
select(title, author, paper_year, journal, pubid, total_cites)
Join:
all_cites_pub <- all_cites |>
left_join(red_pubs)
## Joining with `by = join_by(pubid)`
Next, we need to add in implicit zero-cites where a year is missing, from the publication year up to current year.
This function does it for one paper’s cite info:
complete_paper_cites <- function(one_paper) {
expand_years <- setdiff(unique(one_paper$paper_year):current_year,
one_paper$year)
complete(one_paper,
year = expand_years,
cites = 0,
pubid,
title,
author,
paper_year,
journal,
total_cites) |>
arrange(year)
}
Give it a quick test using this input:
test_input <- all_cites_pub |>
filter(pubid == "43bX7VzcjpAC")
test_input |>
select(paper_year, year, cites)
## paper_year year cites
## 1 2013 2016 1
## 2 2013 2017 0
## 3 2013 2018 0
## 4 2013 2019 1
## 5 2013 2020 0
## 6 2013 2021 0
## 7 2013 2022 1
Here’s the output:
complete_paper_cites(test_input) |>
select(paper_year, year, cites)
## # A tibble: 11 × 3
## paper_year year cites
## <dbl> <int> <dbl>
## 1 2013 2013 0
## 2 2013 2014 0
## 3 2013 2015 0
## 4 2013 2016 1
## 5 2013 2017 0
## 6 2013 2018 0
## 7 2013 2019 1
## 8 2013 2020 0
## 9 2013 2021 0
## 10 2013 2022 1
## 11 2013 2023 0
Looks good, so tidy up:
rm(test_input)
Now do this for all papers:
all_cites_filled <- all_cites_pub |>
group_split(pubid) |>
map_dfr(complete_paper_cites) |>
mutate(age = year - paper_year)
Calculate the cumulative citation sums:
all_cites_cum <- all_cites_filled |>
group_by(pubid) |>
mutate(cum_sum = cumsum(cites))
Plot for all papers…
all_cites_cum |>
ggplot(aes(x = year, y = cum_sum, colour = pubid)) +
geom_line(linewidth = 1, alpha = 0.5) +
theme(legend.position = "none") +
labs(x = "Year",
y = "Cumulative citations",
title = "All the papers")
That’s quite busy so try again.
min_cites <- 50
to_plot <- all_cites_cum |>
filter(total_cites >= min_cites)
to_plot |>
ggplot(aes(x = year,
y = cum_sum,
colour = reorder(str_trunc(title, 20), desc(total_cites)))) +
stat_smooth(geom="line", linewidth = 1, alpha = 0.5) +
theme(legend.position = "right",
legend.directio = "vertical") +
labs(x = "Year",
y = "Cumulative citations",
colour = "Paper",
title = paste("Papers cited at least", min_cites, "times")) +
scale_x_continuous(breaks = seq(min(to_plot$year),max(to_plot$year)+2, 2)) +
scale_y_continuous(breaks = seq(0,ceiling(max(to_plot$cum_sum)/200)*200, 200))
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
to_plot <- all_cites_cum |>
filter(total_cites >= min_cites)
to_plot |>
filter(age >= 0 & cum_sum > 0) |>
ggplot(aes(x = age,
y = cum_sum,
colour = reorder(str_trunc(title, 20), desc(total_cites)))) +
stat_smooth(geom="line", linewidth = 1, alpha = 0.5) +
theme(legend.position = "right",
legend.directio = "vertical") +
labs(x = "Age (years)",
y = "Cumulative citations",
colour = "Paper",
title = paste("Papers cited at least", min_cites, "times")) +
scale_x_continuous(breaks = seq(min(to_plot$age),max(to_plot$age)+2, 1)) +
scale_y_continuous(trans = "log2")
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'