This page shows how to plot citations data using the scholar
package by Guangchuang Yu et al. and the tidyverse
.
Load the packages:
library(scholar)
library(tidyverse)
I’ll use my citations data:
andi_id <- "xrY7bFYAAAAJ"
pubs <- get_publications(andi_id)
Now pubs
has a bunch of info on each of my papers:
names(pubs)
## [1] "title" "author" "journal" "number" "cites" "year" "cid"
## [8] "pubid"
Here’s the current year, which will come in handy later.
current_year <- as.numeric(format(Sys.Date(), "%Y"))
It was 2020 when I ran this.
Now let’s look at citations each paper got by year. Get all citation data:
all_cites <- map_dfr(pubs$pubid,
~ get_article_cite_history(andi_id, .x))
This has one row per year and original cited paper:
head(all_cites)
## year cites pubid
## 1 2015 8 PaBasH6fAo0C
## 2 2016 31 PaBasH6fAo0C
## 3 2017 53 PaBasH6fAo0C
## 4 2018 103 PaBasH6fAo0C
## 5 2019 120 PaBasH6fAo0C
## 6 2020 103 PaBasH6fAo0C
Now glue the original paper info back on. First reduce it a bit:
red_pubs <- pubs %>%
mutate(paper_year = year,
total_cites = cites) %>%
select(title, author, paper_year, journal, pubid, total_cites)
Join:
all_cites_pub <- all_cites %>%
left_join(red_pubs)
## Joining, by = "pubid"
Next, we need to add in implicit zero-cites where a year is missing, from the publication year up to current year.
This function does it for one paper’s cite info:
complete_paper_cites <- function(one_paper) {
expand_years <- setdiff(unique(one_paper$paper_year):current_year,
one_paper$year)
complete(one_paper,
year = expand_years,
cites = 0,
pubid,
title,
author,
paper_year,
journal,
total_cites) %>%
arrange(year)
}
Give it a quick test using this input:
test_input <- all_cites_pub %>%
filter(pubid == "43bX7VzcjpAC")
test_input %>%
select(paper_year, year, cites)
## paper_year year cites
## 1 2013 2016 1
## 2 2013 2017 0
## 3 2013 2018 0
## 4 2013 2019 1
Here’s the output:
complete_paper_cites(test_input) %>%
select(paper_year, year, cites)
## # A tibble: 8 x 3
## paper_year year cites
## <dbl> <dbl> <dbl>
## 1 2013 2013 0
## 2 2013 2014 0
## 3 2013 2015 0
## 4 2013 2016 1
## 5 2013 2017 0
## 6 2013 2018 0
## 7 2013 2019 1
## 8 2013 2020 0
Looks good, so tidy up:
rm(test_input)
Now do this for all papers:
all_cites_filled <- all_cites_pub %>%
group_split(pubid) %>%
map_dfr(complete_paper_cites) %>%
mutate(age = year - paper_year)
Calculate the cumulative citation sums:
all_cites_cum <- all_cites_filled %>%
group_by(pubid) %>%
mutate(cum_sum = cumsum(cites))
Plot for all papers…
all_cites_cum %>%
ggplot(aes(x = year, y = cum_sum, colour = pubid)) +
geom_line(size = 1, alpha = 0.5) +
theme(legend.position = "none") +
labs(x = "Year",
y = "Cumulative citations",
title = "All the papers")
That’s quite busy so try again.
min_cites <- 40
to_plot <- all_cites_cum %>%
filter(total_cites >= min_cites)
to_plot %>%
ggplot(aes(x = year,
y = cum_sum,
colour = reorder(str_trunc(title, 35), desc(total_cites)))) +
stat_smooth(geom="line", size = 1.2, alpha = 0.5) +
theme(legend.position = "right",
legend.directio = "vertical") +
labs(x = "Year",
y = "Cumulative citations",
colour = "Paper",
title = paste("Papers cited at least", min_cites, "times")) +
scale_x_continuous(breaks = seq(min(to_plot$year),max(to_plot$year)+2, 2))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'