This page shows how to plot citations data using the scholar package by Guangchuang Yu et al. and the tidyverse.

Load the packages:

library(scholar)
library(tidyverse)

I’ll use my citations data:

andi_id <- "xrY7bFYAAAAJ"
pubs <- get_publications(andi_id)

Now pubs has a bunch of info on each of my papers:

names(pubs)
## [1] "title"   "author"  "journal" "number"  "cites"   "year"    "cid"    
## [8] "pubid"

Here’s the current year, which will come in handy later.

current_year <- as.numeric(format(Sys.Date(), "%Y"))

It was 2023 when I ran this.

Now let’s look at citations each paper got by year. Get all citation data:

all_cites <- map_dfr(pubs$pubid,
                       ~ get_article_cite_history(andi_id, .x))

This has one row per year and original cited paper:

head(all_cites)
##   year cites        pubid
## 1 2014     4 PaBasH6fAo0C
## 2 2015     8 PaBasH6fAo0C
## 3 2016    32 PaBasH6fAo0C
## 4 2017    50 PaBasH6fAo0C
## 5 2018   108 PaBasH6fAo0C
## 6 2019   131 PaBasH6fAo0C

Now glue the original paper info back on. First reduce it a bit:

red_pubs <- pubs |>
  mutate(paper_year = year,
         total_cites = cites) |>
  select(title, author, paper_year, journal, pubid, total_cites)

Join:

all_cites_pub <- all_cites |>
  left_join(red_pubs)
## Joining with `by = join_by(pubid)`

Next, we need to add in implicit zero-cites where a year is missing, from the publication year up to current year.

This function does it for one paper’s cite info:

complete_paper_cites <- function(one_paper) {
  expand_years <- setdiff(unique(one_paper$paper_year):current_year,
                          one_paper$year)
  complete(one_paper,
           year = expand_years,
           cites = 0,
           pubid,
           title,
           author,
           paper_year,
           journal,
           total_cites) |>
    arrange(year)  
} 

Give it a quick test using this input:

test_input <- all_cites_pub |>
  filter(pubid == "43bX7VzcjpAC")

test_input |>
  select(paper_year, year, cites)
##   paper_year year cites
## 1       2013 2016     1
## 2       2013 2017     0
## 3       2013 2018     0
## 4       2013 2019     1
## 5       2013 2020     0
## 6       2013 2021     0
## 7       2013 2022     1

Here’s the output:

complete_paper_cites(test_input) |>
  select(paper_year, year, cites)
## # A tibble: 11 × 3
##    paper_year  year cites
##         <dbl> <int> <dbl>
##  1       2013  2013     0
##  2       2013  2014     0
##  3       2013  2015     0
##  4       2013  2016     1
##  5       2013  2017     0
##  6       2013  2018     0
##  7       2013  2019     1
##  8       2013  2020     0
##  9       2013  2021     0
## 10       2013  2022     1
## 11       2013  2023     0

Looks good, so tidy up:

rm(test_input)

Now do this for all papers:

all_cites_filled <- all_cites_pub |>
  group_split(pubid) |>
  map_dfr(complete_paper_cites) |>
  mutate(age = year - paper_year)

Calculate the cumulative citation sums:

all_cites_cum <- all_cites_filled |>
  group_by(pubid) |>
  mutate(cum_sum = cumsum(cites))

Plot for all papers…

all_cites_cum |>
  ggplot(aes(x = year, y = cum_sum, colour = pubid)) +
  geom_line(linewidth = 1, alpha = 0.5) +
  theme(legend.position = "none") +
  labs(x = "Year",
       y = "Cumulative citations",
       title = "All the papers")

That’s quite busy so try again.

min_cites <- 50
to_plot <- all_cites_cum |>
  filter(total_cites >= min_cites)

to_plot |>
  ggplot(aes(x = year,
             y = cum_sum,
             colour = reorder(str_trunc(title, 20), desc(total_cites)))) +
  stat_smooth(geom="line", linewidth = 1, alpha = 0.5) +
  theme(legend.position = "right",
        legend.directio = "vertical") +
  labs(x = "Year",
       y = "Cumulative citations",
       colour = "Paper",
       title = paste("Papers cited at least", min_cites, "times")) +
  scale_x_continuous(breaks = seq(min(to_plot$year),max(to_plot$year)+2, 2)) +
  scale_y_continuous(breaks = seq(0,ceiling(max(to_plot$cum_sum)/200)*200, 200))
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

to_plot <- all_cites_cum |>
  filter(total_cites >= min_cites)

to_plot |>
  filter(age >= 0 & cum_sum > 0) |>
  ggplot(aes(x = age,
             y = cum_sum,
             colour = reorder(str_trunc(title, 20), desc(total_cites)))) +
  stat_smooth(geom="line", linewidth = 1, alpha = 0.5) +
  theme(legend.position = "right",
        legend.directio = "vertical") +
  labs(x = "Age (years)",
       y = "Cumulative citations",
       colour = "Paper",
       title = paste("Papers cited at least", min_cites, "times")) +
  scale_x_continuous(breaks = seq(min(to_plot$age),max(to_plot$age)+2, 1)) +
  scale_y_continuous(trans = "log2")
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'