# http://rpsychologist.com/an-r-script-to-automatically-look-at-pubmed-citation-counts-by-year-of-publication/ local({ tmp_require_package_namespace <- function(...) { packages <- as.character(match.call(expand.dots = FALSE)[[2]]) for (p in packages) if (!requireNamespace(p)) install.packages(p) } tmp_require_package_namespace( RCurl, # before RCurl installation: sudo apt-get install libcurl3-dev XML, plyr, # THIS FILE MAY BE BROKEN: REMOVED PLYR BUT NOT UPDATED ggplot2, directlabels ) }) #============================================================================== # Namespace-like method: http://stackoverflow.com/questions/1266279/#1319786 #============================================================================== PubMedTrends <- new.env() ######################## # Download PubMed Data # ######################## PubMedTrends$PubMedTrend <- function(query, yrStart = 1950, yrMax = 2009, calculate_relative_frequencies = FALSE) { ### Some error checking ### if (is.numeric(yrStart) == FALSE || is.numeric(yrMax) == FALSE) { stop("One of the year values is not numeric") } if (yrStart < 1800) { stop(paste("Sure you want to look for hits from the 17th century (yrStart = " , yrStart, ")?\n", sep = "")) } this.year <- Sys.time() this.year <- as.integer(format(this.year, "%Y")) if (yrMax > this.year) { stop(paste("Are you from the future? Please check your year interval; yrMax =",yrMax,"\n")) } if (yrMax < yrStart) { stop("yrMax is smaller than yrMin!") } ### Start main search function ### getCount <- function(query.term) { # convert spaces to '+' query.gsub <- gsub(" ", "+", query.term) # convert some characters to brower friendly text (better to be safe than sorry) query.gsub <- gsub('\\*', "%2A", query.gsub) query.gsub <- gsub('"', "%22", query.gsub) query.gsub <- gsub("\\[", "%5B", query.gsub) query.gsub <- gsub("\\]", "%5D", query.gsub) # add progressbar pb <- txtProgressBar(min = yrStart, max = yrMax, style = 3) # create empty data frame df <- data.frame(NULL) cat("Searching for: ", query.term,"\n") # Start retrieval loop for (i in yrStart:yrMax) { # tell progressbar how it's going setTxtProgressBar(pb, i) # add publication date [dp] to query query.parsed <- paste(query.gsub, "+AND+",i, "%5Bppdat%5D", sep = "") # Get XML with number of hits for query.parsed url <- paste("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&rettype=count&term=", query.parsed, sep = "") # cat("\n", url, "\n", sep = "") pub.esearch <- getURL(url) # Parse XML # cat("\n", pub.esearch, "\n", sep = "") pub.esearch <- xmlTreeParse(pub.esearch, asText = TRUE) # Get number of hits from XML pub.count <- as.numeric(xmlValue(pub.esearch[["doc"]][["eSearchResult"]][["Count"]])) # Don't add anything if count is 0 if (pub.count != 0) { df <- rbind(df, data.frame("year" = i, "count" = pub.count)) } # Wait 0.5 sec Sys.sleep(0.5) } # close progressbar close(pb) return(df) } # Run getCount() for all query terms df <- ldply(query, getCount) # RNC made conditional: if (calculate_relative_frequencies) { ### Calculate relative frequencies ### # load file with pubmed total citations from 1947-2009 load(file = "total_table") # match year match <- match(df$year, total.table$year) # add total count df$total_count <- total.table$total_count[match] # compute relative count times 10 000, i.e. show number of matches per 1 million PubMed citations df$relative <- (df$count / df$total_count) * 10000 } cat("\nAll done!") return(df) } ####################### ### Show total hits ### ####################### PubMedTrends$PubTotalHits <- function(args = FALSE) { # Get column total for query 'x' GetCount <- function(x) { df <- data.frame("search_name" = x, "total_hits" = colSums(df[df$.id == x,][3])) } # Index all query names query.index <- unique(df$.id) # Use GetCount() for every term in 'query.index' and return as data.frame df <- ldply(query.index, GetCount) # if argument is 'query' add full query instead of query name. # if there is no argument specified both name and query will be shown if (args == "query" || args == FALSE) { # remove names names(query) <- NULL # add queries to df df <- cbind(df, "query" = query) # reorder columns df <- df[,c(1,3,2)] # remove 'names' if we only want queries if (args == "query") df <- df[-1] } return(df) } #============================================================================== # Namespace-like method: http://stackoverflow.com/questions/1266279/#1319786 #============================================================================== if ("PubMedTrends" %in% search()) detach("PubMedTrends") attach(PubMedTrends) # subsequent additions not found, so attach at the end