Merge pull request #420 from massimoaria/develop

issue #404: OpenAlex integration
massimoaria · Feb 23, 2024 · 76e56a0 · 76e56a0
2 parents bc5a38c + 1b8ca20
commit 76e56a0
Show file tree

Hide file tree

Showing 8 changed files with 194 additions and 110 deletions.
diff --git a/R/convert2df.R b/R/convert2df.R
@@ -12,9 +12,11 @@
 #' d)\tab 'lens' \tab Lens.org (in csv '.csv');\cr
 #' e)\tab 'pubmed' \tab an object of the class \code{pubmedR (package pubmedR)} containing a collection obtained from a query performed with pubmedR package;\cr
 #' f)\tab 'dimensions' \tab an object of the class \code{dimensionsR (package dimensionsR)} containing a collection obtained from a query performed with dimensionsR package;\cr
-#' g)\tab 'openalex' \tab a data frame object returned by openalexR package, containing a collection of works resulting from a query fetched from OpenAlex database.}
+#' g)\tab 'openalex' \tab OpenAlex .csv file;\cr
+#' h)\tab 'openalex_api' \tab a data frame object returned by openalexR package, containing a collection of works resulting from a query fetched from OpenAlex database.}
 #' @param dbsource is a character indicating the bibliographic database. \code{dbsource} can be \code{dbsource = c('cochrane','dimensions','generic','isi','openalex', 'pubmed','scopus','wos', 'lens')} . Default is \code{dbsource = "isi"}.
-#' @param format is a character indicating the format of the SCOPUS and Clarivate Analytics WoS export file. \code{format} can be \code{c('api', 'bibtex', 'csv', 'endnote','excel','plaintext', 'pubmed')}. Default is \code{format = "plaintext"}.
+#' @param format is a character indicating the SCOPUS, Clarivate Analytics WoS, and other databases export file format. \code{format} can be \code{c('api', 'bibtex', 'csv', 'endnote','excel','plaintext', 'pubmed')}. Default is \code{format = "plaintext"}.
+#' @param remove.duplicates is logical. If TRUE, the function will remove duplicated items checking by DOI and database ID.
 #' @return a data frame with cases corresponding to articles and variables to Field Tags in the original export file.
 #' 
 #' I.e We have three files downlaod from Web of Science in plaintext format, file will be:
@@ -57,10 +59,10 @@
 #' 
 #' @export
 
-convert2df<-function(file,dbsource="wos",format="plaintext"){
+convert2df<-function(file,dbsource="wos",format="plaintext", remove.duplicates=TRUE){
 
  allowed_formats <- c('api', 'bibtex', 'csv', 'endnote','excel','plaintext', 'pubmed') 
- allowed_db <- c('cochrane','dimensions','generic','isi','openalex', 'pubmed','scopus','wos', 'lens')
+ allowed_db <- c('cochrane','dimensions','generic','isi','openalex', 'openalex_api','pubmed','scopus','wos', 'lens')
 
  cat("\nConverting your",dbsource,"collection into a bibliographic dataframe\n\n")
  if (length(setdiff(dbsource,allowed_db))>0){
@@ -147,7 +149,10 @@ convert2df<-function(file,dbsource="wos",format="plaintext"){
  })
 
  },
- openalex = {
+ openalex={
+ M <- csvOA2df(file)
+ },
+ openalex_api = {
  if (!"bibliometrixDB" %in% class(file)){
  M <- openalexR::oa2bibliometrix(file)
  } else {
@@ -168,11 +173,11 @@ convert2df<-function(file,dbsource="wos",format="plaintext"){
  M$CR <- trim.leading(trimES(gsub("\\[,||\\[||\\]|| \\.\\. || \\. ","",M$CR))) # remove foreign characters from CR (i.e. Chinese, Russian characters)
  }
 
- if (dbsource!="cochrane"){M$AU=gsub(intToUtf8(8217),intToUtf8(39),M$AU)}
+ if (dbsource!="cochrane"){M$AU <- gsub(intToUtf8(8217),intToUtf8(39),M$AU)}
 
  cat("Done!\n\n")
 
- if (!(dbsource %in% c("pubmed", "lens", "openalex"))) {
+ if (!(dbsource %in% c("pubmed", "lens", "openalex_api"))) {
  ## AU_UN field creation
  if ("C1" %in% names(M)) {
  cat("\nGenerating affiliation field tag AU_UN from C1: ")
@@ -204,10 +209,37 @@ convert2df<-function(file,dbsource="wos",format="plaintext"){
  }
 
  ### SR field creation
+ if (isTRUE(remove.duplicates)){
+ switch(dbsource,
+ isi={
+ id_field <- "UT"
+ },
+ scopus={
+ id_field <- "UT"
+ },
+ openalex={
+ id_field <- "id_oa"
+ },
+ openalex_api={
+ id_field <- "id_oa"
+ },
+ dimneisons={
+ id_field <- "UT"
+ },
+ pubmed={
+ id_field <- "PMID"
+ },
+ lens={
+ id_field <- "UT"
+ },
+ {
+ id_field <- "TI"
+ })
+ d <- duplicated(M[id_field]) 
+ if (sum(d)>0) cat("\nRemoved ",sum(d),"duplicated documents\n")
+ M <- M[!d,]
+ }
  suppressWarnings(M <- metaTagExtraction(M, Field="SR"))
- d <- duplicated(M$SR)
- if (sum(d)>0) cat("\nRemoved ",sum(d),"duplicated documents\n")
- M <- M[!d,]
  row.names(M) <- M$SR
 
  ### bibliometrix>DB class

diff --git a/R/csvLens2df.R b/R/csvLens2df.R
@@ -54,12 +54,13 @@ csvLens2df <- function(file){
 
  # Iso Source Titles
  DATA$SO[DATA$SO==""] <- DATA$Publisher[DATA$SO==""] 
- DATA$JI <- sapply(DATA$SO, AbbrevTitle, USE.NAMES = FALSE)
- DATA$J9 <- gsub("\\.","",DATA$JI)
+ # DATA$JI <- sapply(DATA$SO, AbbrevTitle, USE.NAMES = FALSE)
+ # DATA$J9 <- gsub("\\.","",DATA$JI)
+ DATA$JI <- DATA$J9 <- DATA$SO
  DATA$ID <- DATA$DE
  DI <- DATA$DI
  URL <- DATA$URL
- DATA <- data.frame(lapply(DATA,toUpper))
+ DATA <- data.frame(lapply(DATA,toupper))
  DATA$DI <- DI
  DATA$URL <- URL
  DATA$AU_CO <- "NA"

diff --git a/R/csvOA2df.R b/R/csvOA2df.R
@@ -0,0 +1,123 @@
+utils::globalVariables(c("all_of", "corr", "DI", "id_oa","RP","UN","AU_ID"))
+
+csvOA2df <- function(file){
+ options(readr.num_columns = 0)
+
+ ## import all files in a single data frame
+ for (i in 1:length(file)){
+ #D <- read.csv(file[i], quote='"', check.names = F, stringsAsFactors = F) #fileEncoding = "UTF-8-BOM")
+ D <- read_csv(file[i], na=character(), quote='"', trim_ws = FALSE, progress = show_progress(), show_col_types = FALSE) %>%
+ mutate(across(where(is.numeric), as.character)) %>% 
+ mutate(across(where(is.character), \(x) tidyr::replace_na(x,""))) %>% 
+ as.data.frame()
+
+ if (i>1){
+ l <- intersect(l,names(D))
+ DATA <- rbind(DATA[l],D[l])
+ }else{
+ l <- names(D)
+ DATA <- D}
+ }
+ rm(D)
+
+ ## Post-Processing
+
+ # column re-labelling
+ DATA <- relabelling(DATA)
+
+ # recode as numeric
+ DATA$TC <- as.numeric(DATA$TC)
+ DATA$PY <- as.numeric(DATA$PY)
+ DATA$relevance_score <- as.numeric(DATA$relevance_score)
+
+ # replace | with ;
+ DATA <- DATA %>% 
+ mutate(across(where(is.character), ~ stringi::stri_replace_all_regex(.,"\\|",";")))
+
+ DATA$AF <- DATA$AU
+ DATA$ID <- DATA$DE
+ DATA$AB=""
+ DATA$CR <- gsub("https://openalex.org/","",DATA$CR)
+ DATA$AU_ID <- gsub("https://openalex.org/","",DATA$AU_ID)
+ DATA$id_oa <- gsub("https://openalex.org/","",DATA$id_oa)
+ DATA$JI <- DATA$J9 <- gsub("https://openalex.org/","",DATA$SO_ID)
+ DATA$corresponding_author_ids <- gsub("https://openalex.org/","",DATA$corresponding_author_ids)
+ DATA$C1 <- gsub("https://", "", DATA$C1)
+ DATA$DB <- "OPENALEX"
+
+ ## corresponding author
+ UN <- strsplit(DATA$C1,";")
+ corresp <- strsplit(DATA$authorships_is_corresponding,";")
+ df_UN <- data.frame(UN=unlist(UN), id_oa=rep(DATA$id_oa,lengths(UN))) %>% 
+ group_by(id_oa) %>% 
+ mutate(n=row_number())
+ df_COR <- data.frame(corr=unlist(corresp), id_oa=rep(DATA$id_oa,lengths(corresp))) %>% 
+ group_by(id_oa) %>% 
+ mutate(n=row_number())
+ df_UN <- df_UN %>% 
+ left_join(df_COR, by=(c("id_oa","n"))) 
+ AU <- strsplit(DATA$AU,";")
+ AU_df <- data.frame(RP = unlist(AU), AU_ID=unlist(strsplit(DATA$AU_ID,";")), id_oa=rep(DATA$id_oa,lengths(AU))) %>% 
+ group_by(id_oa) %>% 
+ mutate(n=row_number()) %>% 
+ left_join(df_UN %>% select("UN","id_oa", "corr", "n"),
+ by = c("id_oa","n")) %>% 
+ dplyr::filter(corr == "true") %>% 
+ mutate(RP = paste(RP,UN, sep=", ")) %>% 
+ ungroup() %>% 
+ select("RP", "AU_ID") %>% 
+ distinct(AU_ID, .keep_all = TRUE)
+ DATA <- DATA %>% 
+ left_join(AU_df, by = c("corresponding_author_ids" = "AU_ID"))
+
+
+ # move all char strings to Upper
+ ind <- apply(DATA,2,function(x){
+ sum(regexpr("https://",x)>-1, na.rm = TRUE)>0
+ })
+ label <- names(ind)[ind==FALSE & !is.na(ind)]
+
+ DATA <- DATA %>% 
+ mutate(across(all_of(label), toupper),
+ DI = gsub("https://doi.org/","",DI),
+ DI = ifelse(DI == "null",NA,DI)) 
+
+ return(DATA)
+}
+
+relabelling <- function(DATA){
+ ## column re-labelling
+ label <- names(DATA)
+ label[label %in% "id"] <- "id_oa"
+ label[label %in% "display_name"] <- "TI"
+ label[label %in% "primary_location_display_name"] <- "SO"
+ label[label %in% "primary_location_id"] <- "SO_ID"
+ label[label %in% "primary_location_host_organization"] <- "PU"
+ label[label %in% "primary_location_issns"] <- "ISSN"
+ label[label %in% "primary_location_issn_l"] <- "ISSN_I"
+ label[label %in% "primary_location_landing_page_url"] <- "URL"
+ label[label %in% "primary_location_pdf_url"] <- "URL_PDF"
+ label[label %in% "author_ids"] <- "AU_ID"
+ label[label %in% "author_names"] <- "AU"
+ label[label %in% "author_orcids"] <- "OI"
+ label[label %in% "author_institution_names"] <- "C3"
+ label[label %in% "cited_by_count"] <- "TC"
+ label[label %in% "publication_year"] <- "PY"
+ label[label %in% "type"] <- "DT"
+ label[label %in% "biblio_issue"] <- "IS"
+ label[label %in% "biblio_volume"] <- "VL"
+ label[label %in% "referenced_works" ] <- "CR"
+ label[label %in% "keywords_keyword"] <- "DE"
+ label[label %in% "concepts_display_name"] <- "CONCEPTS"
+ label[label %in% "topics_display_name"] <- "TOPICS"
+ label[label %in% "sustainable_development_goals_display_name"] <- "SDG"
+ label[label %in% "primary_topic_field_display_name"] <- "SC"
+ label[label %in% "mesh_descriptor_name"] <- "MESH"
+ label[label %in% "referenced_works_count"] <- "NR"
+ label[label %in% "language"] <- "LA"
+ label[label %in% "authorships_author_position"] <- "AU_POSITION"
+ label[label %in% "authorships_raw_affiliation_string"] <- "C1"
+ label[label %in% "doi"] <- "DI"
+ names(DATA) <- label
+ return(DATA)
+}
diff --git a/R/histNetwork.R b/R/histNetwork.R
@@ -195,95 +195,6 @@ wos <- function(M, min.citations, sep, network, verbose){
  return(results)
 }
 
-# scopus <- function(M, min.citations, sep, network, verbose){
-# 
-# if (isTRUE(verbose)) {
-# cat("\nSCOPUS DB: Searching local citations (LCS) by document titles (TI) and DOIs...\n")
-# }
-# 
-# if (!("SR_FULL" %in% names(M))) {
-# M = metaTagExtraction(M, Field = "SR")
-# }
-# 
-# M$nCITING <- 1:nrow(M)
-# papers <- M$nCITING[M$TC >= min.citations]
-# 
-# TIpost <-
-# paste(gsub("[[:punct:]]", "", M$TI[papers]), " ", M$PY[papers], " ", sep = "")
-# 
-# CR <- gsub("[[:punct:]]", "", M$CR)
-# n <- nchar(CR)
-# n[is.na(n)] <- 2
-# n <- n + 1
-# nCum <- c(1, cumsum(n[-length(n)]))
-# CR <- paste(CR, collapse = " ")
-# 
-# #L <- str_locate_all(CR, TIpost)
-# L <- stringi::stri_locate_all_regex(CR,TIpost, omit_no_match = TRUE)
-# 
-# LCS <- lengths(L) / 2
-# 
-# M$LCS <- 0
-# M$LCS[papers] <- LCS
-# 
-# 
-# ### HistData
-# histData <- M %>%
-# select(.data$SR_FULL, .data$TI,.data$DE,.data$ID,.data$DI, .data$PY, .data$LCS, .data$TC) %>%
-# rename(
-# Paper = .data$SR_FULL,
-# Title = .data$TI,
-# Author_Keywords = .data$DE,
-# KeywordsPlus = .data$ID,
-# DOI = .data$DI,
-# Year = .data$PY,
-# GCS = .data$TC
-# ) %>%
-# arrange(.data$Year) %>%
-# dplyr::filter(.data$GCS>=min.citations) %>% 
-# as.data.frame()
-# 
-# 
-# if (isTRUE(network)) {
-# ## Network matrix
-# df <- lapply(seq_along(L), function(i) {
-# l <-
-# data.frame(
-# ref = L[[i]],
-# paper = rep(papers[i], length(L[[i]][, 1]))
-# )
-# })
-# df <- (do.call(rbind, df))
-# 
-# A <- outer(df$ref.start, nCum, "-")
-# A[A < 0] <- NA
-# df$CITINGn <- unlist(apply(A, 1, which.min))
-# df$CITING <- M$SR[df$CITINGn]
-# df$CITED <- M$SR[df$paper]
-# df <- df %>% 
-# dplyr::filter(.data$CITING %in% histData$Paper)
-# 
-# NetMatrix <-
-# (as_adjacency_matrix(graph_from_data_frame(df[, c(6, 5)], directed = T)))
-# } else{
-# NetMatrix = NULL
-# }
-# 
-# if (isTRUE(verbose)) {
-# cat("\nFound",
-# length(M$LCS[M$LCS > 0]),
-# "documents with no empty Local Citations (LCS)\n")
-# }
-# 
-# results <-
-# list(
-# NetMatrix = NetMatrix,
-# histData = histData,
-# M = M,
-# LCS = M$LCS
-# )
-# }
-
 # New algorithm for Scopus
 # Local citation matching is based on First Author, Year and PP
 scopus <- function(M, min.citations, sep, network, verbose){
@@ -387,7 +298,7 @@ scopus <- function(M, min.citations, sep, network, verbose){
 
 openalex <- function(M, min.citations=min.citations, sep=sep, network=network, verbose=verbose){
 
- M$CR[is.na(M$CR)] <- "none"
+ M$CR[is.na(M$CR) | M$CR==""] <- "none"
  ids <- M$id_oa
  CR <- strsplit(M$CR, ";")
  CR <- data.frame(id_oa = rep(M$id_oa,lengths(CR)), ref = unlist(CR)) %>% 
@@ -420,7 +331,7 @@ openalex <- function(M, min.citations=min.citations, sep=sep, network=network, v
  SRrow <- WLCR %>% select(.data$id_oa) %>% 
  left_join(M %>% 
  select(.data$id_oa, .data$SR), 
- by="id_oa")
+ by="id_oa") 
 
  SR_col <- data.frame(id_oa = colnames(WLCR)[-1]) %>% 
  left_join(M %>% 

diff --git a/R/metaTagExtraction.R b/R/metaTagExtraction.R
@@ -405,7 +405,7 @@ AU_UN<-function(M,sep){
  })
  AFFL=unlist(AFFL)
  M$AU_UN=AFFL
- if (M$DB[1]=="ISI" & "C3" %in% names(M)){
+ if (M$DB[1] %in% c("ISI", "OPENALEX") & "C3" %in% names(M)){
  M$AU_UN[!is.na(M$C3) & M$C3!=""] <- M$C3[!is.na(M$C3) & M$C3!=""]
  }
  M$AU_UN=gsub("\\\\&","AND",M$AU_UN)

diff --git a/inst/biblioshiny/server.R b/inst/biblioshiny/server.R
@@ -281,7 +281,15 @@ To ensure the functionality of Biblioshiny,
  })
  })
  },
- openalex = {
+ openalex={
+ withProgress(message = 'Conversion in progress',
+ value = 0, {
+ M <- convert2df(inFile$datapath,
+ dbsource = input$dbsource,
+ format = "csv")
+ })
+ },
+ openalex_api = {
  M <- smart_load(inFile$datapath)
  },
  lens = {

diff --git a/inst/biblioshiny/ui.R b/inst/biblioshiny/ui.R
@@ -254,7 +254,8 @@ body <- dashboardBody(
  "Web of Science (WoS/WoK)" = "isi",
  "Scopus" = "scopus",
  "Dimensions" = "dimensions",
- "OpenAlex (via openalexR)" = "openalex",
+ "Openalex" ="openalex",
+ "OpenAlex API (via openalexR)" = "openalex_api",
  "Lens.org" = "lens",
  "PubMed" = "pubmed",
  "Cochrane Library" = "cochrane"