Skip to content

Commit

Permalink
Merge pull request #420 from massimoaria/develop
Browse files Browse the repository at this point in the history
issue #404: OpenAlex integration
  • Loading branch information
massimoaria authored Feb 23, 2024
2 parents bc5a38c + 1b8ca20 commit 76e56a0
Show file tree
Hide file tree
Showing 8 changed files with 194 additions and 110 deletions.
52 changes: 42 additions & 10 deletions R/convert2df.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,11 @@
#' d)\tab 'lens' \tab Lens.org (in csv '.csv');\cr
#' e)\tab 'pubmed' \tab an object of the class \code{pubmedR (package pubmedR)} containing a collection obtained from a query performed with pubmedR package;\cr
#' f)\tab 'dimensions' \tab an object of the class \code{dimensionsR (package dimensionsR)} containing a collection obtained from a query performed with dimensionsR package;\cr
#' g)\tab 'openalex' \tab a data frame object returned by openalexR package, containing a collection of works resulting from a query fetched from OpenAlex database.}
#' g)\tab 'openalex' \tab OpenAlex .csv file;\cr
#' h)\tab 'openalex_api' \tab a data frame object returned by openalexR package, containing a collection of works resulting from a query fetched from OpenAlex database.}
#' @param dbsource is a character indicating the bibliographic database. \code{dbsource} can be \code{dbsource = c('cochrane','dimensions','generic','isi','openalex', 'pubmed','scopus','wos', 'lens')} . Default is \code{dbsource = "isi"}.
#' @param format is a character indicating the format of the SCOPUS and Clarivate Analytics WoS export file. \code{format} can be \code{c('api', 'bibtex', 'csv', 'endnote','excel','plaintext', 'pubmed')}. Default is \code{format = "plaintext"}.
#' @param format is a character indicating the SCOPUS, Clarivate Analytics WoS, and other databases export file format. \code{format} can be \code{c('api', 'bibtex', 'csv', 'endnote','excel','plaintext', 'pubmed')}. Default is \code{format = "plaintext"}.
#' @param remove.duplicates is logical. If TRUE, the function will remove duplicated items checking by DOI and database ID.
#' @return a data frame with cases corresponding to articles and variables to Field Tags in the original export file.
#'
#' I.e We have three files downlaod from Web of Science in plaintext format, file will be:
Expand Down Expand Up @@ -57,10 +59,10 @@
#'
#' @export

convert2df<-function(file,dbsource="wos",format="plaintext"){
convert2df<-function(file,dbsource="wos",format="plaintext", remove.duplicates=TRUE){

allowed_formats <- c('api', 'bibtex', 'csv', 'endnote','excel','plaintext', 'pubmed')
allowed_db <- c('cochrane','dimensions','generic','isi','openalex', 'pubmed','scopus','wos', 'lens')
allowed_db <- c('cochrane','dimensions','generic','isi','openalex', 'openalex_api','pubmed','scopus','wos', 'lens')

cat("\nConverting your",dbsource,"collection into a bibliographic dataframe\n\n")
if (length(setdiff(dbsource,allowed_db))>0){
Expand Down Expand Up @@ -147,7 +149,10 @@ convert2df<-function(file,dbsource="wos",format="plaintext"){
})

},
openalex = {
openalex={
M <- csvOA2df(file)
},
openalex_api = {
if (!"bibliometrixDB" %in% class(file)){
M <- openalexR::oa2bibliometrix(file)
} else {
Expand All @@ -168,11 +173,11 @@ convert2df<-function(file,dbsource="wos",format="plaintext"){
M$CR <- trim.leading(trimES(gsub("\\[,||\\[||\\]|| \\.\\. || \\. ","",M$CR))) # remove foreign characters from CR (i.e. Chinese, Russian characters)
}

if (dbsource!="cochrane"){M$AU=gsub(intToUtf8(8217),intToUtf8(39),M$AU)}
if (dbsource!="cochrane"){M$AU <- gsub(intToUtf8(8217),intToUtf8(39),M$AU)}

cat("Done!\n\n")

if (!(dbsource %in% c("pubmed", "lens", "openalex"))) {
if (!(dbsource %in% c("pubmed", "lens", "openalex_api"))) {
## AU_UN field creation
if ("C1" %in% names(M)) {
cat("\nGenerating affiliation field tag AU_UN from C1: ")
Expand Down Expand Up @@ -204,10 +209,37 @@ convert2df<-function(file,dbsource="wos",format="plaintext"){
}

### SR field creation
if (isTRUE(remove.duplicates)){
switch(dbsource,
isi={
id_field <- "UT"
},
scopus={
id_field <- "UT"
},
openalex={
id_field <- "id_oa"
},
openalex_api={
id_field <- "id_oa"
},
dimneisons={
id_field <- "UT"
},
pubmed={
id_field <- "PMID"
},
lens={
id_field <- "UT"
},
{
id_field <- "TI"
})
d <- duplicated(M[id_field])
if (sum(d)>0) cat("\nRemoved ",sum(d),"duplicated documents\n")
M <- M[!d,]
}
suppressWarnings(M <- metaTagExtraction(M, Field="SR"))
d <- duplicated(M$SR)
if (sum(d)>0) cat("\nRemoved ",sum(d),"duplicated documents\n")
M <- M[!d,]
row.names(M) <- M$SR

### bibliometrix>DB class
Expand Down
7 changes: 4 additions & 3 deletions R/csvLens2df.R
Original file line number Diff line number Diff line change
Expand Up @@ -54,12 +54,13 @@ csvLens2df <- function(file){

# Iso Source Titles
DATA$SO[DATA$SO==""] <- DATA$Publisher[DATA$SO==""]
DATA$JI <- sapply(DATA$SO, AbbrevTitle, USE.NAMES = FALSE)
DATA$J9 <- gsub("\\.","",DATA$JI)
# DATA$JI <- sapply(DATA$SO, AbbrevTitle, USE.NAMES = FALSE)
# DATA$J9 <- gsub("\\.","",DATA$JI)
DATA$JI <- DATA$J9 <- DATA$SO
DATA$ID <- DATA$DE
DI <- DATA$DI
URL <- DATA$URL
DATA <- data.frame(lapply(DATA,toUpper))
DATA <- data.frame(lapply(DATA,toupper))
DATA$DI <- DI
DATA$URL <- URL
DATA$AU_CO <- "NA"
Expand Down
123 changes: 123 additions & 0 deletions R/csvOA2df.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
utils::globalVariables(c("all_of", "corr", "DI", "id_oa","RP","UN","AU_ID"))

csvOA2df <- function(file){
options(readr.num_columns = 0)

## import all files in a single data frame
for (i in 1:length(file)){
#D <- read.csv(file[i], quote='"', check.names = F, stringsAsFactors = F) #fileEncoding = "UTF-8-BOM")
D <- read_csv(file[i], na=character(), quote='"', trim_ws = FALSE, progress = show_progress(), show_col_types = FALSE) %>%
mutate(across(where(is.numeric), as.character)) %>%
mutate(across(where(is.character), \(x) tidyr::replace_na(x,""))) %>%
as.data.frame()

if (i>1){
l <- intersect(l,names(D))
DATA <- rbind(DATA[l],D[l])
}else{
l <- names(D)
DATA <- D}
}
rm(D)

## Post-Processing

# column re-labelling
DATA <- relabelling(DATA)

# recode as numeric
DATA$TC <- as.numeric(DATA$TC)
DATA$PY <- as.numeric(DATA$PY)
DATA$relevance_score <- as.numeric(DATA$relevance_score)

# replace | with ;
DATA <- DATA %>%
mutate(across(where(is.character), ~ stringi::stri_replace_all_regex(.,"\\|",";")))

DATA$AF <- DATA$AU
DATA$ID <- DATA$DE
DATA$AB=""
DATA$CR <- gsub("https://openalex.org/","",DATA$CR)
DATA$AU_ID <- gsub("https://openalex.org/","",DATA$AU_ID)
DATA$id_oa <- gsub("https://openalex.org/","",DATA$id_oa)
DATA$JI <- DATA$J9 <- gsub("https://openalex.org/","",DATA$SO_ID)
DATA$corresponding_author_ids <- gsub("https://openalex.org/","",DATA$corresponding_author_ids)
DATA$C1 <- gsub("https://", "", DATA$C1)
DATA$DB <- "OPENALEX"

## corresponding author
UN <- strsplit(DATA$C1,";")
corresp <- strsplit(DATA$authorships_is_corresponding,";")
df_UN <- data.frame(UN=unlist(UN), id_oa=rep(DATA$id_oa,lengths(UN))) %>%
group_by(id_oa) %>%
mutate(n=row_number())
df_COR <- data.frame(corr=unlist(corresp), id_oa=rep(DATA$id_oa,lengths(corresp))) %>%
group_by(id_oa) %>%
mutate(n=row_number())
df_UN <- df_UN %>%
left_join(df_COR, by=(c("id_oa","n")))
AU <- strsplit(DATA$AU,";")
AU_df <- data.frame(RP = unlist(AU), AU_ID=unlist(strsplit(DATA$AU_ID,";")), id_oa=rep(DATA$id_oa,lengths(AU))) %>%
group_by(id_oa) %>%
mutate(n=row_number()) %>%
left_join(df_UN %>% select("UN","id_oa", "corr", "n"),
by = c("id_oa","n")) %>%
dplyr::filter(corr == "true") %>%
mutate(RP = paste(RP,UN, sep=", ")) %>%
ungroup() %>%
select("RP", "AU_ID") %>%
distinct(AU_ID, .keep_all = TRUE)
DATA <- DATA %>%
left_join(AU_df, by = c("corresponding_author_ids" = "AU_ID"))


# move all char strings to Upper
ind <- apply(DATA,2,function(x){
sum(regexpr("https://",x)>-1, na.rm = TRUE)>0
})
label <- names(ind)[ind==FALSE & !is.na(ind)]

DATA <- DATA %>%
mutate(across(all_of(label), toupper),
DI = gsub("https://doi.org/","",DI),
DI = ifelse(DI == "null",NA,DI))

return(DATA)
}

relabelling <- function(DATA){
## column re-labelling
label <- names(DATA)
label[label %in% "id"] <- "id_oa"
label[label %in% "display_name"] <- "TI"
label[label %in% "primary_location_display_name"] <- "SO"
label[label %in% "primary_location_id"] <- "SO_ID"
label[label %in% "primary_location_host_organization"] <- "PU"
label[label %in% "primary_location_issns"] <- "ISSN"
label[label %in% "primary_location_issn_l"] <- "ISSN_I"
label[label %in% "primary_location_landing_page_url"] <- "URL"
label[label %in% "primary_location_pdf_url"] <- "URL_PDF"
label[label %in% "author_ids"] <- "AU_ID"
label[label %in% "author_names"] <- "AU"
label[label %in% "author_orcids"] <- "OI"
label[label %in% "author_institution_names"] <- "C3"
label[label %in% "cited_by_count"] <- "TC"
label[label %in% "publication_year"] <- "PY"
label[label %in% "type"] <- "DT"
label[label %in% "biblio_issue"] <- "IS"
label[label %in% "biblio_volume"] <- "VL"
label[label %in% "referenced_works" ] <- "CR"
label[label %in% "keywords_keyword"] <- "DE"
label[label %in% "concepts_display_name"] <- "CONCEPTS"
label[label %in% "topics_display_name"] <- "TOPICS"
label[label %in% "sustainable_development_goals_display_name"] <- "SDG"
label[label %in% "primary_topic_field_display_name"] <- "SC"
label[label %in% "mesh_descriptor_name"] <- "MESH"
label[label %in% "referenced_works_count"] <- "NR"
label[label %in% "language"] <- "LA"
label[label %in% "authorships_author_position"] <- "AU_POSITION"
label[label %in% "authorships_raw_affiliation_string"] <- "C1"
label[label %in% "doi"] <- "DI"
names(DATA) <- label
return(DATA)
}
93 changes: 2 additions & 91 deletions R/histNetwork.R
Original file line number Diff line number Diff line change
Expand Up @@ -195,95 +195,6 @@ wos <- function(M, min.citations, sep, network, verbose){
return(results)
}

# scopus <- function(M, min.citations, sep, network, verbose){
#
# if (isTRUE(verbose)) {
# cat("\nSCOPUS DB: Searching local citations (LCS) by document titles (TI) and DOIs...\n")
# }
#
# if (!("SR_FULL" %in% names(M))) {
# M = metaTagExtraction(M, Field = "SR")
# }
#
# M$nCITING <- 1:nrow(M)
# papers <- M$nCITING[M$TC >= min.citations]
#
# TIpost <-
# paste(gsub("[[:punct:]]", "", M$TI[papers]), " ", M$PY[papers], " ", sep = "")
#
# CR <- gsub("[[:punct:]]", "", M$CR)
# n <- nchar(CR)
# n[is.na(n)] <- 2
# n <- n + 1
# nCum <- c(1, cumsum(n[-length(n)]))
# CR <- paste(CR, collapse = " ")
#
# #L <- str_locate_all(CR, TIpost)
# L <- stringi::stri_locate_all_regex(CR,TIpost, omit_no_match = TRUE)
#
# LCS <- lengths(L) / 2
#
# M$LCS <- 0
# M$LCS[papers] <- LCS
#
#
# ### HistData
# histData <- M %>%
# select(.data$SR_FULL, .data$TI,.data$DE,.data$ID,.data$DI, .data$PY, .data$LCS, .data$TC) %>%
# rename(
# Paper = .data$SR_FULL,
# Title = .data$TI,
# Author_Keywords = .data$DE,
# KeywordsPlus = .data$ID,
# DOI = .data$DI,
# Year = .data$PY,
# GCS = .data$TC
# ) %>%
# arrange(.data$Year) %>%
# dplyr::filter(.data$GCS>=min.citations) %>%
# as.data.frame()
#
#
# if (isTRUE(network)) {
# ## Network matrix
# df <- lapply(seq_along(L), function(i) {
# l <-
# data.frame(
# ref = L[[i]],
# paper = rep(papers[i], length(L[[i]][, 1]))
# )
# })
# df <- (do.call(rbind, df))
#
# A <- outer(df$ref.start, nCum, "-")
# A[A < 0] <- NA
# df$CITINGn <- unlist(apply(A, 1, which.min))
# df$CITING <- M$SR[df$CITINGn]
# df$CITED <- M$SR[df$paper]
# df <- df %>%
# dplyr::filter(.data$CITING %in% histData$Paper)
#
# NetMatrix <-
# (as_adjacency_matrix(graph_from_data_frame(df[, c(6, 5)], directed = T)))
# } else{
# NetMatrix = NULL
# }
#
# if (isTRUE(verbose)) {
# cat("\nFound",
# length(M$LCS[M$LCS > 0]),
# "documents with no empty Local Citations (LCS)\n")
# }
#
# results <-
# list(
# NetMatrix = NetMatrix,
# histData = histData,
# M = M,
# LCS = M$LCS
# )
# }

# New algorithm for Scopus
# Local citation matching is based on First Author, Year and PP
scopus <- function(M, min.citations, sep, network, verbose){
Expand Down Expand Up @@ -387,7 +298,7 @@ scopus <- function(M, min.citations, sep, network, verbose){

openalex <- function(M, min.citations=min.citations, sep=sep, network=network, verbose=verbose){

M$CR[is.na(M$CR)] <- "none"
M$CR[is.na(M$CR) | M$CR==""] <- "none"
ids <- M$id_oa
CR <- strsplit(M$CR, ";")
CR <- data.frame(id_oa = rep(M$id_oa,lengths(CR)), ref = unlist(CR)) %>%
Expand Down Expand Up @@ -420,7 +331,7 @@ openalex <- function(M, min.citations=min.citations, sep=sep, network=network, v
SRrow <- WLCR %>% select(.data$id_oa) %>%
left_join(M %>%
select(.data$id_oa, .data$SR),
by="id_oa")
by="id_oa")

SR_col <- data.frame(id_oa = colnames(WLCR)[-1]) %>%
left_join(M %>%
Expand Down
2 changes: 1 addition & 1 deletion R/metaTagExtraction.R
Original file line number Diff line number Diff line change
Expand Up @@ -405,7 +405,7 @@ AU_UN<-function(M,sep){
})
AFFL=unlist(AFFL)
M$AU_UN=AFFL
if (M$DB[1]=="ISI" & "C3" %in% names(M)){
if (M$DB[1] %in% c("ISI", "OPENALEX") & "C3" %in% names(M)){
M$AU_UN[!is.na(M$C3) & M$C3!=""] <- M$C3[!is.na(M$C3) & M$C3!=""]
}
M$AU_UN=gsub("\\\\&","AND",M$AU_UN)
Expand Down
10 changes: 9 additions & 1 deletion inst/biblioshiny/server.R
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,15 @@ To ensure the functionality of Biblioshiny,
})
})
},
openalex = {
openalex={
withProgress(message = 'Conversion in progress',
value = 0, {
M <- convert2df(inFile$datapath,
dbsource = input$dbsource,
format = "csv")
})
},
openalex_api = {
M <- smart_load(inFile$datapath)
},
lens = {
Expand Down
3 changes: 2 additions & 1 deletion inst/biblioshiny/ui.R
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,8 @@ body <- dashboardBody(
"Web of Science (WoS/WoK)" = "isi",
"Scopus" = "scopus",
"Dimensions" = "dimensions",
"OpenAlex (via openalexR)" = "openalex",
"Openalex" ="openalex",
"OpenAlex API (via openalexR)" = "openalex_api",
"Lens.org" = "lens",
"PubMed" = "pubmed",
"Cochrane Library" = "cochrane"
Expand Down
Loading

0 comments on commit 76e56a0

Please sign in to comment.