Fixes for non-ASCII name list fields in toBiblatex and toBibtex

* Add latexify() from dplR instead of tools::encoded_text_to_latex to improve conversion of non-ASCII characters to valid latex * Fixes #102, #105, #106 Signed-off-by: Mathew W. McLean <[email protected]>
ropensci · Oct 7, 2024 · 876ccea · 876ccea
1 parent 2d33c38
commit 876ccea
Show file tree

Hide file tree

Showing 8 changed files with 353 additions and 22 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,9 +1,11 @@
 Package: RefManageR
-Version: 1.4.0
+Version: 1.4.3
 Title: Straightforward 'BibTeX' and 'BibLaTeX' Bibliography Management
-Authors@R: person(c("Mathew", "W."), "McLean", role = c("aut", "cre"),
+Authors@R: c(person(c("Mathew", "W."), "McLean", role = c("aut", "cre"),
  email = "[email protected]",
- comment = c(ORCID = "0000-0002-7891-9645"))
+ comment = c(ORCID = "0000-0002-7891-9645")),
+ person("Andy", "Bunn", role = "ctb",
+ email = "[email protected]", comment = "function latexify used by toBiblatex"))
 Maintainer: Mathew W. McLean <[email protected]>
 Description: Provides tools for importing and working with bibliographic
  references. It greatly enhances the 'bibentry' class by providing a class
@@ -27,6 +29,8 @@ Imports:
  httr,
  lubridate (>= 1.5.0),
  stringr,
+ stringi,
+ R.utils,
  methods,
  bibtex (>= 0.4.1)
 Suggests:
@@ -39,4 +43,4 @@ Depends:
 VignetteBuilder: knitr
 BugReports: https://github.com/ropensci/RefManageR/issues
 URL: https://github.com/ropensci/RefManageR/
-RoxygenNote: 7.2.1
+RoxygenNote: 7.3.2
diff --git a/NAMESPACE b/NAMESPACE
@@ -49,6 +49,7 @@ export(as.BibEntry)
 export(fields)
 export(is.BibEntry)
 export(toBiblatex)
+importFrom(R.utils,captureOutput)
 importFrom(bibtex,do_read_bib)
 importFrom(httr,GET)
 importFrom(httr,POST)
@@ -73,6 +74,9 @@ importFrom(methods,hasArg)
 importFrom(plyr,llply)
 importFrom(plyr,progress_text)
 importFrom(stats,setNames)
+importFrom(stringi,stri_trans_nfc)
+importFrom(stringi,stri_trans_nfd)
+importFrom(stringi,stri_unescape_unicode)
 importFrom(stringr,str_length)
 importFrom(stringr,str_sub)
 importFrom(stringr,str_trim)
@@ -82,7 +86,6 @@ importFrom(tools,Rd2txt)
 importFrom(tools,Rd2txt_options)
 importFrom(tools,bibstyle)
 importFrom(tools,deparseLatex)
-importFrom(tools,encoded_text_to_latex)
 importFrom(tools,getBibstyle)
 importFrom(tools,latexToUtf8)
 importFrom(tools,loadPkgRdMacros)

diff --git a/R/WriteBib.R b/R/WriteBib.R
@@ -21,7 +21,6 @@
 #' @note To write the contents of \code{bib} \dQuote{as is}, the argument
 #' \code{biblatex} should be \code{TRUE}, otherwise
 #' conversion is done as in \code{\link{toBibtex.BibEntry}}.
-#' @importFrom tools encoded_text_to_latex
 #' @author McLean, M. W. based on \code{write.bib} by Gaujoux, R.
 #' in package \code{bibtex}.
 #' @export

diff --git a/R/toBiblatex.R b/R/toBiblatex.R
@@ -10,6 +10,9 @@
 #' @param extra.fields character vector; fields that are not supported in standard BibTeX styles are by default dropped
 #' in the result return by the toBibtex function.
 #' Any fields specified in extra.fields will \emph{not} be dropped if present in an entry.
+#' @param encoded.names.to.latex if \code{TRUE} (the default) then name list fields
+#' such as \sQuote{author} and \sQuote{editor} will have non-ASCII characters
+#' translated to LaTeX escape sequences by \code{\link[dplR]{latexify}}.
 #' @param ... ignored
 #' @export
 #' @return an object of class \dQuote{Bibtex} - character vectors where each element holds one line of a BibTeX or BibLaTeX file
@@ -50,7 +53,7 @@
 #' }
 #' @seealso \code{\link{toBibtex}}, \code{\link{BibEntry}}, \code{\link{print.BibEntry}}
 #' @author McLean, M. W. \email{mathew.w.mclean@@gmail.com}
-#' @importFrom tools encoded_text_to_latex parseLatex deparseLatex latexToUtf8
+#' @importFrom tools parseLatex deparseLatex latexToUtf8
 #' @keywords database IO utilities
 #' @aliases toBibtex.BibEntry toBibtex
 #' @examples
@@ -60,14 +63,16 @@
 #' toBiblatex(bib[70:72])
 #' toBibtex(bib[70:72])
 #' }
-toBiblatex <- function(object, ...){
+toBiblatex <- function(object, encoded.names.to.latex = TRUE, ...){
  format_bibentry1 <- function(object) {
  object <- unclass(object)[[1L]]
  rval <- paste0("@", attr(object, "bibtype"), "{", attr(object,
  "key"), ",")
- nl.ind <- which(names(object) %in% .BibEntryNameList)
- for (i in nl.ind)
- object[i] <- encoded_text_to_latex(format_author(object[[i]]), "UTF-8")
+ if (encoded.names.to.latex) {
+ nl.ind <- which(names(object) %in% .BibEntryNameList)
+ for (i in nl.ind)
+ object[i] <- EncodedNameListToLaTeX(object[[i]])
+ }
  rval <- c(rval, vapply(names(object), function(n) paste0(" ",
  n, " = {", object[[n]], "},"), ""), "}", "")
  return(rval)
@@ -81,3 +86,214 @@ toBiblatex <- function(object, ...){
  class(rval) <- "Bibtex"
  rval
 }
+
+#' Wrapper for dplr::latexify that returns original
+#' text if translation to LaTeX fails
+#' @noRd
+#' @seealso \url{https://github.com/ropensci/RefManageR/issues/106}
+EncodedNameListToLaTeX <- function(name.list, encoding = "UTF-8")
+{
+ formatted.text <- format_author(name.list)
+ out <- latexify(formatted.text, encoding, doublebackslash = FALSE)
+ if (grepl("^[{]?[?]", out))
+ return(formatted.text)
+ return(out)
+}
+
+#' Modified version of dplR::latexify
+#' @importFrom stringi stri_trans_nfc stri_trans_nfd stri_unescape_unicode
+#' @importFrom R.utils captureOutput
+#' @author Andy Bunn
+latexify <- function(x, 
+ doublebackslash = FALSE, 
+ dashdash = TRUE, 
+ quotes = c("straight", "curved"), 
+ packages = c("fontenc", "textcomp")) 
+{
+ y <- as.character(x)
+ encBytes <- Encoding(y) == "bytes"
+ if (any(encBytes)) {
+ y[encBytes] <- captureOutput(cat(y[encBytes], sep = "\n"))
+ }
+ y <- stri_trans_nfd(y)
+ Letters <- paste0(c(LETTERS, letters), collapse = "")
+ fontenc <- "fontenc" %in% packages
+ textcomp <- "textcomp" %in% packages
+ eurosym <- "eurosym" %in% packages
+ straightQuotes <- match.arg(quotes) == "straight"
+ y <- gsub("(?![[:space:]])[[:cntrl:]]", "", y, perl = TRUE)
+ y <- gsub("[[:space:]]+", " ", y)
+ substitutions <- list(c("\\\\^", "\\\\textasciicircum{}"),
+ c("~", "\\\\textasciitilde{}"), c("<", "\\\\textless{}"),
+ c(">", "\\\\textgreater{}"), c("\\\\|", "\\\\textbar{}"),
+ c("([#$%&_])", "\\\\\\1"), if (isTRUE(dashdash)) {
+ c("-", "\\\\mbox{-}")
+ }, if (textcomp && straightQuotes) {
+ c("'", "\\\\textquotesingle{}")
+ }, if (textcomp && straightQuotes) {
+ c("`", "\\\\textasciigrave{}")
+ }, c("\"", if (fontenc && straightQuotes) {
+ "\\\\textquotedbl{}"
+ } else {
+ "\\\\textquotedblright{}"
+ }), c("/", "\\\\slash{}"))
+ substitutions <- substitutions[!vapply(substitutions, is.null, 
+ logical(1))]
+ substitutions <- c(substitutions, list(c("\\u0132", "\\\\IJ{}"),
+ c("\\u0133", "\\\\ij{}"),
+ c("\\u01f1", "DZ"), c("\\u01f2", "Dz"), 
+ c("\\u01f3", "dz"), c("\\u01c4", "DZ\\u033c"),
+ c("\\u01c5", "Dz\\u030c"),
+ c("\\u01c6", "dz\\u030c"),
+ c("\\u01c7", "LJ"), c("\\u01c8", "Lj"),
+ c("\\u01c9", "lj"), c("\\u01ca", "NJ"),
+ c("\\u01cb", "Nj"), c("\\u01cc", "nj"),
+ c("\\ufb00", "ff"), c("\\ufb01", "fi"),
+ c("\\ufb02", "fl"), c("\\ufb03", "ffi"),
+ c("\\ufb04", "ffl"), c("\\ufb05", "\\u017ft"),
+ c("\\ufb06", "st")))
+ above <- list(diaeresis = c("\\u0308", "\""), acute = c("\\u0301", "'"),
+ dotabove = c("\\u0307", "."), macron = c("\\u0304", "="),
+ circumflex = c("\\u0302", "^"), grave = c("\\u0300", "`"),
+ tilde = c("\\u0303", "~"), doubleacute = c("\\u030b", "H"),
+ ringabove = c("\\u030a", "r"), breve = c("\\u0306", "u"),
+ caron = c("\\u030c", "v"), invbreve = c("\\u0311", "newtie"))
+
+ below <- list(macronbelow = c("\\u0331", "b"), cedilla = c("\\u0327", "c"),
+ dotbelow = c("\\u0323", "d"), tie = c("\\u0361", "t"),
+ ogonek = c("\\u0328", "k"))
+ accents <- c(above, below)
+ command <- paste0("\\\\[", Letters, "]+|\\\\.")
+ combining <- paste0(vapply(accents, "[", character(1), 1), 
+ collapse = "")
+ accPre <- paste0("(", command, "|.)({})?(?<![", combining, 
+ "])")
+ accPost <- paste0("(?![", combining, "])")
+ aboveInCode <- vapply(above, "[", character(1), 1)
+ ijPattern <- paste0("([ij])", aboveInCode, accPost)
+ otherPattern <- paste0(accPre, aboveInCode, accPost)
+ aboveOutCode <- vapply(above, "[", character(1), 2)
+ ijReplacement <- paste0("\\\\", aboveOutCode, "{\\\\\\1}")
+ otherReplacement <- paste0("\\\\", aboveOutCode, "{\\1}")
+ belowInCode <- vapply(below, "[", character(1), 1)
+ belowPattern <- paste0(accPre, belowInCode, accPost)
+ belowOutCode <- vapply(below, "[", character(1), 2)
+ belowReplacement <- paste0("\\\\", belowOutCode, "{\\1}")
+ circPre <- paste0("(", command, "({([^}]|\\\\})+})?|.)({})?")
+ circPattern <- paste0(circPre, "\\u20dd", accPost)
+ circReplacement <- "\\\\textcircled{\\1}"
+ substitutions <- c(substitutions, lapply(lapply(mapply(list, 
+ list(as.name("c")), 
+ c(ijPattern, otherPattern, 
+ belowPattern, circPattern), 
+ c(ijReplacement, otherReplacement, 
+ belowReplacement, circReplacement), 
+ SIMPLIFY = FALSE), 
+ as.call), eval))
+ substitutions <- c(substitutions, list(c("\\u00a1", "\\\\textexclamdown{}"),
+ c("\\u00a3", "\\\\pounds{}"),
+ c("\\u00a7", "\\\\S{}"),
+ c("\\u00a9", "\\\\copyright{}"),
+ c("\\u00aa", "\\\\textordfeminine{}"), c("\\u00ae", "\\\\textregistered{}"),
+ c("\\u00b6", "\\\\P{}"), c("\\u00b7", "\\\\textperiodcentered{}"),
+ c("\\u00ba", "\\\\textordmasculine{}"), c("\\u00bf", "\\\\textquestiondown{}"),
+ c("\\u2013", "\\\\textendash{}"), c("\\u2014", "\\\\textemdash{}"),
+ c("\\u2018", "\\\\textquoteleft{}"), c("\\u2019", "\\\\textquoteright{}"),
+ c("\\u201c", "\\\\textquotedblleft{}"), c("\\u201d", "\\\\textquotedblright{}"),
+ c("\\u2020", "\\\\dag{}"), c("\\u2021", "\\\\ddag{}"),
+ c("\\u2022", "\\\\textbullet{}"), c("\\u2026", "\\\\dots{}"),
+ c("\\u2122", "\\\\texttrademark{}"), c("\\u2423", "\\\\textvisiblespace{}"),
+ c("\\u00c6", "\\\\AE{}"), c("\\u00e6", "\\\\ae{}"), c("\\u0152", "\\\\OE{}"),
+ c("\\u0153", "\\\\oe{}"), c("\\u00d8", "\\\\O{}"), c("\\u00f8", "\\\\o{}"),
+ c("\\u0141", "\\\\L{}"), c("\\u0142", "\\\\l{}"),
+ c("\\u1e9e", "\\\\ifdefined\\\\XeTeXrevision\\\\iffontchar\\\\font\"1E9E\\\\symbol{\"1E9E}\\\\else\\\\SS\\\\fi\\\\else\\\\ifdefined\\\\directlua\\\\iffontchar\\\\font\"1E9E\\\\symbol{\"1E9E}\\\\else\\\\SS\\\\fi\\\\else\\\\SS\\\\fi\\\\fi{}"),
+ c("\\u00df", "\\\\ss{}"), c("\\u017f", "\\\\ifdefined\\\\XeTeXrevision\\\\symbol{\"017F}\\\\else\\\\ifdefined\\\\directlua\\\\symbol{\"017F}\\\\else{\\\\fontencoding{TS1}\\\\selectfont s}\\\\fi\\\\fi{}")))
+ substitutions <- c(substitutions, list(c("\\u00d0", "\\\\DH{}"), c("\\u00f0", "\\\\dh{}"),
+ c("\\u0110", "\\\\DJ{}"), c("\\u0111", "\\\\dj{}"),
+ c("\\u014a", "\\\\NG{}"),
+ c("\\u014b", "\\\\ng{}"), c("\\u00de", "\\\\TH{}"),
+ c("\\u00fe", "\\\\th{}"),
+ c("\\u00ab", "\\\\guillemotleft{}"),
+ c("\\u00bb", "\\\\guillemotright{}"),
+ c("\\u201a", "\\\\quotesinglbase{}"),
+ c("\\u201e", "\\\\quotedblbase{}"),
+ c("\\u2039", "\\\\guilsinglleft{}"),
+ c("\\u203a", "\\\\guilsinglright{}")))
+ substitutions <- c(substitutions, list(c("\\u00ad", "\\\\-"), c("\\u200b", "\\\\hspace{0pt}"),
+ c("\\u2217", "\\\\textasteriskcentered{}"),
+ c("\\u2016", "\\\\textbardbl{}"),
+ c("\\u25ef", "\\\\textbigcircle{}"),
+ c("\\u2422", "\\\\textblank{}"),
+ c("\\u00a6", "\\\\textbrokenbar{}"),
+ c("\\u2052", "\\\\textdiscount{}"),
+ c("\\u212e", "\\\\textestimated{}"),
+ c("\\u203d", "\\\\textinterrobang{}"),
+ c("\\u2e18", "\\\\textinterrobangdown{}"),
+ c("\\u2116", "\\\\textnumero{}"),
+ c("\\u25e6", "\\\\textopenbullet{}"),
+ c("\\u2030", "\\\\textperthousand{}"),
+ c("\\u2031", "\\\\textpertenthousand{}"),
+ c("\\u211e", "\\\\textrecipe{}"),
+ c("\\u203b", "\\\\textreferencemark{}"),
+ c("\\u02f7", "\\\\texttildelow{}"),
+ c("\\u2190", "\\\\textleftarrow{}"),
+ c("\\u2191", "\\\\textuparrow{}"),
+ c("\\u2192", "\\\\textrightarrow{}"),
+ c("\\u2193", "\\\\textdownarrow{}"),
+ c("\\u3008", "\\\\textlangle{}"),
+ c("\\u3009", "\\\\textrangle{}"),
+ c("\\u301a", "\\\\textlbrackdbl{}"),
+ c("\\u301b", "\\\\textrbrackdbl{}"),
+ c("\\u2045", "\\\\textlquill{}"),
+ c("\\u2046", "\\\\textrquill{}"),
+ c("\\u2117", "\\\\textcircledP{}"),
+ c("\\u2120", "\\\\textservicemark{}"),
+ c("\\u2103", "\\\\textcelsius{}"),
+ c("\\u2127", "\\\\textmho{}"),
+ c("\\u00b5", "\\\\textmu{}"),
+ c("\\u03a9", "\\\\textohm{}"),
+ c("\\u0e3f", "\\\\textbaht{}"),
+ c("\\u00a2", "\\\\textcent{}"),
+ c("\\u20a1", "\\\\textcolonmonetary{}"),
+ c("\\u00a4", "\\\\textcurrency{}"),
+ c("\\u20ab", "\\\\textdong{}"),
+ c("\\u20ac", "\\\\texteuro{}"),
+ c("\\u20b2", "\\\\textguarani{}"),
+ c("\\u20a4", "\\\\textlira{}"),
+ c("\\u20a6", "\\\\textnaira{}"),
+ c("\\u20b1", "\\\\textpeso{}"),
+ c("\\u20a9", "\\\\textwon{}"),
+ c("\\u00a5", "\\\\textyen{}"),
+ c("\\u02dd", "\\\\textacutedbl{}"),
+ c("\\u00b4", "\\\\textasciiacute{}"),
+ c("\\u00b8", "\\\\c{}"),
+ c("\\u02d8", "\\\\textasciibreve{}"),
+ c("\\u02c7", "\\\\textasciicaron{}"),
+ c("\\u00a8", "\\\\textasciidieresis{}"),
+ c("\\u00af", "\\\\textasciimacron{}"),
+ c("\\u00b0", "\\\\textdegree{}"),
+ c("\\u00f7", "\\\\textdiv{}"),
+ c("\\u00bc", "\\\\textonequarter{}"),
+ c("\\u00bd", "\\\\textonehalf{}"),
+ c("\\u00be", "\\\\textthreequarters{}"),
+ c("\\u00d7", "\\\\texttimes{}"),
+ c("\\u00b1", "\\\\textpm{}"),
+ c("\\u00b9", "\\\\textonesuperior{}"),
+ c("\\u00b2", "\\\\texttwosuperior{}"),
+ c("\\u00b3", "\\\\textthreesuperior{}"),
+ c("\\u2044", "\\\\textfractionsolidus{}"),
+ c("\\u221a", "\\\\textsurd{}"),
+ c("\\u00ac", "\\\\textlnot{}"),
+ c("\\u2212", "\\\\textminus{}")))
+ tmp <- paste0("(\\\\[", Letters, "]+){}")
+ substitutions <- c(substitutions, list(c(paste0(tmp, "(?=$|[[:digit:],.?!;:\\\\}+*/-])"),
+ "\\1"), c(paste0(tmp, "(?! )"), "\\1 ")))
+ for (subst in substitutions) {
+ y <- gsub(stri_unescape_unicode(subst[1]), subst[2], y, perl = TRUE)
+ }
+ if (isTRUE(doublebackslash)) {
+ y <- gsub("\\", "\\\\", y, fixed = TRUE)
+ }
+ stri_trans_nfc(y)
+}
+
diff --git a/R/toBibtex.R b/R/toBibtex.R
@@ -5,14 +5,17 @@
 toBibtex.BibEntry <- function(object,
  note.replace.field = c('urldate', "pubsate",
  "addendum"),
- extra.fields = NULL, ...){
+ extra.fields = NULL, 
+ encoded.names.to.latex = TRUE,
+ ...){
 
 
  object <- .BibEntry_expand_crossrefs(unclass(object), to.bibtex = TRUE)
  if (length(object)) {
  object$.index <- NULL
  rval <- head(unlist(lapply(object, ConvertToBibtex,
- note.replace.field, extra.fields)), 
+ note.replace.field, extra.fields, 
+ encoded.names.to.latex)), 
  -1L)
  }
  else rval <- character()
@@ -21,18 +24,20 @@ toBibtex.BibEntry <- function(object,
 }
 
 #' @noRd
-ConvertToBibtex <- function(object, note.replace.field, extra.fields){
+ConvertToBibtex <- function(object,
+ note.replace.field,
+ extra.fields,
+ encoded.names.to.latex) {
  object <- unclass(object)[[1L]]
  bibtype <- tolower(attr(object, "bibtype"))
  obj.names <- names(object)
- if ("author" %in% obj.names)
- object$author <- encoded_text_to_latex(format_author(object$author),
-  "UTF-8")
- if ("editor" %in% obj.names)
- object$editor <- encoded_text_to_latex(format_author(object$editor),
-  "UTF-8")
+ if (encoded.names.to.latex) {
+ if ("author" %in% obj.names)
+ object$author <- EncodedNameListToLaTeX(object$author)
+  if ("editor" %in% obj.names)
+  object$editor <- EncodedNameListToLaTeX(object$editor)
+ }
  # see 2.3 Usage Notes p. 28
-
  if (bibtype == "article" && 'journaltitle' %in% obj.names &&
  is.null(object$journal))
  object$journal <- object$journaltitle