From b45845fbea02225291e93be826fb46188d1dad35 Mon Sep 17 00:00:00 2001 From: "Zhian N. Kamvar" Date: Thu, 18 Apr 2024 12:38:19 -0700 Subject: [PATCH 01/37] testing attribute-based protection [skip ci] I've modified the escape-text function escape text based on wether or not it exists in an escapable range. This commit implements a proof of concept that protects the first escapable character and will not pass check. --- R/asis-nodes.R | 22 ++++++++++--- inst/stylesheets/xml2md_gfm.xsl | 58 +++++++++++++++++++++++++++++++++ 2 files changed, 76 insertions(+), 4 deletions(-) diff --git a/R/asis-nodes.R b/R/asis-nodes.R index e3264dc..353883b 100644 --- a/R/asis-nodes.R +++ b/R/asis-nodes.R @@ -121,11 +121,12 @@ protect_inline_math <- function(body, ns) { # protect math that is strictly inline if (length(imath)) { - new_nodes <- purrr::map(imath, fix_fully_inline) + purrr::map(imath, label_fully_inline) + # new_nodes <- purrr::map(imath, fix_fully_inline) # since we split up the nodes, we have to do this node by node - for (i in seq(new_nodes)) { - add_node_siblings(imath[[i]], new_nodes[[i]], remove = TRUE) - } + # for (i in seq(new_nodes)) { + # add_node_siblings(imath[[i]], new_nodes[[i]], remove = TRUE) + # } } # protect math that is broken across lines or markdown elements @@ -205,6 +206,19 @@ fix_fully_inline <- function(math) { make_text_nodes(char) } +label_fully_inline <- function(math) { + char <- xml2::xml_text(math) + locations <- gregexpr(pattern = inline_dollars_regex("full"), + char, + perl = TRUE + ) + pos <- paste(locations[[1]], collapse = " ") + len <- paste(attr(locations[[1]], "match.len"), collapse = " ") + xml2::xml_set_attr(math, "protect.pos", pos) + xml2::xml_set_attr(math, "protect.len", len) + +} + #' Transform a character vector of XML into text nodes #' #' This is useful in the case where we want to modify some text content to diff --git a/inst/stylesheets/xml2md_gfm.xsl b/inst/stylesheets/xml2md_gfm.xsl index 54816bc..6ea95c4 100644 --- a/inst/stylesheets/xml2md_gfm.xsl +++ b/inst/stylesheets/xml2md_gfm.xsl @@ -8,6 +8,7 @@ + @@ -16,8 +17,65 @@ + + + + + + + + + + + + + + + + + + + safe: + + length: + + position: + + range: + + .. + + translated: + + + + + + \ + + + + + + + + + + + + + + + + + + + + + + - + + + + ===>Escaping + + \ + + - + + + @@ -72,7 +158,7 @@ - + From f1c57ce4bfc2d0250c3d21c48d635ebc2f34a473 Mon Sep 17 00:00:00 2001 From: "Zhian N. Kamvar" Date: Fri, 19 Apr 2024 09:33:50 -0700 Subject: [PATCH 03/37] move helper templates; document; remove comments [skip ci] --- inst/stylesheets/tinkr-helpers.xsl | 136 +++++++++++++++++++++++++++++ inst/stylesheets/xml2md_gfm.xsl | 136 +---------------------------- 2 files changed, 137 insertions(+), 135 deletions(-) create mode 100644 inst/stylesheets/tinkr-helpers.xsl diff --git a/inst/stylesheets/tinkr-helpers.xsl b/inst/stylesheets/tinkr-helpers.xsl new file mode 100644 index 0000000..18fb060 --- /dev/null +++ b/inst/stylesheets/tinkr-helpers.xsl @@ -0,0 +1,136 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ + + + + + + + + + + + + + + + + + + + + diff --git a/inst/stylesheets/xml2md_gfm.xsl b/inst/stylesheets/xml2md_gfm.xsl index 44376c9..8710940 100644 --- a/inst/stylesheets/xml2md_gfm.xsl +++ b/inst/stylesheets/xml2md_gfm.xsl @@ -8,6 +8,7 @@ + @@ -17,141 +18,6 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - safe: - - length: - - position: - - range: - - .. - - positions: - ( - - ) - ( - - ) - translated: - - - - - - - - - ===>Escaping - - - \ - - - - - - - - - - - - - - - - - - From 30414604f81bd0d565441057c249f548d4ed2d6f Mon Sep 17 00:00:00 2001 From: "Zhian N. Kamvar" Date: Thu, 25 Apr 2024 12:59:58 -0700 Subject: [PATCH 04/37] [xml] use embedded str:tokenize function via EXSLT I had initially found a tokenize template and had contacted the author about license information (she gave permission): When I was working with it, I found that the function exists as part of libxml because it bundles EXSLT functions, which allows me to do this easier and more efficient by tracking and modifying a single index instead of a pair of strings. --- inst/stylesheets/tinkr-helpers.xsl | 111 ++++++----------------------- inst/stylesheets/xml2md_gfm.xsl | 12 ++-- 2 files changed, 30 insertions(+), 93 deletions(-) diff --git a/inst/stylesheets/tinkr-helpers.xsl b/inst/stylesheets/tinkr-helpers.xsl index 18fb060..c9652d8 100644 --- a/inst/stylesheets/tinkr-helpers.xsl +++ b/inst/stylesheets/tinkr-helpers.xsl @@ -2,60 +2,9 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -63,53 +12,36 @@ + + - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + - + + + @@ -123,8 +55,9 @@ - - + + + diff --git a/inst/stylesheets/xml2md_gfm.xsl b/inst/stylesheets/xml2md_gfm.xsl index 8710940..2f1aeb9 100644 --- a/inst/stylesheets/xml2md_gfm.xsl +++ b/inst/stylesheets/xml2md_gfm.xsl @@ -2,6 +2,7 @@ @@ -18,13 +19,16 @@ - - + + + - - + + From 31e5a0fde3b1f6d97f88f3aaf720b4f588f6dafc Mon Sep 17 00:00:00 2001 From: "Zhian N. Kamvar" Date: Thu, 25 Apr 2024 13:15:36 -0700 Subject: [PATCH 05/37] update comments --- inst/stylesheets/tinkr-helpers.xsl | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/inst/stylesheets/tinkr-helpers.xsl b/inst/stylesheets/tinkr-helpers.xsl index c9652d8..b7ddf59 100644 --- a/inst/stylesheets/tinkr-helpers.xsl +++ b/inst/stylesheets/tinkr-helpers.xsl @@ -12,10 +12,11 @@ + - + - + @@ -26,11 +27,15 @@ + + + + @@ -42,6 +47,7 @@ + @@ -49,6 +55,7 @@ + From 90561ef7725d587ab020bd05fd67e31f7971ac77 Mon Sep 17 00:00:00 2001 From: "Zhian N. Kamvar" Date: Fri, 26 Apr 2024 11:53:42 -0700 Subject: [PATCH 06/37] add range updator --- R/asis-nodes.R | 13 ----- R/node-protection.R | 71 +++++++++++++++++++++++++++ man/protect_unescaped.Rd | 2 +- tests/testthat/test-node-protection.R | 69 ++++++++++++++++++++++++++ 4 files changed, 141 insertions(+), 14 deletions(-) create mode 100644 R/node-protection.R create mode 100644 tests/testthat/test-node-protection.R diff --git a/R/asis-nodes.R b/R/asis-nodes.R index fc1138c..5ba5c70 100644 --- a/R/asis-nodes.R +++ b/R/asis-nodes.R @@ -206,19 +206,6 @@ fix_fully_inline <- function(math) { make_text_nodes(char) } -label_fully_inline <- function(math) { - char <- xml2::xml_text(math) - locations <- gregexpr(pattern = inline_dollars_regex("full"), - char, - perl = TRUE - ) - pos <- locations[[1]] - len <- attr(locations[[1]], "match.len") - xml2::xml_set_attr(math, "protect.pos", paste(pos, collapse = " ")) - xml2::xml_set_attr(math, "protect.end", paste(pos + len, collapse = " ")) - -} - #' Transform a character vector of XML into text nodes #' #' This is useful in the case where we want to modify some text content to diff --git a/R/node-protection.R b/R/node-protection.R new file mode 100644 index 0000000..d52715a --- /dev/null +++ b/R/node-protection.R @@ -0,0 +1,71 @@ +label_fully_inline <- function(math) { + char <- xml2::xml_text(math) + locations <- gregexpr(pattern = inline_dollars_regex("full"), + char, + perl = TRUE + ) + add_protection(math, locations) +} + +add_protection <- function(node, locations) { + start <- locations[[1]] + end <- start + attr(locations[[1]], "match.len") + if (xml2::xml_has_attr(node, "protect.pos")) { + # extract the ranges from the attributes + ostart <- strsplit(xml2::xml_attr(node, "protect.pos"), " ")[[1]] + oend <- strsplit(xml2::xml_attr(node, "protect.end"), " ")[[1]] + # update the ranges and the variables + new_ranges <- update_ranges( + start = c(as.integer(ostart), start), + end = c(as.integer(oend), end), + ) + start <- new_ranges$start + end <- new_ranges$end + } + xml2::xml_set_attr(node, "protect.pos", paste(start, collapse = " ")) + xml2::xml_set_attr(node, "protect.end", paste(end, collapse = " ")) +} + +inrange <- function(s1, e1, s2, e2) { + s1 <= s2 & e1 >= e2 +} + +overlap <- function(s1, e1, s2, e2) { + s1 <= e2 & s2 <= e1 +} + + +# https://www.geeksforgeeks.org/merging-intervals/ +update_ranges <- function(start, end) { + # Sort the intervals based on the increasing order of starting time. + ord <- order(start) + start <- start[ord] + end <- end[ord] + n <- length(start) + nstart <- integer(n) + nend <- integer(n) + # Push the first interval into a stack. + nstart[1] <- start[1] + nend[1] <- end[1] + i <- 2 + j <- 1 + while (i <= n) { + # For each interval do the following: + if (overlap(nstart[j], nend[j], start[i], end[i])) { + # If the current interval overlap with the top of the stack then, + # update the stack top with the ending time of the current interval. + nend[j] <- max(nend[j], end[i]) + } else { + # If the current interval does not overlap with the top of the stack + # then, push the current interval into the stack. + j <- j + 1 + nstart[j] <- start[i] + nend[j] <- end[i] + } + i <- i + 1 + # The end stack contains the merged intervals. + } + keep <- seq(j) + return(list(start = nstart[keep], end = nend[keep])) + +} diff --git a/man/protect_unescaped.Rd b/man/protect_unescaped.Rd index 6d31652..44a9de4 100644 --- a/man/protect_unescaped.Rd +++ b/man/protect_unescaped.Rd @@ -79,7 +79,7 @@ md <- yarn$new(f, sourcepos = TRUE, unescaped = FALSE) md$show() if (requireNamespace("withr")) { lines <- readLines(f)[-length(md$yaml)] -lnks <- withr::with_namespace("tinkr", +lnks <- withr::with_namespace("tinkr", protect_unescaped(body = md$body, txt = lines)) md$body <- lnks md$show() diff --git a/tests/testthat/test-node-protection.R b/tests/testthat/test-node-protection.R new file mode 100644 index 0000000..93c8650 --- /dev/null +++ b/tests/testthat/test-node-protection.R @@ -0,0 +1,69 @@ + +test_that("overlap returns false for separate overlaps", { + expect_false(overlap(1, 3, 5, 8)) + expect_false(overlap(5, 8, 1, 3)) + expect_false(overlap(1, 1, 5, 5)) +}) +test_that("overlap returns true for separate overlaps", { + expect_true(overlap(1, 8, 5, 13)) + expect_true(overlap(5, 13, 1, 8)) + expect_true(overlap(5, 9, 1, 10)) + expect_true(overlap(1, 10, 5, 9)) +}) + + +test_that("update_ranges will add non-overlapping ranges", { + a = list( + start = c(1, 5, 10), + end = c(3, 8, 50) + ) + b = list( + start = c(100, 500, 1000), + end = c(300, 800, 5000) + ) + expect_equal(update_ranges(c(b$start, a$start), c(b$end, a$end)), + list(start = c(a$start, b$start), end = c(a$end, b$end)) + ) +}) + +test_that("update_ranges will merge overlapping ranges", { + # in this scenario, we get the ranges from (a) [5, 8], [10, 50], + # joined by the first range from (b) [5, 35] + # which ends up as [5, 50] + a = list( + start = c(1, 5, 10), + end = c(3, 8, 50) + ) + b = list( + start = c(5, 100, 500, 1000), + end = c(35, 300, 800, 5000) + ) + expect_equal(update_ranges(c(b$start, a$start), c(b$end, a$end)), + list(start = c(a$start[-3], b$start[-1]), end = c(a$end[-2], b$end[-1])) + ) +}) + + +test_that("update_ranges will merge all overlapping ranges", { + a = list( + start = c(1, 5, 10), + end = c(3, 8, 50) + ) + b = list( + start = c(100, 500, 1000), + end = c(300, 800, 5000) + ) + c = list( + start = 1, + end = 2500 + ) + expect_equal( + update_ranges( + c(b$start, a$start, c$start), + c(b$end, a$end, c$end) + ), + list(start = 1, end = 5000) + ) + +}) + From a7b33c8b812c955046ddea64aed709b383767bc8 Mon Sep 17 00:00:00 2001 From: "Zhian N. Kamvar" Date: Mon, 29 Apr 2024 10:01:43 -0700 Subject: [PATCH 07/37] update math protection to use labels. --- R/asis-nodes.R | 85 ++++++++++++++++++++-------------- R/node-protection.R | 108 ++++++++++++++++++++++++++++++++++---------- 2 files changed, 134 insertions(+), 59 deletions(-) diff --git a/R/asis-nodes.R b/R/asis-nodes.R index 5ba5c70..1bfaa86 100644 --- a/R/asis-nodes.R +++ b/R/asis-nodes.R @@ -104,7 +104,6 @@ find_broken_math <- function(math) { #' cat(tinkr::to_md(list(body = protxt, yaml = "")), sep = "\n") #' } protect_inline_math <- function(body, ns) { - body <- copy_xml(body) math <- find_inline_math(body, ns) if (length(math) == 0) { return(body) @@ -120,17 +119,12 @@ protect_inline_math <- function(body, ns) { bmath <- math[!bespoke] # protect math that is strictly inline - if (length(imath)) { - purrr::map(imath, label_fully_inline) - # new_nodes <- purrr::map(imath, fix_fully_inline) - # since we split up the nodes, we have to do this node by node - # for (i in seq(new_nodes)) { - # add_node_siblings(imath[[i]], new_nodes[[i]], remove = TRUE) - # } + if (length(imath) > 0L) { + purrr::walk(imath, label_fully_inline) } # protect math that is broken across lines or markdown elements - if (length(bmath)) { + if (length(bmath) > 0L) { if (any(broke$ambiguous)) { # ambiguous math may be due to inline r code that produces an answer: # $R^2 = `r runif(1)`$ @@ -156,7 +150,7 @@ protect_inline_math <- function(body, ns) { fix_partial_inline(i, body, ns) } } - copy_xml(body) + body } # Partial inline math are math elements that are not entirely embedded in a @@ -172,38 +166,61 @@ fix_partial_inline <- function(tag, body, ns) { # find everything between the tagged pair math_lines <- find_between_inlines(body, ns, tag) # make sure everything between the tagged pair is labeled as 'asis' + # this is explicitly for symbols like `_`, which denote subscripts in LaTeX, + # but make _emph_ text in markdown. filling <- math_lines[is.na(xml2::xml_attr(math_lines, "latex-pair"))] set_asis(filling) filling <- xml2::xml_find_all(filling, ".//node()") set_asis(filling) - # paste the lines together and create new nodes - n <- length(math_lines) - char <- as.character(math_lines) - char[[1]] <- sub("[$]", "$", char[[1]]) - char[[n]] <- sub("[<]text ", "this is $\LaTeX$ text - # becomes - # this is $\LaTeX$ text - char <- gsub( - pattern = inline_dollars_regex("full"), - replacement = "\\1", - x = char, +label_partial_inline <- function(math) { + char <- xml2::xml_text(math) + # find lines that begin with `$` but do not have an end. + start <- gregexpr(inline_dollars_regex("start"), + char, + perl = TRUE + ) + # find lines that end with `$` but do not have a beginning. + stop <- gregexpr(inline_dollars_regex("stop"), + char, + perl = TRUE + ) + has_start <- start[[1]][1] > 0 + has_end <- stop[[1]][1] > 0 + if (has_start) { + # if the line contains the beginning of an inline math fragment, + # we start at the match and end at the end of the string + begin <- start[[1]] + end <- nchar(char) + } else if (has_end) { + # if the line contains the end of an inline math fragment, + # we start at the beginning of the string and end at the end of the match + begin <- 1 + end <- stop[[1]] + attr(stop[[1]], "match.len") + } else { + # otherwise, the entire range should be protected. + begin <- 1 + end <- nchar(char) + } + add_protected_ranges(math, begin, end) +} + +label_fully_inline <- function(math) { + char <- xml2::xml_text(math) + # Find the locations of inline math that is complete + locations <- gregexpr(pattern = inline_dollars_regex("full"), + char, perl = TRUE ) - make_text_nodes(char) + # add the ranges to the attributes + # this is $\LaTeX$ text + # becomes + # this is $\LaTeX$ text + start <- locations[[1]] + end <- start + attr(locations[[1]], "match.len") + add_protected_ranges(math, start, end) } #' Transform a character vector of XML into text nodes diff --git a/R/node-protection.R b/R/node-protection.R index d52715a..654b162 100644 --- a/R/node-protection.R +++ b/R/node-protection.R @@ -1,41 +1,99 @@ -label_fully_inline <- function(math) { - char <- xml2::xml_text(math) - locations <- gregexpr(pattern = inline_dollars_regex("full"), - char, - perl = TRUE - ) - add_protection(math, locations) -} - -add_protection <- function(node, locations) { - start <- locations[[1]] - end <- start + attr(locations[[1]], "match.len") - if (xml2::xml_has_attr(node, "protect.pos")) { +#' Handle protected ranges for a node +#' +#' @param node an XML `` node. +#' @param start `\[integer\]` a vector of starting indices of a set of ranges +#' @param end `\[integer\]` a vector of ending indices that are paired with +#' `start` +#' @return +#' - `add_protected_ranges()`: the modified node +#' - `remove_protected_ranges()`: the modified node +#' - `is_protected()`: `TRUE` if the node has protection attributes +#' - `get_protected_ranges()` a list containing integer vectors `start` and +#' `end` if the node is protected, otherwise, it returns NULL +#' @rdname protected_ranges +add_protected_ranges <- function(node, start, end) { + if (any(start < 1)) { + # return early if there are no ranges to protect + return(node) + } + if (is_protected(node)) { # extract the ranges from the attributes - ostart <- strsplit(xml2::xml_attr(node, "protect.pos"), " ")[[1]] - oend <- strsplit(xml2::xml_attr(node, "protect.end"), " ")[[1]] + orig <- get_protected_ranges(node) # update the ranges and the variables - new_ranges <- update_ranges( - start = c(as.integer(ostart), start), - end = c(as.integer(oend), end), - ) - start <- new_ranges$start - end <- new_ranges$end + new <- update_ranges(start = c(start, orig$start), end = c(end, orig$end)) + start <- new$start + end <- new$end } xml2::xml_set_attr(node, "protect.pos", paste(start, collapse = " ")) xml2::xml_set_attr(node, "protect.end", paste(end, collapse = " ")) + return(node) } -inrange <- function(s1, e1, s2, e2) { - s1 <= s2 & e1 >= e2 +#' @rdname protected_ranges +is_protected <- function(node) { + xml2::xml_has_attr(node, "protect.pos") && + xml2::xml_has_attr(node, "protect.end") } +#' @rdname protected_ranges +get_protected_ranges <- function(node) { + if (is_protected(node)) { + start <- strsplit(xml2::xml_attr(node, "protect.pos"), " ")[[1]] + end <- strsplit(xml2::xml_attr(node, "protect.end"), " ")[[1]] + } else { + return(NULL) + } + return(list(start = start, end = end)) +} + +#' @rdname protected_ranges +remove_protected_ranges <- function(node) { + xml2::xml_set_attr(node, "protect.pos", NULL) + xml2::xml_set_attr(node, "protect.end", NULL) + return(node) +} + +#' Detect if two ranges are overlapping +#' +#' @param s1 \[integer\] starting index of first range +#' @param e1 \[integer\] ending index of first range +#' @param s2 \[integer\] starting index of second range +#' @param e2 \[integer\] ending index of second range +#' @return `TRUE` if the ranges overlap and `FALSE` if they do not +#' +#' @noRd +#' @examples +#' overlap(1, 10, 5, 15) # TRUE +#' overlap(1, 4, 5, 15) # FALSE overlap <- function(s1, e1, s2, e2) { s1 <= e2 & s2 <= e1 } - -# https://www.geeksforgeeks.org/merging-intervals/ +#' Update a set of ranges +#' +#' @param start `\[integer\]` a vector of starting indices of a set of ranges +#' @param end `\[integer\]` a vector of ending indices that are paired with +#' `start` +#' @return a list of two integer vectors each with a length of at least one and +#' at most the same length as the input. +#' - `start` +#' - `end` +#' +#' @details +#' This function merges a set of ranges based on the algorithm presented in +#' . If none of the intervals +#' overlap, then the original `start` and `end` variables are returned sorted by +#' the starting order. +#' +#' If there are overlaps, they will be condensed, removing up to n - 1 intervals +#' +#' @noRd +#' @examples +#' # in this example, the ranges of [10, 20] overlaps with [5, 15] +#' ranges <- data.frame(start = c(1, 10, 100), end = c(2, 20, 200)) +#' new <- data.frame(start = c(5, 50, 500), end = c(15, 150, 1500)) +#' # thus they become [5, 20] +#' update_ranges(c(ranges$start, new$start), c(ranges$end, new$end)) update_ranges <- function(start, end) { # Sort the intervals based on the increasing order of starting time. ord <- order(start) From aeb3f58b92ed01510e3a7f0cb831c63ad84ab66d Mon Sep 17 00:00:00 2001 From: "Zhian N. Kamvar" Date: Mon, 29 Apr 2024 11:04:06 -0700 Subject: [PATCH 08/37] make sure node protection exists with an empty set --- R/node-protection.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/node-protection.R b/R/node-protection.R index 654b162..b7ce509 100644 --- a/R/node-protection.R +++ b/R/node-protection.R @@ -12,7 +12,7 @@ #' `end` if the node is protected, otherwise, it returns NULL #' @rdname protected_ranges add_protected_ranges <- function(node, start, end) { - if (any(start < 1)) { + if (length(start) == 0 || any(start < 1)) { # return early if there are no ranges to protect return(node) } From 8aa181b5e9b7f97423ab4df00e3b36c74bd0ab43 Mon Sep 17 00:00:00 2001 From: "Zhian N. Kamvar" Date: Mon, 29 Apr 2024 11:31:06 -0700 Subject: [PATCH 09/37] use attributes to protect square bracket nodes --- R/asis-nodes.R | 66 +++++++++++++++++++++++++------------------------- 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/R/asis-nodes.R b/R/asis-nodes.R index 1bfaa86..42581b6 100644 --- a/R/asis-nodes.R +++ b/R/asis-nodes.R @@ -391,11 +391,10 @@ protect_unescaped <- function(body, txt, ns = md_ns()) { warning(msg, call. = FALSE) return(body) } - body <- copy_xml(body) XPATH <- ".//md:text[not(@asis)][contains(text(), '[') or contains(text(), ']')]" snodes <- xml2::xml_find_all(body, XPATH, ns = ns) fix_unescaped_squares(snodes, txt) - copy_xml(body) + return(body) } #' Find the escaped square braces in text vector @@ -425,9 +424,8 @@ find_escaped_squares <- function(txt) { #' not represented as markup, we use their `sourcepos` attributes to determine #' the lines and columns of the `txt` where _escaped_ square braces are. #' -#' Knowing this, we can process each node by its line number and wrap all -#' unescpaed square braces in text nodes with the `@asis` attribute, which is -#' performed with the [fix_unescaped()] function. +#' Knowing this, we can add protection attributes to the positions that should +#' not be escaped. #' #' @return nothing, invisibly. This function is called for its side-effect. #' @noRd @@ -456,7 +454,7 @@ fix_unescaped_squares <- function(nodes, txt) { } } } - invisible() + return(invisible()) } @@ -474,17 +472,17 @@ fix_unescaped_squares <- function(nodes, txt) { #' will produce a text node like this: #' #' ```html -#' this is [unescaped] and this is [escaped] +#' +#' this is [unescaped] and this is [escaped] +#' #' ``` #' #' This function will replace the text node with this: #' #' ```html -#' this is -#' [ -#' unescaped -#' ] -#' and this is [escaped] +#' +#' this is [unescaped] and this is [escaped] +#' #' ``` #' #' This will ensure that the unescaped markdown remains unescaped. @@ -497,35 +495,37 @@ fix_unescaped_squares <- function(nodes, txt) { #' list items will have an offset of 4L because they are preceeded by ` - `. #' Defaults to `1L`, indicating that this text node starts as a paragraph #' whose parent is the root of the document. -#' @return new XML nodes, invisibly +#' @return modified XML nodes, invisibly #' @noRd fix_unescaped <- function(node, escaped = integer(0), offset = 1L) { txt <- as.character(node) - if (length(escaped) == 0) { - # If we have no escaped characters, then we can do a broad substitution - unescaped <- TRUE - } else { - # Converted to text, the node becomes Actual text Because - # the position is based on the actual text, we need to find the start of - # the actual text in the node text - text_start <- gregexpr("[>]", txt)[[1]][[1]] + 1L + if (length(escaped) > 0) { # Because the escaped characters were stripped off, we have to account for # a rolling count of the number of escapes missing_chars <- seq_along(escaped) - 1L - # If the source starts with markup, we have to take into account the offset - # position. This will set the escaped to start at the end of the XML markup - unescaped <- -(escaped + text_start - offset - missing_chars) + # If the source starts with markup (e.g. a list item), we have to take into + # account the offset position. This will set the escaped to start at the + # end of the XML markup + escaped <- escaped - missing_chars - offset + 1L } - # Here we split the character and exclude the escaped braces, protecting - # the unescaped braces. - chars <- strsplit(txt, "")[[1]] - chars[unescaped] <- sub( + return(label_unescaped(node, except = escaped)) +} + + +label_unescaped <- function(node, except = integer(0)) { + char <- xml2::xml_text(node) + # Find the locations of inline chars that is complete + locations <- gregexpr( pattern = "(\\[|\\])", - replacement = "\\1", - x = chars[unescaped] + char, + perl = TRUE ) - new_nodes <- make_text_nodes(paste(chars, collapse = "")) - add_node_siblings(node, new_nodes, remove = TRUE) + # add the ranges to the attributes + # this is $\LaTeX$ text + # becomes + # this is $\LaTeX$ text + pos <- locations[[1]] + pos <- pos[!pos %in% except] + return(add_protected_ranges(node, pos, pos)) } - From e51eceb4005e7aba9147564a67ec3e18ae13fe31 Mon Sep 17 00:00:00 2001 From: "Zhian N. Kamvar" Date: Mon, 29 Apr 2024 11:31:54 -0700 Subject: [PATCH 10/37] use attributes to protect curly nodes This will address #105 --- R/attr-nodes.R | 47 ++++++++++++++++++++++------------------------- 1 file changed, 22 insertions(+), 25 deletions(-) diff --git a/R/attr-nodes.R b/R/attr-nodes.R index 488ec3a..ff5cf6d 100644 --- a/R/attr-nodes.R +++ b/R/attr-nodes.R @@ -25,25 +25,27 @@ find_curly <- function(body, ns) { } digest_curly <- function(curly, ns) { - char <- as.character(curly) - curlies <- regmatches(char, gregexpr("\\{.*?\\}", char))[[1]] - for (curl in curlies) { - attributes <- "curly='true'" + label_curly_nodes(curly) + char <- xml2::xml_text(curly) + alt_fragment <- regmatches(char, gregexpr("alt=['\"].*?['\"]", char))[[1]] + if (length(alt_fragment) > 0) { + alt_text <- sub("^alt=", "", alt_fragment) + xml2::xml_set_attr(curly, "alt", alt_text) + } +} - alt_fragment <- regmatches(curl, gregexpr("alt=['\"].*?['\"]", curl))[[1]] - if (length(alt_fragment) > 0) { - alt_text <- sub("^alt=", "", alt_fragment) - attributes <- sprintf("%s alt=%s", attributes, alt_text) - } +label_curly_nodes <- function(node) { + char <- xml2::xml_text(node) + # Find the locations of inline chars that is complete + locations <- gregexpr( + pattern = "\\{.*?\\}", + char, + perl = TRUE + ) + start <- locations[[1]] + end <- start + attr(locations[[1]], "match.len") + return(add_protected_ranges(node, start, end)) - char <- sub( - curl, - sprintf("%s", attributes, curl), - char, - fixed = TRUE - ) - } - make_text_nodes(char) } #' Protect curly elements for further processing @@ -71,12 +73,7 @@ digest_curly <- function(curly, ns) { #' m$body <- protect_curly(m$body) #' xml2::xml_child(m$body) protect_curly <- function(body, ns = md_ns()) { - body <- copy_xml(body) - curly <- find_curly(body, ns) - new_nodes <- purrr::map(curly, digest_curly, ns = ns) - # since we split up the nodes, we have to do this node by node - for (i in seq(new_nodes)) { - add_node_siblings(curly[[i]], new_nodes[[i]], remove = TRUE) - } - copy_xml(body) + curly <- find_curly(body, ns) + purrr::walk(curly, digest_curly, ns = ns) + return(body) } From 1b9ae6ab29a3a8bd4dcb1190d6598fd0816786fd Mon Sep 17 00:00:00 2001 From: "Zhian N. Kamvar" Date: Mon, 29 Apr 2024 11:32:32 -0700 Subject: [PATCH 11/37] update tests --- tests/testthat/_snaps/attr-nodes.md | 26 +++++--------------------- 1 file changed, 5 insertions(+), 21 deletions(-) diff --git a/tests/testthat/_snaps/attr-nodes.md b/tests/testthat/_snaps/attr-nodes.md index 55f8e57..2e0b7c4 100644 --- a/tests/testthat/_snaps/attr-nodes.md +++ b/tests/testthat/_snaps/attr-nodes.md @@ -7,19 +7,13 @@ - preface - {#pre-face .unnumbered} - + preface {#pre-face .unnumbered} hello - I like - {xml2} - but of course - {tinkr} - is even cooler! + I like {xml2} but of course {tinkr} is even cooler! Images that use pandoc style will have curlies with content that should be translated and should be protected. @@ -28,29 +22,19 @@ a pretty kitten - - {#kitteh alt='a picture of a kitten'} - + {#kitteh alt='a picture of a kitten'} a pretty puppy - - {#dog alt="a picture + {#dog alt="a picture of a dog"} - - - [ - a span with attributes - ] - - {.span-with-attributes + [a span with attributes]{.span-with-attributes style='color: red;'} - From 32d2ffdd52053300626241561e2cba103c684645 Mon Sep 17 00:00:00 2001 From: "Zhian N. Kamvar" Date: Mon, 29 Apr 2024 12:14:53 -0700 Subject: [PATCH 12/37] add NEWS; bump description --- DESCRIPTION | 2 +- NEWS.md | 29 +++++++++++++++++++++++++++++ 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index bb4cc73..1c487b2 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: tinkr Title: Cast '(R)Markdown' Files to 'XML' and Back Again -Version: 0.2.0 +Version: 0.2.0.9000 Authors@R: c(person(given = "Maëlle", family = "Salmon", diff --git a/NEWS.md b/NEWS.md index 1f0533a..3ecfcbb 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,32 @@ +# tinkr 1.0.0 + +## BREAKING CHANGES + +Node protection will no longer fragment text nodes into groups of 'asis' and +regular text nodes. Instead, two attributes `protect.pos` and `protect.end` +are added that record the ranges of the protected characters. If you have been +relying on using `@asis` nodes to manipulate your documents, your code will +break. We are instead exporting helper functions to handle this. + + - `protect_math()`, `protect_curly()`, and `protect_unescaped()` now modify + the document in place. + +## NEW FUNCTIONS + + - `add_protected_ranges()` adds and updates protected ranges for a given text + node + - `is_protected()` an indicator if a node has protection or not + - `get_protected_ranges()` returns a list of integer vectors that indicate the + protected ranges + - `remove_protected_ranges()` removes the `protect.pos` and `protect.end` + attributes from a node + +## BUG FIX + + - `protect_math()` no longer failes if `protect_curly()` was run before it + (reported: @maelle, #105; fixed: @zkamvar) + + # tinkr 0.2.0 ## BUG FIX From cd95f58619e4c0c168170f6b7f9da88931f90356 Mon Sep 17 00:00:00 2001 From: "Zhian N. Kamvar" Date: Mon, 29 Apr 2024 12:17:20 -0700 Subject: [PATCH 13/37] export protection functions --- NAMESPACE | 4 ++++ R/node-protection.R | 4 ++++ man/protected_ranges.Rd | 37 +++++++++++++++++++++++++++++++++++++ 3 files changed, 45 insertions(+) create mode 100644 man/protected_ranges.Rd diff --git a/NAMESPACE b/NAMESPACE index 93033f3..5c0412a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,9 +1,13 @@ # Generated by roxygen2: do not edit by hand +export(add_protected_ranges) export(find_between) +export(get_protected_ranges) +export(is_protected) export(md_ns) export(protect_curly) export(protect_math) +export(remove_protected_ranges) export(stylesheet) export(to_md) export(to_xml) diff --git a/R/node-protection.R b/R/node-protection.R index b7ce509..3a09a1a 100644 --- a/R/node-protection.R +++ b/R/node-protection.R @@ -11,6 +11,7 @@ #' - `get_protected_ranges()` a list containing integer vectors `start` and #' `end` if the node is protected, otherwise, it returns NULL #' @rdname protected_ranges +#' @export add_protected_ranges <- function(node, start, end) { if (length(start) == 0 || any(start < 1)) { # return early if there are no ranges to protect @@ -30,12 +31,14 @@ add_protected_ranges <- function(node, start, end) { } #' @rdname protected_ranges +#' @export is_protected <- function(node) { xml2::xml_has_attr(node, "protect.pos") && xml2::xml_has_attr(node, "protect.end") } #' @rdname protected_ranges +#' @export get_protected_ranges <- function(node) { if (is_protected(node)) { start <- strsplit(xml2::xml_attr(node, "protect.pos"), " ")[[1]] @@ -47,6 +50,7 @@ get_protected_ranges <- function(node) { } #' @rdname protected_ranges +#' @export remove_protected_ranges <- function(node) { xml2::xml_set_attr(node, "protect.pos", NULL) xml2::xml_set_attr(node, "protect.end", NULL) diff --git a/man/protected_ranges.Rd b/man/protected_ranges.Rd new file mode 100644 index 0000000..1b471ab --- /dev/null +++ b/man/protected_ranges.Rd @@ -0,0 +1,37 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/node-protection.R +\name{add_protected_ranges} +\alias{add_protected_ranges} +\alias{is_protected} +\alias{get_protected_ranges} +\alias{remove_protected_ranges} +\title{Handle protected ranges for a node} +\usage{ +add_protected_ranges(node, start, end) + +is_protected(node) + +get_protected_ranges(node) + +remove_protected_ranges(node) +} +\arguments{ +\item{node}{an XML \verb{} node.} + +\item{start}{\verb{\[integer\]} a vector of starting indices of a set of ranges} + +\item{end}{\verb{\[integer\]} a vector of ending indices that are paired with +\code{start}} +} +\value{ +\itemize{ +\item \code{add_protected_ranges()}: the modified node +\item \code{remove_protected_ranges()}: the modified node +\item \code{is_protected()}: \code{TRUE} if the node has protection attributes +\item \code{get_protected_ranges()} a list containing integer vectors \code{start} and +\code{end} if the node is protected, otherwise, it returns NULL +} +} +\description{ +Handle protected ranges for a node +} From 7c8d35b5e0a9a5d1f767f8ab4ee86b36b360950e Mon Sep 17 00:00:00 2001 From: "Zhian N. Kamvar" Date: Mon, 29 Apr 2024 15:50:37 -0700 Subject: [PATCH 14/37] add test for #105 --- tests/testthat/test-asis-nodes.R | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/tests/testthat/test-asis-nodes.R b/tests/testthat/test-asis-nodes.R index 206548d..7fab492 100644 --- a/tests/testthat/test-asis-nodes.R +++ b/tests/testthat/test-asis-nodes.R @@ -100,3 +100,26 @@ test_that("protect_unescaped() will throw a warning if no sourcpos is available" }) }) + +test_that("(105) protection of one element does not impede protection of another", { + + expected <- "example\n\n$a_{ij}$ \n" + + temp_file <- withr::local_tempfile() + brio::write_lines(expected, temp_file) + wool <- tinkr::yarn$new(temp_file) + n <- xml2::xml_find_all(wool$body, ".//md:text[@protect.pos]", ns = md_ns()) + expect_length(n, 0) + + wool$protect_curly() + + n <- xml2::xml_find_all(wool$body, ".//md:text[@protect.pos]", ns = md_ns()) + expect_length(n, 1) + expect_equal(get_protected_ranges(n[[1]]), list(start = 4L, end = 7L)) + expect_no_error(wool$protect_math()) + n <- xml2::xml_find_all(wool$body, ".//md:text[@protect.pos]", ns = md_ns()) + expect_length(n, 1) + expect_equal(get_protected_ranges(n[[1]]), list(start = 1L, end = 8L)) + expect_snapshot(show_user(wool$show(), force = TRUE)) +}) + From fd45728abd705d3b38c5aab418b30584824e81e7 Mon Sep 17 00:00:00 2001 From: "Zhian N. Kamvar" Date: Mon, 29 Apr 2024 15:51:20 -0700 Subject: [PATCH 15/37] ensure output of get_protected_ranges is integer --- R/node-protection.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/node-protection.R b/R/node-protection.R index 3a09a1a..ca8b0b4 100644 --- a/R/node-protection.R +++ b/R/node-protection.R @@ -46,7 +46,7 @@ get_protected_ranges <- function(node) { } else { return(NULL) } - return(list(start = start, end = end)) + return(list(start = as.integer(start), end = as.integer(end))) } #' @rdname protected_ranges From f79c1239bf7763020dff8f8531ccc80bf1650526 Mon Sep 17 00:00:00 2001 From: "Zhian N. Kamvar" Date: Mon, 29 Apr 2024 15:51:41 -0700 Subject: [PATCH 16/37] fix off-by-one errors --- R/asis-nodes.R | 6 +++--- R/attr-nodes.R | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/R/asis-nodes.R b/R/asis-nodes.R index 42581b6..33195b4 100644 --- a/R/asis-nodes.R +++ b/R/asis-nodes.R @@ -198,7 +198,7 @@ label_partial_inline <- function(math) { # if the line contains the end of an inline math fragment, # we start at the beginning of the string and end at the end of the match begin <- 1 - end <- stop[[1]] + attr(stop[[1]], "match.len") + end <- stop[[1]] + attr(stop[[1]], "match.len") - 1L } else { # otherwise, the entire range should be protected. begin <- 1 @@ -219,7 +219,7 @@ label_fully_inline <- function(math) { # becomes # this is $\LaTeX$ text start <- locations[[1]] - end <- start + attr(locations[[1]], "match.len") + end <- start + attr(locations[[1]], "match.len") - 1L add_protected_ranges(math, start, end) } @@ -403,7 +403,7 @@ protect_unescaped <- function(body, txt, ns = md_ns()) { #' @return the same output as [base::gregexpr()]: a list the same length as #' `txt` with integer vectors indicating the character positions of the matches #' with attributes: -#' 1. match.length the length of the match (will be '2') +#' 1. match.len the length of the match (will be '2') #' @noRd find_escaped_squares <- function(txt) { gregexpr("(\\\\\\])|(\\\\\\[)", txt, useBytes = FALSE) diff --git a/R/attr-nodes.R b/R/attr-nodes.R index ff5cf6d..2d1ca1e 100644 --- a/R/attr-nodes.R +++ b/R/attr-nodes.R @@ -43,7 +43,7 @@ label_curly_nodes <- function(node) { perl = TRUE ) start <- locations[[1]] - end <- start + attr(locations[[1]], "match.len") + end <- start + attr(locations[[1]], "match.len") - 1L return(add_protected_ranges(node, start, end)) } From 29b9b3d1f9516bcd877f3b705208b0479767a589 Mon Sep 17 00:00:00 2001 From: "Zhian N. Kamvar" Date: Mon, 29 Apr 2024 15:51:51 -0700 Subject: [PATCH 17/37] rerun snaps --- tests/testthat/_snaps/asis-nodes.md | 10 ++++++++++ tests/testthat/_snaps/attr-nodes.md | 6 +++--- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/tests/testthat/_snaps/asis-nodes.md b/tests/testthat/_snaps/asis-nodes.md index 9f14f0b..18f62d9 100644 --- a/tests/testthat/_snaps/asis-nodes.md +++ b/tests/testthat/_snaps/asis-nodes.md @@ -87,3 +87,13 @@ - [This is a link](https://ropensci.org) - \[this is an example\] +# (105) protection of one element does not impede protection of another + + Code + show_user(wool$show(), force = TRUE) + Output + example + + $a_{ij}$ + + diff --git a/tests/testthat/_snaps/attr-nodes.md b/tests/testthat/_snaps/attr-nodes.md index 2e0b7c4..a5658b4 100644 --- a/tests/testthat/_snaps/attr-nodes.md +++ b/tests/testthat/_snaps/attr-nodes.md @@ -7,13 +7,13 @@ - preface {#pre-face .unnumbered} + preface {#pre-face .unnumbered} hello - I like {xml2} but of course {tinkr} is even cooler! + I like {xml2} but of course {tinkr} is even cooler! Images that use pandoc style will have curlies with content that should be translated and should be protected. @@ -22,7 +22,7 @@ a pretty kitten - {#kitteh alt='a picture of a kitten'} + {#kitteh alt='a picture of a kitten'} From bbc1e610e576a157d04d6f1cd1dd9773bd694db8 Mon Sep 17 00:00:00 2001 From: "Zhian N. Kamvar" Date: Tue, 30 Apr 2024 08:46:04 -0700 Subject: [PATCH 18/37] add extra checks for `add_protected_ranges()` --- R/node-protection.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/R/node-protection.R b/R/node-protection.R index ca8b0b4..eb270ad 100644 --- a/R/node-protection.R +++ b/R/node-protection.R @@ -13,7 +13,9 @@ #' @rdname protected_ranges #' @export add_protected_ranges <- function(node, start, end) { - if (length(start) == 0 || any(start < 1)) { + no_beginning <- length(start) == 0 || any(start < 1) + can_protect <- inherits(node, "xml_node") && xml2::xml_name(node) == "text" + if (no_beginning || !can_protect) { # return early if there are no ranges to protect return(node) } From b7c94ee9844b6b7269bd68840ff438fe2865f0df Mon Sep 17 00:00:00 2001 From: "Zhian N. Kamvar" Date: Tue, 30 Apr 2024 09:46:32 -0700 Subject: [PATCH 19/37] fix failing CI test The square bracket _should_ be escaped since it's outside of the protected range. --- tests/testthat/test-asis-nodes.R | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/tests/testthat/test-asis-nodes.R b/tests/testthat/test-asis-nodes.R index 7fab492..6633930 100644 --- a/tests/testthat/test-asis-nodes.R +++ b/tests/testthat/test-asis-nodes.R @@ -5,14 +5,36 @@ test_that("mal-formed inline math throws an informative error", { }) test_that("multi-line inline math can have punctutation after", { - template <- "C) $E(\\text{Weight}) = 81.37 + 1.26 \\times x_1 +\n2.65 \\times x_2$punk\n" - for (punk in c('--', '---', ',', ';', '.', '?', ')', ']', '}', '>')) { - expected <- sub("punk", punk, template) + template <- c( + "C) $E(\\text{Weight}) = 81.37 + 1.26 \\times x_1 +", + "2.65 \\times x_2$punk\n" + ) + for (punk in c('--', '---', ',', ';', '.', '?', ')', '\\\\]', '}', '>')) { + expected <- paste(sub("punk", punk, template), collapse = "\n") math <- commonmark::markdown_xml(expected) txt <- xml2::read_xml(math) + nodes <- xml2::xml_find_all(txt, ".//md:text", ns = md_ns()) + # no protection initially + expect_equal( + xml2::xml_attr(nodes, "protect.pos"), + c(NA_character_, NA_character_) + ) protxt <- protect_inline_math(txt, md_ns()) + # the transformed content is identical. + expect_identical(txt, protxt) + # protection exists + expect_equal( + xml2::xml_attr(nodes, "protect.pos"), + c('4', '1') + ) + expect_equal( + xml2::xml_attr(nodes, "protect.end"), + c('48', '16') + ) actual <- to_md(list(yaml = NULL, body = protxt)) - expect_equal(actual, expected) + act <- substring(actual, nchar(actual) - 2, nchar(actual) - 1) + + expect_equal(actual, expected, label = act, expected.label = punk) } }) From b478e97f0b30459e2db91d6f628e0347df4c59eb Mon Sep 17 00:00:00 2001 From: "Zhian N. Kamvar" Date: Tue, 30 Apr 2024 09:48:40 -0700 Subject: [PATCH 20/37] add text node boolean functions --- R/node-protection.R | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/R/node-protection.R b/R/node-protection.R index eb270ad..23d852b 100644 --- a/R/node-protection.R +++ b/R/node-protection.R @@ -14,8 +14,7 @@ #' @export add_protected_ranges <- function(node, start, end) { no_beginning <- length(start) == 0 || any(start < 1) - can_protect <- inherits(node, "xml_node") && xml2::xml_name(node) == "text" - if (no_beginning || !can_protect) { + if (no_beginning || not_text_node(node)) { # return early if there are no ranges to protect return(node) } @@ -32,6 +31,12 @@ add_protected_ranges <- function(node, start, end) { return(node) } +is_text_node <- function(node) { + inherits(node, "xml_node") && xml2::xml_name(node) == "text" +} + +not_text_node <- Negate(is_text_node) + #' @rdname protected_ranges #' @export is_protected <- function(node) { @@ -42,7 +47,7 @@ is_protected <- function(node) { #' @rdname protected_ranges #' @export get_protected_ranges <- function(node) { - if (is_protected(node)) { + if (is_text_node(node) && is_protected(node)) { start <- strsplit(xml2::xml_attr(node, "protect.pos"), " ")[[1]] end <- strsplit(xml2::xml_attr(node, "protect.end"), " ")[[1]] } else { From 741fbd781bcc82180f22dd64bdd478565b037da7 Mon Sep 17 00:00:00 2001 From: "Zhian N. Kamvar" Date: Tue, 30 Apr 2024 09:49:27 -0700 Subject: [PATCH 21/37] add comments to test file --- tests/testthat/test-asis-nodes.R | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/testthat/test-asis-nodes.R b/tests/testthat/test-asis-nodes.R index 6633930..745978c 100644 --- a/tests/testthat/test-asis-nodes.R +++ b/tests/testthat/test-asis-nodes.R @@ -130,17 +130,23 @@ test_that("(105) protection of one element does not impede protection of another temp_file <- withr::local_tempfile() brio::write_lines(expected, temp_file) wool <- tinkr::yarn$new(temp_file) + # no protection initially n <- xml2::xml_find_all(wool$body, ".//md:text[@protect.pos]", ns = md_ns()) expect_length(n, 0) wool$protect_curly() + # protection exists n <- xml2::xml_find_all(wool$body, ".//md:text[@protect.pos]", ns = md_ns()) expect_length(n, 1) + # the ranges are initially betwen the curly braces expect_equal(get_protected_ranges(n[[1]]), list(start = 4L, end = 7L)) + + # protecting for math does not throw an error expect_no_error(wool$protect_math()) n <- xml2::xml_find_all(wool$body, ".//md:text[@protect.pos]", ns = md_ns()) expect_length(n, 1) + # the protected range now extends to the whole line expect_equal(get_protected_ranges(n[[1]]), list(start = 1L, end = 8L)) expect_snapshot(show_user(wool$show(), force = TRUE)) }) From 2066ba78c16720e6adcc4140c326f51fd0f3c74b Mon Sep 17 00:00:00 2001 From: "Zhian N. Kamvar" Date: Tue, 30 Apr 2024 09:49:40 -0700 Subject: [PATCH 22/37] add protection tests --- tests/testthat/test-node-protection.R | 112 ++++++++++++++++++++++++++ 1 file changed, 112 insertions(+) diff --git a/tests/testthat/test-node-protection.R b/tests/testthat/test-node-protection.R index 93c8650..716b4bd 100644 --- a/tests/testthat/test-node-protection.R +++ b/tests/testthat/test-node-protection.R @@ -67,3 +67,115 @@ test_that("update_ranges will merge all overlapping ranges", { }) + + + +test_that("protection can be added and removed", { + expected <- c( + "\\a\\b\\c\\d", + "\\e\\f\\g\\h", + "" + ) + temp_file <- withr::local_tempfile() + brio::write_lines(expected, temp_file) + wool <- tinkr::yarn$new(temp_file, unescaped = FALSE) + nodes <- xml2::xml_find_all(wool$body, ".//md:text", ns = md_ns()) + + # NO RANGES ------------------------------------------------------- + # the body is not a text node + expect_true(not_text_node(wool$body)) + # the text nodes are not a single text node + expect_true(not_text_node(nodes)) + # individual text nodes are nodes + expect_true(is_text_node(nodes[[1]])) + expect_false(not_text_node(nodes[[1]])) + expect_false(is_protected(nodes[[1]])) + expect_false(is_protected(nodes[[2]])) + + # no range protection exists + expect_null(get_protected_ranges(nodes)) + expect_equal(lapply(nodes, get_protected_ranges), + vector(mode = "list", length = 2) # empty list + ) + + # no protection applied, the text is all escaped + no_protection <- wool$show() + expect_equal(no_protection, gsub("\\\\", "\\\\\\\\", expected)) + + # ADDING PROTECTION ----------------------------------------------- + # protecting second and fourth entities + expect_false(is_protected(nodes[[1]])) + + add_protected_ranges(nodes[[1]], start = c(3, 7), end = c(4, 8)) + + expect_true(is_protected(nodes[[1]])) + + # protecting everything + expect_false(is_protected(nodes[[2]])) + + add_protected_ranges(nodes[[2]], start = 1, end = 8) + + expect_true(is_protected(nodes[[2]])) + + expect_equal(get_protected_ranges(nodes[[1]]), + list(start = c(3, 7), end = c(4, 8)) + ) + expect_equal(get_protected_ranges(nodes[[2]]), + list(start = 1, end = 8) + ) + some_protection <- wool$show() + # we expect all but the first and third entities to be protected + some_expected <- gsub("\\\\([ac])", "\\\\\\\\\\1", expected) + expect_equal(object = some_protection, expected = some_expected) + + # OVERLAPPING PROTECTIONS ---------------------------------------- + # adding identical protections do not duplicate them + add_protected_ranges(nodes[[1]], start = c(3, 7), end = c(4, 8)) + expect_equal(get_protected_ranges(nodes[[1]]), + list(start = c(3, 7), end = c(4, 8)) + ) + # adding completely overlapping protections does not cause an error + add_protected_ranges(nodes[[1]], start = c(3, 7), end = c(4, 8)) + expect_equal(get_protected_ranges(nodes[[2]]), + list(start = 1, end = 8) + ) + some_protection <- wool$show() + # we expect all but the first and third entities to be protected + some_expected <- gsub("\\\\([ac])", "\\\\\\\\\\1", expected) + expect_equal(object = some_protection, expected = some_expected) + + # NEW PROTECTIONS ----------------------------------------------- + # we can add a new range that does not overlap (but it can abut) + add_protected_ranges(nodes[[1]], start = 1, end = 2) + expect_equal(get_protected_ranges(nodes[[1]]), + list(start = c(1, 3, 7), end = c(2, 4, 8)) + ) + some_protection <- wool$show() + some_expected <- gsub("\\\\([c])", "\\\\\\\\\\1", expected) + expect_equal(object = some_protection, expected = some_expected) + + # if we add an overlapping range, they are connected + add_protected_ranges(nodes[[1]], start = 1, end = 4) + expect_equal(get_protected_ranges(nodes[[1]]), + list(start = c(1, 7), end = c(4, 8)) + ) + some_protection <- wool$show() + some_expected <- gsub("\\\\([c])", "\\\\\\\\\\1", expected) + expect_equal(object = some_protection, expected = some_expected) + + # REMOVING PROTECTIONS ----------------------------------------- + expect_true(is_protected(nodes[[1]])) + + remove_protected_ranges(nodes[[1]]) + + expect_false(is_protected(nodes[[1]])) + + expect_null(get_protected_ranges(nodes[[1]])) + + # now the top line is unprotected + some_protection <- wool$show() + some_expected <- gsub("\\\\([abcd])", "\\\\\\\\\\1", expected) + expect_equal(object = some_protection, expected = some_expected) + +}) + From 739020a16df3cf97f20d435a7ee2a1d0096e6583 Mon Sep 17 00:00:00 2001 From: "Zhian N. Kamvar" Date: Tue, 30 Apr 2024 09:49:55 -0700 Subject: [PATCH 23/37] update documentation a bit --- R/asis-nodes.R | 2 +- man/protect_math.Rd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/asis-nodes.R b/R/asis-nodes.R index 33195b4..8fb5962 100644 --- a/R/asis-nodes.R +++ b/R/asis-nodes.R @@ -17,7 +17,7 @@ #' txt <- textConnection(tinkr::to_md(m)) #' cat(tail(readLines(txt)), sep = "\n") # broken math #' close(txt) -#' m$body <- protect_math(m$body) +#' protect_math(m$body) #' txt <- textConnection(tinkr::to_md(m)) #' cat(tail(readLines(txt)), sep = "\n") # fixed math #' close(txt) diff --git a/man/protect_math.Rd b/man/protect_math.Rd index bbf25bf..5dc4daa 100644 --- a/man/protect_math.Rd +++ b/man/protect_math.Rd @@ -32,7 +32,7 @@ m <- tinkr::to_xml(system.file("extdata", "math-example.md", package = "tinkr")) txt <- textConnection(tinkr::to_md(m)) cat(tail(readLines(txt)), sep = "\n") # broken math close(txt) -m$body <- protect_math(m$body) +protect_math(m$body) txt <- textConnection(tinkr::to_md(m)) cat(tail(readLines(txt)), sep = "\n") # fixed math close(txt) From 9357b95874193ea79a3b65bd943c13d0be147b86 Mon Sep 17 00:00:00 2001 From: "Zhian N. Kamvar" Date: Tue, 30 Apr 2024 10:03:39 -0700 Subject: [PATCH 24/37] document node protection --- DESCRIPTION | 1 + R/node-protection.R | 46 ++++++++++++++++++++++++++++++++++++++++ man/protected_ranges.Rd | 47 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 94 insertions(+) diff --git a/DESCRIPTION b/DESCRIPTION index 1c487b2..a3e7e73 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -50,6 +50,7 @@ Suggests: rmarkdown, covr, testthat (>= 3.0.0), + brio, withr Config/testthat/edition: 3 Encoding: UTF-8 diff --git a/R/node-protection.R b/R/node-protection.R index 23d852b..dc979b7 100644 --- a/R/node-protection.R +++ b/R/node-protection.R @@ -12,6 +12,52 @@ #' `end` if the node is protected, otherwise, it returns NULL #' @rdname protected_ranges #' @export +#' @examples +#' # example of text to protect +#' # SETUP --------------------- +#' expected <- c( +#' "\\a\\b\\c\\d", +#' "\\e\\f\\g\\h", +#' "" +#' ) +#' temp_file <- tempfile() +#' writeLines(expected, temp_file) +#' wool <- tinkr::yarn$new(temp_file) +#' nodes <- xml2::xml_find_all(wool$body, ".//md:text", ns = md_ns()) +#' writeLines(expected) # this is how it should appear +#' wool$show() # nothing is protected, so the '\' are escaped +#' # ADDING PROTECTION ---------- +#' # protections are added _per node_ +#' add_protected_ranges(nodes[[1]], start = 1, end = 8) # protect whole range +#' add_protected_ranges(nodes[[2]], start = c(1, 7), end = c(2, 8)) # partial +#' wool$show() # the first row and "\e" and "\h" are protected +#' +#' # extract the ranges +#' is_protected(nodes[[1]]) +#' is_protected(nodes[[2]]) +#' get_protected_ranges(nodes[[1]]) +#' get_protected_ranges(nodes[[2]]) +#' +#' # Add additional protection that overlaps. +#' # The current range is [1, 2] and [7, 8]. If we add [5, 8], the range +#' # will be updated +#' add_protected_ranges(nodes[[2]], start = 5, end = 8) +#' get_protected_ranges(nodes[[2]]) +#' +#' # overlapping protection is not duplicated +#' add_protected_ranges(nodes[[1]], start = 1, end = 4) +#' get_protected_ranges(nodes[[1]]) +#' +#' wool$show() # the first row and "\e", "\g", and "\h" are protected +#' +#' # REMOVING PROTECTION -------- +#' remove_protected_ranges(nodes[[2]]) +#' is_protected(nodes[[2]]) +#' get_protected_ranges(nodes[[2]]) +#' wool$show() +#' +#' # CLEAN UP ------------------- +#' if (file.exists(temp_file)) unlink(temp_file) add_protected_ranges <- function(node, start, end) { no_beginning <- length(start) == 0 || any(start < 1) if (no_beginning || not_text_node(node)) { diff --git a/man/protected_ranges.Rd b/man/protected_ranges.Rd index 1b471ab..5892110 100644 --- a/man/protected_ranges.Rd +++ b/man/protected_ranges.Rd @@ -35,3 +35,50 @@ remove_protected_ranges(node) \description{ Handle protected ranges for a node } +\examples{ +# example of text to protect +# SETUP --------------------- +expected <- c( + "\\\\a\\\\b\\\\c\\\\d", + "\\\\e\\\\f\\\\g\\\\h", + "" +) +temp_file <- tempfile() +writeLines(expected, temp_file) +wool <- tinkr::yarn$new(temp_file) +nodes <- xml2::xml_find_all(wool$body, ".//md:text", ns = md_ns()) +writeLines(expected) # this is how it should appear +wool$show() # nothing is protected, so the '\' are escaped +# ADDING PROTECTION ---------- +# protections are added _per node_ +add_protected_ranges(nodes[[1]], start = 1, end = 8) # protect whole range +add_protected_ranges(nodes[[2]], start = c(1, 7), end = c(2, 8)) # partial +wool$show() # the first row and "\e" and "\h" are protected + +# extract the ranges +is_protected(nodes[[1]]) +is_protected(nodes[[2]]) +get_protected_ranges(nodes[[1]]) +get_protected_ranges(nodes[[2]]) + +# Add additional protection that overlaps. +# The current range is [1, 2] and [7, 8]. If we add [5, 8], the range +# will be updated +add_protected_ranges(nodes[[2]], start = 5, end = 8) +get_protected_ranges(nodes[[2]]) + +# overlapping protection is not duplicated +add_protected_ranges(nodes[[1]], start = 1, end = 4) +get_protected_ranges(nodes[[1]]) + +wool$show() # the first row and "\e", "\g", and "\h" are protected + +# REMOVING PROTECTION -------- +remove_protected_ranges(nodes[[2]]) +is_protected(nodes[[2]]) +get_protected_ranges(nodes[[2]]) +wool$show() + +# CLEAN UP ------------------- +if (file.exists(temp_file)) unlink(temp_file) +} From 2614c20b832dddb6200e6d07ad9d5570a9c031d3 Mon Sep 17 00:00:00 2001 From: "Zhian N. Kamvar" Date: Tue, 30 Apr 2024 10:08:55 -0700 Subject: [PATCH 25/37] rename protect.pos -> protect.start --- NEWS.md | 4 ++-- R/asis-nodes.R | 2 +- R/node-protection.R | 8 ++++---- inst/extdata/xml_protect.xml | 2 +- inst/stylesheets/tinkr-helpers.xsl | 6 +++--- inst/stylesheets/xml2md_gfm.xsl | 6 +++--- tests/testthat/_snaps/attr-nodes.md | 8 ++++---- tests/testthat/test-asis-nodes.R | 10 +++++----- 8 files changed, 23 insertions(+), 23 deletions(-) diff --git a/NEWS.md b/NEWS.md index 3ed330e..30b9c64 100644 --- a/NEWS.md +++ b/NEWS.md @@ -3,7 +3,7 @@ ## BREAKING CHANGES Node protection will no longer fragment text nodes into groups of 'asis' and -regular text nodes. Instead, two attributes `protect.pos` and `protect.end` +regular text nodes. Instead, two attributes `protect.start` and `protect.end` are added that record the ranges of the protected characters. If you have been relying on using `@asis` nodes to manipulate your documents, your code will break. We are instead exporting helper functions to handle this. @@ -18,7 +18,7 @@ break. We are instead exporting helper functions to handle this. - `is_protected()` an indicator if a node has protection or not - `get_protected_ranges()` returns a list of integer vectors that indicate the protected ranges - - `remove_protected_ranges()` removes the `protect.pos` and `protect.end` + - `remove_protected_ranges()` removes the `protect.start` and `protect.end` attributes from a node ## BUG FIX diff --git a/R/asis-nodes.R b/R/asis-nodes.R index 8fb5962..d3dcc0d 100644 --- a/R/asis-nodes.R +++ b/R/asis-nodes.R @@ -480,7 +480,7 @@ fix_unescaped_squares <- function(nodes, txt) { #' This function will replace the text node with this: #' #' ```html -#' +#' #' this is [unescaped] and this is [escaped] #' #' ``` diff --git a/R/node-protection.R b/R/node-protection.R index dc979b7..a3dc802 100644 --- a/R/node-protection.R +++ b/R/node-protection.R @@ -72,7 +72,7 @@ add_protected_ranges <- function(node, start, end) { start <- new$start end <- new$end } - xml2::xml_set_attr(node, "protect.pos", paste(start, collapse = " ")) + xml2::xml_set_attr(node, "protect.start", paste(start, collapse = " ")) xml2::xml_set_attr(node, "protect.end", paste(end, collapse = " ")) return(node) } @@ -86,7 +86,7 @@ not_text_node <- Negate(is_text_node) #' @rdname protected_ranges #' @export is_protected <- function(node) { - xml2::xml_has_attr(node, "protect.pos") && + xml2::xml_has_attr(node, "protect.start") && xml2::xml_has_attr(node, "protect.end") } @@ -94,7 +94,7 @@ is_protected <- function(node) { #' @export get_protected_ranges <- function(node) { if (is_text_node(node) && is_protected(node)) { - start <- strsplit(xml2::xml_attr(node, "protect.pos"), " ")[[1]] + start <- strsplit(xml2::xml_attr(node, "protect.start"), " ")[[1]] end <- strsplit(xml2::xml_attr(node, "protect.end"), " ")[[1]] } else { return(NULL) @@ -105,7 +105,7 @@ get_protected_ranges <- function(node) { #' @rdname protected_ranges #' @export remove_protected_ranges <- function(node) { - xml2::xml_set_attr(node, "protect.pos", NULL) + xml2::xml_set_attr(node, "protect.start", NULL) xml2::xml_set_attr(node, "protect.end", NULL) return(node) } diff --git a/inst/extdata/xml_protect.xml b/inst/extdata/xml_protect.xml index 125c967..3213fa5 100644 --- a/inst/extdata/xml_protect.xml +++ b/inst/extdata/xml_protect.xml @@ -2,6 +2,6 @@ - This is a thing with some $x_i$ math elements $x_j$ and $let x_i = x_j$ [not math] with $\frac{i}{\pi}$ after all. + This is a thing with some $x_i$ math elements $x_j$ and $let x_i = x_j$ [not math] with $\frac{i}{\pi}$ after all. diff --git a/inst/stylesheets/tinkr-helpers.xsl b/inst/stylesheets/tinkr-helpers.xsl index b7ddf59..a8969e1 100644 --- a/inst/stylesheets/tinkr-helpers.xsl +++ b/inst/stylesheets/tinkr-helpers.xsl @@ -15,7 +15,7 @@ - + @@ -45,7 +45,7 @@ - + @@ -63,7 +63,7 @@ - + diff --git a/inst/stylesheets/xml2md_gfm.xsl b/inst/stylesheets/xml2md_gfm.xsl index 2f1aeb9..ac6eaa3 100644 --- a/inst/stylesheets/xml2md_gfm.xsl +++ b/inst/stylesheets/xml2md_gfm.xsl @@ -22,11 +22,11 @@ - + - + diff --git a/tests/testthat/_snaps/attr-nodes.md b/tests/testthat/_snaps/attr-nodes.md index a5658b4..bd2becb 100644 --- a/tests/testthat/_snaps/attr-nodes.md +++ b/tests/testthat/_snaps/attr-nodes.md @@ -7,13 +7,13 @@ - preface {#pre-face .unnumbered} + preface {#pre-face .unnumbered} hello - I like {xml2} but of course {tinkr} is even cooler! + I like {xml2} but of course {tinkr} is even cooler! Images that use pandoc style will have curlies with content that should be translated and should be protected. @@ -22,7 +22,7 @@ a pretty kitten - {#kitteh alt='a picture of a kitten'} + {#kitteh alt='a picture of a kitten'} @@ -33,7 +33,7 @@ - [a span with attributes]{.span-with-attributes + [a span with attributes]{.span-with-attributes style='color: red;'} diff --git a/tests/testthat/test-asis-nodes.R b/tests/testthat/test-asis-nodes.R index 745978c..2a1bb2d 100644 --- a/tests/testthat/test-asis-nodes.R +++ b/tests/testthat/test-asis-nodes.R @@ -16,7 +16,7 @@ test_that("multi-line inline math can have punctutation after", { nodes <- xml2::xml_find_all(txt, ".//md:text", ns = md_ns()) # no protection initially expect_equal( - xml2::xml_attr(nodes, "protect.pos"), + xml2::xml_attr(nodes, "protect.start"), c(NA_character_, NA_character_) ) protxt <- protect_inline_math(txt, md_ns()) @@ -24,7 +24,7 @@ test_that("multi-line inline math can have punctutation after", { expect_identical(txt, protxt) # protection exists expect_equal( - xml2::xml_attr(nodes, "protect.pos"), + xml2::xml_attr(nodes, "protect.start"), c('4', '1') ) expect_equal( @@ -131,20 +131,20 @@ test_that("(105) protection of one element does not impede protection of another brio::write_lines(expected, temp_file) wool <- tinkr::yarn$new(temp_file) # no protection initially - n <- xml2::xml_find_all(wool$body, ".//md:text[@protect.pos]", ns = md_ns()) + n <- xml2::xml_find_all(wool$body, ".//md:text[@protect.start]", ns = md_ns()) expect_length(n, 0) wool$protect_curly() # protection exists - n <- xml2::xml_find_all(wool$body, ".//md:text[@protect.pos]", ns = md_ns()) + n <- xml2::xml_find_all(wool$body, ".//md:text[@protect.start]", ns = md_ns()) expect_length(n, 1) # the ranges are initially betwen the curly braces expect_equal(get_protected_ranges(n[[1]]), list(start = 4L, end = 7L)) # protecting for math does not throw an error expect_no_error(wool$protect_math()) - n <- xml2::xml_find_all(wool$body, ".//md:text[@protect.pos]", ns = md_ns()) + n <- xml2::xml_find_all(wool$body, ".//md:text[@protect.start]", ns = md_ns()) expect_length(n, 1) # the protected range now extends to the whole line expect_equal(get_protected_ranges(n[[1]]), list(start = 1L, end = 8L)) From afef059f841992371736d9ef332ea18018b4f6b7 Mon Sep 17 00:00:00 2001 From: "Zhian N. Kamvar" Date: Wed, 1 May 2024 16:46:15 -0700 Subject: [PATCH 26/37] create utils for playing with sourcepos --- R/resolve-links.R | 18 ----------------- R/utils-sourcepos.R | 49 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 18 deletions(-) create mode 100644 R/utils-sourcepos.R diff --git a/R/resolve-links.R b/R/resolve-links.R index 94e43f5..f7b3c49 100644 --- a/R/resolve-links.R +++ b/R/resolve-links.R @@ -198,21 +198,3 @@ al_title <- function(link) { escape_ampersand(titles) } -#nocov start -# Get the position of an element -get_pos <- function(x, e = 1) { - as.integer( - gsub( - "^(\\d+?):(\\d+?)[-](\\d+?):(\\d+?)$", - glue::glue("\\{e}"), - xml2::xml_attr(x, "sourcepos") - ) - ) -} - -# helpers for get_pos -get_linestart <- function(x) get_pos(x, e = 1) -get_colstart <- function(x) get_pos(x, e = 2) -get_lineend <- function(x) get_pos(x, e = 3) -get_colend <- function(x) get_pos(x, e = 4) -#nocov end diff --git a/R/utils-sourcepos.R b/R/utils-sourcepos.R new file mode 100644 index 0000000..02f25da --- /dev/null +++ b/R/utils-sourcepos.R @@ -0,0 +1,49 @@ +has_sourcepos <- function(node) { + xml2::xml_has_attr(node, "sourcepos") +} +# Get the position of an element +get_pos <- function(x, e = 1) { + as.integer( + gsub( + "^(\\d+?):(\\d+?)[-](\\d+?):(\\d+?)$", + glue::glue("\\{e}"), + xml2::xml_attr(x, "sourcepos") + ) + ) +} + +# helpers for get_pos +get_linestart <- function(x) get_pos(x, e = 1) +get_colstart <- function(x) get_pos(x, e = 2) +get_lineend <- function(x) get_pos(x, e = 3) +get_colend <- function(x) get_pos(x, e = 4) + +get_sourcepos <- function(node) { + list( + linestart = get_linestart(node), + colstart = get_colstart(node), + lineend = get_lineend(node), + colend = get_colend(node) + ) +} +make_sourcepos <- function(pos) { + glue::glue("{pos$linestart}:{pos$colstart}-{pos$lineend}:{pos$colend}") +} + +split_sourcepos <- function(node) { + pos <- get_sourcepos(node) + ranges <- get_full_ranges(node) + offset <- pos$colstart - 1L + pos$colstart <- ranges$start + offset + pos$colend <- ranges$end + make_sourcepos(pos) +} + +join_sourcepos <- function(nodes) { + pos <- get_sourcepos(nodes) + pos$linestart <- min(pos$linestart) + pos$colstart <- min(pos$colstart) + pos$lineend <- max(pos$lineend) + pos$colend <- max(pos$colend) + make_sourcepos(pos) +} From 54c1d1f70ad25b6f5e38bcda5dbb098d8a48909d Mon Sep 17 00:00:00 2001 From: "Zhian N. Kamvar" Date: Wed, 1 May 2024 16:46:42 -0700 Subject: [PATCH 27/37] add capability to split and rejoin protected nodes This begins to address limitations of the attribute-based protection by providing a way to separate and rejoin nodes that were previously split. --- DESCRIPTION | 2 +- NAMESPACE | 4 + R/asis-nodes.R | 2 +- R/node-protection.R | 57 ++++++++++++ R/node-splits.R | 142 ++++++++++++++++++++++++++++++ man/protected_ranges.Rd | 13 +++ man/split_protected_nodes.Rd | 49 +++++++++++ tests/testthat/test-node-splits.R | 75 ++++++++++++++++ 8 files changed, 342 insertions(+), 2 deletions(-) create mode 100644 R/node-splits.R create mode 100644 man/split_protected_nodes.Rd create mode 100644 tests/testthat/test-node-splits.R diff --git a/DESCRIPTION b/DESCRIPTION index a3e7e73..fc95947 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -56,5 +56,5 @@ Config/testthat/edition: 3 Encoding: UTF-8 LazyData: true Roxygen: list(markdown = TRUE) -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.1 VignetteBuilder: knitr diff --git a/NAMESPACE b/NAMESPACE index 5c0412a..dbd28aa 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,15 +2,19 @@ export(add_protected_ranges) export(find_between) +export(get_protected_nodes) export(get_protected_ranges) export(is_protected) +export(join_split_nodes) export(md_ns) export(protect_curly) export(protect_math) export(remove_protected_ranges) +export(split_protected_nodes) export(stylesheet) export(to_md) export(to_xml) +export(xpath_protected) export(yarn) importFrom(R6,R6Class) importFrom(magrittr,"%>%") diff --git a/R/asis-nodes.R b/R/asis-nodes.R index d3dcc0d..8b55f4b 100644 --- a/R/asis-nodes.R +++ b/R/asis-nodes.R @@ -391,7 +391,7 @@ protect_unescaped <- function(body, txt, ns = md_ns()) { warning(msg, call. = FALSE) return(body) } - XPATH <- ".//md:text[not(@asis)][contains(text(), '[') or contains(text(), ']')]" + XPATH <- ".//md:text[contains(text(), '[') or contains(text(), ']')]" snodes <- xml2::xml_find_all(body, XPATH, ns = ns) fix_unescaped_squares(snodes, txt) return(body) diff --git a/R/node-protection.R b/R/node-protection.R index a3dc802..af83f8b 100644 --- a/R/node-protection.R +++ b/R/node-protection.R @@ -90,6 +90,10 @@ is_protected <- function(node) { xml2::xml_has_attr(node, "protect.end") } +#' @rdname protected_ranges +#' @export +xpath_protected <- ".//node()[@protect.start and @protect.end]" + #' @rdname protected_ranges #' @export get_protected_ranges <- function(node) { @@ -110,6 +114,26 @@ remove_protected_ranges <- function(node) { return(node) } +#' @rdname protected_ranges +#' @param find all nodes that +#' @export +get_protected_nodes <- function(body) { + xml2::xml_find_all(body, xpath_protected) +} + +get_full_ranges <- function(node) { + if (!is_protected(node)) { + return(node) + } + ranges <- get_protected_ranges(node) + txt <- xml2::xml_text(node) + inv <- inverse_ranges(nchar(txt), ranges$start, ranges$end) + full <- update_ranges(c(inv$start, ranges$start), c(inv$end, ranges$end)) + full$protected <- full$start %in% ranges$start + return(full) +} + + #' Detect if two ranges are overlapping #' #' @param s1 \[integer\] starting index of first range @@ -184,3 +208,36 @@ update_ranges <- function(start, end) { return(list(start = nstart[keep], end = nend[keep])) } + +inverse_ranges <- function(upper, start, end) { + original <- seq.int(upper) + n <- length(start) + 1 + res <- list(start = integer(n), end = integer(n)) + i <- 1 + j <- 1 + # create a mask to kee track of where we have been + mask <- original > 0 + while (i < n) { + mask <- mask & (original > end[i] | original < start[i]) + keep <- mask & original < end[i] + if (!any(keep)) { + i <- i + 1 + next + } + rng <- range(original[keep]) + res$start[j] <- rng[1] + res$end[j] <- rng[2] + # update the mask to make sure we do not add this again + mask[rng[1]:rng[2]] <- FALSE + j <- j + 1 + i <- i + 1 + } + if (end[n - 1] < upper) { + res$start[j] <- end[n - 1] + 1 + res$end[j] <- upper + } + # remove any empty cells + res$start <- res$start[res$start != 0] + res$end <- res$end[res$end != 0] + return(res) +} diff --git a/R/node-splits.R b/R/node-splits.R new file mode 100644 index 0000000..ed8e876 --- /dev/null +++ b/R/node-splits.R @@ -0,0 +1,142 @@ +#' Split and Join nodes that have been protected +#' +#' @param body an `xml_document` class object +#' @return +#' - `split_protected_nodes()` - a copy of `body` where text nodes with +#' protection are split into 'asis' and regular nodes. +#' - `join_split_nodes()` - the `body` object where text nodes that have been +#' split (those with the `@split-id` attribute) are joined into a single text +#' node. +#' @rdname split_protected_nodes +#' @export +#' @examples +#' ex <- system.file("extdata", "math-example.md", package = "tinkr") +#' m <- tinkr::yarn$new(ex) +#' m$protect_math() +#' # protection gives us protected nodes +#' get_protected_nodes(m$body) +#' original_body <- m$body +#' # ---- splitting -------------------------------------------- +#' # splitting transforms those nodes into split text nodes +#' split_body <- split_protected_nodes(m$body) +#' get_protected_nodes(split_body) +#' xml2::xml_find_all(split_body, ".//node()[@split-id]") +#' # The effect is the same +#' m$head(10) +#' m$body <- split_body +#' m$head(10) +#' # ---- joining ---------------------------------------------- +#' # joining is done in place +#' join_split_nodes(m$body) +#' m$head(10) +#' get_protected_nodes(m$body) +#' get_protected_nodes(original_body) +split_protected_nodes <- function(body) { + body <- copy_xml(body) + protected <- get_protected_nodes(body) + if (length(protected) == 0) { + return(body) + } + purrr::iwalk(protected, split_node) + copy_xml(body) +} + +# split a node into adjacent text nodes +split_node <- function(node, id) { + if (!is_protected(node)) { + return(node) + } + frag <- split_node_text(node) + new_nodes <- glue::glue("{frag$string}") + new_nodes <- make_text_nodes(new_nodes) + attrs <- xml2::xml_attrs(node) + attrs <- attrs[!startsWith(names(attrs), "protect")] + for (attr in names(attrs)) { + val <- attrs[[attr]] + if (attr == "space") { + attr <- "xml:space" + } + xml2::xml_set_attr(new_nodes, attr, val) + } + if (has_sourcepos(node)) { + xml2::xml_set_attr(new_nodes, "sourcepos", split_sourcepos(node)) + } + remove_protected_ranges(node) + xml2::xml_set_attr(new_nodes, "split-id", id) + xml2::xml_set_attr(new_nodes[frag$protected], "asis", "true") + add_node_siblings(node, new_nodes, remove = TRUE) +} + +# splits node text based on protected ranges +split_node_text <- function(node) { + if (!is_protected(node)) { + return(node) + } + full <- get_full_ranges(node) + txt <- xml2::xml_text(node) + parts <- list( + string = substring(txt, full$start, full$end), + protected = full$protected + ) + return(parts) +} + +#' @rdname split_protected_nodes +#' @export +join_split_nodes <- function(body) { + nodes <- xml2::xml_find_all(body, ".//md:text[@split-id]", ns = md_ns()) + if (length(nodes) == 0) { + return(body) + } + ids <- xml2::xml_attr(nodes, "split-id") + purrr::walk(unique(ids), function(i) join_text_nodes(nodes[ids == i])) + return(body) +} + +join_text_nodes <- function(nodes) { + nodetxt <- xml2::xml_text(nodes) + asis_nodes <- xml2::xml_has_attr(nodes, "asis") + # if there is a single node (e.g. it was part of a multiline math problem) + # and it was _all_ protected, then we need to catch it and return it early + if (length(nodes) == 1) { + new_node <- nodes[[1]] + if (has_sourcepos(nodes)) { + start <- 1 + end <- get_colend(nodes)[asis_nodes] - get_colstart(nodes)[asis_nodes] + 1 + } else { + start <- 1 + end <- nchar(nodetxt) + } + add_protected_ranges(new_node, start, end) + xml2::xml_set_attr(new_node, "split-id", NULL) + xml2::xml_set_attr(new_node, "asis", NULL) + return(new_node) + } + # In this nodeset, we need to make sure to relabel the asis nodes + txt <- paste(nodetxt, collapse = "") + # our new node is the donor for all other nodes + new_node <- nodes[[1]] + xml2::xml_set_text(new_node, txt) + if (has_sourcepos(new_node)) { + pos <- join_sourcepos(nodes) + offset <- get_colstart(new_node) - 1 + start <- get_colstart(nodes)[asis_nodes] - offset + end <- get_colend(nodes)[asis_nodes] - offset + xml2::xml_set_attr(new_node, "sourcepos", pos) + } else { + # compute the protection from the string lengths + n <- cumsum(nchar(nodetxt)) + start <- n[!asis_nodes] + 1 + end <- n[asis_nodes] + # if the first node is an asis node, th + if (isTRUE(asis_nodes[1])) { + start <- c(1, n[!asis_nodes] + 1) + } + start <- start[seq_along(end)] + } + add_protected_ranges(new_node, start, end) + xml2::xml_set_attr(new_node, "split-id", NULL) + xml2::xml_set_attr(new_node, "asis", NULL) + xml2::xml_remove(nodes[-1]) +} + diff --git a/man/protected_ranges.Rd b/man/protected_ranges.Rd index 5892110..54dad63 100644 --- a/man/protected_ranges.Rd +++ b/man/protected_ranges.Rd @@ -1,19 +1,29 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/node-protection.R +\docType{data} \name{add_protected_ranges} \alias{add_protected_ranges} \alias{is_protected} +\alias{xpath_protected} \alias{get_protected_ranges} \alias{remove_protected_ranges} +\alias{get_protected_nodes} \title{Handle protected ranges for a node} +\format{ +An object of class \code{character} of length 1. +} \usage{ add_protected_ranges(node, start, end) is_protected(node) +xpath_protected + get_protected_ranges(node) remove_protected_ranges(node) + +get_protected_nodes(body) } \arguments{ \item{node}{an XML \verb{} node.} @@ -22,6 +32,8 @@ remove_protected_ranges(node) \item{end}{\verb{\[integer\]} a vector of ending indices that are paired with \code{start}} + +\item{find}{all nodes that} } \value{ \itemize{ @@ -82,3 +94,4 @@ wool$show() # CLEAN UP ------------------- if (file.exists(temp_file)) unlink(temp_file) } +\keyword{datasets} diff --git a/man/split_protected_nodes.Rd b/man/split_protected_nodes.Rd new file mode 100644 index 0000000..2503c87 --- /dev/null +++ b/man/split_protected_nodes.Rd @@ -0,0 +1,49 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/node-splits.R +\name{split_protected_nodes} +\alias{split_protected_nodes} +\alias{join_split_nodes} +\title{Split and Join nodes that have been protected} +\usage{ +split_protected_nodes(body) + +join_split_nodes(body) +} +\arguments{ +\item{body}{an \code{xml_document} class object} +} +\value{ +\itemize{ +\item \code{split_protected_nodes()} - a copy of \code{body} where text nodes with +protection are split into 'asis' and regular nodes. +\item \code{join_split_nodes()} - the \code{body} object where text nodes that have been +split (those with the \verb{@split-id} attribute) are joined into a single text +node. +} +} +\description{ +Split and Join nodes that have been protected +} +\examples{ +ex <- system.file("extdata", "math-example.md", package = "tinkr") +m <- tinkr::yarn$new(ex) +m$protect_math() +# protection gives us protected nodes +get_protected_nodes(m$body) +original_body <- m$body +# ---- splitting -------------------------------------------- +# splitting transforms those nodes into split text nodes +split_body <- split_protected_nodes(m$body) +get_protected_nodes(split_body) +xml2::xml_find_all(split_body, ".//node()[@split-id]") +# The effect is the same +m$head(10) +m$body <- split_body +m$head(10) +# ---- joining ---------------------------------------------- +# joining is done in place +join_split_nodes(m$body) +m$head(10) +get_protected_nodes(m$body) +get_protected_nodes(original_body) +} diff --git a/tests/testthat/test-node-splits.R b/tests/testthat/test-node-splits.R new file mode 100644 index 0000000..a44577b --- /dev/null +++ b/tests/testthat/test-node-splits.R @@ -0,0 +1,75 @@ + +test_that("splitting and joining protected nodes will work round trip", { + + ex <- system.file("extdata", "math-example.md", package = "tinkr") + m <- tinkr::yarn$new(ex) + m$protect_math() + # protection gives us protected nodes + protected <- length(get_protected_nodes(m$body)) + expect_gt(protected, 0) + # ---- splitting -------------------------------------------- + # splitting transforms those nodes into split text nodes + split_body <- split_protected_nodes(m$body) + # no protected nodes exist + expect_false(identical(split_body, m$body)) + expect_length(get_protected_nodes(split_body), 0) + + # there should be the same number of unique ids as the protected + splits <- xml2::xml_find_all(split_body, ".//node()[@split-id]") + expect_gt(length(splits), 0) + ids <- unique(xml2::xml_attr(splits, "split-id")) + expect_equal(length(ids), protected) + + # The effect is the same + h1 <- m$head(10) + m$body <- split_body + h2 <- m$head(10) + expect_equal(h1, h2) + # ---- joining ---------------------------------------------- + # joining is done in place + join_split_nodes(m$body) + expect_identical(m$body, split_body) + h3 <- m$head(10) + expect_equal(h1, h3) + expect_equal(length(get_protected_nodes(m$body)), protected) + +}) + + +test_that("splitting and joining protected nodes will work round trip with sourcepos", { + + ex <- system.file("extdata", "math-example.md", package = "tinkr") + m <- tinkr::yarn$new(ex, sourcepos = TRUE) + m$protect_math() + # the source positions exist + expect_true(has_sourcepos(m$body)) + # protection gives us protected nodes + protected <- length(get_protected_nodes(m$body)) + expect_gt(protected, 0) + # ---- splitting -------------------------------------------- + # splitting transforms those nodes into split text nodes + split_body <- split_protected_nodes(m$body) + # no protected nodes exist + expect_false(identical(split_body, m$body)) + expect_length(get_protected_nodes(split_body), 0) + + # there should be the same number of unique ids as the protected + splits <- xml2::xml_find_all(split_body, ".//node()[@split-id]") + expect_gt(length(splits), 0) + ids <- unique(xml2::xml_attr(splits, "split-id")) + expect_equal(length(ids), protected) + + # The effect is the same + h1 <- m$head(10) + m$body <- split_body + h2 <- m$head(10) + expect_equal(h1, h2) + # ---- joining ---------------------------------------------- + # joining is done in place + join_split_nodes(m$body) + expect_identical(m$body, split_body) + h3 <- m$head(10) + expect_equal(h1, h3) + expect_equal(length(get_protected_nodes(m$body)), protected) + +}) From febfa2f8dd73f2419622c662eb5f814e2c36f610 Mon Sep 17 00:00:00 2001 From: "Zhian N. Kamvar" Date: Wed, 1 May 2024 16:56:28 -0700 Subject: [PATCH 28/37] fix doc booboo --- R/node-protection.R | 4 +++- man/protected_ranges.Rd | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/R/node-protection.R b/R/node-protection.R index af83f8b..6b6e449 100644 --- a/R/node-protection.R +++ b/R/node-protection.R @@ -4,12 +4,14 @@ #' @param start `\[integer\]` a vector of starting indices of a set of ranges #' @param end `\[integer\]` a vector of ending indices that are paired with #' `start` +#' @param body an XML document #' @return #' - `add_protected_ranges()`: the modified node #' - `remove_protected_ranges()`: the modified node #' - `is_protected()`: `TRUE` if the node has protection attributes #' - `get_protected_ranges()` a list containing integer vectors `start` and #' `end` if the node is protected, otherwise, it returns NULL +#' - `get_protected_nodes()` a nodelist #' @rdname protected_ranges #' @export #' @examples @@ -49,6 +51,7 @@ #' get_protected_ranges(nodes[[1]]) #' #' wool$show() # the first row and "\e", "\g", and "\h" are protected +#' get_protected_nodes(wool$body) # showing the nodes that are protected #' #' # REMOVING PROTECTION -------- #' remove_protected_ranges(nodes[[2]]) @@ -115,7 +118,6 @@ remove_protected_ranges <- function(node) { } #' @rdname protected_ranges -#' @param find all nodes that #' @export get_protected_nodes <- function(body) { xml2::xml_find_all(body, xpath_protected) diff --git a/man/protected_ranges.Rd b/man/protected_ranges.Rd index 54dad63..123d891 100644 --- a/man/protected_ranges.Rd +++ b/man/protected_ranges.Rd @@ -33,7 +33,7 @@ get_protected_nodes(body) \item{end}{\verb{\[integer\]} a vector of ending indices that are paired with \code{start}} -\item{find}{all nodes that} +\item{body}{an XML document} } \value{ \itemize{ @@ -42,6 +42,7 @@ get_protected_nodes(body) \item \code{is_protected()}: \code{TRUE} if the node has protection attributes \item \code{get_protected_ranges()} a list containing integer vectors \code{start} and \code{end} if the node is protected, otherwise, it returns NULL +\item \code{get_protected_nodes()} a nodelist } } \description{ @@ -84,6 +85,7 @@ add_protected_ranges(nodes[[1]], start = 1, end = 4) get_protected_ranges(nodes[[1]]) wool$show() # the first row and "\e", "\g", and "\h" are protected +get_protected_nodes(wool$body) # showing the nodes that are protected # REMOVING PROTECTION -------- remove_protected_ranges(nodes[[2]]) From 70612418bbe47f5c78cc40acbf0d63f871f98641 Mon Sep 17 00:00:00 2001 From: "Zhian N. Kamvar" Date: Thu, 2 May 2024 09:31:25 -0700 Subject: [PATCH 29/37] ensure complete round trip The previous iteration was not quite correct because it had assumed that the sourcepos would match up exactly with the protection ranges, but these were two separate numbers. This does the following: 1. when a protected range spans the entire node, then it is labeled "asis" 2. `split_sourcepos()` now reflects the actual end of the sourcepos instead of the computed end 3. an awkward catch for single nodes in `join_split_nodes()` is now eliminated 4. `join_split_nodes()` no longer re-comuputes the protected ranges from the sourcepos --- R/asis-nodes.R | 42 ++----------- R/node-protection.R | 13 +++- R/node-splits.R | 87 ++++++++++++++++++--------- R/utils-sourcepos.R | 9 ++- man/split_protected_nodes.Rd | 3 + tests/testthat/_snaps/asis-nodes.md | 2 +- tests/testthat/_snaps/attr-nodes.md | 2 +- tests/testthat/test-asis-nodes.R | 16 ++--- tests/testthat/test-node-protection.R | 16 ++--- tests/testthat/test-node-splits.R | 4 ++ 10 files changed, 106 insertions(+), 88 deletions(-) diff --git a/R/asis-nodes.R b/R/asis-nodes.R index 8b55f4b..ac874c6 100644 --- a/R/asis-nodes.R +++ b/R/asis-nodes.R @@ -32,6 +32,10 @@ set_asis <- function(nodes) { xml2::xml_set_attr(nodes[xml2::xml_name(nodes) != "softbreak"], "asis", "true") } +is_asis <- function(node) { + xml2::xml_has_attr(node, "asis") +} + # INLINE MATH ------------------------------------------------------------------ # finding inline math consists of searching for $ and excluding $$ @@ -223,44 +227,6 @@ label_fully_inline <- function(math) { add_protected_ranges(math, start, end) } -#' Transform a character vector of XML into text nodes -#' -#' This is useful in the case where we want to modify some text content to -#' split it and label a portion of it 'asis' to protect it from commonmark's -#' escape processing. -#' -#' `fix_fully_inline()` uses `make_text_nodes()` to modify a single text node -#' into several text nodes. It first takes a string of a single text node like -#' below... -#' -#' ```html -#' this is $\LaTeX$ text -#' ``` -#' -#' ...and splits it into three text nodes, surrounding the LaTeX math with text -#' tags that have the 'asis' attribute. -#' -#' ```html -#' this is $\LaTeX$ text -#' ``` -#' -#' The `make_text_nodes()` function takes the above text string and converts it -#' into nodes so that the original text node can be replaced. -#' -#' @param a character vector of modified text nodes -#' @return a nodeset with no associated namespace -#' @noRd -make_text_nodes <- function(txt) { - # We are hijacking commonmark here to produce an XML markdown document with - # a single element: {paste(txt, collapse = ''). This gets passed to glue where - # it is expanded into nodes that we can read in via {xml2}, strip the - # namespace, and extract all nodes below - doc <- glue::glue(commonmark::markdown_xml("{paste(txt, collapse = '')}")) - nodes <- xml2::xml_ns_strip(xml2::read_xml(doc)) - xml2::xml_find_all(nodes, ".//paragraph/text/*") -} - - # BLOCK MATH ------------------------------------------------------------------ find_block_math <- function(body, ns) { diff --git a/R/node-protection.R b/R/node-protection.R index 6b6e449..0a6c339 100644 --- a/R/node-protection.R +++ b/R/node-protection.R @@ -63,8 +63,9 @@ #' if (file.exists(temp_file)) unlink(temp_file) add_protected_ranges <- function(node, start, end) { no_beginning <- length(start) == 0 || any(start < 1) - if (no_beginning || not_text_node(node)) { - # return early if there are no ranges to protect + if (is_asis(node) || no_beginning || not_text_node(node)) { + # return early if node is already protected + # or there are no ranges to protect return(node) } if (is_protected(node)) { @@ -75,6 +76,14 @@ add_protected_ranges <- function(node, start, end) { start <- new$start end <- new$end } + n <- nchar(xml2::xml_text(node)) + if (length(end) == 1 && start == 1 && end == n ) { + # if the protection ends up spanning the entire node, just return asis + xml2::xml_set_attr(node, "asis", "true") + xml2::xml_set_attr(node, "protect.start", NULL) + xml2::xml_set_attr(node, "protect.end", NULL) + return(node) + } xml2::xml_set_attr(node, "protect.start", paste(start, collapse = " ")) xml2::xml_set_attr(node, "protect.end", paste(end, collapse = " ")) return(node) diff --git a/R/node-splits.R b/R/node-splits.R index ed8e876..a5ecfb7 100644 --- a/R/node-splits.R +++ b/R/node-splits.R @@ -1,3 +1,40 @@ +#' Transform a character vector of XML into text nodes +#' +#' This is useful in the case where we want to modify some text content to +#' split it and label a portion of it 'asis' to protect it from commonmark's +#' escape processing. +#' +#' `fix_fully_inline()` uses `make_text_nodes()` to modify a single text node +#' into several text nodes. It first takes a string of a single text node like +#' below... +#' +#' ```html +#' this is $\LaTeX$ text +#' ``` +#' +#' ...and splits it into three text nodes, surrounding the LaTeX math with text +#' tags that have the 'asis' attribute. +#' +#' ```html +#' this is $\LaTeX$ text +#' ``` +#' +#' The `make_text_nodes()` function takes the above text string and converts it +#' into nodes so that the original text node can be replaced. +#' +#' @param a character vector of modified text nodes +#' @return a nodeset with no associated namespace +#' @noRd +make_text_nodes <- function(txt) { + # We are hijacking commonmark here to produce an XML markdown document with + # a single element: {paste(txt, collapse = ''). This gets passed to glue where + # it is expanded into nodes that we can read in via {xml2}, strip the + # namespace, and extract all nodes below + doc <- glue::glue(commonmark::markdown_xml("{paste(txt, collapse = '')}")) + nodes <- xml2::xml_ns_strip(xml2::read_xml(doc)) + xml2::xml_find_all(nodes, ".//paragraph/text/*") +} + #' Split and Join nodes that have been protected #' #' @param body an `xml_document` class object @@ -31,6 +68,9 @@ #' m$head(10) #' get_protected_nodes(m$body) #' get_protected_nodes(original_body) +#' +#' # the context is identical even after transformation +#' identical(as.character(m$body), as.character(original_body)) split_protected_nodes <- function(body) { body <- copy_xml(body) protected <- get_protected_nodes(body) @@ -47,6 +87,14 @@ split_node <- function(node, id) { return(node) } frag <- split_node_text(node) + if (length(frag$string) == 1) { + # If we have a single fragment, we can label it "asis" and be on our way + # This is likely the situation where a math equation contains + # two underscores, which the interpreter reads as nodes. + xml2::xml_set_attr(node, "asis", "true") + remove_protected_ranges(node) + return(node) + } new_nodes <- glue::glue("{frag$string}") new_nodes <- make_text_nodes(new_nodes) attrs <- xml2::xml_attrs(node) @@ -96,44 +144,25 @@ join_split_nodes <- function(body) { join_text_nodes <- function(nodes) { nodetxt <- xml2::xml_text(nodes) asis_nodes <- xml2::xml_has_attr(nodes, "asis") - # if there is a single node (e.g. it was part of a multiline math problem) - # and it was _all_ protected, then we need to catch it and return it early - if (length(nodes) == 1) { - new_node <- nodes[[1]] - if (has_sourcepos(nodes)) { - start <- 1 - end <- get_colend(nodes)[asis_nodes] - get_colstart(nodes)[asis_nodes] + 1 - } else { - start <- 1 - end <- nchar(nodetxt) - } - add_protected_ranges(new_node, start, end) - xml2::xml_set_attr(new_node, "split-id", NULL) - xml2::xml_set_attr(new_node, "asis", NULL) - return(new_node) - } # In this nodeset, we need to make sure to relabel the asis nodes txt <- paste(nodetxt, collapse = "") # our new node is the donor for all other nodes new_node <- nodes[[1]] xml2::xml_set_text(new_node, txt) if (has_sourcepos(new_node)) { + # restore the sourcepos of the original nodes pos <- join_sourcepos(nodes) - offset <- get_colstart(new_node) - 1 - start <- get_colstart(nodes)[asis_nodes] - offset - end <- get_colend(nodes)[asis_nodes] - offset xml2::xml_set_attr(new_node, "sourcepos", pos) - } else { - # compute the protection from the string lengths - n <- cumsum(nchar(nodetxt)) - start <- n[!asis_nodes] + 1 - end <- n[asis_nodes] - # if the first node is an asis node, th - if (isTRUE(asis_nodes[1])) { - start <- c(1, n[!asis_nodes] + 1) - } - start <- start[seq_along(end)] } + # compute the protection from the string lengths + n <- cumsum(nchar(nodetxt)) + start <- n[!asis_nodes] + 1 + end <- n[asis_nodes] + # if the first node is an asis node, th + if (isTRUE(asis_nodes[1])) { + start <- c(1, n[!asis_nodes] + 1) + } + start <- start[seq_along(end)] add_protected_ranges(new_node, start, end) xml2::xml_set_attr(new_node, "split-id", NULL) xml2::xml_set_attr(new_node, "asis", NULL) diff --git a/R/utils-sourcepos.R b/R/utils-sourcepos.R index 02f25da..33396f2 100644 --- a/R/utils-sourcepos.R +++ b/R/utils-sourcepos.R @@ -32,10 +32,17 @@ make_sourcepos <- function(pos) { split_sourcepos <- function(node) { pos <- get_sourcepos(node) + # sometimes the source position will run beyond the ranges of the node + # this preserves the original end for the source position + original_end <- pos$colend ranges <- get_full_ranges(node) + # the colstart and ends should be offset by the original colstart + # in the case of lists or indented paragraphs offset <- pos$colstart - 1L pos$colstart <- ranges$start + offset - pos$colend <- ranges$end + pos$colend <- ranges$end + offset + # we append the original end + pos$colend[length(pos$colend)] <- original_end make_sourcepos(pos) } diff --git a/man/split_protected_nodes.Rd b/man/split_protected_nodes.Rd index 2503c87..86a8597 100644 --- a/man/split_protected_nodes.Rd +++ b/man/split_protected_nodes.Rd @@ -46,4 +46,7 @@ join_split_nodes(m$body) m$head(10) get_protected_nodes(m$body) get_protected_nodes(original_body) + +# the context is identical even after transformation +identical(as.character(m$body), as.character(original_body)) } diff --git a/tests/testthat/_snaps/asis-nodes.md b/tests/testthat/_snaps/asis-nodes.md index 18f62d9..3935f69 100644 --- a/tests/testthat/_snaps/asis-nodes.md +++ b/tests/testthat/_snaps/asis-nodes.md @@ -92,7 +92,7 @@ Code show_user(wool$show(), force = TRUE) Output - example + example inline $b_{ij}$ $a_{ij}$ diff --git a/tests/testthat/_snaps/attr-nodes.md b/tests/testthat/_snaps/attr-nodes.md index bd2becb..d6a39fc 100644 --- a/tests/testthat/_snaps/attr-nodes.md +++ b/tests/testthat/_snaps/attr-nodes.md @@ -22,7 +22,7 @@ a pretty kitten - {#kitteh alt='a picture of a kitten'} + {#kitteh alt='a picture of a kitten'} diff --git a/tests/testthat/test-asis-nodes.R b/tests/testthat/test-asis-nodes.R index 2a1bb2d..2d381ca 100644 --- a/tests/testthat/test-asis-nodes.R +++ b/tests/testthat/test-asis-nodes.R @@ -125,7 +125,7 @@ test_that("protect_unescaped() will throw a warning if no sourcpos is available" test_that("(105) protection of one element does not impede protection of another", { - expected <- "example\n\n$a_{ij}$ \n" + expected <- "example inline $b_{ij}$\n\n$a_{ij}$ \n" temp_file <- withr::local_tempfile() brio::write_lines(expected, temp_file) @@ -137,17 +137,17 @@ test_that("(105) protection of one element does not impede protection of another wool$protect_curly() # protection exists - n <- xml2::xml_find_all(wool$body, ".//md:text[@protect.start]", ns = md_ns()) - expect_length(n, 1) + the_nodes <- xml2::xml_find_all(wool$body, ".//node()[@protect.start]") + expect_length(the_nodes, 2) # the ranges are initially betwen the curly braces - expect_equal(get_protected_ranges(n[[1]]), list(start = 4L, end = 7L)) + expect_equal(get_protected_ranges(the_nodes[[1]]), list(start = 19L, end = 22L)) + expect_equal(get_protected_ranges(the_nodes[[2]]), list(start = 4L, end = 7L)) # protecting for math does not throw an error expect_no_error(wool$protect_math()) - n <- xml2::xml_find_all(wool$body, ".//md:text[@protect.start]", ns = md_ns()) - expect_length(n, 1) - # the protected range now extends to the whole line - expect_equal(get_protected_ranges(n[[1]]), list(start = 1L, end = 8L)) + # the protected range now extends to the whole line without the trailing space + expect_equal(get_protected_ranges(the_nodes[[1]]), list(start = 16L, end = 23L)) + expect_equal(get_protected_ranges(the_nodes[[2]]), NULL) expect_snapshot(show_user(wool$show(), force = TRUE)) }) diff --git a/tests/testthat/test-node-protection.R b/tests/testthat/test-node-protection.R index 716b4bd..fd8e867 100644 --- a/tests/testthat/test-node-protection.R +++ b/tests/testthat/test-node-protection.R @@ -115,14 +115,14 @@ test_that("protection can be added and removed", { add_protected_ranges(nodes[[2]], start = 1, end = 8) - expect_true(is_protected(nodes[[2]])) + expect_false(is_protected(nodes[[2]])) + expect_true(is_asis(nodes[[2]])) expect_equal(get_protected_ranges(nodes[[1]]), list(start = c(3, 7), end = c(4, 8)) ) - expect_equal(get_protected_ranges(nodes[[2]]), - list(start = 1, end = 8) - ) + expect_equal(get_protected_ranges(nodes[[2]]), NULL) + some_protection <- wool$show() # we expect all but the first and third entities to be protected some_expected <- gsub("\\\\([ac])", "\\\\\\\\\\1", expected) @@ -135,10 +135,10 @@ test_that("protection can be added and removed", { list(start = c(3, 7), end = c(4, 8)) ) # adding completely overlapping protections does not cause an error - add_protected_ranges(nodes[[1]], start = c(3, 7), end = c(4, 8)) - expect_equal(get_protected_ranges(nodes[[2]]), - list(start = 1, end = 8) - ) + add_protected_ranges(nodes[[2]], start = c(3, 7), end = c(4, 8)) + expect_true(is_asis(nodes[[2]])) + expect_equal(get_protected_ranges(nodes[[2]]), NULL) + some_protection <- wool$show() # we expect all but the first and third entities to be protected some_expected <- gsub("\\\\([ac])", "\\\\\\\\\\1", expected) diff --git a/tests/testthat/test-node-splits.R b/tests/testthat/test-node-splits.R index a44577b..2ba79b6 100644 --- a/tests/testthat/test-node-splits.R +++ b/tests/testthat/test-node-splits.R @@ -6,6 +6,7 @@ test_that("splitting and joining protected nodes will work round trip", { m$protect_math() # protection gives us protected nodes protected <- length(get_protected_nodes(m$body)) + orig <- m$body expect_gt(protected, 0) # ---- splitting -------------------------------------------- # splitting transforms those nodes into split text nodes @@ -32,6 +33,7 @@ test_that("splitting and joining protected nodes will work round trip", { h3 <- m$head(10) expect_equal(h1, h3) expect_equal(length(get_protected_nodes(m$body)), protected) + expect_equal(as.character(m$body), as.character(orig)) }) @@ -41,6 +43,7 @@ test_that("splitting and joining protected nodes will work round trip with sourc ex <- system.file("extdata", "math-example.md", package = "tinkr") m <- tinkr::yarn$new(ex, sourcepos = TRUE) m$protect_math() + orig <- m$body # the source positions exist expect_true(has_sourcepos(m$body)) # protection gives us protected nodes @@ -71,5 +74,6 @@ test_that("splitting and joining protected nodes will work round trip with sourc h3 <- m$head(10) expect_equal(h1, h3) expect_equal(length(get_protected_nodes(m$body)), protected) + expect_equal(object = as.character(m$body), expected = as.character(orig)) }) From 9e7de7281293f254a8f2c0628f824ca891c9b544 Mon Sep 17 00:00:00 2001 From: "Zhian N. Kamvar" Date: Thu, 2 May 2024 10:58:14 -0700 Subject: [PATCH 30/37] add find_between_nodes from pegboard This allows us to search for internal nodes using their identities --- R/utils.R | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/R/utils.R b/R/utils.R index 741c099..70f7709 100644 --- a/R/utils.R +++ b/R/utils.R @@ -3,6 +3,22 @@ show_user <- function(out, force = FALSE) { invisible(out) } +# taken from pegboard +find_between_nodes <- function(a, b, include = TRUE) { + the_parent <- xml2::xml_parent(a) + if (!identical(the_parent, xml2::xml_parent(b))) { + # we cannot return a space between nodes on different levels + return(xml2::xml_missing()) + } + the_children <- xml2::xml_children(the_parent) + # find the node in question by testing for identity since they represent the + # same object, they will be identical. + offset <- if (include) 0 else 1 + ida <- which(purrr::map_lgl(the_children, identical, a)) + offset + idb <- which(purrr::map_lgl(the_children, identical, b)) - offset + the_children[seq(ida, idb)] +} + unbalanced_math_error <- function(bmath, endless, headless, le, lh) { no_end <- xml2::xml_text(bmath[endless]) no_beginning <- xml2::xml_text(bmath[headless]) From acd560f4e9bd7b99651b14082941d2a8959a6696 Mon Sep 17 00:00:00 2001 From: "Zhian N. Kamvar" Date: Thu, 2 May 2024 10:59:05 -0700 Subject: [PATCH 31/37] use helpers for asis --- R/node-splits.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/node-splits.R b/R/node-splits.R index a5ecfb7..0fbf0f9 100644 --- a/R/node-splits.R +++ b/R/node-splits.R @@ -91,7 +91,7 @@ split_node <- function(node, id) { # If we have a single fragment, we can label it "asis" and be on our way # This is likely the situation where a math equation contains # two underscores, which the interpreter reads as nodes. - xml2::xml_set_attr(node, "asis", "true") + set_asis(node) remove_protected_ranges(node) return(node) } @@ -143,7 +143,7 @@ join_split_nodes <- function(body) { join_text_nodes <- function(nodes) { nodetxt <- xml2::xml_text(nodes) - asis_nodes <- xml2::xml_has_attr(nodes, "asis") + asis_nodes <- is_asis(nodes) # In this nodeset, we need to make sure to relabel the asis nodes txt <- paste(nodetxt, collapse = "") # our new node is the donor for all other nodes From f22d2aac7270098a5fec0138a8a72be55fa3830c Mon Sep 17 00:00:00 2001 From: "Zhian N. Kamvar" Date: Thu, 2 May 2024 10:59:24 -0700 Subject: [PATCH 32/37] remove "curly" attr when setting a math node --- R/asis-nodes.R | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/R/asis-nodes.R b/R/asis-nodes.R index ac874c6..8709362 100644 --- a/R/asis-nodes.R +++ b/R/asis-nodes.R @@ -208,6 +208,9 @@ label_partial_inline <- function(math) { begin <- 1 end <- nchar(char) } + # if any of the nodes have been set as curly, unset + xml2::xml_set_attr(math, "curly", NULL) + xml2::xml_set_attr(math, "curly-id", NULL) add_protected_ranges(math, begin, end) } @@ -224,6 +227,9 @@ label_fully_inline <- function(math) { # this is $\LaTeX$ text start <- locations[[1]] end <- start + attr(locations[[1]], "match.len") - 1L + # if any of the nodes have been set as curly, unset + xml2::xml_set_attr(math, "curly", NULL) + xml2::xml_set_attr(math, "curly-id", NULL) add_protected_ranges(math, start, end) } @@ -242,6 +248,9 @@ protect_block_math <- function(body, ns) { bm <- find_block_math(body, ns) # get all of the internal nodes bm <- xml2::xml_find_all(bm, ".//descendant-or-self::md:*", ns = ns) + # if any of the nodes have been set as curly, unset + xml2::xml_set_attr(bm, "curly", NULL) + xml2::xml_set_attr(math, "curly-id", NULL) set_asis(bm) } From 7875dea9bc64bf62d9f3909aa36f6bec42b100fa Mon Sep 17 00:00:00 2001 From: "Zhian N. Kamvar" Date: Thu, 2 May 2024 11:00:28 -0700 Subject: [PATCH 33/37] update curly processing to better handle alt text --- R/attr-nodes.R | 75 +++++++++++++++++------ tests/testthat/_snaps/attr-nodes.md | 18 +++--- tests/testthat/_snaps/attr-nodes/yarn.Rmd | 2 - tests/testthat/_snaps/attr-nodes/yarn.md | 2 - tests/testthat/test-node-splits.R | 6 +- 5 files changed, 71 insertions(+), 32 deletions(-) diff --git a/R/attr-nodes.R b/R/attr-nodes.R index 2d1ca1e..b6ce752 100644 --- a/R/attr-nodes.R +++ b/R/attr-nodes.R @@ -6,46 +6,85 @@ find_curly <- function(body, ns) { attr_texts <- xml2::xml_text(curlies) no_closing <- !grepl("[}]", attr_texts) if (any(no_closing)) { + curlies <- lapply(curlies, function(i) i) close_xpath <- "self::*/following-sibling::md:text[contains(text(), '}')]" - for (not_closed in curlies[no_closing]) { - closing <- xml2::xml_find_all( + for (i in which(no_closing)) { + not_closed <- curlies[[i]] + closing <- xml2::xml_find_first( not_closed, glue::glue("./{close_xpath}"), ns ) - xml2::xml_text(not_closed) <- paste( - xml2::xml_text(not_closed), - xml2::xml_text(closing), - sep = "\n" - ) - xml2::xml_remove(closing) + all_nodes <- find_between_nodes(not_closed, closing, include = TRUE) + curlies[[i]] <- all_nodes + # xml2::xml_text(not_closed) <- paste( + # xml2::xml_text(all_nodes), + # collapse = "\n" + # ) + # xml2::xml_remove(all_nodes[-1]) } } curlies } -digest_curly <- function(curly, ns) { - label_curly_nodes(curly) - char <- xml2::xml_text(curly) +digest_curly <- function(curly, id, ns) { + xml2::xml_set_attr(curly, "curly-id", id) + if (inherits(curly, "xml_node")) { + label_curly_node(curly) + } else { + n <- length(curly) + label_curly_node(curly[[1]], type = "start") + label_curly_node(curly[[n]], type = "stop") + if (n > 2) { + set_asis(curly[-c(1, n)]) + } + } + label_alt(curly) + return(curly) +} + +label_alt <- function(curly) { + char <- paste(trimws(xml2::xml_text(curly)), collapse = " ") + res <- if (inherits(curly, "xml_node")) curly else curly[[1]] alt_fragment <- regmatches(char, gregexpr("alt=['\"].*?['\"]", char))[[1]] if (length(alt_fragment) > 0) { alt_text <- sub("^alt=", "", alt_fragment) - xml2::xml_set_attr(curly, "alt", alt_text) + xml2::xml_set_attr(res, "alt", alt_text) } } -label_curly_nodes <- function(node) { +label_curly_node <- function(node, type = c("full", "start", "stop")) { char <- xml2::xml_text(node) # Find the locations of inline chars that is complete + pattern <- switch(match.arg(type), + full = "\\{.*?\\}", + start = "\\{[^}]*?", + stop = "[^{]*?\\}", + ) locations <- gregexpr( - pattern = "\\{.*?\\}", + pattern = pattern, char, perl = TRUE ) start <- locations[[1]] - end <- start + attr(locations[[1]], "match.len") - 1L - return(add_protected_ranges(node, start, end)) - + end <- if (match.arg(type) == "start") { + nchar(char) + } else { + start + attr(locations[[1]], "match.len") - 1L + } + add_protected_ranges(node, start, end) + if (is_asis(node)) { + xml2::xml_set_attr(node, "curly", "true") + } else { + xml2::xml_set_attr(node, "curly", + paste( + paste(start, collapse = " "), + paste(end, collapse = " "), + sep = ":" + ) + ) + } + return(node) } #' Protect curly elements for further processing @@ -74,6 +113,6 @@ label_curly_nodes <- function(node) { #' xml2::xml_child(m$body) protect_curly <- function(body, ns = md_ns()) { curly <- find_curly(body, ns) - purrr::walk(curly, digest_curly, ns = ns) + purrr::iwalk(curly, digest_curly, ns = ns) return(body) } diff --git a/tests/testthat/_snaps/attr-nodes.md b/tests/testthat/_snaps/attr-nodes.md index d6a39fc..f33e365 100644 --- a/tests/testthat/_snaps/attr-nodes.md +++ b/tests/testthat/_snaps/attr-nodes.md @@ -7,13 +7,13 @@ - preface {#pre-face .unnumbered} + preface {#pre-face .unnumbered} hello - I like {xml2} but of course {tinkr} is even cooler! + I like {xml2} but of course {tinkr} is even cooler! Images that use pandoc style will have curlies with content that should be translated and should be protected. @@ -22,20 +22,20 @@ a pretty kitten - {#kitteh alt='a picture of a kitten'} + {#kitteh alt='a picture of a kitten'} a pretty puppy - {#dog alt="a picture - of a dog"} - + {#dog alt="a picture + + of a dog"} - [a span with attributes]{.span-with-attributes - style='color: red;'} - + [a span with attributes]{.span-with-attributes + + style='color: red;'} diff --git a/tests/testthat/_snaps/attr-nodes/yarn.Rmd b/tests/testthat/_snaps/attr-nodes/yarn.Rmd index ef2353e..bd8c761 100644 --- a/tests/testthat/_snaps/attr-nodes/yarn.Rmd +++ b/tests/testthat/_snaps/attr-nodes/yarn.Rmd @@ -22,9 +22,7 @@ Images that use pandoc style will have curlies with content that should be trans ![a pretty puppy](https://placedog.net/200/300){#dog alt="a picture of a dog"} - [a span with attributes]{.span-with-attributes style='color: red;'} - diff --git a/tests/testthat/_snaps/attr-nodes/yarn.md b/tests/testthat/_snaps/attr-nodes/yarn.md index 972133a..811ef46 100644 --- a/tests/testthat/_snaps/attr-nodes/yarn.md +++ b/tests/testthat/_snaps/attr-nodes/yarn.md @@ -15,9 +15,7 @@ Images that use pandoc style will have curlies with content that should be trans ![a pretty puppy](https://placedog.net/200/300){#dog alt="a picture of a dog"} - [a span with attributes]{.span-with-attributes style='color: red;'} - diff --git a/tests/testthat/test-node-splits.R b/tests/testthat/test-node-splits.R index 2ba79b6..17209fc 100644 --- a/tests/testthat/test-node-splits.R +++ b/tests/testthat/test-node-splits.R @@ -33,6 +33,8 @@ test_that("splitting and joining protected nodes will work round trip", { h3 <- m$head(10) expect_equal(h1, h3) expect_equal(length(get_protected_nodes(m$body)), protected) + + # CONFIRM ROUND TRIP ---------------------------------------- expect_equal(as.character(m$body), as.character(orig)) }) @@ -49,7 +51,7 @@ test_that("splitting and joining protected nodes will work round trip with sourc # protection gives us protected nodes protected <- length(get_protected_nodes(m$body)) expect_gt(protected, 0) - # ---- splitting -------------------------------------------- + # ---- splitting --------------------------------------------- # splitting transforms those nodes into split text nodes split_body <- split_protected_nodes(m$body) # no protected nodes exist @@ -74,6 +76,8 @@ test_that("splitting and joining protected nodes will work round trip with sourc h3 <- m$head(10) expect_equal(h1, h3) expect_equal(length(get_protected_nodes(m$body)), protected) + + # CONFIRM ROUND TRIP ---------------------------------------- expect_equal(object = as.character(m$body), expected = as.character(orig)) }) From 37332f70d05cda3890091801162ba0120a861547 Mon Sep 17 00:00:00 2001 From: "Zhian N. Kamvar" Date: Thu, 2 May 2024 11:11:20 -0700 Subject: [PATCH 34/37] update processing of alt text to remove breaks --- R/attr-nodes.R | 3 +- tests/testthat/_snaps/attr-nodes.md | 58 ++++++++++++++++++++++++++++- tests/testthat/test-attr-nodes.R | 22 +++++++++++ 3 files changed, 81 insertions(+), 2 deletions(-) diff --git a/R/attr-nodes.R b/R/attr-nodes.R index b6ce752..36d5c5f 100644 --- a/R/attr-nodes.R +++ b/R/attr-nodes.R @@ -44,7 +44,8 @@ digest_curly <- function(curly, id, ns) { } label_alt <- function(curly) { - char <- paste(trimws(xml2::xml_text(curly)), collapse = " ") + char <- trimws(xml2::xml_text(curly)) + char <- paste(char[char != ""], collapse = " ") res <- if (inherits(curly, "xml_node")) curly else curly[[1]] alt_fragment <- regmatches(char, gregexpr("alt=['\"].*?['\"]", char))[[1]] if (length(alt_fragment) > 0) { diff --git a/tests/testthat/_snaps/attr-nodes.md b/tests/testthat/_snaps/attr-nodes.md index f33e365..5f8e94e 100644 --- a/tests/testthat/_snaps/attr-nodes.md +++ b/tests/testthat/_snaps/attr-nodes.md @@ -28,7 +28,7 @@ a pretty puppy - {#dog alt="a picture + {#dog alt="a picture of a dog"} @@ -39,3 +39,59 @@ +# multiline alt text can be processed + + Code + cat(as.character(protect_curly(curly$body))) + Output + + + + + preface {#pre-face .unnumbered} + + + hello + + + I like {xml2} but of course {tinkr} is even cooler! + + + Images that use pandoc style will have curlies with content that should be translated and should be protected. + + + + a pretty kitten + + {#kitteh alt='a picture of a kitten'} + + + + a pretty puppy + + {#dog alt="a picture + + of a dog"} + + + [a span with attributes]{.span-with-attributes + + style='color: red;'} + + + image with long alt text + + {#image alt='this is + + long alt text that should be all + + included in the image'} + + + + image with short alt text + + {#img alt='short alt text'} + + + diff --git a/tests/testthat/test-attr-nodes.R b/tests/testthat/test-attr-nodes.R index f35a200..201538d 100644 --- a/tests/testthat/test-attr-nodes.R +++ b/tests/testthat/test-attr-nodes.R @@ -4,6 +4,28 @@ test_that("protect_curly() works", { expect_snapshot(cat(as.character(protect_curly(curly$body)))) }) + +test_that("multiline alt text can be processed", { + pathcurly <- system.file("extdata", "basic-curly.md", package = "tinkr") + tmpfile <- withr::local_tempfile() + file.copy(pathcurly, tmpfile, overwrite = TRUE) + cat(c("![image with long alt text](image.png){#image alt='this is", + "long alt text that should be all", + "included in the image'}", + "", + "![image with short alt text](img.png){#img alt='short alt text'}", + "" + ), + sep = "\n", + file = tmpfile, + append = TRUE + ) + curly <- yarn$new(tmpfile, sourcepos = TRUE) + expect_snapshot(cat(as.character(protect_curly(curly$body)))) +}) + + + test_that("a curly-protected yarn object can be written back to markdown", { md_pathcurly <- system.file("extdata", "basic-curly.md", package = "tinkr") rmd_pathcurly <- system.file("extdata", "basic-curly2.Rmd", package = "tinkr") From bb0032cf5adc99252c6e717a1af2b3dcbfe3bc35 Mon Sep 17 00:00:00 2001 From: "Zhian N. Kamvar" Date: Thu, 2 May 2024 11:12:44 -0700 Subject: [PATCH 35/37] fix typo --- R/asis-nodes.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/asis-nodes.R b/R/asis-nodes.R index 8709362..0ab7258 100644 --- a/R/asis-nodes.R +++ b/R/asis-nodes.R @@ -250,7 +250,7 @@ protect_block_math <- function(body, ns) { bm <- xml2::xml_find_all(bm, ".//descendant-or-self::md:*", ns = ns) # if any of the nodes have been set as curly, unset xml2::xml_set_attr(bm, "curly", NULL) - xml2::xml_set_attr(math, "curly-id", NULL) + xml2::xml_set_attr(bm, "curly-id", NULL) set_asis(bm) } From 4db21899bae6189b6c9c5366ae3024f5ff1860de Mon Sep 17 00:00:00 2001 From: "Zhian N. Kamvar" Date: Thu, 2 May 2024 13:50:02 -0700 Subject: [PATCH 36/37] simplify node joinery --- R/node-splits.R | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/R/node-splits.R b/R/node-splits.R index 0fbf0f9..b76ccd0 100644 --- a/R/node-splits.R +++ b/R/node-splits.R @@ -142,30 +142,32 @@ join_split_nodes <- function(body) { } join_text_nodes <- function(nodes) { - nodetxt <- xml2::xml_text(nodes) - asis_nodes <- is_asis(nodes) - # In this nodeset, we need to make sure to relabel the asis nodes - txt <- paste(nodetxt, collapse = "") + txt <- paste(xml2::xml_text(nodes), collapse = "") + prtct <- protected_ranges_from_fragments(nodes) # our new node is the donor for all other nodes new_node <- nodes[[1]] + xml2::xml_set_attr(new_node, "asis", NULL) + add_protected_ranges(new_node, prtct$start, prtct$end) xml2::xml_set_text(new_node, txt) if (has_sourcepos(new_node)) { # restore the sourcepos of the original nodes pos <- join_sourcepos(nodes) xml2::xml_set_attr(new_node, "sourcepos", pos) } - # compute the protection from the string lengths - n <- cumsum(nchar(nodetxt)) - start <- n[!asis_nodes] + 1 - end <- n[asis_nodes] - # if the first node is an asis node, th - if (isTRUE(asis_nodes[1])) { - start <- c(1, n[!asis_nodes] + 1) - } - start <- start[seq_along(end)] - add_protected_ranges(new_node, start, end) + # need to remove 'asis' before setting the protected range xml2::xml_set_attr(new_node, "split-id", NULL) - xml2::xml_set_attr(new_node, "asis", NULL) xml2::xml_remove(nodes[-1]) } +protected_ranges_from_fragments <- function(nodes) { + txt <- xml2::xml_text(nodes) + asis_nodes <- is_asis(nodes) + # number of characters + N <- nchar(txt) + # ending positions of the characters + P <- cumsum(N) + start <- P[asis_nodes] - N[asis_nodes] + 1 + end <- P[asis_nodes] + return(list(start = start, end = end)) +} + From 37fc979930ac77769aa19755badba7ea69231b02 Mon Sep 17 00:00:00 2001 From: "Zhian N. Kamvar" Date: Thu, 2 May 2024 14:10:02 -0700 Subject: [PATCH 37/37] update tests for curly --- tests/testthat/_snaps/attr-nodes.md | 124 +++++++++++++++++++++++++++- tests/testthat/test-attr-nodes.R | 47 ++++++++++- 2 files changed, 169 insertions(+), 2 deletions(-) diff --git a/tests/testthat/_snaps/attr-nodes.md b/tests/testthat/_snaps/attr-nodes.md index 5f8e94e..b8e6fbe 100644 --- a/tests/testthat/_snaps/attr-nodes.md +++ b/tests/testthat/_snaps/attr-nodes.md @@ -1,7 +1,7 @@ # protect_curly() works Code - cat(as.character(protect_curly(curly$body))) + cat(as.character(protec)) Output @@ -39,6 +39,128 @@ +# protect_curly() can be reversed + + Code + cat(as.character(splitsville)) + Output + + + + + preface + {#pre-face .unnumbered} + + + hello + + + I like + {xml2} + but of course + {tinkr} + is even cooler! + + + Images that use pandoc style will have curlies with content that should be translated and should be protected. + + + + a pretty kitten + + {#kitteh alt='a picture of a kitten'} + + + + a pretty puppy + + {#dog alt="a picture + + of a dog"} + + + [ + a span with attributes + ] + {.span-with-attributes + + style='color: red;'} + + + + image with long alt text + + {#image alt='this is + + long alt text that should be all + + included in the image'} + + + + image with short alt text + + {#img alt='short alt text'} + + + +--- + + Code + cat(as.character(joinsville)) + Output + + + + + preface {#pre-face .unnumbered} + + + hello + + + I like {xml2} but of course {tinkr} is even cooler! + + + Images that use pandoc style will have curlies with content that should be translated and should be protected. + + + + a pretty kitten + + {#kitteh alt='a picture of a kitten'} + + + + a pretty puppy + + {#dog alt="a picture + + of a dog"} + + + [a span with attributes]{.span-with-attributes + + style='color: red;'} + + + + image with long alt text + + {#image alt='this is + + long alt text that should be all + + included in the image'} + + + + image with short alt text + + {#img alt='short alt text'} + + + # multiline alt text can be processed Code diff --git a/tests/testthat/test-attr-nodes.R b/tests/testthat/test-attr-nodes.R index 201538d..b881610 100644 --- a/tests/testthat/test-attr-nodes.R +++ b/tests/testthat/test-attr-nodes.R @@ -1,10 +1,55 @@ test_that("protect_curly() works", { pathcurly <- system.file("extdata", "basic-curly.md", package = "tinkr") curly <- yarn$new(pathcurly, sourcepos = TRUE) - expect_snapshot(cat(as.character(protect_curly(curly$body)))) + protec <- protect_curly(curly$body) + expect_snapshot(cat(as.character(protec))) +}) + + +test_that("protect_curly() can be reversed", { + pathcurly <- system.file("extdata", "basic-curly.md", package = "tinkr") + tmpfile <- withr::local_tempfile() + file.copy(pathcurly, tmpfile, overwrite = TRUE) + cat(c( + "", + "![image with long alt text](image.png){#image alt='this is", + "long alt text that should be all", + "included in the image'}", + "", + "![image with short alt text](img.png){#img alt='short alt text'}", + "" + ), + sep = "\n", + file = tmpfile, + append = TRUE + ) + curly <- yarn$new(tmpfile, sourcepos = TRUE) + curly$protect_curly() + orig <- copy_xml(curly$body) + + expect_length(get_protected_nodes(curly$body), 3) + expect_length(xml2::xml_find_all(curly$body, ".//node()[@curly]"), 10) + + # when split the protected nodes go away + splitsville <- split_protected_nodes(curly$body) + expect_snapshot(cat(as.character(splitsville))) + expect_length(get_protected_nodes(splitsville), 0) + + joinsville <- join_split_nodes(splitsville) + expect_identical(joinsville, splitsville) + # joining restores these nodes + sprotec <- get_protected_nodes(splitsville) + expect_length(sprotec, 3) + expect_equal( + lapply(sprotec, get_protected_ranges), + lapply(get_protected_nodes(orig), get_protected_ranges) + ) + expect_length(xml2::xml_find_all(splitsville, ".//node()[@curly]"), 10) + expect_snapshot(cat(as.character(joinsville))) }) + test_that("multiline alt text can be processed", { pathcurly <- system.file("extdata", "basic-curly.md", package = "tinkr") tmpfile <- withr::local_tempfile()