diff --git a/.Rbuildignore b/.Rbuildignore index 12e15eb..1c65280 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -20,3 +20,4 @@ Doxyfile ^CRAN-RELEASE$ ^LICENSE$ ^\.scribblr$ +^CRAN-SUBMISSION$ diff --git a/DESCRIPTION b/DESCRIPTION index c6f2f07..4533a2d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: kgrams Title: Classical k-gram Language Models -Version: 0.1.4 +Version: 0.1.5 Authors@R: person(given = "Valerio", family = "Gherardi", @@ -8,14 +8,14 @@ Authors@R: email = "vgherard840@gmail.com", comment = c(ORCID = "0000-0002-8215-3013")) Description: - Tools for training and evaluating k-gram language models in R, + Training and evaluating k-gram language models in R, supporting several probability smoothing techniques, perplexity computations, random text generation and more. License: GPL (>= 3) Encoding: UTF-8 LazyData: true Roxygen: list(markdown = TRUE, roclets = c ("namespace", "rd")) -RoxygenNote: 7.2.1 +RoxygenNote: 7.2.3 LinkingTo: Rcpp, RcppProgress Imports: diff --git a/NEWS.md b/NEWS.md index 97998ec..edf3f9f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,9 @@ +# kgrams 0.1.5 + +* Removed "Tools for..." at the beginning of package DESCRIPTION, as per CRAN's +request. +* Simplified examples in `?kgram_freqs`. + # kgrams 0.1.4 * Updated `R` requirements `3.5 -> 4.0`. diff --git a/R/kgram_freqs.R b/R/kgram_freqs.R index 79800bf..67d8982 100644 --- a/R/kgram_freqs.R +++ b/R/kgram_freqs.R @@ -157,38 +157,13 @@ #' # Build a k-gram frequency table from a file connection #' #' \dontrun{ -#' f <- kgram_freqs(file("myfile.txt"), 3) +#' f <- kgram_freqs(file("my_text_file.txt"), 3) #' } #' #' #' # Build a k-gram frequency table from an URL connection #' \dontrun{ -#' ### Shakespeare's "Much Ado About Nothing" (entire play) -#' con <- url("http://shakespeare.mit.edu/much_ado/full.html") -#' -#' # Apply some basic preprocessing -#' .preprocess <- function(x) { -#' # Remove character names and locations (boldfaced in original html) -#' x <- gsub("[A-z]+", "", x) -#' # Remove other html tags -#' x <- gsub("<[^>]+>||<[^>]+$||^[^>]+>$", "", x) -#' # Apply standard preprocessing including lower-case -#' x <- kgrams::preprocess(x) -#' return(x) -#' } -#' -#' .tknz_sent <- function(x) { -#' # Tokenize sentences keeping Shakespeare's punctuation -#' x <- kgrams::tknz_sent(x, keep_first = TRUE) -#' # Remove empty sentences -#' x <- x[x != ""] -#' return(x) -#' } -#' -#' f <- kgram_freqs(con, 3, .preprocess, .tknz_sent, batch_size = 1000) -#' summary(f) -#' -#' query(f, c("leonato", "thy", "smartphones")) # c(145, 52, 0) +#' f <- kgram_freqs(url("http://my.website/my_text_file.txt"), 3) #' } #' @name kgram_freqs NULL diff --git a/cran-comments.md b/cran-comments.md index 2e6c20c..3a3873d 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,4 +1,4 @@ -## kgrams v0.1.4 +## kgrams v0.1.5 `kgrams` v0.1.2 got archived because the former vignette used an online data source which became unavailable. This online source has been replaced by local @@ -10,4 +10,53 @@ R CMD CHECK produces a Note: All declared Imports should be used. This note is spurious, as the imported package is used. It is only called from -C++ source code, which may be the reason behind this false positive. \ No newline at end of file +C++ source code, which may be the reason behind this false positive. + +--- + +### Follow-up to CRAN review + +#### Comment 1 + + The Title field should be in title case. Current version is: + 'Classical k-gram Language Models' + In title case that is: + 'Classical k-Gram Language Models' + +"k-gram" is a mathematical term, as such it should be considered as a +single word. In particular, "gram" does not need to be capitalized. + +#### Comment 2 + +Please omit the redundant "Tools for" at the beginning of your description. + +Done. + +#### Comment 3 + +If there are references describing the methods in your package, please add these in the description field of your DESCRIPTION file in the form +authors (year) +authors (year) +authors (year, ISBN:...) +or if those are not available: +with no space after 'doi:', 'arXiv:', 'https:' and angle brackets for auto-linking. +(If you want to add a title as well please put it in quotes: "Title") + +The package implements a manifold of mathematical methods for language models +(that can, to some extent, be considered "classical" literature). These are +properly referenced throughout the package documentation. +Documenting them in the DESCRIPTION would require citing tens of articles (some +of them published in the beginning of XX-th century), which I think is beside +the point. + +#### Comment 4 + +\dontrun{} should only be used if the example really cannot be executed (e.g. because of missing additional software, missing API keys, ...) by the user. That's why wrapping examples in \dontrun{} adds the comment ("# Not run:") as a warning for the user. +Does not seem necessary. +Please unwrap the examples if they are executable in < 5 sec, or replace \dontrun{} with \donttest{}. + +The only two examples that are left wrapped in the \dontrun{} command +cannot be run with 100% confidence - because they reference a dummy +"my_text_file.txt" local or online resource, which does not exist. I believe +this is the most transparent way to document (abstractly) the relevant features +here. diff --git a/man/kgram_freqs.Rd b/man/kgram_freqs.Rd index 75e6831..30f6559 100644 --- a/man/kgram_freqs.Rd +++ b/man/kgram_freqs.Rd @@ -245,38 +245,13 @@ query(f1, c("a", "b")) # c(3, 4): the new 'f1' stores the updated counts # Build a k-gram frequency table from a file connection \dontrun{ -f <- kgram_freqs(file("myfile.txt"), 3) +f <- kgram_freqs(file("my_text_file.txt"), 3) } # Build a k-gram frequency table from an URL connection \dontrun{ -### Shakespeare's "Much Ado About Nothing" (entire play) -con <- url("http://shakespeare.mit.edu/much_ado/full.html") - -# Apply some basic preprocessing -.preprocess <- function(x) { - # Remove character names and locations (boldfaced in original html) - x <- gsub("[A-z]+", "", x) - # Remove other html tags - x <- gsub("<[^>]+>||<[^>]+$||^[^>]+>$", "", x) - # Apply standard preprocessing including lower-case - x <- kgrams::preprocess(x) - return(x) -} - -.tknz_sent <- function(x) { - # Tokenize sentences keeping Shakespeare's punctuation - x <- kgrams::tknz_sent(x, keep_first = TRUE) - # Remove empty sentences - x <- x[x != ""] - return(x) -} - -f <- kgram_freqs(con, 3, .preprocess, .tknz_sent, batch_size = 1000) -summary(f) - -query(f, c("leonato", "thy", "smartphones")) # c(145, 52, 0) +f <- kgram_freqs(url("http://my.website/my_text_file.txt"), 3) } } \seealso{ diff --git a/man/kgrams-package.Rd b/man/kgrams-package.Rd index 1a65ede..c9a1748 100644 --- a/man/kgrams-package.Rd +++ b/man/kgrams-package.Rd @@ -6,7 +6,7 @@ \alias{kgrams-package} \title{kgrams: Classical k-gram Language Models} \description{ -Tools for training and evaluating k-gram language models in R, supporting several probability smoothing techniques, perplexity computations, random text generation and more. +Training and evaluating k-gram language models in R, supporting several probability smoothing techniques, perplexity computations, random text generation and more. } \seealso{ Useful links: