diff --git a/.Rbuildignore b/.Rbuildignore
index 12e15eb..1c65280 100644
--- a/.Rbuildignore
+++ b/.Rbuildignore
@@ -20,3 +20,4 @@ Doxyfile
^CRAN-RELEASE$
^LICENSE$
^\.scribblr$
+^CRAN-SUBMISSION$
diff --git a/DESCRIPTION b/DESCRIPTION
index c6f2f07..4533a2d 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
Package: kgrams
Title: Classical k-gram Language Models
-Version: 0.1.4
+Version: 0.1.5
Authors@R:
person(given = "Valerio",
family = "Gherardi",
@@ -8,14 +8,14 @@ Authors@R:
email = "vgherard840@gmail.com",
comment = c(ORCID = "0000-0002-8215-3013"))
Description:
- Tools for training and evaluating k-gram language models in R,
+ Training and evaluating k-gram language models in R,
supporting several probability smoothing techniques,
perplexity computations, random text generation and more.
License: GPL (>= 3)
Encoding: UTF-8
LazyData: true
Roxygen: list(markdown = TRUE, roclets = c ("namespace", "rd"))
-RoxygenNote: 7.2.1
+RoxygenNote: 7.2.3
LinkingTo:
Rcpp, RcppProgress
Imports:
diff --git a/NEWS.md b/NEWS.md
index 97998ec..edf3f9f 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,9 @@
+# kgrams 0.1.5
+
+* Removed "Tools for..." at the beginning of package DESCRIPTION, as per CRAN's
+request.
+* Simplified examples in `?kgram_freqs`.
+
# kgrams 0.1.4
* Updated `R` requirements `3.5 -> 4.0`.
diff --git a/R/kgram_freqs.R b/R/kgram_freqs.R
index 79800bf..67d8982 100644
--- a/R/kgram_freqs.R
+++ b/R/kgram_freqs.R
@@ -157,38 +157,13 @@
#' # Build a k-gram frequency table from a file connection
#'
#' \dontrun{
-#' f <- kgram_freqs(file("myfile.txt"), 3)
+#' f <- kgram_freqs(file("my_text_file.txt"), 3)
#' }
#'
#'
#' # Build a k-gram frequency table from an URL connection
#' \dontrun{
-#' ### Shakespeare's "Much Ado About Nothing" (entire play)
-#' con <- url("http://shakespeare.mit.edu/much_ado/full.html")
-#'
-#' # Apply some basic preprocessing
-#' .preprocess <- function(x) {
-#' # Remove character names and locations (boldfaced in original html)
-#' x <- gsub("[A-z]+", "", x)
-#' # Remove other html tags
-#' x <- gsub("<[^>]+>||<[^>]+$||^[^>]+>$", "", x)
-#' # Apply standard preprocessing including lower-case
-#' x <- kgrams::preprocess(x)
-#' return(x)
-#' }
-#'
-#' .tknz_sent <- function(x) {
-#' # Tokenize sentences keeping Shakespeare's punctuation
-#' x <- kgrams::tknz_sent(x, keep_first = TRUE)
-#' # Remove empty sentences
-#' x <- x[x != ""]
-#' return(x)
-#' }
-#'
-#' f <- kgram_freqs(con, 3, .preprocess, .tknz_sent, batch_size = 1000)
-#' summary(f)
-#'
-#' query(f, c("leonato", "thy", "smartphones")) # c(145, 52, 0)
+#' f <- kgram_freqs(url("http://my.website/my_text_file.txt"), 3)
#' }
#' @name kgram_freqs
NULL
diff --git a/cran-comments.md b/cran-comments.md
index 2e6c20c..3a3873d 100644
--- a/cran-comments.md
+++ b/cran-comments.md
@@ -1,4 +1,4 @@
-## kgrams v0.1.4
+## kgrams v0.1.5
`kgrams` v0.1.2 got archived because the former vignette used an online data
source which became unavailable. This online source has been replaced by local
@@ -10,4 +10,53 @@ R CMD CHECK produces a Note:
All declared Imports should be used.
This note is spurious, as the imported package is used. It is only called from
-C++ source code, which may be the reason behind this false positive.
\ No newline at end of file
+C++ source code, which may be the reason behind this false positive.
+
+---
+
+### Follow-up to CRAN review
+
+#### Comment 1
+
+ The Title field should be in title case. Current version is:
+ 'Classical k-gram Language Models'
+ In title case that is:
+ 'Classical k-Gram Language Models'
+
+"k-gram" is a mathematical term, as such it should be considered as a
+single word. In particular, "gram" does not need to be capitalized.
+
+#### Comment 2
+
+Please omit the redundant "Tools for" at the beginning of your description.
+
+Done.
+
+#### Comment 3
+
+If there are references describing the methods in your package, please add these in the description field of your DESCRIPTION file in the form
+authors (year)
+authors (year)
+authors (year, ISBN:...)
+or if those are not available:
+with no space after 'doi:', 'arXiv:', 'https:' and angle brackets for auto-linking.
+(If you want to add a title as well please put it in quotes: "Title")
+
+The package implements a manifold of mathematical methods for language models
+(that can, to some extent, be considered "classical" literature). These are
+properly referenced throughout the package documentation.
+Documenting them in the DESCRIPTION would require citing tens of articles (some
+of them published in the beginning of XX-th century), which I think is beside
+the point.
+
+#### Comment 4
+
+\dontrun{} should only be used if the example really cannot be executed (e.g. because of missing additional software, missing API keys, ...) by the user. That's why wrapping examples in \dontrun{} adds the comment ("# Not run:") as a warning for the user.
+Does not seem necessary.
+Please unwrap the examples if they are executable in < 5 sec, or replace \dontrun{} with \donttest{}.
+
+The only two examples that are left wrapped in the \dontrun{} command
+cannot be run with 100% confidence - because they reference a dummy
+"my_text_file.txt" local or online resource, which does not exist. I believe
+this is the most transparent way to document (abstractly) the relevant features
+here.
diff --git a/man/kgram_freqs.Rd b/man/kgram_freqs.Rd
index 75e6831..30f6559 100644
--- a/man/kgram_freqs.Rd
+++ b/man/kgram_freqs.Rd
@@ -245,38 +245,13 @@ query(f1, c("a", "b")) # c(3, 4): the new 'f1' stores the updated counts
# Build a k-gram frequency table from a file connection
\dontrun{
-f <- kgram_freqs(file("myfile.txt"), 3)
+f <- kgram_freqs(file("my_text_file.txt"), 3)
}
# Build a k-gram frequency table from an URL connection
\dontrun{
-### Shakespeare's "Much Ado About Nothing" (entire play)
-con <- url("http://shakespeare.mit.edu/much_ado/full.html")
-
-# Apply some basic preprocessing
-.preprocess <- function(x) {
- # Remove character names and locations (boldfaced in original html)
- x <- gsub("[A-z]+", "", x)
- # Remove other html tags
- x <- gsub("<[^>]+>||<[^>]+$||^[^>]+>$", "", x)
- # Apply standard preprocessing including lower-case
- x <- kgrams::preprocess(x)
- return(x)
-}
-
-.tknz_sent <- function(x) {
- # Tokenize sentences keeping Shakespeare's punctuation
- x <- kgrams::tknz_sent(x, keep_first = TRUE)
- # Remove empty sentences
- x <- x[x != ""]
- return(x)
-}
-
-f <- kgram_freqs(con, 3, .preprocess, .tknz_sent, batch_size = 1000)
-summary(f)
-
-query(f, c("leonato", "thy", "smartphones")) # c(145, 52, 0)
+f <- kgram_freqs(url("http://my.website/my_text_file.txt"), 3)
}
}
\seealso{
diff --git a/man/kgrams-package.Rd b/man/kgrams-package.Rd
index 1a65ede..c9a1748 100644
--- a/man/kgrams-package.Rd
+++ b/man/kgrams-package.Rd
@@ -6,7 +6,7 @@
\alias{kgrams-package}
\title{kgrams: Classical k-gram Language Models}
\description{
-Tools for training and evaluating k-gram language models in R, supporting several probability smoothing techniques, perplexity computations, random text generation and more.
+Training and evaluating k-gram language models in R, supporting several probability smoothing techniques, perplexity computations, random text generation and more.
}
\seealso{
Useful links: