Skip to content

Commit

Permalink
Merge pull request #124 from oscar-project/dev-checksum
Browse files Browse the repository at this point in the history
Optional corpus checksum + have language folders rather than flat files
  • Loading branch information
Uinelj authored Aug 31, 2023
2 parents 9234d13 + 12f096e commit 2e80e39
Show file tree
Hide file tree
Showing 9 changed files with 458 additions and 177 deletions.
578 changes: 408 additions & 170 deletions Cargo.lock

Large diffs are not rendered by default.

10 changes: 9 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ twox-hash = "1.6"
glob = "0.3.0"
sha2 = "0.9.5"

oscar-tools = { version = "0.4.0", optional = true }

serde = { version = "1", features = ["derive"] }
serde_json = "1"
schemars = "0.8.3"
Expand All @@ -59,6 +61,7 @@ ctclib-pp = { version = "0.2.0", optional = true }

[features]
kenlm = ["dep:ctclib-pp"]
checksum = ["dep:oscar-tools"]

[dev-dependencies]
rand_distr = "0.4.2"
Expand Down Expand Up @@ -97,4 +100,9 @@ ci = ["github"]
# The installers to generate for each app
installers = []
# Target platforms to build apps for (Rust target-triple syntax)
targets = ["x86_64-unknown-linux-gnu", "x86_64-apple-darwin", "x86_64-pc-windows-msvc", "aarch64-apple-darwin"]
targets = [
"x86_64-unknown-linux-gnu",
"x86_64-apple-darwin",
"x86_64-pc-windows-msvc",
"aarch64-apple-darwin",
]
4 changes: 4 additions & 0 deletions src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -195,4 +195,8 @@ pub struct Pipeline {

#[structopt(short = "c", long = "comp", help = "Enables zstd compression")]
pub comp: bool,

#[cfg(feature = "checksum")]
#[structopt(long = "checksum", help = "compute checksums")]
pub checksum: bool,
}
12 changes: 9 additions & 3 deletions src/io/langfiles.rs
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,13 @@ impl LangFilesDoc {
} else {
None
};
let w = Writer::new(dst, lang, part_size_bytes, comp)?;

// add lang subfolder
let mut subfolder = dst.to_path_buf();
subfolder.push(lang.to_string());
std::fs::create_dir(&subfolder)?;

let w = Writer::new(&subfolder, lang, part_size_bytes, comp)?;

Ok(Arc::new(Mutex::new(w)))
}
Expand Down Expand Up @@ -219,7 +225,7 @@ mod tests {
w.flush().unwrap();
}
let mut read_path = PathBuf::from(dst.path());
read_path.push("en.jsonl");
read_path.push("en/en.jsonl");

let b = File::open(read_path).unwrap();
let doc_from_file: Document = serde_json::from_reader(b).unwrap();
Expand Down Expand Up @@ -252,7 +258,7 @@ mod tests {
}

let mut read_path = PathBuf::from(dst.path());
read_path.push("en.jsonl.zstd");
read_path.push("en/en.jsonl.zstd");

let b = File::open(&read_path).unwrap();
let dec = zstd::decode_all(b).unwrap();
Expand Down
2 changes: 2 additions & 0 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,8 @@ async fn main() -> Result<(), error::Error> {
p.kenlms_path,
p.split.map(|size_mbytes| size_mbytes * 1_000_000),
p.comp,
#[cfg(feature = "checksum")]
p.checksum,
);
p.run()?;

Expand Down
20 changes: 20 additions & 0 deletions src/pipelines/oscardoc/pipeline.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,15 @@ pub struct OscarDoc {
kenlms_path: Option<PathBuf>,
split: Option<u64>, // in bytes
comp: bool,
#[cfg(feature = "checksum")]
checksum: bool,
}

#[cfg(feature = "checksum")]
use oscar_tools::Checksum;
#[cfg(feature = "checksum")]
impl Checksum for OscarDoc {}

impl OscarDoc {
pub fn new(
src: PathBuf,
Expand All @@ -71,6 +78,7 @@ impl OscarDoc {
kenlms_path: Option<PathBuf>,
split: Option<u64>,
comp: bool,
#[cfg(feature = "checksum")] checksum: bool,
) -> Self {
if blocklist.is_none() {
warn!("No blocklist folder specified! No adult content tagging will be done.");
Expand All @@ -85,6 +93,8 @@ impl OscarDoc {
kenlms_path,
split,
comp,
#[cfg(feature = "checksum")]
checksum,
}
}

Expand Down Expand Up @@ -519,6 +529,16 @@ impl Pipeline<()> for OscarDoc {
// flush writers
info!("Flushing writers");
langfiles.flush_all()?;
std::mem::drop(langfiles);

Check warning on line 532 in src/pipelines/oscardoc/pipeline.rs

View check run for this annotation

Codecov / codecov/patch

src/pipelines/oscardoc/pipeline.rs#L530-L532

Added lines #L530 - L532 were not covered by tests
//compute checksums
#[cfg(feature = "checksum")]
{
if self.checksum {
info!("Checksumming");
Self::checksum_folder(&self.dst, 1).unwrap();
}
}

info!("Done");

Check warning on line 542 in src/pipelines/oscardoc/pipeline.rs

View check run for this annotation

Codecov / codecov/patch

src/pipelines/oscardoc/pipeline.rs#L542

Added line #L542 was not covered by tests
Ok(())
}
Expand Down
2 changes: 0 additions & 2 deletions src/pipelines/oscardoc/types/location.rs
Original file line number Diff line number Diff line change
Expand Up @@ -191,8 +191,6 @@ impl Default for Location {

#[cfg(test)]
mod tests {



use super::Location;
use super::LocationBuilder;
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,13 @@ pub use annotate::Annotate;
pub use annotate::Annotator;
pub use content_detector::ContentDetector;
pub use header::Header;
pub use lsh::LSH;
#[cfg(feature = "kenlm")]
pub use kenlm::AdultDetector;
#[cfg(feature = "kenlm")]
pub use kenlm::AdultDetectorBuilder;
#[cfg(feature = "kenlm")]
pub use kenlm::Models;
pub use lsh::LSH;
pub use noisy::Noisy;
pub use sentence_filter::Conv;
pub use sentence_filter::RemoveShortSentences;
Expand Down
5 changes: 5 additions & 0 deletions tests/oscardoc_rebuild.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,12 @@ fn gen_corpus() {
let kenlm = Path::new("res/kenlm/").to_path_buf();

//TODO test with custom blocklists
#[cfg(feature = "checksum")]
let pipeline = OscarDoc::new(src, dst, lid, Some(bl), Some(kenlm), None, false, false);

#[cfg(not(feature = "checksum"))]
let pipeline = OscarDoc::new(src, dst, lid, Some(bl), Some(kenlm), None, false);

pipeline.run().expect(
"Ensure to have shards in res/shards, lid.176.bin at root and blocklist at res/blocklist",
);
Expand Down

0 comments on commit 2e80e39

Please sign in to comment.