From ff84806a59cca3390b435054a8ef19dffed05f35 Mon Sep 17 00:00:00 2001 From: ridiculousfish Date: Sun, 29 Oct 2023 11:55:18 -0700 Subject: [PATCH] Add further unit tests around case folding and charsets Add some low level tests to exercise these functions. --- src/codepointset.rs | 220 ++++++++++++++++++++++++++++++++++++++++--- src/parse.rs | 2 +- src/unicode.rs | 225 +++++++++++++++++++++++++++++++++----------- 3 files changed, 381 insertions(+), 66 deletions(-) diff --git a/src/codepointset.rs b/src/codepointset.rs index 77632ae..3a2f688 100644 --- a/src/codepointset.rs +++ b/src/codepointset.rs @@ -1,23 +1,23 @@ use crate::util::SliceHelp; #[cfg(not(feature = "std"))] use alloc::vec::Vec; -use core::cmp::Ordering; -use core::iter::once; +use core::cmp::{self, Ordering}; pub type CodePoint = u32; /// The maximum (inclusive) code point. pub const CODE_POINT_MAX: CodePoint = 0x10FFFF; -/// An list of sorted, inclusive, non-empty ranges of code points. +/// An inclusive range of code points. /// This is more efficient than InclusiveRange because it does not need to carry /// around the Option. -#[derive(Debug, Copy, Clone)] +#[derive(Debug, Copy, Clone, PartialEq, Eq)] pub struct Interval { pub first: CodePoint, pub last: CodePoint, } +/// A list of sorted, inclusive, non-empty ranges of code points. impl Interval { /// Return whether self is before rhs. fn is_before(self, other: Interval) -> bool { @@ -34,7 +34,7 @@ impl Interval { /// Compare two intervals. /// Overlapping *or abutting* intervals are considered equal. - fn mergecmp(self, rhs: Interval) -> Ordering { + fn mergecmp(self, rhs: Interval) -> cmp::Ordering { if self.is_strictly_before(rhs) { Ordering::Less } else if rhs.is_strictly_before(self) { @@ -81,7 +81,7 @@ fn merge_intervals(x: Interval, y: &Interval) -> Interval { } } -#[derive(Clone, Debug, Default)] +#[derive(Clone, Debug, Default, PartialEq, Eq)] pub struct CodePointSet { ivs: Vec, } @@ -92,6 +92,7 @@ impl CodePointSet { CodePointSet { ivs: Vec::new() } } + #[inline] fn assert_is_well_formed(&self) { if cfg!(debug_assertions) { for iv in &self.ivs { @@ -120,6 +121,7 @@ impl CodePointSet { // Check our work. if cfg!(debug_assertions) { + debug_assert!(new_iv.first <= new_iv.last); for (idx, iv) in self.ivs.iter().enumerate() { if idx < mergeable.start { debug_assert!(iv.is_strictly_before(new_iv)); @@ -132,15 +134,36 @@ impl CodePointSet { } // Merge all the overlapping intervals (possibly none), and then replace the - // range. - let merged_iv = self.ivs[mergeable.clone()] - .iter() - .fold(new_iv, merge_intervals); - self.ivs.splice(mergeable, once(merged_iv)); + // range. Tests show that drain(), which modifies the vector, is not effectively + // optimized, so try to avoid it in the cases of a new entry or replacing an existing + // entry. + match mergeable.end - mergeable.start { + 0 => { + // New entry. + self.ivs.insert(mergeable.start, new_iv); + } + 1 => { + // Replace a single entry. + let entry = &mut self.ivs[mergeable.start]; + *entry = Interval { + first: cmp::min(entry.first, new_iv.first), + last: cmp::max(entry.last, new_iv.last), + }; + } + _ => { + // Replace range of entries. + let merged_iv: Interval = self.ivs[mergeable.clone()] + .iter() + .fold(new_iv, merge_intervals); + self.ivs[mergeable.start] = merged_iv; + self.ivs.drain(mergeable.start + 1..mergeable.end); + } + } self.assert_is_well_formed(); } /// Add a single code point to the set. + #[inline] pub fn add_one(&mut self, cp: CodePoint) { self.add(Interval { first: cp, @@ -206,3 +229,178 @@ impl CodePointSet { CodePointSet::from_sorted_disjoint_intervals(inverted_ivs) } } + +#[cfg(test)] +mod tests { + use super::*; + + fn iv(first: u32, last: u32) -> Interval { + Interval { first, last } + } + + #[test] + fn test_is_before() { + let a = iv(0, 9); + let b = iv(10, 19); + assert!(a.is_before(b)); + assert!(!b.is_before(a)); + } + + #[test] + fn test_is_strictly_before() { + let a = iv(0, 9); + let b = iv(10, 19); + let c = iv(11, 19); + assert!(!a.is_strictly_before(b)); + assert!(a.is_strictly_before(c)); + assert!(!b.is_strictly_before(a)); + assert!(!b.is_strictly_before(c)); + } + + #[test] + fn test_mergecmp() { + let a = iv(0, 9); + let b = iv(10, 19); + let c = iv(9, 18); + assert_eq!(a.mergecmp(b), Ordering::Equal); + assert_eq!(b.mergecmp(a), Ordering::Equal); + assert_eq!(a.mergecmp(c), Ordering::Equal); + assert_eq!(c.mergecmp(a), Ordering::Equal); + + let d = iv(11, 19); + assert_eq!(a.mergecmp(d), Ordering::Less); + assert_eq!(d.mergecmp(a), Ordering::Greater); + assert_eq!(b.mergecmp(d), Ordering::Equal); + assert_eq!(d.mergecmp(b), Ordering::Equal); + assert_eq!(c.mergecmp(d), Ordering::Equal); + assert_eq!(d.mergecmp(c), Ordering::Equal); + + let e = iv(100, 109); + assert_eq!(a.mergecmp(e), Ordering::Less); + assert_eq!(e.mergecmp(a), Ordering::Greater); + } + + #[test] + fn test_mergeable() { + let a = iv(0, 9); + let b = iv(9, 19); + assert!(a.mergeable(a)); + assert!(a.mergeable(b)); + assert!(b.mergeable(b)); + } + + #[test] + fn test_contains() { + let a = iv(0, 9); + assert!(a.contains(0)); + assert!(a.contains(9)); + assert!(!a.contains(10)); + } + + #[test] + fn test_overlaps() { + let a = iv(0, 9); + let b = iv(5, 14); + let c = iv(10, 19); + assert!(a.overlaps(b)); + assert!(!a.overlaps(c)); + } + + #[test] + fn test_codepoints() { + let a = iv(0, 9); + assert_eq!(a.codepoints(), 0..10); + } + + #[test] + fn test_count_codepoints() { + assert_eq!(iv(0, 9).count_codepoints(), 10); + assert_eq!(iv(0, 0).count_codepoints(), 1); + assert_eq!( + iv(0, CODE_POINT_MAX).count_codepoints(), + (CODE_POINT_MAX + 1) as usize + ); + } + + #[test] + fn test_add() { + let mut set = CodePointSet::new(); + set.add(iv(10, 20)); + set.add(iv(30, 40)); + set.add(iv(15, 35)); + assert_eq!(set.intervals(), &[iv(10, 40)]); + } + + #[test] + fn test_add_one() { + let mut set = CodePointSet::new(); + set.add_one(10); + set.add_one(20); + set.add_one(15); + assert_eq!(set.intervals(), &[iv(10, 10), iv(15, 15), iv(20, 20)]); + } + + #[test] + fn test_add_set() { + let mut set1 = CodePointSet::new(); + set1.add(iv(10, 20)); + set1.add(iv(30, 40)); + let mut set2 = CodePointSet::new(); + set2.add(iv(15, 25)); + set2.add(iv(35, 45)); + set1.add_set(set2); + assert_eq!(set1.intervals(), &[iv(10, 25), iv(30, 45)]); + } + + #[test] + fn test_inverted() { + let mut set = CodePointSet::new(); + set.add(iv(10, 20)); + set.add(iv(30, 40)); + let inverted_set = set.inverted(); + assert_eq!( + inverted_set.intervals(), + &[iv(0, 9), iv(21, 29), iv(41, CODE_POINT_MAX)] + ); + let set_again = inverted_set.inverted(); + assert_eq!(set_again.intervals(), set.intervals()); + + assert_eq!( + set.inverted_interval_count(), + inverted_set.intervals().len() + ); + assert_eq!( + inverted_set.inverted_interval_count(), + set.intervals().len() + ); + } + + #[test] + fn test_adds_torture() { + let mut set = CodePointSet::new(); + set.add(iv(1, 3)); + assert_eq!(&set.intervals(), &[iv(1, 3)]); + set.add(iv(0, 0)); + assert_eq!(&set.intervals(), &[iv(0, 3)]); + set.add(iv(3, 5)); + assert_eq!(&set.intervals(), &[iv(0, 5)]); + set.add(iv(6, 10)); + assert_eq!(&set.intervals(), &[iv(0, 10)]); + set.add(iv(15, 15)); + assert_eq!(&set.intervals(), &[iv(0, 10), iv(15, 15)]); + set.add(iv(12, 14)); + assert_eq!(&set.intervals(), &[iv(0, 10), iv(12, 15)]); + set.add(iv(16, 20)); + assert_eq!(&set.intervals(), &[iv(0, 10), iv(12, 20)]); + set.add(iv(21, 22)); + assert_eq!(&set.intervals(), &[iv(0, 10), iv(12, 22)]); + set.add(iv(23, 23)); + assert_eq!(&set.intervals(), &[iv(0, 10), iv(12, 23)]); + set.add(iv(100, 200)); + assert_eq!(&set.intervals(), &[iv(0, 10), iv(12, 23), iv(100, 200)]); + set.add(iv(201, 250)); + assert_eq!(&set.intervals(), &[iv(0, 10), iv(12, 23), iv(100, 250)]); + set.add(iv(0, 0x10ffff)); + assert_eq!(&set.intervals(), &[iv(0, 0x10ffff)]); + } +} diff --git a/src/parse.rs b/src/parse.rs index 9ee0817..60b8d60 100644 --- a/src/parse.rs +++ b/src/parse.rs @@ -416,7 +416,7 @@ where Some(']') => { self.consume(']'); if self.flags.icase { - result.cps = unicode::fold_code_points(result.cps); + result.cps = unicode::add_icase_code_points(result.cps); } return Ok(ir::Node::Bracket(result)); } diff --git a/src/unicode.rs b/src/unicode.rs index aa9a92f..6056d99 100644 --- a/src/unicode.rs +++ b/src/unicode.rs @@ -4,8 +4,6 @@ use crate::util::SliceHelp; #[cfg(not(feature = "std"))] use alloc::vec::Vec; use core::cmp::Ordering; -#[cfg(test)] -use std::collections::HashMap; // CodePointRange packs a code point and a length together into a u32. // We currently do not need to store any information about code points in plane 16 (U+100000), @@ -191,8 +189,9 @@ pub fn fold(cu: u32) -> u32 { } } +// Add all folded characters in the given interval to the given code point set. +// This skips characters which fold to themselves. fn fold_interval(iv: Interval, recv: &mut CodePointSet) { - // Find the range of folds which overlap our interval. let overlaps = FOLDS.equal_range_by(|tr| { if tr.first() > iv.last { Ordering::Greater @@ -240,6 +239,7 @@ fn unfold_interval(iv: Interval, recv: &mut CodePointSet) { /// \return all the characters which fold to c's fold. /// This is a slow linear search across all ranges. +/// The result always contains c. pub fn unfold_char(c: u32) -> Vec { let mut res = vec![c]; let fcp = fold(c); @@ -265,7 +265,7 @@ pub fn unfold_char(c: u32) -> Vec { } // Fold every character in \p input, then find all the prefolds. -pub fn fold_code_points(mut input: CodePointSet) -> CodePointSet { +pub fn add_icase_code_points(mut input: CodePointSet) -> CodePointSet { let mut folded = input.clone(); for iv in input.intervals() { fold_interval(*iv, &mut folded) @@ -350,70 +350,187 @@ pub(crate) fn is_character_class(c: u32, property_escape: &PropertyEscape) -> bo } } -#[test] -fn test_folds() { - for c in 0..0x41 { - assert_eq!(fold(c), c); - } - for c in 0x41..=0x5A { - assert_eq!(fold(c), c + 0x20); +#[cfg(test)] +mod tests { + use super::*; + use std::collections::HashMap; + + // Map from folded char to the chars that folded to it. + // If an entry is missing, it means either nothing folds to the char, + // or it folds exclusively to itself; this can be determined by comparing + // the char to its fold. + fn get_unfold_map() -> HashMap> { + let mut unfold_map: HashMap> = HashMap::new(); + for c in 0..=0x10FFFF { + let fc = fold(c); + if fc != c { + unfold_map.entry(fc).or_insert_with(Vec::new).push(c); + } + } + + // We neglected self-folds - add them now, but only for entries + // where something else folds to it, else our map would be quite large. + // Also sort them all. + for (&k, v) in unfold_map.iter_mut() { + assert_eq!(k, fold(k), "folds should be idempotent"); + v.push(k); + v.sort_unstable(); + } + unfold_map } - assert_eq!(fold(0xB5), 0x3BC); - assert_eq!(fold(0xC0), 0xE0); - assert_eq!(fold(0x1B8), 0x1B9); - assert_eq!(fold(0x1B9), 0x1B9); - assert_eq!(fold(0x1BA), 0x1BA); - assert_eq!(fold(0x1BB), 0x1BB); - assert_eq!(fold(0x1BC), 0x1BD); - assert_eq!(fold(0x1BD), 0x1BD); + #[test] + fn test_folds() { + for c in 0..0x41 { + assert_eq!(fold(c), c); + } + for c in 0x41..=0x5A { + assert_eq!(fold(c), c + 0x20); + } + assert_eq!(fold(0xB5), 0x3BC); + assert_eq!(fold(0xC0), 0xE0); + + assert_eq!(fold(0x1B8), 0x1B9); + assert_eq!(fold(0x1B9), 0x1B9); + assert_eq!(fold(0x1BA), 0x1BA); + assert_eq!(fold(0x1BB), 0x1BB); + assert_eq!(fold(0x1BC), 0x1BD); + assert_eq!(fold(0x1BD), 0x1BD); + + for c in 0x1F8..0x21F { + if c % 2 == 0 { + assert_eq!(fold(c), c + 1); + } else { + assert_eq!(fold(c), c); + } + } - for c in 0x1F8..0x21F { - if c % 2 == 0 { - assert_eq!(fold(c), c + 1); - } else { + assert_eq!(fold(0x37F), 0x3F3); + assert_eq!(fold(0x380), 0x380); + assert_eq!(fold(0x16E40), 0x16E60); + assert_eq!(fold(0x16E41), 0x16E61); + assert_eq!(fold(0x16E42), 0x16E62); + assert_eq!(fold(0x1E900), 0x1E922); + assert_eq!(fold(0x1E901), 0x1E923); + for c in 0xF0000..=0x10FFFF { assert_eq!(fold(c), c); } } - assert_eq!(fold(0x37F), 0x3F3); - assert_eq!(fold(0x380), 0x380); - assert_eq!(fold(0x16E40), 0x16E60); - assert_eq!(fold(0x16E41), 0x16E61); - assert_eq!(fold(0x16E42), 0x16E62); - assert_eq!(fold(0x1E900), 0x1E922); - assert_eq!(fold(0x1E901), 0x1E923); - for c in 0xF0000..=0x10FFFF { - assert_eq!(fold(c), c); + #[test] + fn test_fold_idempotent() { + for c in 0..=0x10FFFF { + let fc = fold(c); + let ffc = fold(fc); + assert_eq!(ffc, fc); + } } -} -#[test] -fn test_fold_idempotent() { - for c in 0..=0x10FFFF { - let fc = fold(c); - let ffc = fold(fc); - assert_eq!(ffc, fc); + #[test] + fn test_unfolds_refold() { + for c in 0..=0x10FFFF { + let fc = fold(c); + let unfolds = unfold_char(c); + for uc in unfolds { + assert_eq!(fold(uc), fc); + } + } } -} -#[test] -fn test_unfold_chars() { - // Map from folded char to the chars that folded to it. - let mut fold_map: HashMap> = HashMap::new(); - for c in 0..=0x10FFFF { - let fc = fold(c); - fold_map.entry(fc).or_insert_with(Vec::new).push(c); + #[test] + fn test_unfold_chars() { + let unfold_map = get_unfold_map(); + for c in 0..=0x10FFFF { + let mut unfolded = unfold_char(c); + unfolded.sort_unstable(); + let fc = fold(c); + if let Some(expected) = unfold_map.get(&fc) { + // Explicit list of unfolds. + assert_eq!(&unfolded, expected); + } else { + // No entry in our testing unfold map: that means that either the + // character folds to itself and nothing else does, or the character + // folds to a different character - but that different character + // should fold to itself (folding is idempotent) so we should always + // have multiple characters in that case. Therefore we expect this + // character's unfolds to be itself exclusively. + assert_eq!(&unfolded, &[c]); + } + } } - // Sort them all. - for v in fold_map.values_mut() { - v.sort_unstable(); + #[test] + fn test_add_icase_code_points() { + let unfold_map = get_unfold_map(); + let locs = [ + 0x0, 0x42, 0x100, 0xdeba, 0x11419, 0x278f8, 0x2e000, 0x35df7, 0x462d6, 0x4bc29, + 0x4f4c0, 0x58a9b, 0x5bafc, 0x62383, 0x66d60, 0x6974a, 0x77628, 0x87804, 0x9262b, + 0x931e4, 0xaa08c, 0xad7a8, 0xca6b0, 0xcce27, 0xcd897, 0xcf5e7, 0xe2802, 0xe561b, + 0xe5f43, 0xf4339, 0xfb78c, 0xfc5ee, 0x104fa9, 0x10e402, 0x10e6cf, 0x10FFFF, + ]; + for (idx, &first) in locs.iter().enumerate() { + // Keep a running set of the unfolded code points we expect to be in the + // range [first, last]. + let mut expected = CodePointSet::default(); + let mut from = first; + for &last in &locs[idx..] { + // Add both folded and unfolded characters to expected. + for c in from..=last { + let fc = fold(c); + if let Some(unfolded) = unfold_map.get(&fc) { + // Some nontrival set of characters fold to fc. + for &ufc in unfolded { + expected.add_one(ufc); + } + } else { + // Only fc folds to fc. + expected.add_one(fc); + } + } + let mut input = CodePointSet::new(); + input.add(Interval { first, last }); + let folded = add_icase_code_points(input); + assert_eq!(folded, expected); + from = last; + } + } } - for c in 0..=0x10FFFF { - let mut unfolded = unfold_char(c); - unfolded.sort_unstable(); - assert_eq!(unfolded, fold_map[&fold(c)]); + #[test] + fn test_fold_interval() { + let locs = [ + 0, 0x894, 0x59ac, 0xfa64, 0x10980, 0x12159, 0x16b8d, 0x1aaa2, 0x1f973, 0x1fcd4, + 0x20c35, 0x23d8a, 0x276af, 0x2c6b8, 0x2fb25, 0x30b9b, 0x338ad, 0x35ab3, 0x38d37, + 0x3bfa7, 0x3fba6, 0x404c9, 0x44572, 0x480c9, 0x4b5c4, 0x4f371, 0x5a9fa, 0x5ad6c, + 0x5e395, 0x5f103, 0x5fa98, 0x617fa, 0x6500e, 0x68890, 0x6a3fc, 0x6eab3, 0x704a6, + 0x70c22, 0x72efb, 0x737cc, 0x76796, 0x79da8, 0x7a450, 0x7b023, 0x7cc5c, 0x82027, + 0x84ef4, 0x8ac66, 0x8b898, 0x8bd1a, 0x95841, 0x98a48, 0x9e6cd, 0xa035a, 0xa41fb, + 0xa50e3, 0xa6387, 0xa7ba1, 0xaad9a, 0xabed8, 0xacc88, 0xb2737, 0xb31b1, 0xb6daf, + 0xb7ff4, 0xba2b4, 0xbde4f, 0xbe38b, 0xbe7a5, 0xc4eb2, 0xc5670, 0xc7703, 0xc995d, + 0xccb72, 0xcdfe3, 0xcfc99, 0xd09eb, 0xd2773, 0xd357d, 0xd6696, 0xd9aec, 0xdc3fa, + 0xdc8ae, 0xdc9d5, 0xde31d, 0xe2edb, 0xe652b, 0xe92d5, 0xebf2d, 0xee335, 0xef45f, + 0xf4280, 0xf74b1, 0xf9ac4, 0xfafca, 0x10208d, 0x107d63, 0x10821e, 0x108818, 0x10911f, + 0x10b6fd, 0x10FFFF, + ]; + for (idx, &first) in locs.iter().enumerate() { + // Keep a running set of the folded code points we expect to be in the + // range [first, last]. + let mut expected = CodePointSet::default(); + let mut from = first; + for &last in &locs[idx..] { + // Add characters to expected which do not fold to themselves. + for c in from..=last { + let fc = fold(c); + if fc != c { + expected.add_one(fc); + } + } + let mut cps = CodePointSet::default(); + fold_interval(Interval { first, last }, &mut cps); + assert_eq!(cps.intervals(), expected.intervals()); + + from = last; + } + } } }