From ff84806a59cca3390b435054a8ef19dffed05f35 Mon Sep 17 00:00:00 2001
From: ridiculousfish <corydoras@ridiculousfish.com>
Date: Sun, 29 Oct 2023 11:55:18 -0700
Subject: [PATCH] Add further unit tests around case folding and charsets

Add some low level tests to exercise these functions.
---
 src/codepointset.rs | 220 ++++++++++++++++++++++++++++++++++++++++---
 src/parse.rs        |   2 +-
 src/unicode.rs      | 225 +++++++++++++++++++++++++++++++++-----------
 3 files changed, 381 insertions(+), 66 deletions(-)
diff --git a/src/codepointset.rs b/src/codepointset.rs
index 77632ae..3a2f688 100644
--- a/src/codepointset.rs
+++ b/src/codepointset.rs
@@ -1,23 +1,23 @@
 use crate::util::SliceHelp;
 #[cfg(not(feature = "std"))]
 use alloc::vec::Vec;
-use core::cmp::Ordering;
-use core::iter::once;
+use core::cmp::{self, Ordering};
 
 pub type CodePoint = u32;
 
 /// The maximum (inclusive) code point.
 pub const CODE_POINT_MAX: CodePoint = 0x10FFFF;
 
-/// An list of sorted, inclusive, non-empty ranges of code points.
+/// An inclusive range of code points.
 /// This is more efficient than InclusiveRange because it does not need to carry
 /// around the Option<bool>.
-#[derive(Debug, Copy, Clone)]
+#[derive(Debug, Copy, Clone, PartialEq, Eq)]
 pub struct Interval {
     pub first: CodePoint,
     pub last: CodePoint,
 }
 
+/// A list of sorted, inclusive, non-empty ranges of code points.
 impl Interval {
     /// Return whether self is before rhs.
     fn is_before(self, other: Interval) -> bool {
@@ -34,7 +34,7 @@ impl Interval {
 
     /// Compare two intervals.
     /// Overlapping *or abutting* intervals are considered equal.
-    fn mergecmp(self, rhs: Interval) -> Ordering {
+    fn mergecmp(self, rhs: Interval) -> cmp::Ordering {
         if self.is_strictly_before(rhs) {
             Ordering::Less
         } else if rhs.is_strictly_before(self) {
@@ -81,7 +81,7 @@ fn merge_intervals(x: Interval, y: &Interval) -> Interval {
     }
 }
 
-#[derive(Clone, Debug, Default)]
+#[derive(Clone, Debug, Default, PartialEq, Eq)]
 pub struct CodePointSet {
     ivs: Vec<Interval>,
 }
@@ -92,6 +92,7 @@ impl CodePointSet {
         CodePointSet { ivs: Vec::new() }
     }
 
+    #[inline]
     fn assert_is_well_formed(&self) {
         if cfg!(debug_assertions) {
             for iv in &self.ivs {
@@ -120,6 +121,7 @@ impl CodePointSet {
 
         // Check our work.
         if cfg!(debug_assertions) {
+            debug_assert!(new_iv.first <= new_iv.last);
             for (idx, iv) in self.ivs.iter().enumerate() {
                 if idx < mergeable.start {
                     debug_assert!(iv.is_strictly_before(new_iv));
@@ -132,15 +134,36 @@ impl CodePointSet {
         }
 
         // Merge all the overlapping intervals (possibly none), and then replace the
-        // range.
-        let merged_iv = self.ivs[mergeable.clone()]
-            .iter()
-            .fold(new_iv, merge_intervals);
-        self.ivs.splice(mergeable, once(merged_iv));
+        // range. Tests show that drain(), which modifies the vector, is not effectively
+        // optimized, so try to avoid it in the cases of a new entry or replacing an existing
+        // entry.
+        match mergeable.end - mergeable.start {
+            0 => {
+                // New entry.
+                self.ivs.insert(mergeable.start, new_iv);
+            }
+            1 => {
+                // Replace a single entry.
+                let entry = &mut self.ivs[mergeable.start];
+                *entry = Interval {
+                    first: cmp::min(entry.first, new_iv.first),
+                    last: cmp::max(entry.last, new_iv.last),
+                };
+            }
+            _ => {
+                // Replace range of entries.
+                let merged_iv: Interval = self.ivs[mergeable.clone()]
+                    .iter()
+                    .fold(new_iv, merge_intervals);
+                self.ivs[mergeable.start] = merged_iv;
+                self.ivs.drain(mergeable.start + 1..mergeable.end);
+            }
+        }
         self.assert_is_well_formed();
     }
 
     /// Add a single code point to the set.
+    #[inline]
     pub fn add_one(&mut self, cp: CodePoint) {
         self.add(Interval {
             first: cp,
@@ -206,3 +229,178 @@ impl CodePointSet {
         CodePointSet::from_sorted_disjoint_intervals(inverted_ivs)
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn iv(first: u32, last: u32) -> Interval {
+        Interval { first, last }
+    }
+
+    #[test]
+    fn test_is_before() {
+        let a = iv(0, 9);
+        let b = iv(10, 19);
+        assert!(a.is_before(b));
+        assert!(!b.is_before(a));
+    }
+
+    #[test]
+    fn test_is_strictly_before() {
+        let a = iv(0, 9);
+        let b = iv(10, 19);
+        let c = iv(11, 19);
+        assert!(!a.is_strictly_before(b));
+        assert!(a.is_strictly_before(c));
+        assert!(!b.is_strictly_before(a));
+        assert!(!b.is_strictly_before(c));
+    }
+
+    #[test]
+    fn test_mergecmp() {
+        let a = iv(0, 9);
+        let b = iv(10, 19);
+        let c = iv(9, 18);
+        assert_eq!(a.mergecmp(b), Ordering::Equal);
+        assert_eq!(b.mergecmp(a), Ordering::Equal);
+        assert_eq!(a.mergecmp(c), Ordering::Equal);
+        assert_eq!(c.mergecmp(a), Ordering::Equal);
+
+        let d = iv(11, 19);
+        assert_eq!(a.mergecmp(d), Ordering::Less);
+        assert_eq!(d.mergecmp(a), Ordering::Greater);
+        assert_eq!(b.mergecmp(d), Ordering::Equal);
+        assert_eq!(d.mergecmp(b), Ordering::Equal);
+        assert_eq!(c.mergecmp(d), Ordering::Equal);
+        assert_eq!(d.mergecmp(c), Ordering::Equal);
+
+        let e = iv(100, 109);
+        assert_eq!(a.mergecmp(e), Ordering::Less);
+        assert_eq!(e.mergecmp(a), Ordering::Greater);
+    }
+
+    #[test]
+    fn test_mergeable() {
+        let a = iv(0, 9);
+        let b = iv(9, 19);
+        assert!(a.mergeable(a));
+        assert!(a.mergeable(b));
+        assert!(b.mergeable(b));
+    }
+
+    #[test]
+    fn test_contains() {
+        let a = iv(0, 9);
+        assert!(a.contains(0));
+        assert!(a.contains(9));
+        assert!(!a.contains(10));
+    }
+
+    #[test]
+    fn test_overlaps() {
+        let a = iv(0, 9);
+        let b = iv(5, 14);
+        let c = iv(10, 19);
+        assert!(a.overlaps(b));
+        assert!(!a.overlaps(c));
+    }
+
+    #[test]
+    fn test_codepoints() {
+        let a = iv(0, 9);
+        assert_eq!(a.codepoints(), 0..10);
+    }
+
+    #[test]
+    fn test_count_codepoints() {
+        assert_eq!(iv(0, 9).count_codepoints(), 10);
+        assert_eq!(iv(0, 0).count_codepoints(), 1);
+        assert_eq!(
+            iv(0, CODE_POINT_MAX).count_codepoints(),
+            (CODE_POINT_MAX + 1) as usize
+        );
+    }
+
+    #[test]
+    fn test_add() {
+        let mut set = CodePointSet::new();
+        set.add(iv(10, 20));
+        set.add(iv(30, 40));
+        set.add(iv(15, 35));
+        assert_eq!(set.intervals(), &[iv(10, 40)]);
+    }
+
+    #[test]
+    fn test_add_one() {
+        let mut set = CodePointSet::new();
+        set.add_one(10);
+        set.add_one(20);
+        set.add_one(15);
+        assert_eq!(set.intervals(), &[iv(10, 10), iv(15, 15), iv(20, 20)]);
+    }
+
+    #[test]
+    fn test_add_set() {
+        let mut set1 = CodePointSet::new();
+        set1.add(iv(10, 20));
+        set1.add(iv(30, 40));
+        let mut set2 = CodePointSet::new();
+        set2.add(iv(15, 25));
+        set2.add(iv(35, 45));
+        set1.add_set(set2);
+        assert_eq!(set1.intervals(), &[iv(10, 25), iv(30, 45)]);
+    }
+
+    #[test]
+    fn test_inverted() {
+        let mut set = CodePointSet::new();
+        set.add(iv(10, 20));
+        set.add(iv(30, 40));
+        let inverted_set = set.inverted();
+        assert_eq!(
+            inverted_set.intervals(),
+            &[iv(0, 9), iv(21, 29), iv(41, CODE_POINT_MAX)]
+        );
+        let set_again = inverted_set.inverted();
+        assert_eq!(set_again.intervals(), set.intervals());
+
+        assert_eq!(
+            set.inverted_interval_count(),
+            inverted_set.intervals().len()
+        );
+        assert_eq!(
+            inverted_set.inverted_interval_count(),
+            set.intervals().len()
+        );
+    }
+
+    #[test]
+    fn test_adds_torture() {
+        let mut set = CodePointSet::new();
+        set.add(iv(1, 3));
+        assert_eq!(&set.intervals(), &[iv(1, 3)]);
+        set.add(iv(0, 0));
+        assert_eq!(&set.intervals(), &[iv(0, 3)]);
+        set.add(iv(3, 5));
+        assert_eq!(&set.intervals(), &[iv(0, 5)]);
+        set.add(iv(6, 10));
+        assert_eq!(&set.intervals(), &[iv(0, 10)]);
+        set.add(iv(15, 15));
+        assert_eq!(&set.intervals(), &[iv(0, 10), iv(15, 15)]);
+        set.add(iv(12, 14));
+        assert_eq!(&set.intervals(), &[iv(0, 10), iv(12, 15)]);
+        set.add(iv(16, 20));
+        assert_eq!(&set.intervals(), &[iv(0, 10), iv(12, 20)]);
+        set.add(iv(21, 22));
+        assert_eq!(&set.intervals(), &[iv(0, 10), iv(12, 22)]);
+        set.add(iv(23, 23));
+        assert_eq!(&set.intervals(), &[iv(0, 10), iv(12, 23)]);
+        set.add(iv(100, 200));
+        assert_eq!(&set.intervals(), &[iv(0, 10), iv(12, 23), iv(100, 200)]);
+        set.add(iv(201, 250));
+        assert_eq!(&set.intervals(), &[iv(0, 10), iv(12, 23), iv(100, 250)]);
+        set.add(iv(0, 0x10ffff));
+        assert_eq!(&set.intervals(), &[iv(0, 0x10ffff)]);
+    }
+}
diff --git a/src/parse.rs b/src/parse.rs
index 9ee0817..60b8d60 100644
--- a/src/parse.rs
+++ b/src/parse.rs
@@ -416,7 +416,7 @@ where
                 Some(']') => {
                     self.consume(']');
                     if self.flags.icase {
-                        result.cps = unicode::fold_code_points(result.cps);
+                        result.cps = unicode::add_icase_code_points(result.cps);
                     }
                     return Ok(ir::Node::Bracket(result));
                 }
diff --git a/src/unicode.rs b/src/unicode.rs
index aa9a92f..6056d99 100644
--- a/src/unicode.rs
+++ b/src/unicode.rs
@@ -4,8 +4,6 @@ use crate::util::SliceHelp;
 #[cfg(not(feature = "std"))]
 use alloc::vec::Vec;
 use core::cmp::Ordering;
-#[cfg(test)]
-use std::collections::HashMap;
 
 // CodePointRange packs a code point and a length together into a u32.
 // We currently do not need to store any information about code points in plane 16 (U+100000),
@@ -191,8 +189,9 @@ pub fn fold(cu: u32) -> u32 {
     }
 }
 
+// Add all folded characters in the given interval to the given code point set.
+// This skips characters which fold to themselves.
 fn fold_interval(iv: Interval, recv: &mut CodePointSet) {
-    // Find the range of folds which overlap our interval.
     let overlaps = FOLDS.equal_range_by(|tr| {
         if tr.first() > iv.last {
             Ordering::Greater
@@ -240,6 +239,7 @@ fn unfold_interval(iv: Interval, recv: &mut CodePointSet) {
 
 /// \return all the characters which fold to c's fold.
 /// This is a slow linear search across all ranges.
+/// The result always contains c.
 pub fn unfold_char(c: u32) -> Vec<u32> {
     let mut res = vec![c];
     let fcp = fold(c);
@@ -265,7 +265,7 @@ pub fn unfold_char(c: u32) -> Vec<u32> {
 }
 
 // Fold every character in \p input, then find all the prefolds.
-pub fn fold_code_points(mut input: CodePointSet) -> CodePointSet {
+pub fn add_icase_code_points(mut input: CodePointSet) -> CodePointSet {
     let mut folded = input.clone();
     for iv in input.intervals() {
         fold_interval(*iv, &mut folded)
@@ -350,70 +350,187 @@ pub(crate) fn is_character_class(c: u32, property_escape: &PropertyEscape) -> bo
     }
 }
 
-#[test]
-fn test_folds() {
-    for c in 0..0x41 {
-        assert_eq!(fold(c), c);
-    }
-    for c in 0x41..=0x5A {
-        assert_eq!(fold(c), c + 0x20);
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::collections::HashMap;
+
+    // Map from folded char to the chars that folded to it.
+    // If an entry is missing, it means either nothing folds to the char,
+    // or it folds exclusively to itself; this can be determined by comparing
+    // the char to its fold.
+    fn get_unfold_map() -> HashMap<u32, Vec<u32>> {
+        let mut unfold_map: HashMap<u32, Vec<u32>> = HashMap::new();
+        for c in 0..=0x10FFFF {
+            let fc = fold(c);
+            if fc != c {
+                unfold_map.entry(fc).or_insert_with(Vec::new).push(c);
+            }
+        }
+
+        // We neglected self-folds - add them now, but only for entries
+        // where something else folds to it, else our map would be quite large.
+        // Also sort them all.
+        for (&k, v) in unfold_map.iter_mut() {
+            assert_eq!(k, fold(k), "folds should be idempotent");
+            v.push(k);
+            v.sort_unstable();
+        }
+        unfold_map
     }
-    assert_eq!(fold(0xB5), 0x3BC);
-    assert_eq!(fold(0xC0), 0xE0);
 
-    assert_eq!(fold(0x1B8), 0x1B9);
-    assert_eq!(fold(0x1B9), 0x1B9);
-    assert_eq!(fold(0x1BA), 0x1BA);
-    assert_eq!(fold(0x1BB), 0x1BB);
-    assert_eq!(fold(0x1BC), 0x1BD);
-    assert_eq!(fold(0x1BD), 0x1BD);
+    #[test]
+    fn test_folds() {
+        for c in 0..0x41 {
+            assert_eq!(fold(c), c);
+        }
+        for c in 0x41..=0x5A {
+            assert_eq!(fold(c), c + 0x20);
+        }
+        assert_eq!(fold(0xB5), 0x3BC);
+        assert_eq!(fold(0xC0), 0xE0);
+
+        assert_eq!(fold(0x1B8), 0x1B9);
+        assert_eq!(fold(0x1B9), 0x1B9);
+        assert_eq!(fold(0x1BA), 0x1BA);
+        assert_eq!(fold(0x1BB), 0x1BB);
+        assert_eq!(fold(0x1BC), 0x1BD);
+        assert_eq!(fold(0x1BD), 0x1BD);
+
+        for c in 0x1F8..0x21F {
+            if c % 2 == 0 {
+                assert_eq!(fold(c), c + 1);
+            } else {
+                assert_eq!(fold(c), c);
+            }
+        }
 
-    for c in 0x1F8..0x21F {
-        if c % 2 == 0 {
-            assert_eq!(fold(c), c + 1);
-        } else {
+        assert_eq!(fold(0x37F), 0x3F3);
+        assert_eq!(fold(0x380), 0x380);
+        assert_eq!(fold(0x16E40), 0x16E60);
+        assert_eq!(fold(0x16E41), 0x16E61);
+        assert_eq!(fold(0x16E42), 0x16E62);
+        assert_eq!(fold(0x1E900), 0x1E922);
+        assert_eq!(fold(0x1E901), 0x1E923);
+        for c in 0xF0000..=0x10FFFF {
             assert_eq!(fold(c), c);
         }
     }
 
-    assert_eq!(fold(0x37F), 0x3F3);
-    assert_eq!(fold(0x380), 0x380);
-    assert_eq!(fold(0x16E40), 0x16E60);
-    assert_eq!(fold(0x16E41), 0x16E61);
-    assert_eq!(fold(0x16E42), 0x16E62);
-    assert_eq!(fold(0x1E900), 0x1E922);
-    assert_eq!(fold(0x1E901), 0x1E923);
-    for c in 0xF0000..=0x10FFFF {
-        assert_eq!(fold(c), c);
+    #[test]
+    fn test_fold_idempotent() {
+        for c in 0..=0x10FFFF {
+            let fc = fold(c);
+            let ffc = fold(fc);
+            assert_eq!(ffc, fc);
+        }
     }
-}
 
-#[test]
-fn test_fold_idempotent() {
-    for c in 0..=0x10FFFF {
-        let fc = fold(c);
-        let ffc = fold(fc);
-        assert_eq!(ffc, fc);
+    #[test]
+    fn test_unfolds_refold() {
+        for c in 0..=0x10FFFF {
+            let fc = fold(c);
+            let unfolds = unfold_char(c);
+            for uc in unfolds {
+                assert_eq!(fold(uc), fc);
+            }
+        }
     }
-}
 
-#[test]
-fn test_unfold_chars() {
-    // Map from folded char to the chars that folded to it.
-    let mut fold_map: HashMap<u32, Vec<u32>> = HashMap::new();
-    for c in 0..=0x10FFFF {
-        let fc = fold(c);
-        fold_map.entry(fc).or_insert_with(Vec::new).push(c);
+    #[test]
+    fn test_unfold_chars() {
+        let unfold_map = get_unfold_map();
+        for c in 0..=0x10FFFF {
+            let mut unfolded = unfold_char(c);
+            unfolded.sort_unstable();
+            let fc = fold(c);
+            if let Some(expected) = unfold_map.get(&fc) {
+                // Explicit list of unfolds.
+                assert_eq!(&unfolded, expected);
+            } else {
+                // No entry in our testing unfold map: that means that either the
+                // character folds to itself and nothing else does, or the character
+                // folds to a different character - but that different character
+                // should fold to itself (folding is idempotent) so we should always
+                // have multiple characters in that case. Therefore we expect this
+                // character's unfolds to be itself exclusively.
+                assert_eq!(&unfolded, &[c]);
+            }
+        }
     }
 
-    // Sort them all.
-    for v in fold_map.values_mut() {
-        v.sort_unstable();
+    #[test]
+    fn test_add_icase_code_points() {
+        let unfold_map = get_unfold_map();
+        let locs = [
+            0x0, 0x42, 0x100, 0xdeba, 0x11419, 0x278f8, 0x2e000, 0x35df7, 0x462d6, 0x4bc29,
+            0x4f4c0, 0x58a9b, 0x5bafc, 0x62383, 0x66d60, 0x6974a, 0x77628, 0x87804, 0x9262b,
+            0x931e4, 0xaa08c, 0xad7a8, 0xca6b0, 0xcce27, 0xcd897, 0xcf5e7, 0xe2802, 0xe561b,
+            0xe5f43, 0xf4339, 0xfb78c, 0xfc5ee, 0x104fa9, 0x10e402, 0x10e6cf, 0x10FFFF,
+        ];
+        for (idx, &first) in locs.iter().enumerate() {
+            // Keep a running set of the unfolded code points we expect to be in the
+            // range [first, last].
+            let mut expected = CodePointSet::default();
+            let mut from = first;
+            for &last in &locs[idx..] {
+                // Add both folded and unfolded characters to expected.
+                for c in from..=last {
+                    let fc = fold(c);
+                    if let Some(unfolded) = unfold_map.get(&fc) {
+                        // Some nontrival set of characters fold to fc.
+                        for &ufc in unfolded {
+                            expected.add_one(ufc);
+                        }
+                    } else {
+                        // Only fc folds to fc.
+                        expected.add_one(fc);
+                    }
+                }
+                let mut input = CodePointSet::new();
+                input.add(Interval { first, last });
+                let folded = add_icase_code_points(input);
+                assert_eq!(folded, expected);
+                from = last;
+            }
+        }
     }
 
-    for c in 0..=0x10FFFF {
-        let mut unfolded = unfold_char(c);
-        unfolded.sort_unstable();
-        assert_eq!(unfolded, fold_map[&fold(c)]);
+    #[test]
+    fn test_fold_interval() {
+        let locs = [
+            0, 0x894, 0x59ac, 0xfa64, 0x10980, 0x12159, 0x16b8d, 0x1aaa2, 0x1f973, 0x1fcd4,
+            0x20c35, 0x23d8a, 0x276af, 0x2c6b8, 0x2fb25, 0x30b9b, 0x338ad, 0x35ab3, 0x38d37,
+            0x3bfa7, 0x3fba6, 0x404c9, 0x44572, 0x480c9, 0x4b5c4, 0x4f371, 0x5a9fa, 0x5ad6c,
+            0x5e395, 0x5f103, 0x5fa98, 0x617fa, 0x6500e, 0x68890, 0x6a3fc, 0x6eab3, 0x704a6,
+            0x70c22, 0x72efb, 0x737cc, 0x76796, 0x79da8, 0x7a450, 0x7b023, 0x7cc5c, 0x82027,
+            0x84ef4, 0x8ac66, 0x8b898, 0x8bd1a, 0x95841, 0x98a48, 0x9e6cd, 0xa035a, 0xa41fb,
+            0xa50e3, 0xa6387, 0xa7ba1, 0xaad9a, 0xabed8, 0xacc88, 0xb2737, 0xb31b1, 0xb6daf,
+            0xb7ff4, 0xba2b4, 0xbde4f, 0xbe38b, 0xbe7a5, 0xc4eb2, 0xc5670, 0xc7703, 0xc995d,
+            0xccb72, 0xcdfe3, 0xcfc99, 0xd09eb, 0xd2773, 0xd357d, 0xd6696, 0xd9aec, 0xdc3fa,
+            0xdc8ae, 0xdc9d5, 0xde31d, 0xe2edb, 0xe652b, 0xe92d5, 0xebf2d, 0xee335, 0xef45f,
+            0xf4280, 0xf74b1, 0xf9ac4, 0xfafca, 0x10208d, 0x107d63, 0x10821e, 0x108818, 0x10911f,
+            0x10b6fd, 0x10FFFF,
+        ];
+        for (idx, &first) in locs.iter().enumerate() {
+            // Keep a running set of the folded code points we expect to be in the
+            // range [first, last].
+            let mut expected = CodePointSet::default();
+            let mut from = first;
+            for &last in &locs[idx..] {
+                // Add characters to expected which do not fold to themselves.
+                for c in from..=last {
+                    let fc = fold(c);
+                    if fc != c {
+                        expected.add_one(fc);
+                    }
+                }
+                let mut cps = CodePointSet::default();
+                fold_interval(Interval { first, last }, &mut cps);
+                assert_eq!(cps.intervals(), expected.intervals());
+
+                from = last;
+            }
+        }
     }
 }