ridiculousfish · raskad · Jan 7, 2024 · Dec 16, 2023 · Dec 16, 2023 · Jan 7, 2024
diff --git a/Cargo.toml b/Cargo.toml
@@ -29,6 +29,9 @@ index-positions = []
 # Prohibits all uses of unsafe code, for the paranoid.
 prohibit-unsafe = []
 
+# Enables UTF-16 support. This disables some optimizations, so it should only be used when necessary.
+utf16 = []
+
 [dependencies]
 hashbrown = "0.13.2"
 memchr = { version = "2.4.0", default-features = false }
diff --git a/src/api.rs b/src/api.rs
@@ -6,6 +6,12 @@ use crate::insn::CompiledRegex;
 use crate::optimizer;
 use crate::parse;
 
+#[cfg(feature = "utf16")]
+use crate::{
+    classicalbacktrack::MatchAttempter,
+    indexing::{InputIndexer, Ucs2Input, Utf16Input},
+};
+
 #[cfg(feature = "backend-pikevm")]
 use crate::pikevm;
 use crate::util::to_char_sat;
@@ -377,6 +383,42 @@ impl Regex {
     pub fn find_from_ascii<'r, 't>(&'r self, text: &'t str, start: usize) -> AsciiMatches<'r, 't> {
         backends::find(self, text, start)
     }
+
+    /// Returns an iterator for matches found in 'text' starting at index `start`.
+    #[cfg(feature = "utf16")]
+    pub fn find_from_utf16<'r, 't>(
+        &'r self,
+        text: &'t [u16],
+        start: usize,
+    ) -> exec::Matches<super::classicalbacktrack::BacktrackExecutor<'r, indexing::Utf16Input<'t>>>
+    {
+        let input = Utf16Input::new(text);
+        exec::Matches::new(
+            super::classicalbacktrack::BacktrackExecutor::new(
+                input,
+                MatchAttempter::new(&self.cr, input.left_end()),
+            ),
+            start,
+        )
+    }
+
+    /// Returns an iterator for matches found in 'text' starting at index `start`.
+    #[cfg(feature = "utf16")]
+    pub fn find_from_ucs2<'r, 't>(
+        &'r self,
+        text: &'t [u16],
+        start: usize,
+    ) -> exec::Matches<super::classicalbacktrack::BacktrackExecutor<'r, indexing::Ucs2Input<'t>>>
+    {
+        let input = Ucs2Input::new(text);
+        exec::Matches::new(
+            super::classicalbacktrack::BacktrackExecutor::new(
+                input,
+                MatchAttempter::new(&self.cr, input.left_end()),
+            ),
+            start,
+        )
+    }
 }
 
 impl FromStr for Regex {

diff --git a/src/classicalbacktrack.rs b/src/classicalbacktrack.rs
@@ -7,7 +7,9 @@ use crate::cursor::{Backward, Direction, Forward};
 use crate::exec;
 use crate::indexing;
 use crate::indexing::{AsciiInput, ElementType, InputIndexer, Utf8Input};
-use crate::insn::{CompiledRegex, Insn, LoopFields, StartPredicate};
+#[cfg(not(feature = "utf16"))]
+use crate::insn::StartPredicate;
+use crate::insn::{CompiledRegex, Insn, LoopFields};
 use crate::matchers;
 use crate::matchers::CharProperties;
 use crate::position::PositionType;
@@ -65,14 +67,14 @@ struct State<Position: PositionType> {
 }
 
 #[derive(Debug)]
-struct MatchAttempter<'a, Input: InputIndexer> {
+pub(crate) struct MatchAttempter<'a, Input: InputIndexer> {
     re: &'a CompiledRegex,
     bts: Vec<BacktrackInsn<Input>>,
     s: State<Input::Position>,
 }
 
 impl<'a, Input: InputIndexer> MatchAttempter<'a, Input> {
-    fn new(re: &'a CompiledRegex, entry: Input::Position) -> Self {
+    pub(crate) fn new(re: &'a CompiledRegex, entry: Input::Position) -> Self {
         Self {
             re,
             bts: vec![BacktrackInsn::Exhausted],
@@ -647,9 +649,11 @@ impl<'a, Input: InputIndexer> MatchAttempter<'a, Input> {
                         next_or_bt!(scm::MatchAny::new().matches(input, dir, &mut pos))
                     }
 
-                    Insn::MatchAnyExceptLineTerminator => next_or_bt!(
-                        scm::MatchAnyExceptLineTerminator::new().matches(input, dir, &mut pos)
-                    ),
+                    Insn::MatchAnyExceptLineTerminator => {
+                        next_or_bt!(
+                            scm::MatchAnyExceptLineTerminator::new().matches(input, dir, &mut pos)
+                        )
+                    }
 
                     &Insn::WordBoundary { invert } => {
                         // Copy the positions since these destructively move them.
@@ -906,6 +910,13 @@ pub struct BacktrackExecutor<'r, Input: InputIndexer> {
     matcher: MatchAttempter<'r, Input>,
 }
 
+#[cfg(feature = "utf16")]
+impl<'r, Input: InputIndexer> BacktrackExecutor<'r, Input> {
+    pub(crate) fn new(input: Input, matcher: MatchAttempter<'r, Input>) -> Self {
+        Self { input, matcher }
+    }
+}
+
 impl<'r, Input: InputIndexer> BacktrackExecutor<'r, Input> {
     fn successful_match(&mut self, start: Input::Position, end: Input::Position) -> Match {
         // We want to simultaneously map our groups to offsets, and clear the groups.
@@ -970,6 +981,11 @@ impl<'a, Input: InputIndexer> exec::MatchProducer for BacktrackExecutor<'a, Inpu
         pos: Input::Position,
         next_start: &mut Option<Input::Position>,
     ) -> Option<Match> {
+        // When UTF-16 support is active prefix search is not used due to the different encoding.
+        #[cfg(feature = "utf16")]
+        return self.next_match_with_prefix_search(pos, next_start, &bytesearch::EmptyString {});
+
+        #[cfg(not(feature = "utf16"))]
         match &self.matcher.re.start_pred {
             StartPredicate::Arbitrary => {
                 self.next_match_with_prefix_search(pos, next_start, &bytesearch::EmptyString {})

diff --git a/src/cursor.rs b/src/cursor.rs
@@ -28,30 +28,6 @@ impl Direction for Backward {
     }
 }
 
-/// \return a slice of bytes of length \p len starting (or ending if not FORWARD) at \p pos.
-/// Advance (retreat) pos by that many bytes.
-#[inline(always)]
-fn try_slice<'a, Input: InputIndexer, Dir: Direction>(
-    input: &'a Input,
-    _dir: Dir,
-    pos: &mut Input::Position,
-    len: usize,
-) -> Option<&'a [u8]> {
-    // Note we may exit here if there's not enough bytes remaining.
-    let start;
-    let end;
-    if Dir::FORWARD {
-        start = *pos;
-        end = input.try_move_right(start, len)?;
-        *pos = end;
-    } else {
-        end = *pos;
-        start = input.try_move_left(end, len)?;
-        *pos = start;
-    }
-    Some(input.slice(start, end))
-}
-
 /// \return whether we match some literal bytes.
 /// If so, update the position. If not, the position is unspecified.
 #[inline(always)]
@@ -61,30 +37,7 @@ pub fn try_match_lit<Input: InputIndexer, Dir: Direction, Bytes: ByteSeq>(
     pos: &mut Input::Position,
     bytes: &Bytes,
 ) -> bool {
-    let len = Bytes::LENGTH;
-    debug_assert!(len > 0, "Should not have zero length");
-    if let Some(subr_slice) = try_slice(input, dir, pos, len) {
-        bytes.equals_known_len(subr_slice)
-    } else {
-        false
-    }
-}
-
-/// If the subrange [start, end) is byte-for-byte equal to a range of the same length starting (if FORWARD is true) or ending (if FORWARD
-/// is false) at \p pos, then return true and then advance (or retreat) the position.
-/// On failure, return false and the position is unspecified.
-pub fn subrange_eq<Input: InputIndexer, Dir: Direction>(
-    input: &Input,
-    dir: Dir,
-    pos: &mut Input::Position,
-    start: Input::Position,
-    end: Input::Position,
-) -> bool {
-    if let Some(subr_slice) = try_slice(input, dir, pos, end - start) {
-        subr_slice == input.slice(start, end)
-    } else {
-        false
-    }
+    input.match_bytes(dir, pos, bytes)
 }
 
 /// \return the next character, updating the position.