diff --git a/Cargo.toml b/Cargo.toml index e29e2b2..6ca7c1f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -29,6 +29,9 @@ index-positions = [] # Prohibits all uses of unsafe code, for the paranoid. prohibit-unsafe = [] +# Enables UTF-16 support. This disables some optimizations, so it should only be used when necessary. +utf16 = [] + [dependencies] hashbrown = "0.13.2" memchr = { version = "2.4.0", default-features = false } diff --git a/src/api.rs b/src/api.rs index 60214aa..39e701f 100644 --- a/src/api.rs +++ b/src/api.rs @@ -6,6 +6,12 @@ use crate::insn::CompiledRegex; use crate::optimizer; use crate::parse; +#[cfg(feature = "utf16")] +use crate::{ + classicalbacktrack::MatchAttempter, + indexing::{InputIndexer, Ucs2Input, Utf16Input}, +}; + #[cfg(feature = "backend-pikevm")] use crate::pikevm; use crate::util::to_char_sat; @@ -377,6 +383,42 @@ impl Regex { pub fn find_from_ascii<'r, 't>(&'r self, text: &'t str, start: usize) -> AsciiMatches<'r, 't> { backends::find(self, text, start) } + + /// Returns an iterator for matches found in 'text' starting at index `start`. + #[cfg(feature = "utf16")] + pub fn find_from_utf16<'r, 't>( + &'r self, + text: &'t [u16], + start: usize, + ) -> exec::Matches>> + { + let input = Utf16Input::new(text); + exec::Matches::new( + super::classicalbacktrack::BacktrackExecutor::new( + input, + MatchAttempter::new(&self.cr, input.left_end()), + ), + start, + ) + } + + /// Returns an iterator for matches found in 'text' starting at index `start`. + #[cfg(feature = "utf16")] + pub fn find_from_ucs2<'r, 't>( + &'r self, + text: &'t [u16], + start: usize, + ) -> exec::Matches>> + { + let input = Ucs2Input::new(text); + exec::Matches::new( + super::classicalbacktrack::BacktrackExecutor::new( + input, + MatchAttempter::new(&self.cr, input.left_end()), + ), + start, + ) + } } impl FromStr for Regex { diff --git a/src/classicalbacktrack.rs b/src/classicalbacktrack.rs index 8858efb..312af97 100644 --- a/src/classicalbacktrack.rs +++ b/src/classicalbacktrack.rs @@ -7,7 +7,9 @@ use crate::cursor::{Backward, Direction, Forward}; use crate::exec; use crate::indexing; use crate::indexing::{AsciiInput, ElementType, InputIndexer, Utf8Input}; -use crate::insn::{CompiledRegex, Insn, LoopFields, StartPredicate}; +#[cfg(not(feature = "utf16"))] +use crate::insn::StartPredicate; +use crate::insn::{CompiledRegex, Insn, LoopFields}; use crate::matchers; use crate::matchers::CharProperties; use crate::position::PositionType; @@ -65,14 +67,14 @@ struct State { } #[derive(Debug)] -struct MatchAttempter<'a, Input: InputIndexer> { +pub(crate) struct MatchAttempter<'a, Input: InputIndexer> { re: &'a CompiledRegex, bts: Vec>, s: State, } impl<'a, Input: InputIndexer> MatchAttempter<'a, Input> { - fn new(re: &'a CompiledRegex, entry: Input::Position) -> Self { + pub(crate) fn new(re: &'a CompiledRegex, entry: Input::Position) -> Self { Self { re, bts: vec![BacktrackInsn::Exhausted], @@ -647,9 +649,11 @@ impl<'a, Input: InputIndexer> MatchAttempter<'a, Input> { next_or_bt!(scm::MatchAny::new().matches(input, dir, &mut pos)) } - Insn::MatchAnyExceptLineTerminator => next_or_bt!( - scm::MatchAnyExceptLineTerminator::new().matches(input, dir, &mut pos) - ), + Insn::MatchAnyExceptLineTerminator => { + next_or_bt!( + scm::MatchAnyExceptLineTerminator::new().matches(input, dir, &mut pos) + ) + } &Insn::WordBoundary { invert } => { // Copy the positions since these destructively move them. @@ -906,6 +910,13 @@ pub struct BacktrackExecutor<'r, Input: InputIndexer> { matcher: MatchAttempter<'r, Input>, } +#[cfg(feature = "utf16")] +impl<'r, Input: InputIndexer> BacktrackExecutor<'r, Input> { + pub(crate) fn new(input: Input, matcher: MatchAttempter<'r, Input>) -> Self { + Self { input, matcher } + } +} + impl<'r, Input: InputIndexer> BacktrackExecutor<'r, Input> { fn successful_match(&mut self, start: Input::Position, end: Input::Position) -> Match { // We want to simultaneously map our groups to offsets, and clear the groups. @@ -970,6 +981,11 @@ impl<'a, Input: InputIndexer> exec::MatchProducer for BacktrackExecutor<'a, Inpu pos: Input::Position, next_start: &mut Option, ) -> Option { + // When UTF-16 support is active prefix search is not used due to the different encoding. + #[cfg(feature = "utf16")] + return self.next_match_with_prefix_search(pos, next_start, &bytesearch::EmptyString {}); + + #[cfg(not(feature = "utf16"))] match &self.matcher.re.start_pred { StartPredicate::Arbitrary => { self.next_match_with_prefix_search(pos, next_start, &bytesearch::EmptyString {}) diff --git a/src/cursor.rs b/src/cursor.rs index 92d64f2..e6dc5fd 100644 --- a/src/cursor.rs +++ b/src/cursor.rs @@ -28,30 +28,6 @@ impl Direction for Backward { } } -/// \return a slice of bytes of length \p len starting (or ending if not FORWARD) at \p pos. -/// Advance (retreat) pos by that many bytes. -#[inline(always)] -fn try_slice<'a, Input: InputIndexer, Dir: Direction>( - input: &'a Input, - _dir: Dir, - pos: &mut Input::Position, - len: usize, -) -> Option<&'a [u8]> { - // Note we may exit here if there's not enough bytes remaining. - let start; - let end; - if Dir::FORWARD { - start = *pos; - end = input.try_move_right(start, len)?; - *pos = end; - } else { - end = *pos; - start = input.try_move_left(end, len)?; - *pos = start; - } - Some(input.slice(start, end)) -} - /// \return whether we match some literal bytes. /// If so, update the position. If not, the position is unspecified. #[inline(always)] @@ -61,30 +37,7 @@ pub fn try_match_lit( pos: &mut Input::Position, bytes: &Bytes, ) -> bool { - let len = Bytes::LENGTH; - debug_assert!(len > 0, "Should not have zero length"); - if let Some(subr_slice) = try_slice(input, dir, pos, len) { - bytes.equals_known_len(subr_slice) - } else { - false - } -} - -/// If the subrange [start, end) is byte-for-byte equal to a range of the same length starting (if FORWARD is true) or ending (if FORWARD -/// is false) at \p pos, then return true and then advance (or retreat) the position. -/// On failure, return false and the position is unspecified. -pub fn subrange_eq( - input: &Input, - dir: Dir, - pos: &mut Input::Position, - start: Input::Position, - end: Input::Position, -) -> bool { - if let Some(subr_slice) = try_slice(input, dir, pos, end - start) { - subr_slice == input.slice(start, end) - } else { - false - } + input.match_bytes(dir, pos, bytes) } /// \return the next character, updating the position. diff --git a/src/indexing.rs b/src/indexing.rs index 1921eea..b3f8fd2 100644 --- a/src/indexing.rs +++ b/src/indexing.rs @@ -1,8 +1,12 @@ -use crate::bytesearch; +use crate::bytesearch::{self, ByteSeq}; +use crate::cursor::Direction; use crate::matchers; +#[cfg(feature = "utf16")] +use crate::position::IndexPosition; use crate::position::{DefPosition, PositionType}; use crate::util::{is_utf8_continuation, utf8_w2, utf8_w3, utf8_w4}; use core::convert::TryInto; +use core::ops::Range; use core::{ops, str}; // A type which may be an Element. @@ -15,9 +19,6 @@ pub trait ElementType: + core::convert::Into + core::convert::TryFrom { - /// Return the length of ourself in bytes. - fn bytelength(self) -> usize; - /// Return another ElementType as self. #[inline(always)] fn try_from(v: Elem) -> Option { @@ -32,26 +33,11 @@ pub trait ElementType: } } -impl ElementType for u32 { - #[inline(always)] - fn bytelength(self) -> usize { - 4 - } -} +impl ElementType for char {} -impl ElementType for char { - #[inline(always)] - fn bytelength(self) -> usize { - self.len_utf8() - } -} +impl ElementType for u8 {} -impl ElementType for u8 { - #[inline(always)] - fn bytelength(self) -> usize { - 1 - } -} +impl ElementType for u32 {} // A helper type that holds a string and allows indexing into it. pub trait InputIndexer: core::fmt::Debug + Copy + Clone @@ -67,17 +53,6 @@ where /// A type which references a position in the input string. type Position: PositionType; - /// \return the byte contents. - fn contents(&self) -> &[u8]; - - /// \return the length of the contents, in bytes. - fn bytelength(&self) -> usize { - self.contents().len() - } - - /// \return a slice of the contents. - fn slice(&self, start: Self::Position, end: Self::Position) -> &[u8]; - /// \return a sub-input. Note that positions in the original may no longer be valid in the sub-input. fn subinput(&self, range: ops::Range) -> Self; @@ -139,6 +114,25 @@ where fn peek_left(&self, mut pos: Self::Position) -> Option { self.next_left(&mut pos) } + + /// Check if the subrange `range` is byte-for-byte equal to a range of the same length from the current position `pos`. + /// If `dir` is FORWARD, then the range is checked starting at `pos` and ending at `pos + range.len()`. + /// If `dir` is BACKWARD, then the range is checked starting at `pos - range.len()` and ending at `pos`. + fn subrange_eq( + &self, + dir: Dir, + pos: &mut Self::Position, + range: Range, + ) -> bool; + + /// Return whether we match some literal bytes. + /// If so, update the position. If not, the position is unspecified. + fn match_bytes( + &self, + dir: Dir, + pos: &mut Self::Position, + bytes: &Bytes, + ) -> bool; } /// \return the length of a UTF8 sequence starting with this byte. @@ -170,6 +164,39 @@ pub struct Utf8Input<'a> { } impl<'a> Utf8Input<'a> { + #[inline(always)] + fn contents(&self) -> &[u8] { + self.input.as_bytes() + } + + #[inline(always)] + fn bytelength(&self) -> usize { + self.input.as_bytes().len() + } + + #[inline(always)] + fn slice( + &self, + start: ::Position, + end: ::Position, + ) -> &[u8] { + self.debug_assert_valid_pos(start); + self.debug_assert_valid_pos(end); + debug_assert!(end >= start, "Slice start after end"); + + #[cfg(any(feature = "index-positions", feature = "prohibit-unsafe"))] + let res = &self.contents()[core::ops::Range { + start: self.pos_to_offset(start), + end: self.pos_to_offset(end), + }]; + + #[cfg(all(not(feature = "index-positions"), not(feature = "prohibit-unsafe")))] + let res = unsafe { core::slice::from_raw_parts(start.ptr(), end - start) }; + + debug_assert!(res.len() <= self.bytelength() && res.len() == end - start); + res + } + #[inline(always)] pub fn new(s: &'a str) -> Self { // The big idea of RefPosition is enforced here. @@ -229,30 +256,6 @@ impl<'a> InputIndexer for Utf8Input<'a> { type Element = char; type CharProps = matchers::UTF8CharProperties; - #[inline(always)] - fn contents(&self) -> &[u8] { - self.input.as_bytes() - } - - #[inline(always)] - fn slice(&self, start: Self::Position, end: Self::Position) -> &[u8] { - self.debug_assert_valid_pos(start); - self.debug_assert_valid_pos(end); - debug_assert!(end >= start, "Slice start after end"); - - #[cfg(any(feature = "index-positions", feature = "prohibit-unsafe"))] - let res = &self.contents()[core::ops::Range { - start: self.pos_to_offset(start), - end: self.pos_to_offset(end), - }]; - - #[cfg(all(not(feature = "index-positions"), not(feature = "prohibit-unsafe")))] - let res = unsafe { core::slice::from_raw_parts(start.ptr(), end - start) }; - - debug_assert!(res.len() <= self.bytelength() && res.len() == end - start); - res - } - #[inline(always)] fn subinput(&self, range: ops::Range) -> Self { Self::new(self.str_slice(range)) @@ -459,6 +462,86 @@ impl<'a> InputIndexer for Utf8Input<'a> { let idx = search.find_in(rem)?; Some(pos + idx) } + + fn subrange_eq( + &self, + _dir: Dir, + pos: &mut Self::Position, + range: Range, + ) -> bool { + let len = range.end - range.start; + let (start, end) = if Dir::FORWARD { + if let Some(end) = self.try_move_right(*pos, len) { + let start = *pos; + *pos = end; + (start, end) + } else { + return false; + } + } else if let Some(start) = self.try_move_left(*pos, len) { + let end = *pos; + *pos = start; + (start, end) + } else { + return false; + }; + + #[cfg(any(feature = "index-positions", feature = "prohibit-unsafe"))] + let new_range = &self.contents()[core::ops::Range { + start: self.pos_to_offset(start), + end: self.pos_to_offset(end), + }]; + + #[cfg(all(not(feature = "index-positions"), not(feature = "prohibit-unsafe")))] + let new_range = unsafe { core::slice::from_raw_parts(start.ptr(), end - start) }; + + #[cfg(any(feature = "index-positions", feature = "prohibit-unsafe"))] + let old_range = &self.contents()[core::ops::Range { + start: self.pos_to_offset(range.start), + end: self.pos_to_offset(range.end), + }]; + + #[cfg(all(not(feature = "index-positions"), not(feature = "prohibit-unsafe")))] + let old_range = + unsafe { core::slice::from_raw_parts(range.start.ptr(), range.end - range.start) }; + + new_range == old_range + } + + fn match_bytes( + &self, + _dir: Dir, + pos: &mut Self::Position, + bytes: &Bytes, + ) -> bool { + let len = Bytes::LENGTH; + let (start, end) = if Dir::FORWARD { + if let Some(end) = self.try_move_right(*pos, len) { + let start = *pos; + *pos = end; + (start, end) + } else { + return false; + } + } else if let Some(start) = self.try_move_left(*pos, len) { + let end = *pos; + *pos = start; + (start, end) + } else { + return false; + }; + + #[cfg(any(feature = "index-positions", feature = "prohibit-unsafe"))] + let new_range = &self.contents()[core::ops::Range { + start: self.pos_to_offset(start), + end: self.pos_to_offset(end), + }]; + + #[cfg(all(not(feature = "index-positions"), not(feature = "prohibit-unsafe")))] + let new_range = unsafe { core::slice::from_raw_parts(start.ptr(), end - start) }; + + bytes.equals_known_len(new_range) + } } #[derive(Debug, Copy, Clone)] @@ -467,6 +550,38 @@ pub struct AsciiInput<'a> { } impl<'a> AsciiInput<'a> { + #[inline(always)] + fn contents(&self) -> &[u8] { + self.input + } + + #[inline(always)] + fn bytelength(&self) -> usize { + self.input.len() + } + + #[inline(always)] + fn slice( + &self, + start: ::Position, + end: ::Position, + ) -> &[u8] { + self.debug_assert_valid_pos(start); + self.debug_assert_valid_pos(end); + + #[cfg(any(feature = "index-positions", feature = "prohibit-unsafe"))] + let res = &self.contents()[core::ops::Range { + start: self.pos_to_offset(start), + end: self.pos_to_offset(end), + }]; + + #[cfg(all(not(feature = "index-positions"), not(feature = "prohibit-unsafe")))] + let res = unsafe { core::slice::from_raw_parts(start.ptr(), end - start) }; + + debug_assert!(res.len() <= self.bytelength() && res.len() == end - start); + res + } + pub fn new(s: &'a str) -> Self { // The big idea of RefPosition is enforced here. ::Position::check_size(); @@ -500,29 +615,6 @@ impl<'a> InputIndexer for AsciiInput<'a> { type Element = u8; type CharProps = matchers::ASCIICharProperties; - #[inline(always)] - fn contents(&self) -> &[u8] { - self.input - } - - #[inline(always)] - fn slice(&self, start: Self::Position, end: Self::Position) -> &[u8] { - self.debug_assert_valid_pos(start); - self.debug_assert_valid_pos(end); - - #[cfg(any(feature = "index-positions", feature = "prohibit-unsafe"))] - let res = &self.contents()[core::ops::Range { - start: self.pos_to_offset(start), - end: self.pos_to_offset(end), - }]; - - #[cfg(all(not(feature = "index-positions"), not(feature = "prohibit-unsafe")))] - let res = unsafe { core::slice::from_raw_parts(start.ptr(), end - start) }; - - debug_assert!(res.len() <= self.bytelength() && res.len() == end - start); - res - } - #[inline(always)] fn subinput(&self, range: ops::Range) -> AsciiInput<'a> { self.debug_assert_valid_pos(range.start); @@ -645,4 +737,544 @@ impl<'a> InputIndexer for AsciiInput<'a> { let idx = search.find_in(rem)?; Some(pos + idx) } + + fn subrange_eq( + &self, + _dir: Dir, + pos: &mut Self::Position, + range: Range, + ) -> bool { + let len = range.end - range.start; + let (start, end) = if Dir::FORWARD { + if let Some(end) = self.try_move_right(*pos, len) { + let start = *pos; + *pos = end; + (start, end) + } else { + return false; + } + } else if let Some(start) = self.try_move_left(*pos, len) { + let end = *pos; + *pos = start; + (start, end) + } else { + return false; + }; + + #[cfg(any(feature = "index-positions", feature = "prohibit-unsafe"))] + let new_range = &self.contents()[core::ops::Range { + start: self.pos_to_offset(start), + end: self.pos_to_offset(end), + }]; + + #[cfg(all(not(feature = "index-positions"), not(feature = "prohibit-unsafe")))] + let new_range = unsafe { core::slice::from_raw_parts(start.ptr(), end - start) }; + + #[cfg(any(feature = "index-positions", feature = "prohibit-unsafe"))] + let old_range = &self.contents()[core::ops::Range { + start: self.pos_to_offset(range.start), + end: self.pos_to_offset(range.end), + }]; + + #[cfg(all(not(feature = "index-positions"), not(feature = "prohibit-unsafe")))] + let old_range = + unsafe { core::slice::from_raw_parts(range.start.ptr(), range.end - range.start) }; + + new_range == old_range + } + + fn match_bytes( + &self, + _dir: Dir, + pos: &mut Self::Position, + bytes: &Bytes, + ) -> bool { + let len = Bytes::LENGTH; + let (start, end) = if Dir::FORWARD { + if let Some(end) = self.try_move_right(*pos, len) { + let start = *pos; + *pos = end; + (start, end) + } else { + return false; + } + } else if let Some(start) = self.try_move_left(*pos, len) { + let end = *pos; + *pos = start; + (start, end) + } else { + return false; + }; + + #[cfg(any(feature = "index-positions", feature = "prohibit-unsafe"))] + let new_range = &self.contents()[core::ops::Range { + start: self.pos_to_offset(start), + end: self.pos_to_offset(end), + }]; + + #[cfg(all(not(feature = "index-positions"), not(feature = "prohibit-unsafe")))] + let new_range = unsafe { core::slice::from_raw_parts(start.ptr(), end - start) }; + + bytes.equals_known_len(new_range) + } +} + +#[cfg(feature = "utf16")] +#[derive(Debug, Copy, Clone)] +pub struct Utf16Input<'a> { + input: &'a [u16], +} + +#[cfg(feature = "utf16")] +impl<'a> Utf16Input<'a> { + pub fn new(s: &'a [u16]) -> Self { + Self { input: s } + } + + #[inline(always)] + fn debug_assert_valid_pos(&self, pos: ::Position) -> &Self { + debug_assert!(self.left_end() <= pos && pos <= self.right_end()); + self + } + + const SURROGATE_HIGH_START: u16 = 0xD800; + const SURROGATE_HIGH_END: u16 = 0xDBFF; + const SURROGATE_LOW_START: u16 = 0xDC00; + const SURROGATE_LOW_END: u16 = 0xDFFF; + + #[inline(always)] + fn is_high_surrogate(b: u16) -> bool { + b >= Self::SURROGATE_HIGH_START && b <= Self::SURROGATE_HIGH_END + } + + #[inline(always)] + fn is_low_surrogate(b: u16) -> bool { + b >= Self::SURROGATE_LOW_START && b <= Self::SURROGATE_LOW_END + } + + #[inline(always)] + fn code_point_from_surrogates(high: u16, low: u16) -> u32 { + (((high & 0x3ff) as u32) << 10 | (low & 0x3ff) as u32) + 0x1_0000 + } +} + +#[cfg(feature = "utf16")] +impl<'a> InputIndexer for Utf16Input<'a> { + type Position = IndexPosition<'a>; + type Element = u32; + type CharProps = matchers::Utf16CharProperties; + + #[inline(always)] + fn subinput(&self, range: ops::Range) -> Utf16Input<'a> { + self.debug_assert_valid_pos(range.start); + self.debug_assert_valid_pos(range.end); + debug_assert!(range.end >= range.start); + Utf16Input { + input: &self.input[core::ops::Range { + start: self.pos_to_offset(range.start), + end: self.pos_to_offset(range.end), + }], + } + } + + #[inline(always)] + fn next_right(&self, pos: &mut Self::Position) -> Option { + let u1 = self.input.get(self.pos_to_offset(*pos)).copied()?; + *pos += 1; + + // If the code unit is not a high surrogate, it is not the start of a surrogate pair. + if !Self::is_high_surrogate(u1) { + return Some(u1.into()); + } + + let Some(u2) = self.input.get(self.pos_to_offset(*pos)).copied() else { + return Some(u1.into()); + }; + + // If the code unit is not a low surrogate, it is not a surrogate pair. + if !Self::is_low_surrogate(u2) { + return Some(u1.into()); + } + + *pos += 1; + Some(Self::code_point_from_surrogates(u1, u2)) + } + + #[inline(always)] + fn next_left(&self, pos: &mut Self::Position) -> Option { + let u2 = self.input.get(self.pos_to_offset(*pos - 1)).copied()?; + *pos -= 1; + + // If the code unit is not a low surrogate, it is not the end of a surrogate pair. + if !Self::is_low_surrogate(u2) { + return Some(u2.into()); + } + + let Some(u1) = self.input.get(self.pos_to_offset(*pos - 1)).copied() else { + return Some(u2.into()); + }; + + // If the code unit is not a high surrogate, it is not a surrogate pair. + if !Self::is_high_surrogate(u1) { + return Some(u2.into()); + } + + *pos -= 1; + Some(Self::code_point_from_surrogates(u1, u2)) + } + + #[inline(always)] + fn next_right_pos(&self, pos: Self::Position) -> Option { + self.try_move_right(pos, 1) + } + + #[inline(always)] + fn next_left_pos(&self, pos: Self::Position) -> Option { + self.try_move_left(pos, 1) + } + + #[inline(always)] + fn peek_byte_right(&self, mut pos: Self::Position) -> Option { + if let Some(c) = self.next_right(&mut pos) { + if cfg!(target_endian = "big") { + Some(c.to_be_bytes()[0]) + } else { + Some(c.to_le_bytes()[0]) + } + } else { + None + } + } + + #[inline(always)] + fn peek_byte_left(&self, mut pos: Self::Position) -> Option { + if let Some(c) = self.next_left(&mut pos) { + if cfg!(target_endian = "big") { + Some(c.to_be_bytes()[0]) + } else { + Some(c.to_le_bytes()[0]) + } + } else { + None + } + } + + #[inline(always)] + fn left_end(&self) -> Self::Position { + Self::Position::new(0) + } + + #[inline(always)] + fn right_end(&self) -> Self::Position { + Self::Position::new(self.input.len()) + } + + #[inline(always)] + fn pos_to_offset(&self, pos: Self::Position) -> usize { + debug_assert!(self.left_end() <= pos && pos <= self.right_end()); + pos - self.left_end() + } + + fn try_move_right(&self, mut pos: Self::Position, amt: usize) -> Option { + self.debug_assert_valid_pos(pos); + if self.right_end() - pos < amt { + None + } else { + pos += amt; + self.debug_assert_valid_pos(pos); + Some(pos) + } + } + + #[inline(always)] + fn try_move_left(&self, mut pos: Self::Position, amt: usize) -> Option { + self.debug_assert_valid_pos(pos); + if pos - self.left_end() < amt { + None + } else { + pos -= amt; + self.debug_assert_valid_pos(pos); + Some(pos) + } + } + + #[inline(always)] + fn find_bytes( + &self, + pos: Self::Position, + search: &Search, + ) -> Option { + let idx = search.find_in( + &self.input[self.pos_to_offset(pos)..self.pos_to_offset(self.right_end())] + .iter() + .map(|c| *c as u8) + .collect::>(), + )?; + Some(pos + idx) + } + + fn subrange_eq( + &self, + _dir: Dir, + pos: &mut Self::Position, + range: Range, + ) -> bool { + let len = range.end - range.start; + let (start, end) = if Dir::FORWARD { + if let Some(end) = self.try_move_right(*pos, len) { + let start = *pos; + *pos = end; + (start, end) + } else { + return false; + } + } else if let Some(start) = self.try_move_left(*pos, len) { + let end = *pos; + *pos = start; + (start, end) + } else { + return false; + }; + + let new_range = &self.input[self.pos_to_offset(start)..self.pos_to_offset(end)]; + let old_range = &self.input[self.pos_to_offset(range.start)..self.pos_to_offset(range.end)]; + + new_range == old_range + } + + fn match_bytes( + &self, + _dir: Dir, + pos: &mut Self::Position, + bytes: &Bytes, + ) -> bool { + let len = Bytes::LENGTH; + let (start, end) = if Dir::FORWARD { + if let Some(end) = self.try_move_right(*pos, len) { + let start = *pos; + *pos = end; + (start, end) + } else { + return false; + } + } else if let Some(start) = self.try_move_left(*pos, len) { + let end = *pos; + *pos = start; + (start, end) + } else { + return false; + }; + + bytes.equals_known_len( + &self.input[self.pos_to_offset(start)..self.pos_to_offset(end)] + .iter() + .map(|c| *c as u8) + .collect::>(), + ) + } +} + +#[cfg(feature = "utf16")] +#[derive(Debug, Copy, Clone)] +pub struct Ucs2Input<'a> { + input: &'a [u16], +} + +#[cfg(feature = "utf16")] +impl<'a> Ucs2Input<'a> { + pub fn new(s: &'a [u16]) -> Self { + Self { input: s } + } + + #[inline(always)] + fn debug_assert_valid_pos(&self, pos: ::Position) -> &Self { + debug_assert!(self.left_end() <= pos && pos <= self.right_end()); + self + } +} + +#[cfg(feature = "utf16")] +impl<'a> InputIndexer for Ucs2Input<'a> { + type Position = IndexPosition<'a>; + type Element = u32; + type CharProps = matchers::Utf16CharProperties; + + #[inline(always)] + fn subinput(&self, range: ops::Range) -> Ucs2Input<'a> { + self.debug_assert_valid_pos(range.start); + self.debug_assert_valid_pos(range.end); + debug_assert!(range.end >= range.start); + Ucs2Input { + input: &self.input[core::ops::Range { + start: self.pos_to_offset(range.start), + end: self.pos_to_offset(range.end), + }], + } + } + + #[inline(always)] + fn next_right(&self, pos: &mut Self::Position) -> Option { + let u1 = self.input.get(self.pos_to_offset(*pos)).copied()?; + *pos += 1; + + Some(u1.into()) + } + + #[inline(always)] + fn next_left(&self, pos: &mut Self::Position) -> Option { + let u2 = self.input.get(self.pos_to_offset(*pos - 1)).copied()?; + *pos -= 1; + + Some(u2.into()) + } + + #[inline(always)] + fn next_right_pos(&self, pos: Self::Position) -> Option { + self.try_move_right(pos, 1) + } + + #[inline(always)] + fn next_left_pos(&self, pos: Self::Position) -> Option { + self.try_move_left(pos, 1) + } + + #[inline(always)] + fn peek_byte_right(&self, mut pos: Self::Position) -> Option { + if let Some(c) = self.next_right(&mut pos) { + if cfg!(target_endian = "big") { + Some(c.to_be_bytes()[0]) + } else { + Some(c.to_le_bytes()[0]) + } + } else { + None + } + } + + #[inline(always)] + fn peek_byte_left(&self, mut pos: Self::Position) -> Option { + if let Some(c) = self.next_left(&mut pos) { + if cfg!(target_endian = "big") { + Some(c.to_be_bytes()[0]) + } else { + Some(c.to_le_bytes()[0]) + } + } else { + None + } + } + + #[inline(always)] + fn left_end(&self) -> Self::Position { + Self::Position::new(0) + } + + #[inline(always)] + fn right_end(&self) -> Self::Position { + Self::Position::new(self.input.len()) + } + + #[inline(always)] + fn pos_to_offset(&self, pos: Self::Position) -> usize { + debug_assert!(self.left_end() <= pos && pos <= self.right_end()); + pos - self.left_end() + } + + fn try_move_right(&self, mut pos: Self::Position, amt: usize) -> Option { + self.debug_assert_valid_pos(pos); + if self.right_end() - pos < amt { + None + } else { + pos += amt; + self.debug_assert_valid_pos(pos); + Some(pos) + } + } + + #[inline(always)] + fn try_move_left(&self, mut pos: Self::Position, amt: usize) -> Option { + self.debug_assert_valid_pos(pos); + if pos - self.left_end() < amt { + None + } else { + pos -= amt; + self.debug_assert_valid_pos(pos); + Some(pos) + } + } + + #[inline(always)] + fn find_bytes( + &self, + pos: Self::Position, + search: &Search, + ) -> Option { + let idx = search.find_in( + &self.input[self.pos_to_offset(pos)..self.pos_to_offset(self.right_end())] + .iter() + .map(|c| *c as u8) + .collect::>(), + )?; + Some(pos + idx) + } + + fn subrange_eq( + &self, + _dir: Dir, + pos: &mut Self::Position, + range: Range, + ) -> bool { + let len = range.end - range.start; + let (start, end) = if Dir::FORWARD { + if let Some(end) = self.try_move_right(*pos, len) { + let start = *pos; + *pos = end; + (start, end) + } else { + return false; + } + } else if let Some(start) = self.try_move_left(*pos, len) { + let end = *pos; + *pos = start; + (start, end) + } else { + return false; + }; + + let new_range = &self.input[self.pos_to_offset(start)..self.pos_to_offset(end)]; + let old_range = &self.input[self.pos_to_offset(range.start)..self.pos_to_offset(range.end)]; + + new_range == old_range + } + + fn match_bytes( + &self, + _dir: Dir, + pos: &mut Self::Position, + bytes: &Bytes, + ) -> bool { + let len = Bytes::LENGTH; + let (start, end) = if Dir::FORWARD { + if let Some(end) = self.try_move_right(*pos, len) { + let start = *pos; + *pos = end; + (start, end) + } else { + return false; + } + } else if let Some(start) = self.try_move_left(*pos, len) { + let end = *pos; + *pos = start; + (start, end) + } else { + return false; + }; + + bytes.equals_known_len( + &self.input[self.pos_to_offset(start)..self.pos_to_offset(end)] + .iter() + .map(|c| *c as u8) + .collect::>(), + ) + } } diff --git a/src/matchers.rs b/src/matchers.rs index 1039191..25cf120 100644 --- a/src/matchers.rs +++ b/src/matchers.rs @@ -67,6 +67,22 @@ impl CharProperties for ASCIICharProperties { } } +#[cfg(feature = "utf16")] +pub struct Utf16CharProperties {} + +#[cfg(feature = "utf16")] +impl CharProperties for Utf16CharProperties { + type Element = u32; + + fn fold(c: Self::Element) -> Self::Element { + if char::from_u32(c).is_some() { + unicode::fold(c) + } else { + c + } + } +} + /// Check whether the \p orig_range within \p cursor matches position \p pos. pub fn backref( input: &Input, @@ -74,7 +90,7 @@ pub fn backref( orig_range: core::ops::Range, pos: &mut Input::Position, ) -> bool { - cursor::subrange_eq(input, dir, pos, orig_range.start, orig_range.end) + input.subrange_eq(dir, pos, orig_range) } pub fn backref_icase( diff --git a/src/optimizer.rs b/src/optimizer.rs index 8d8e3d3..6d2eac1 100644 --- a/src/optimizer.rs +++ b/src/optimizer.rs @@ -321,6 +321,7 @@ fn form_literal_bytes(n: &mut Node, walk: &Walk) -> PassAction { } } match n { + #[cfg(not(feature = "utf16"))] Node::Char { c, icase } if !*icase => { if let Some(c) = char::from_u32(*c) { let mut buff = [0; 4];