Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement UTF-16 based matching #75

Merged
merged 3 commits into from
Jan 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@ index-positions = []
# Prohibits all uses of unsafe code, for the paranoid.
prohibit-unsafe = []

# Enables UTF-16 support. This disables some optimizations, so it should only be used when necessary.
utf16 = []

[dependencies]
hashbrown = "0.13.2"
memchr = { version = "2.4.0", default-features = false }
42 changes: 42 additions & 0 deletions src/api.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,12 @@ use crate::insn::CompiledRegex;
use crate::optimizer;
use crate::parse;

#[cfg(feature = "utf16")]
use crate::{
classicalbacktrack::MatchAttempter,
indexing::{InputIndexer, Ucs2Input, Utf16Input},
};

#[cfg(feature = "backend-pikevm")]
use crate::pikevm;
use crate::util::to_char_sat;
Expand Down Expand Up @@ -377,6 +383,42 @@ impl Regex {
pub fn find_from_ascii<'r, 't>(&'r self, text: &'t str, start: usize) -> AsciiMatches<'r, 't> {
backends::find(self, text, start)
}

/// Returns an iterator for matches found in 'text' starting at index `start`.
#[cfg(feature = "utf16")]
pub fn find_from_utf16<'r, 't>(
&'r self,
text: &'t [u16],
start: usize,
) -> exec::Matches<super::classicalbacktrack::BacktrackExecutor<'r, indexing::Utf16Input<'t>>>
{
let input = Utf16Input::new(text);
exec::Matches::new(
super::classicalbacktrack::BacktrackExecutor::new(
input,
MatchAttempter::new(&self.cr, input.left_end()),
),
start,
)
}

/// Returns an iterator for matches found in 'text' starting at index `start`.
#[cfg(feature = "utf16")]
pub fn find_from_ucs2<'r, 't>(
&'r self,
text: &'t [u16],
start: usize,
) -> exec::Matches<super::classicalbacktrack::BacktrackExecutor<'r, indexing::Ucs2Input<'t>>>
{
let input = Ucs2Input::new(text);
exec::Matches::new(
super::classicalbacktrack::BacktrackExecutor::new(
input,
MatchAttempter::new(&self.cr, input.left_end()),
),
start,
)
}
}

impl FromStr for Regex {
Expand Down
28 changes: 22 additions & 6 deletions src/classicalbacktrack.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@ use crate::cursor::{Backward, Direction, Forward};
use crate::exec;
use crate::indexing;
use crate::indexing::{AsciiInput, ElementType, InputIndexer, Utf8Input};
use crate::insn::{CompiledRegex, Insn, LoopFields, StartPredicate};
#[cfg(not(feature = "utf16"))]
use crate::insn::StartPredicate;
use crate::insn::{CompiledRegex, Insn, LoopFields};
use crate::matchers;
use crate::matchers::CharProperties;
use crate::position::PositionType;
Expand Down Expand Up @@ -65,14 +67,14 @@ struct State<Position: PositionType> {
}

#[derive(Debug)]
struct MatchAttempter<'a, Input: InputIndexer> {
pub(crate) struct MatchAttempter<'a, Input: InputIndexer> {
re: &'a CompiledRegex,
bts: Vec<BacktrackInsn<Input>>,
s: State<Input::Position>,
}

impl<'a, Input: InputIndexer> MatchAttempter<'a, Input> {
fn new(re: &'a CompiledRegex, entry: Input::Position) -> Self {
pub(crate) fn new(re: &'a CompiledRegex, entry: Input::Position) -> Self {
Self {
re,
bts: vec![BacktrackInsn::Exhausted],
Expand Down Expand Up @@ -647,9 +649,11 @@ impl<'a, Input: InputIndexer> MatchAttempter<'a, Input> {
next_or_bt!(scm::MatchAny::new().matches(input, dir, &mut pos))
}

Insn::MatchAnyExceptLineTerminator => next_or_bt!(
scm::MatchAnyExceptLineTerminator::new().matches(input, dir, &mut pos)
),
Insn::MatchAnyExceptLineTerminator => {
next_or_bt!(
scm::MatchAnyExceptLineTerminator::new().matches(input, dir, &mut pos)
)
}

&Insn::WordBoundary { invert } => {
// Copy the positions since these destructively move them.
Expand Down Expand Up @@ -906,6 +910,13 @@ pub struct BacktrackExecutor<'r, Input: InputIndexer> {
matcher: MatchAttempter<'r, Input>,
}

#[cfg(feature = "utf16")]
impl<'r, Input: InputIndexer> BacktrackExecutor<'r, Input> {
pub(crate) fn new(input: Input, matcher: MatchAttempter<'r, Input>) -> Self {
Self { input, matcher }
}
}

impl<'r, Input: InputIndexer> BacktrackExecutor<'r, Input> {
fn successful_match(&mut self, start: Input::Position, end: Input::Position) -> Match {
// We want to simultaneously map our groups to offsets, and clear the groups.
Expand Down Expand Up @@ -970,6 +981,11 @@ impl<'a, Input: InputIndexer> exec::MatchProducer for BacktrackExecutor<'a, Inpu
pos: Input::Position,
next_start: &mut Option<Input::Position>,
) -> Option<Match> {
// When UTF-16 support is active prefix search is not used due to the different encoding.
#[cfg(feature = "utf16")]
return self.next_match_with_prefix_search(pos, next_start, &bytesearch::EmptyString {});

#[cfg(not(feature = "utf16"))]
match &self.matcher.re.start_pred {
StartPredicate::Arbitrary => {
self.next_match_with_prefix_search(pos, next_start, &bytesearch::EmptyString {})
Expand Down
49 changes: 1 addition & 48 deletions src/cursor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,30 +28,6 @@ impl Direction for Backward {
}
}

/// \return a slice of bytes of length \p len starting (or ending if not FORWARD) at \p pos.
/// Advance (retreat) pos by that many bytes.
#[inline(always)]
fn try_slice<'a, Input: InputIndexer, Dir: Direction>(
input: &'a Input,
_dir: Dir,
pos: &mut Input::Position,
len: usize,
) -> Option<&'a [u8]> {
// Note we may exit here if there's not enough bytes remaining.
let start;
let end;
if Dir::FORWARD {
start = *pos;
end = input.try_move_right(start, len)?;
*pos = end;
} else {
end = *pos;
start = input.try_move_left(end, len)?;
*pos = start;
}
Some(input.slice(start, end))
}

/// \return whether we match some literal bytes.
/// If so, update the position. If not, the position is unspecified.
#[inline(always)]
Expand All @@ -61,30 +37,7 @@ pub fn try_match_lit<Input: InputIndexer, Dir: Direction, Bytes: ByteSeq>(
pos: &mut Input::Position,
bytes: &Bytes,
) -> bool {
let len = Bytes::LENGTH;
debug_assert!(len > 0, "Should not have zero length");
if let Some(subr_slice) = try_slice(input, dir, pos, len) {
bytes.equals_known_len(subr_slice)
} else {
false
}
}

/// If the subrange [start, end) is byte-for-byte equal to a range of the same length starting (if FORWARD is true) or ending (if FORWARD
/// is false) at \p pos, then return true and then advance (or retreat) the position.
/// On failure, return false and the position is unspecified.
pub fn subrange_eq<Input: InputIndexer, Dir: Direction>(
input: &Input,
dir: Dir,
pos: &mut Input::Position,
start: Input::Position,
end: Input::Position,
) -> bool {
if let Some(subr_slice) = try_slice(input, dir, pos, end - start) {
subr_slice == input.slice(start, end)
} else {
false
}
input.match_bytes(dir, pos, bytes)
}

/// \return the next character, updating the position.
Expand Down
Loading
Loading