Skip to content

Commit

Permalink
Fix multiple escape parsing issues
Browse files Browse the repository at this point in the history
  • Loading branch information
raskad committed Dec 28, 2023
1 parent f4591c1 commit 55a95b8
Show file tree
Hide file tree
Showing 6 changed files with 555 additions and 471 deletions.
2 changes: 1 addition & 1 deletion gen-unicode/src/binary_properties.rs
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ pub(crate) fn generate_tests(scope: &mut Scope) {
));

let mut b = Block::new("for regex in REGEXES");
b.line("let regex = tc.compile(regex);");
b.line(r#"let regex = tc.compilef(regex, "u");"#);

let mut bb = Block::new("for code_point in CODE_POINTS");
bb.line("regex.test_succeeds(code_point);");
Expand Down
4 changes: 2 additions & 2 deletions gen-unicode/src/general_category_values.rs
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ pub(crate) fn generate_tests(scope: &mut Scope) {
));

let mut b = Block::new("for regex in REGEXES");
b.line("let regex = tc.compile(regex);");
b.line(r#"let regex = tc.compilef(regex, "u");"#);

let mut bb = Block::new("for code_point in CODE_POINTS");
bb.line("regex.test_succeeds(code_point);");
Expand Down Expand Up @@ -281,7 +281,7 @@ pub(crate) fn generate_tests(scope: &mut Scope) {
));

let mut b = Block::new("for regex in REGEXES");
b.line("let regex = tc.compile(regex);");
b.line(r#"let regex = tc.compilef(regex, "u");"#);

let mut bb = Block::new("for code_point in CODE_POINTS");
bb.line("regex.test_succeeds(code_point);");
Expand Down
2 changes: 1 addition & 1 deletion gen-unicode/src/scripts.rs
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ impl GenUnicode {
regexes.join(",\n ")
));

f.line(r#"for regex in REGEXES { let regex = tc.compile(regex); for range in CODE_POINTS { for cp in range { regex.test_succeeds(&char::from_u32(cp).unwrap().to_string()); } } }"#);
f.line(r#"for regex in REGEXES { let regex = tc.compilef(regex, "u"); for range in CODE_POINTS { for cp in range { regex.test_succeeds(&char::from_u32(cp).unwrap().to_string()); } } }"#);
}
}
}
Expand Down
243 changes: 161 additions & 82 deletions src/parse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -161,10 +161,6 @@ where
/// Named capture group references.
named_group_indices: HashMap<CaptureGroupName, u32>,

/// Maximum backreference encountered.
/// Note that values larger than will fit are early errors.
max_backref: u32,

/// Whether a lookbehind was encountered.
has_lookbehind: bool,
}
Expand Down Expand Up @@ -264,12 +260,13 @@ where
match to_char_sat(c) {
// A concatenation is terminated by closing parens or vertical bar (alternations).
')' | '|' => break,
// Term :: Assertion :: ^
'^' => {
self.consume('^');
result.push(ir::Node::Anchor(ir::AnchorType::StartOfLine));
quantifier_allowed = false;
}

// Term :: Assertion :: $
'$' => {
self.consume('$');
result.push(ir::Node::Anchor(ir::AnchorType::EndOfLine));
Expand All @@ -278,7 +275,50 @@ where

'\\' => {
self.consume('\\');
result.push(self.consume_atom_escape()?);
let Some(c) = self.peek() else {
return error("Incomplete escape");
};
match to_char_sat(c) {
// Term :: Assertion :: \b
'b' => {
self.consume('b');
result.push(ir::Node::WordBoundary { invert: false });
}
// Term :: Assertion :: \B
'B' => {
self.consume('B');
result.push(ir::Node::WordBoundary { invert: true });
}
// Term :: Atom :: \ AtomEscape :: CharacterEscape :: c AsciiLetter
// Term :: ExtendedAtom :: \ [lookahead = c]
'c' if !self.flags.unicode => {
self.consume('c');
if self
.peek()
.and_then(char::from_u32)
.map(|c| c.is_ascii_alphabetic())
== Some(true)
{
result.push(ir::Node::Char {
c: self.next().expect("char was not next") % 32,
icase: self.flags.icase,
});
} else {
result.push(ir::Node::Char {
c: u32::from('\\'),
icase: self.flags.icase,
});
result.push(ir::Node::Char {
c: u32::from('c'),
icase: self.flags.icase,
});
}
}
// Term :: Atom :: \ AtomEscape
_ => {
result.push(self.consume_atom_escape()?);
}
}
}

'.' => {
Expand Down Expand Up @@ -489,7 +529,7 @@ where
// End of bracket.
']' => Ok(None),

// Escape sequence.
// ClassEscape
'\\' => {
self.consume('\\');
let ec = if let Some(ec) = self.peek() {
Expand All @@ -508,6 +548,23 @@ where
self.consume('-');
Ok(Some(ClassAtom::CodePoint(u32::from('-'))))
}
// ClassEscape :: [~UnicodeMode] c ClassControlLetter
'c' if !self.flags.unicode => {
let input = self.input.clone();
self.consume('c');
match self.peek().map(to_char_sat) {
Some('0'..='9' | '_') => {
let next = self.next().expect("char was not next");
Ok(Some(ClassAtom::CodePoint(next & 0x1F)))
}
// ClassEscape :: CharacterEscape
_ => {
self.input = input;
let cc = self.consume_character_escape()?;
Ok(Some(ClassAtom::CodePoint(cc)))
}
}
}
// ClassEscape :: CharacterClassEscape :: d
'd' => {
self.consume('d');
Expand Down Expand Up @@ -708,101 +765,109 @@ where
}

fn consume_character_escape(&mut self) -> Result<u32, Error> {
let c = self.peek().expect("Should have a character");
match to_char_sat(c) {
'f' => {
self.consume('f');
Ok(0xC)
}
'n' => {
self.consume('n');
Ok(0xA)
}
'r' => {
self.consume('r');
Ok(0xD)
}
't' => {
self.consume('t');
Ok(0x9)
}
'v' => {
self.consume('v');
Ok(0xB)
}
let c = self.next().expect("Should have a character");
let ch = to_char_sat(c);
match ch {
// CharacterEscape :: ControlEscape :: f
'f' => Ok(0xC),
// CharacterEscape :: ControlEscape :: n
'n' => Ok(0xA),
// CharacterEscape :: ControlEscape :: r
'r' => Ok(0xD),
// CharacterEscape :: ControlEscape :: t
't' => Ok(0x9),
// CharacterEscape :: ControlEscape :: v
'v' => Ok(0xB),
// CharacterEscape :: c AsciiLetter
'c' => {
// Control escape.
self.consume('c');
if let Some(nc) = self.next().and_then(char::from_u32) {
if nc.is_ascii_lowercase() || nc.is_ascii_uppercase() {
return Ok((nc as u32) % 32);
}
}
error("Invalid character escape")
}
'0' => {
// CharacterEscape :: "0 [lookahead != DecimalDigit]"
self.consume('0');
match self.peek().and_then(char::from_u32) {
Some(c) if c.is_ascii_digit() => error("Invalid character escape"),
_ => Ok(0x0),
}
// CharacterEscape :: 0 [lookahead ∉ DecimalDigit]
'0' if self
.peek()
.and_then(char::from_u32)
.map(|c: char| c.is_ascii_digit())
!= Some(true) =>
{
Ok(0x0)
}

// CharacterEscape :: HexEscapeSequence :: x HexDigit HexDigit
'x' => {
// HexEscapeSequence :: x HexDigit HexDigit
// See ES6 11.8.3 HexDigit
let hex_to_digit = |c: char| c.to_digit(16);
self.consume('x');
let x1 = self.next().and_then(char::from_u32).and_then(hex_to_digit);
let x2 = self.next().and_then(char::from_u32).and_then(hex_to_digit);
match (x1, x2) {
(Some(x1), Some(x2)) => Ok(x1 * 16 + x2),
// CharacterEscape :: IdentityEscape :: SourceCharacterIdentityEscape
_ if !self.flags.unicode => Ok(c),
_ => error("Invalid character escape"),
}
}

// CharacterEscape :: RegExpUnicodeEscapeSequence
'u' => {
// Unicode escape
self.consume('u');
if let Some(c) = self.try_escape_unicode_sequence() {
Ok(c)
} else if !self.flags.unicode {
// CharacterEscape :: IdentityEscape :: SourceCharacterIdentityEscape
Ok(c)
} else {
error("Invalid unicode escape")
}
}

// Only syntax characters and / participate in IdentityEscape in Unicode regexp.
'^' | '$' | '\\' | '.' | '*' | '+' | '?' | '(' | ')' | '[' | ']' | '{' | '}' | '|'
| '/' => Ok(self.consume(c)),

// TODO: currently we reject numeric characters in IdentityEscape to help some PCRE
// tests pass.
// Specifically a regex of the form [\p{Nd}]: in non-Unicode mode this is not a
// character property test and is expected to parse as just a bracket where \p is
// IdentityEscaped to p.
c if (!self.flags.unicode && !c.is_ascii_digit()) || c.is_ascii_alphabetic() => {
Ok(self.consume(c))
// CharacterEscape :: [~UnicodeMode] LegacyOctalEscapeSequence
'0'..='7' if !self.flags.unicode => {
let Some(c1) = self.peek() else {
return Ok(c - '0' as u32);
};
let ch1 = to_char_sat(c1);

match ch {
// 0 [lookahead ∈ { 8, 9 }]
'0' if ('8'..='9').contains(&ch1) => Ok(0x0),
// NonZeroOctalDigit [lookahead ∉ OctalDigit]
_ if !('0'..='7').contains(&ch1) => Ok(c - '0' as u32),
// FourToSeven OctalDigit
'4'..='7' => {
self.consume(c1);
Ok((c - '0' as u32) * 8 + c1 - '0' as u32)
}
// ZeroToThree OctalDigit [lookahead ∉ OctalDigit]
// ZeroToThree OctalDigit OctalDigit
'0'..='3' => {
self.consume(c1);
if self.peek().map(|c2| ('0'..='7').contains(&to_char_sat(c2)))
== Some(true)
{
let c2 = self.next().expect("char was not next");
Ok((c - '0' as u32) * 64 + (c1 - '0' as u32) * 8 + c2 - '0' as u32)
} else {
Ok((c - '0' as u32) * 8 + c1 - '0' as u32)
}
}
_ => unreachable!(),
}
}

// CharacterEscape :: IdentityEscape :: [+UnicodeMode] SyntaxCharacter
// CharacterEscape :: IdentityEscape :: [+UnicodeMode] /
'^' | '$' | '\\' | '.' | '*' | '+' | '?' | '(' | ')' | '[' | ']' | '{' | '}' | '|'
| '/' => Ok(c),
// CharacterEscape :: IdentityEscape :: SourceCharacterIdentityEscape
_ if !self.flags.unicode => Ok(c),
_ => error("Invalid character escape"),
}
}

// AtomEscape
fn consume_atom_escape(&mut self) -> Result<ir::Node, Error> {
let nc = self.peek();
if nc.is_none() {
let Some(c) = self.peek() else {
return error("Incomplete escape");
}
let c = nc.unwrap();
};
match to_char_sat(c) {
'b' | 'B' => {
self.consume(c);
Ok(ir::Node::WordBoundary {
invert: c == 'B' as u32,
})
}

'd' | 'D' => {
self.consume(c);
Ok(make_bracket_class(
Expand All @@ -827,7 +892,9 @@ where
))
}

'p' | 'P' => {
// ClassEscape :: CharacterClassEscape :: [+UnicodeMode] p{ UnicodePropertyValueExpression }
// ClassEscape :: CharacterClassEscape :: [+UnicodeMode] P{ UnicodePropertyValueExpression }
'p' | 'P' if self.flags.unicode => {
self.consume(c);

let property_escape = self.try_consume_unicode_property_escape()?;
Expand All @@ -839,20 +906,33 @@ where
})
}

// [+UnicodeMode] DecimalEscape
// Note: This is a backreference.
'1'..='9' if self.flags.unicode => {
let group = self.try_consume_decimal_integer_literal().unwrap();
if group <= self.group_count_max as usize {
Ok(ir::Node::BackRef(group as u32))
} else {
error("Invalid character escape")
}
}

// [~UnicodeMode] DecimalEscape but only if the CapturingGroupNumber of DecimalEscape
// is ≤ CountLeftCapturingParensWithin(the Pattern containing DecimalEscape)
// Note: This could be either a backreference, a legacy octal escape or an identity escape.
'1'..='9' => {
let val = self.try_consume_decimal_integer_literal().unwrap();
let input = self.input.clone();
let group = self.try_consume_decimal_integer_literal().unwrap();

// This is a backreference.
// Note we limit backreferences to u32 but the value may exceed that.
if val <= self.group_count_max as usize {
if val > MAX_CAPTURE_GROUPS {
return error(format!("Backreference \\{} too large", val));
}
let group = val as u32;
self.max_backref = core::cmp::max(self.max_backref, group);
Ok(ir::Node::BackRef(group))
if group <= self.group_count_max as usize {
Ok(ir::Node::BackRef(group as u32))
} else {
error("Invalid character escape")
self.input = input;
let c = self.consume_character_escape()?;
Ok(ir::Node::Char {
c: self.fold_if_icase(c),
icase: self.flags.icase,
})
}
}

Expand Down Expand Up @@ -1147,7 +1227,6 @@ where
group_count: 0,
named_group_indices: HashMap::new(),
group_count_max: 0,
max_backref: 0,
has_lookbehind: false,
};
p.try_parse()
Expand Down
Loading

0 comments on commit 55a95b8

Please sign in to comment.