From 68746d7a2b235478a0f2a8ec77854e53f070a56b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gustavo=20Gir=C3=A1ldez?= Date: Wed, 2 Oct 2024 19:31:43 -0300 Subject: [PATCH] Parser performance optimizations (#1119) This PR contains two optimizations: 1. Replaces `TextIndex` as the position tracking structure for the parser context and instead uses the byte offset stored in a `usize`. Most of the time, the other fields computed in `TextIndex` (ie. line, column and UTF-16 offsets) are not used, and they are expensive to compute. Instead, for the very few cases where the `TextIndex` is required, this is computed for the given byte offset. This brings a 39% reduction on instructions/cycles for the parser benchmark. ``` iai::benchmarks::parser Instructions: 2828523499|4666216487 (-39.3829%) [-1.64970x] L1 Hits: 4140897324|6755778229 (-38.7058%) [-1.63148x] L2 Hits: 3784868|3630727 (+4.24546%) [+1.04245x] RAM Hits: 67508|67774 (-0.39248%) [-1.00394x] Total read+write: 4144749700|6759476730 (-38.6824%) [-1.63085x] Estimated Cycles: 4162184444|6776303954 (-38.5774%) [-1.62806x] ``` 2. Caches the parsed leading trivia to avoid having to repeatedly re-parse it when backtracking. This reduces by a further 72% the number of instructions/cycles in the parser benchmark. ``` iai::benchmarks::parser Instructions: 772093852|2828523499 (-72.7033%) [-3.66345x] L1 Hits: 1131017179|4140897324 (-72.6867%) [-3.66122x] L2 Hits: 2047519|3784868 (-45.9025%) [-1.84851x] RAM Hits: 67411|67508 (-0.14369%) [-1.00144x] Total read+write: 1133132109|4144749700 (-72.6610%) [-3.65778x] Estimated Cycles: 1143614159|4162184444 (-72.5237%) [-3.63950x] ``` Overall, the number of estimated instructions went from 4,666,216,487 to 772,093,852 for a reduction of 83% (6x smaller). --- .../cargo/src/runtime/parser/lexer/mod.rs | 4 +- .../cargo/src/runtime/parser/parser.rs.jinja2 | 4 +- .../parser/parser_support/choice_helper.rs | 2 +- .../runtime/parser/parser_support/context.rs | 87 ++++++++++++++++--- .../parser/parser_support/parser_function.rs | 14 +-- .../runtime/parser/parser_support/recovery.rs | 4 +- .../src/runtime/parser/scanner_macros/mod.rs | 2 +- .../src/generated/parser/generated/parser.rs | 6 +- .../src/generated/parser/lexer/mod.rs | 4 +- .../parser/parser_support/choice_helper.rs | 2 +- .../parser/parser_support/context.rs | 87 ++++++++++++++++--- .../parser/parser_support/parser_function.rs | 14 +-- .../parser/parser_support/recovery.rs | 4 +- .../generated/parser/scanner_macros/mod.rs | 2 +- .../src/generated/parser/generated/parser.rs | 2 +- .../src/generated/parser/lexer/mod.rs | 4 +- .../parser/parser_support/choice_helper.rs | 2 +- .../parser/parser_support/context.rs | 87 ++++++++++++++++--- .../parser/parser_support/parser_function.rs | 14 +-- .../parser/parser_support/recovery.rs | 4 +- .../generated/parser/scanner_macros/mod.rs | 2 +- 21 files changed, 273 insertions(+), 78 deletions(-) diff --git a/crates/codegen/runtime/cargo/src/runtime/parser/lexer/mod.rs b/crates/codegen/runtime/cargo/src/runtime/parser/lexer/mod.rs index d461fb4d8b..71d28c520b 100644 --- a/crates/codegen/runtime/cargo/src/runtime/parser/lexer/mod.rs +++ b/crates/codegen/runtime/cargo/src/runtime/parser/lexer/mod.rs @@ -101,7 +101,7 @@ pub(crate) trait Lexer { ParserResult::r#match( vec![Edge::anonymous(Node::terminal( kind, - input.content(start.utf8..end.utf8), + input.content(start..end), ))], vec![], ) @@ -134,7 +134,7 @@ pub(crate) trait Lexer { let end = input.position(); children.push(Edge::anonymous(Node::terminal( kind, - input.content(start.utf8..end.utf8), + input.content(start..end), ))); let restore = input.position(); diff --git a/crates/codegen/runtime/cargo/src/runtime/parser/parser.rs.jinja2 b/crates/codegen/runtime/cargo/src/runtime/parser/parser.rs.jinja2 index 7c1f8aa27e..223e0b0ae0 100644 --- a/crates/codegen/runtime/cargo/src/runtime/parser/parser.rs.jinja2 +++ b/crates/codegen/runtime/cargo/src/runtime/parser/parser.rs.jinja2 @@ -143,7 +143,7 @@ impl Lexer for Parser { {%- if rendering_in_stubs -%} unreachable!("Invoking leading_trivia in stubs: {input:#?}") {%- else -%} - Parser::leading_trivia(self, input) + input.cached_leading_trivia_or(|input| Parser::leading_trivia(self, input)) {%- endif -%} } @@ -238,7 +238,7 @@ impl Lexer for Parser { input.set_position(save); // TODO(#1001): Don't allocate a string here - let ident_value = input.content(save.utf8..furthest_position.utf8); + let ident_value = input.content(save..furthest_position); for keyword_compound_scanner in [ {%- for keyword_name, _ in context.keyword_compound_scanners %} diff --git a/crates/codegen/runtime/cargo/src/runtime/parser/parser_support/choice_helper.rs b/crates/codegen/runtime/cargo/src/runtime/parser/parser_support/choice_helper.rs index 2b5186168a..a52654b63c 100644 --- a/crates/codegen/runtime/cargo/src/runtime/parser/parser_support/choice_helper.rs +++ b/crates/codegen/runtime/cargo/src/runtime/parser/parser_support/choice_helper.rs @@ -14,7 +14,7 @@ pub struct ChoiceHelper { start_position: Marker, // Because we backtrack after every non-final pick, we store the progress // and the emitted errors from the time of a best pick, so that we can return to it later. - last_progress: TextIndex, + last_progress: usize, recovered_errors: Vec, } diff --git a/crates/codegen/runtime/cargo/src/runtime/parser/parser_support/context.rs b/crates/codegen/runtime/cargo/src/runtime/parser/parser_support/context.rs index 0343ec8780..c440a78aa7 100644 --- a/crates/codegen/runtime/cargo/src/runtime/parser/parser_support/context.rs +++ b/crates/codegen/runtime/cargo/src/runtime/parser/parser_support/context.rs @@ -1,21 +1,31 @@ use std::mem; use std::ops::Range; +use super::ParserResult; use crate::cst::{TerminalKind, TextIndex}; use crate::parser::ParseError; +#[derive(Debug)] +struct CachedParserResult { + start_position: usize, + end_position: usize, + result: ParserResult, +} + #[derive(Debug)] pub struct ParserContext<'s> { source: &'s str, - position: TextIndex, - undo_position: Option, + position: usize, + undo_position: Option, errors: Vec, closing_delimiters: Vec, + last_text_index: TextIndex, + leading_trivia_cache: Option, } #[derive(Copy, Clone)] pub struct Marker { - position: TextIndex, + position: usize, err_len: usize, } @@ -23,10 +33,12 @@ impl<'s> ParserContext<'s> { pub fn new(source: &'s str) -> Self { Self { source, - position: TextIndex::ZERO, + position: 0usize, undo_position: None, errors: vec![], closing_delimiters: vec![], + last_text_index: TextIndex::ZERO, + leading_trivia_cache: None, } } @@ -76,29 +88,54 @@ impl<'s> ParserContext<'s> { &self.closing_delimiters } - pub fn position(&self) -> TextIndex { + pub fn text_index_at(&mut self, position: usize) -> TextIndex { + // This is a minor optimization: we remember the last computed TextIndex + // and if the requested position is after, we start from that last + // index and avoid advancing over the same characters again. Otherwise, + // we do start from the beginning. + let mut text_index = if self.last_text_index.utf8 <= position { + self.last_text_index + } else { + TextIndex::ZERO + }; + let mut from_iter = self.source[text_index.utf8..].chars(); + let Some(mut c) = from_iter.next() else { + return text_index; + }; + let mut next_c = from_iter.next(); + loop { + if text_index.utf8 >= position { + break; + } + text_index.advance(c, next_c.as_ref()); + c = match next_c { + Some(ch) => ch, + None => break, + }; + next_c = from_iter.next(); + } + self.last_text_index = text_index; + text_index + } + + pub fn position(&self) -> usize { self.position } - pub fn set_position(&mut self, position: TextIndex) { + pub fn set_position(&mut self, position: usize) { self.position = position; } pub fn peek(&self) -> Option { - self.source[self.position.utf8..].chars().next() - } - - pub fn peek_pair(&self) -> Option<(char, Option)> { - let mut iter = self.source[self.position.utf8..].chars(); - iter.next().map(|c| (c, iter.next())) + self.source[self.position..].chars().next() } #[allow(clippy::should_implement_trait)] pub fn next(&mut self) -> Option { self.undo_position = Some(self.position); - if let Some((c, n)) = self.peek_pair() { - self.position.advance(c, n.as_ref()); + if let Some(c) = self.peek() { + self.position += c.len_utf8(); Some(c) } else { None @@ -113,6 +150,28 @@ impl<'s> ParserContext<'s> { pub fn content(&self, range: Range) -> String { self.source[range].to_owned() } + + pub fn cached_leading_trivia_or( + &mut self, + f: impl FnOnce(&mut Self) -> ParserResult, + ) -> ParserResult { + let position = self.position(); + if let Some(cache) = &self.leading_trivia_cache { + if cache.start_position == position { + let result = cache.result.clone(); + self.set_position(cache.end_position); + return result; + } + } + + let result = f(self); + self.leading_trivia_cache = Some(CachedParserResult { + start_position: position, + end_position: self.position(), + result: result.clone(), + }); + result + } } pub(crate) struct DelimiterGuard<'a, 's> { diff --git a/crates/codegen/runtime/cargo/src/runtime/parser/parser_support/parser_function.rs b/crates/codegen/runtime/cargo/src/runtime/parser/parser_support/parser_function.rs index 16a63be252..2cb239078e 100644 --- a/crates/codegen/runtime/cargo/src/runtime/parser/parser_support/parser_function.rs +++ b/crates/codegen/runtime/cargo/src/runtime/parser/parser_support/parser_function.rs @@ -105,23 +105,27 @@ where // Mark the rest of the unconsumed stream as skipped and report an error // NOTE: IncompleteMatch internally consumes the stream when picked via choice, // so needs a separate check here. - if start.utf8 < input.len() || is_incomplete || is_recovering { + if start < input.len() || is_incomplete || is_recovering { let start = if is_recovering { - topmost_node.text_len + topmost_node.text_len.utf8 } else { start }; - let kind = if input[start.utf8..].is_empty() { + let kind = if input[start..].is_empty() { TerminalKind::MISSING } else { TerminalKind::UNRECOGNIZED }; - let skipped_node = Node::terminal(kind, input[start.utf8..].to_string()); + let skipped_node = Node::terminal(kind, input[start..].to_string()); let mut new_children = topmost_node.children.clone(); new_children.push(Edge::anonymous(skipped_node)); + let start_index = stream.text_index_at(start); let mut errors = stream.into_errors(); - errors.push(ParseError::new(start..input.into(), expected_terminals)); + errors.push(ParseError::new( + start_index..input.into(), + expected_terminals, + )); ParseOutput { parse_tree: Node::nonterminal(topmost_node.kind, new_children), diff --git a/crates/codegen/runtime/cargo/src/runtime/parser/parser_support/recovery.rs b/crates/codegen/runtime/cargo/src/runtime/parser/parser_support/recovery.rs index 185c3b622e..cec55a393c 100644 --- a/crates/codegen/runtime/cargo/src/runtime/parser/parser_support/recovery.rs +++ b/crates/codegen/runtime/cargo/src/runtime/parser/parser_support/recovery.rs @@ -141,7 +141,9 @@ pub(crate) fn skip_until_with_nested_delims( // Don't consume the delimiter; parent will consume it input.set_position(save); - return Some((terminal, start..save)); + let start_index = input.text_index_at(start); + let save_index = input.text_index_at(save); + return Some((terminal, start_index..save_index)); } // Found the local closing delimiter, pop the stack Some(terminal) if local_delims.last() == Some(&terminal) => { diff --git a/crates/codegen/runtime/cargo/src/runtime/parser/scanner_macros/mod.rs b/crates/codegen/runtime/cargo/src/runtime/parser/scanner_macros/mod.rs index d0d2784976..0ed780e2b1 100644 --- a/crates/codegen/runtime/cargo/src/runtime/parser/scanner_macros/mod.rs +++ b/crates/codegen/runtime/cargo/src/runtime/parser/scanner_macros/mod.rs @@ -80,7 +80,7 @@ macro_rules! scan_keyword_choice { $( { if let result @ (KeywordScan::Present(..) | KeywordScan::Reserved(..)) = ($scanner) { - if $ident.len() == $stream.position().utf8 - save.utf8 { + if $ident.len() == $stream.position() - save { break result; } } diff --git a/crates/solidity/outputs/cargo/slang_solidity/src/generated/parser/generated/parser.rs b/crates/solidity/outputs/cargo/slang_solidity/src/generated/parser/generated/parser.rs index 4e7026296c..e63b80d559 100644 --- a/crates/solidity/outputs/cargo/slang_solidity/src/generated/parser/generated/parser.rs +++ b/crates/solidity/outputs/cargo/slang_solidity/src/generated/parser/generated/parser.rs @@ -9523,7 +9523,7 @@ impl Parser { impl Lexer for Parser { fn leading_trivia(&self, input: &mut ParserContext<'_>) -> ParserResult { - Parser::leading_trivia(self, input) + input.cached_leading_trivia_or(|input| Parser::leading_trivia(self, input)) } fn trailing_trivia(&self, input: &mut ParserContext<'_>) -> ParserResult { @@ -10959,7 +10959,7 @@ impl Lexer for Parser { input.set_position(save); // TODO(#1001): Don't allocate a string here - let ident_value = input.content(save.utf8..furthest_position.utf8); + let ident_value = input.content(save..furthest_position); for keyword_compound_scanner in [ Self::bytes_keyword, @@ -13508,7 +13508,7 @@ impl Lexer for Parser { input.set_position(save); // TODO(#1001): Don't allocate a string here - let ident_value = input.content(save.utf8..furthest_position.utf8); + let ident_value = input.content(save..furthest_position); for keyword_compound_scanner in [ Self::yul_bytes_keyword, diff --git a/crates/solidity/outputs/cargo/slang_solidity/src/generated/parser/lexer/mod.rs b/crates/solidity/outputs/cargo/slang_solidity/src/generated/parser/lexer/mod.rs index 4853e3029f..02bbe2b817 100644 --- a/crates/solidity/outputs/cargo/slang_solidity/src/generated/parser/lexer/mod.rs +++ b/crates/solidity/outputs/cargo/slang_solidity/src/generated/parser/lexer/mod.rs @@ -103,7 +103,7 @@ pub(crate) trait Lexer { ParserResult::r#match( vec![Edge::anonymous(Node::terminal( kind, - input.content(start.utf8..end.utf8), + input.content(start..end), ))], vec![], ) @@ -136,7 +136,7 @@ pub(crate) trait Lexer { let end = input.position(); children.push(Edge::anonymous(Node::terminal( kind, - input.content(start.utf8..end.utf8), + input.content(start..end), ))); let restore = input.position(); diff --git a/crates/solidity/outputs/cargo/slang_solidity/src/generated/parser/parser_support/choice_helper.rs b/crates/solidity/outputs/cargo/slang_solidity/src/generated/parser/parser_support/choice_helper.rs index 558226833a..925582d5f7 100644 --- a/crates/solidity/outputs/cargo/slang_solidity/src/generated/parser/parser_support/choice_helper.rs +++ b/crates/solidity/outputs/cargo/slang_solidity/src/generated/parser/parser_support/choice_helper.rs @@ -16,7 +16,7 @@ pub struct ChoiceHelper { start_position: Marker, // Because we backtrack after every non-final pick, we store the progress // and the emitted errors from the time of a best pick, so that we can return to it later. - last_progress: TextIndex, + last_progress: usize, recovered_errors: Vec, } diff --git a/crates/solidity/outputs/cargo/slang_solidity/src/generated/parser/parser_support/context.rs b/crates/solidity/outputs/cargo/slang_solidity/src/generated/parser/parser_support/context.rs index 9b52ba03f0..43b75d69d6 100644 --- a/crates/solidity/outputs/cargo/slang_solidity/src/generated/parser/parser_support/context.rs +++ b/crates/solidity/outputs/cargo/slang_solidity/src/generated/parser/parser_support/context.rs @@ -3,21 +3,31 @@ use std::mem; use std::ops::Range; +use super::ParserResult; use crate::cst::{TerminalKind, TextIndex}; use crate::parser::ParseError; +#[derive(Debug)] +struct CachedParserResult { + start_position: usize, + end_position: usize, + result: ParserResult, +} + #[derive(Debug)] pub struct ParserContext<'s> { source: &'s str, - position: TextIndex, - undo_position: Option, + position: usize, + undo_position: Option, errors: Vec, closing_delimiters: Vec, + last_text_index: TextIndex, + leading_trivia_cache: Option, } #[derive(Copy, Clone)] pub struct Marker { - position: TextIndex, + position: usize, err_len: usize, } @@ -25,10 +35,12 @@ impl<'s> ParserContext<'s> { pub fn new(source: &'s str) -> Self { Self { source, - position: TextIndex::ZERO, + position: 0usize, undo_position: None, errors: vec![], closing_delimiters: vec![], + last_text_index: TextIndex::ZERO, + leading_trivia_cache: None, } } @@ -78,29 +90,54 @@ impl<'s> ParserContext<'s> { &self.closing_delimiters } - pub fn position(&self) -> TextIndex { + pub fn text_index_at(&mut self, position: usize) -> TextIndex { + // This is a minor optimization: we remember the last computed TextIndex + // and if the requested position is after, we start from that last + // index and avoid advancing over the same characters again. Otherwise, + // we do start from the beginning. + let mut text_index = if self.last_text_index.utf8 <= position { + self.last_text_index + } else { + TextIndex::ZERO + }; + let mut from_iter = self.source[text_index.utf8..].chars(); + let Some(mut c) = from_iter.next() else { + return text_index; + }; + let mut next_c = from_iter.next(); + loop { + if text_index.utf8 >= position { + break; + } + text_index.advance(c, next_c.as_ref()); + c = match next_c { + Some(ch) => ch, + None => break, + }; + next_c = from_iter.next(); + } + self.last_text_index = text_index; + text_index + } + + pub fn position(&self) -> usize { self.position } - pub fn set_position(&mut self, position: TextIndex) { + pub fn set_position(&mut self, position: usize) { self.position = position; } pub fn peek(&self) -> Option { - self.source[self.position.utf8..].chars().next() - } - - pub fn peek_pair(&self) -> Option<(char, Option)> { - let mut iter = self.source[self.position.utf8..].chars(); - iter.next().map(|c| (c, iter.next())) + self.source[self.position..].chars().next() } #[allow(clippy::should_implement_trait)] pub fn next(&mut self) -> Option { self.undo_position = Some(self.position); - if let Some((c, n)) = self.peek_pair() { - self.position.advance(c, n.as_ref()); + if let Some(c) = self.peek() { + self.position += c.len_utf8(); Some(c) } else { None @@ -115,6 +152,28 @@ impl<'s> ParserContext<'s> { pub fn content(&self, range: Range) -> String { self.source[range].to_owned() } + + pub fn cached_leading_trivia_or( + &mut self, + f: impl FnOnce(&mut Self) -> ParserResult, + ) -> ParserResult { + let position = self.position(); + if let Some(cache) = &self.leading_trivia_cache { + if cache.start_position == position { + let result = cache.result.clone(); + self.set_position(cache.end_position); + return result; + } + } + + let result = f(self); + self.leading_trivia_cache = Some(CachedParserResult { + start_position: position, + end_position: self.position(), + result: result.clone(), + }); + result + } } pub(crate) struct DelimiterGuard<'a, 's> { diff --git a/crates/solidity/outputs/cargo/slang_solidity/src/generated/parser/parser_support/parser_function.rs b/crates/solidity/outputs/cargo/slang_solidity/src/generated/parser/parser_support/parser_function.rs index 89682aad92..57f5d3ed4b 100644 --- a/crates/solidity/outputs/cargo/slang_solidity/src/generated/parser/parser_support/parser_function.rs +++ b/crates/solidity/outputs/cargo/slang_solidity/src/generated/parser/parser_support/parser_function.rs @@ -107,23 +107,27 @@ where // Mark the rest of the unconsumed stream as skipped and report an error // NOTE: IncompleteMatch internally consumes the stream when picked via choice, // so needs a separate check here. - if start.utf8 < input.len() || is_incomplete || is_recovering { + if start < input.len() || is_incomplete || is_recovering { let start = if is_recovering { - topmost_node.text_len + topmost_node.text_len.utf8 } else { start }; - let kind = if input[start.utf8..].is_empty() { + let kind = if input[start..].is_empty() { TerminalKind::MISSING } else { TerminalKind::UNRECOGNIZED }; - let skipped_node = Node::terminal(kind, input[start.utf8..].to_string()); + let skipped_node = Node::terminal(kind, input[start..].to_string()); let mut new_children = topmost_node.children.clone(); new_children.push(Edge::anonymous(skipped_node)); + let start_index = stream.text_index_at(start); let mut errors = stream.into_errors(); - errors.push(ParseError::new(start..input.into(), expected_terminals)); + errors.push(ParseError::new( + start_index..input.into(), + expected_terminals, + )); ParseOutput { parse_tree: Node::nonterminal(topmost_node.kind, new_children), diff --git a/crates/solidity/outputs/cargo/slang_solidity/src/generated/parser/parser_support/recovery.rs b/crates/solidity/outputs/cargo/slang_solidity/src/generated/parser/parser_support/recovery.rs index 49641de07c..f54ae45934 100644 --- a/crates/solidity/outputs/cargo/slang_solidity/src/generated/parser/parser_support/recovery.rs +++ b/crates/solidity/outputs/cargo/slang_solidity/src/generated/parser/parser_support/recovery.rs @@ -143,7 +143,9 @@ pub(crate) fn skip_until_with_nested_delims( // Don't consume the delimiter; parent will consume it input.set_position(save); - return Some((terminal, start..save)); + let start_index = input.text_index_at(start); + let save_index = input.text_index_at(save); + return Some((terminal, start_index..save_index)); } // Found the local closing delimiter, pop the stack Some(terminal) if local_delims.last() == Some(&terminal) => { diff --git a/crates/solidity/outputs/cargo/slang_solidity/src/generated/parser/scanner_macros/mod.rs b/crates/solidity/outputs/cargo/slang_solidity/src/generated/parser/scanner_macros/mod.rs index fa1f9dfb39..feebf437d2 100644 --- a/crates/solidity/outputs/cargo/slang_solidity/src/generated/parser/scanner_macros/mod.rs +++ b/crates/solidity/outputs/cargo/slang_solidity/src/generated/parser/scanner_macros/mod.rs @@ -82,7 +82,7 @@ macro_rules! scan_keyword_choice { $( { if let result @ (KeywordScan::Present(..) | KeywordScan::Reserved(..)) = ($scanner) { - if $ident.len() == $stream.position().utf8 - save.utf8 { + if $ident.len() == $stream.position() - save { break result; } } diff --git a/crates/testlang/outputs/cargo/slang_testlang/src/generated/parser/generated/parser.rs b/crates/testlang/outputs/cargo/slang_testlang/src/generated/parser/generated/parser.rs index bfbfd0add8..ab7fd20c62 100644 --- a/crates/testlang/outputs/cargo/slang_testlang/src/generated/parser/generated/parser.rs +++ b/crates/testlang/outputs/cargo/slang_testlang/src/generated/parser/generated/parser.rs @@ -687,7 +687,7 @@ impl Parser { impl Lexer for Parser { fn leading_trivia(&self, input: &mut ParserContext<'_>) -> ParserResult { - Parser::leading_trivia(self, input) + input.cached_leading_trivia_or(|input| Parser::leading_trivia(self, input)) } fn trailing_trivia(&self, input: &mut ParserContext<'_>) -> ParserResult { diff --git a/crates/testlang/outputs/cargo/slang_testlang/src/generated/parser/lexer/mod.rs b/crates/testlang/outputs/cargo/slang_testlang/src/generated/parser/lexer/mod.rs index 4853e3029f..02bbe2b817 100644 --- a/crates/testlang/outputs/cargo/slang_testlang/src/generated/parser/lexer/mod.rs +++ b/crates/testlang/outputs/cargo/slang_testlang/src/generated/parser/lexer/mod.rs @@ -103,7 +103,7 @@ pub(crate) trait Lexer { ParserResult::r#match( vec![Edge::anonymous(Node::terminal( kind, - input.content(start.utf8..end.utf8), + input.content(start..end), ))], vec![], ) @@ -136,7 +136,7 @@ pub(crate) trait Lexer { let end = input.position(); children.push(Edge::anonymous(Node::terminal( kind, - input.content(start.utf8..end.utf8), + input.content(start..end), ))); let restore = input.position(); diff --git a/crates/testlang/outputs/cargo/slang_testlang/src/generated/parser/parser_support/choice_helper.rs b/crates/testlang/outputs/cargo/slang_testlang/src/generated/parser/parser_support/choice_helper.rs index 558226833a..925582d5f7 100644 --- a/crates/testlang/outputs/cargo/slang_testlang/src/generated/parser/parser_support/choice_helper.rs +++ b/crates/testlang/outputs/cargo/slang_testlang/src/generated/parser/parser_support/choice_helper.rs @@ -16,7 +16,7 @@ pub struct ChoiceHelper { start_position: Marker, // Because we backtrack after every non-final pick, we store the progress // and the emitted errors from the time of a best pick, so that we can return to it later. - last_progress: TextIndex, + last_progress: usize, recovered_errors: Vec, } diff --git a/crates/testlang/outputs/cargo/slang_testlang/src/generated/parser/parser_support/context.rs b/crates/testlang/outputs/cargo/slang_testlang/src/generated/parser/parser_support/context.rs index 9b52ba03f0..43b75d69d6 100644 --- a/crates/testlang/outputs/cargo/slang_testlang/src/generated/parser/parser_support/context.rs +++ b/crates/testlang/outputs/cargo/slang_testlang/src/generated/parser/parser_support/context.rs @@ -3,21 +3,31 @@ use std::mem; use std::ops::Range; +use super::ParserResult; use crate::cst::{TerminalKind, TextIndex}; use crate::parser::ParseError; +#[derive(Debug)] +struct CachedParserResult { + start_position: usize, + end_position: usize, + result: ParserResult, +} + #[derive(Debug)] pub struct ParserContext<'s> { source: &'s str, - position: TextIndex, - undo_position: Option, + position: usize, + undo_position: Option, errors: Vec, closing_delimiters: Vec, + last_text_index: TextIndex, + leading_trivia_cache: Option, } #[derive(Copy, Clone)] pub struct Marker { - position: TextIndex, + position: usize, err_len: usize, } @@ -25,10 +35,12 @@ impl<'s> ParserContext<'s> { pub fn new(source: &'s str) -> Self { Self { source, - position: TextIndex::ZERO, + position: 0usize, undo_position: None, errors: vec![], closing_delimiters: vec![], + last_text_index: TextIndex::ZERO, + leading_trivia_cache: None, } } @@ -78,29 +90,54 @@ impl<'s> ParserContext<'s> { &self.closing_delimiters } - pub fn position(&self) -> TextIndex { + pub fn text_index_at(&mut self, position: usize) -> TextIndex { + // This is a minor optimization: we remember the last computed TextIndex + // and if the requested position is after, we start from that last + // index and avoid advancing over the same characters again. Otherwise, + // we do start from the beginning. + let mut text_index = if self.last_text_index.utf8 <= position { + self.last_text_index + } else { + TextIndex::ZERO + }; + let mut from_iter = self.source[text_index.utf8..].chars(); + let Some(mut c) = from_iter.next() else { + return text_index; + }; + let mut next_c = from_iter.next(); + loop { + if text_index.utf8 >= position { + break; + } + text_index.advance(c, next_c.as_ref()); + c = match next_c { + Some(ch) => ch, + None => break, + }; + next_c = from_iter.next(); + } + self.last_text_index = text_index; + text_index + } + + pub fn position(&self) -> usize { self.position } - pub fn set_position(&mut self, position: TextIndex) { + pub fn set_position(&mut self, position: usize) { self.position = position; } pub fn peek(&self) -> Option { - self.source[self.position.utf8..].chars().next() - } - - pub fn peek_pair(&self) -> Option<(char, Option)> { - let mut iter = self.source[self.position.utf8..].chars(); - iter.next().map(|c| (c, iter.next())) + self.source[self.position..].chars().next() } #[allow(clippy::should_implement_trait)] pub fn next(&mut self) -> Option { self.undo_position = Some(self.position); - if let Some((c, n)) = self.peek_pair() { - self.position.advance(c, n.as_ref()); + if let Some(c) = self.peek() { + self.position += c.len_utf8(); Some(c) } else { None @@ -115,6 +152,28 @@ impl<'s> ParserContext<'s> { pub fn content(&self, range: Range) -> String { self.source[range].to_owned() } + + pub fn cached_leading_trivia_or( + &mut self, + f: impl FnOnce(&mut Self) -> ParserResult, + ) -> ParserResult { + let position = self.position(); + if let Some(cache) = &self.leading_trivia_cache { + if cache.start_position == position { + let result = cache.result.clone(); + self.set_position(cache.end_position); + return result; + } + } + + let result = f(self); + self.leading_trivia_cache = Some(CachedParserResult { + start_position: position, + end_position: self.position(), + result: result.clone(), + }); + result + } } pub(crate) struct DelimiterGuard<'a, 's> { diff --git a/crates/testlang/outputs/cargo/slang_testlang/src/generated/parser/parser_support/parser_function.rs b/crates/testlang/outputs/cargo/slang_testlang/src/generated/parser/parser_support/parser_function.rs index 89682aad92..57f5d3ed4b 100644 --- a/crates/testlang/outputs/cargo/slang_testlang/src/generated/parser/parser_support/parser_function.rs +++ b/crates/testlang/outputs/cargo/slang_testlang/src/generated/parser/parser_support/parser_function.rs @@ -107,23 +107,27 @@ where // Mark the rest of the unconsumed stream as skipped and report an error // NOTE: IncompleteMatch internally consumes the stream when picked via choice, // so needs a separate check here. - if start.utf8 < input.len() || is_incomplete || is_recovering { + if start < input.len() || is_incomplete || is_recovering { let start = if is_recovering { - topmost_node.text_len + topmost_node.text_len.utf8 } else { start }; - let kind = if input[start.utf8..].is_empty() { + let kind = if input[start..].is_empty() { TerminalKind::MISSING } else { TerminalKind::UNRECOGNIZED }; - let skipped_node = Node::terminal(kind, input[start.utf8..].to_string()); + let skipped_node = Node::terminal(kind, input[start..].to_string()); let mut new_children = topmost_node.children.clone(); new_children.push(Edge::anonymous(skipped_node)); + let start_index = stream.text_index_at(start); let mut errors = stream.into_errors(); - errors.push(ParseError::new(start..input.into(), expected_terminals)); + errors.push(ParseError::new( + start_index..input.into(), + expected_terminals, + )); ParseOutput { parse_tree: Node::nonterminal(topmost_node.kind, new_children), diff --git a/crates/testlang/outputs/cargo/slang_testlang/src/generated/parser/parser_support/recovery.rs b/crates/testlang/outputs/cargo/slang_testlang/src/generated/parser/parser_support/recovery.rs index 49641de07c..f54ae45934 100644 --- a/crates/testlang/outputs/cargo/slang_testlang/src/generated/parser/parser_support/recovery.rs +++ b/crates/testlang/outputs/cargo/slang_testlang/src/generated/parser/parser_support/recovery.rs @@ -143,7 +143,9 @@ pub(crate) fn skip_until_with_nested_delims( // Don't consume the delimiter; parent will consume it input.set_position(save); - return Some((terminal, start..save)); + let start_index = input.text_index_at(start); + let save_index = input.text_index_at(save); + return Some((terminal, start_index..save_index)); } // Found the local closing delimiter, pop the stack Some(terminal) if local_delims.last() == Some(&terminal) => { diff --git a/crates/testlang/outputs/cargo/slang_testlang/src/generated/parser/scanner_macros/mod.rs b/crates/testlang/outputs/cargo/slang_testlang/src/generated/parser/scanner_macros/mod.rs index fa1f9dfb39..feebf437d2 100644 --- a/crates/testlang/outputs/cargo/slang_testlang/src/generated/parser/scanner_macros/mod.rs +++ b/crates/testlang/outputs/cargo/slang_testlang/src/generated/parser/scanner_macros/mod.rs @@ -82,7 +82,7 @@ macro_rules! scan_keyword_choice { $( { if let result @ (KeywordScan::Present(..) | KeywordScan::Reserved(..)) = ($scanner) { - if $ident.len() == $stream.position().utf8 - save.utf8 { + if $ident.len() == $stream.position() - save { break result; } }