Skip to content

Commit

Permalink
Parser performance optimizations (#1119)
Browse files Browse the repository at this point in the history
This PR contains two optimizations:

1. Replaces `TextIndex` as the position tracking structure for the
parser context and instead uses the byte offset stored in a `usize`.
Most of the time, the other fields computed in `TextIndex` (ie. line,
column and UTF-16 offsets) are not used, and they are expensive to
compute. Instead, for the very few cases where the `TextIndex` is
required, this is computed for the given byte offset. This brings a 39%
reduction on instructions/cycles for the parser benchmark.

```
iai::benchmarks::parser
  Instructions:          2828523499|4666216487      (-39.3829%) [-1.64970x]
  L1 Hits:               4140897324|6755778229      (-38.7058%) [-1.63148x]
  L2 Hits:                  3784868|3630727         (+4.24546%) [+1.04245x]
  RAM Hits:                   67508|67774           (-0.39248%) [-1.00394x]
  Total read+write:      4144749700|6759476730      (-38.6824%) [-1.63085x]
  Estimated Cycles:      4162184444|6776303954      (-38.5774%) [-1.62806x]
```

2. Caches the parsed leading trivia to avoid having to repeatedly
re-parse it when backtracking. This reduces by a further 72% the number
of instructions/cycles in the parser benchmark.

```
iai::benchmarks::parser
  Instructions:           772093852|2828523499      (-72.7033%) [-3.66345x]
  L1 Hits:               1131017179|4140897324      (-72.6867%) [-3.66122x]
  L2 Hits:                  2047519|3784868         (-45.9025%) [-1.84851x]
  RAM Hits:                   67411|67508           (-0.14369%) [-1.00144x]
  Total read+write:      1133132109|4144749700      (-72.6610%) [-3.65778x]
  Estimated Cycles:      1143614159|4162184444      (-72.5237%) [-3.63950x]
```

Overall, the number of estimated instructions went from 4,666,216,487 to
772,093,852 for a reduction of 83% (6x smaller).
  • Loading branch information
ggiraldez authored Oct 2, 2024
1 parent be7bb79 commit 68746d7
Show file tree
Hide file tree
Showing 21 changed files with 273 additions and 78 deletions.
4 changes: 2 additions & 2 deletions crates/codegen/runtime/cargo/src/runtime/parser/lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ pub(crate) trait Lexer {
ParserResult::r#match(
vec![Edge::anonymous(Node::terminal(
kind,
input.content(start.utf8..end.utf8),
input.content(start..end),
))],
vec![],
)
Expand Down Expand Up @@ -134,7 +134,7 @@ pub(crate) trait Lexer {
let end = input.position();
children.push(Edge::anonymous(Node::terminal(
kind,
input.content(start.utf8..end.utf8),
input.content(start..end),
)));

let restore = input.position();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ impl Lexer for Parser {
{%- if rendering_in_stubs -%}
unreachable!("Invoking leading_trivia in stubs: {input:#?}")
{%- else -%}
Parser::leading_trivia(self, input)
input.cached_leading_trivia_or(|input| Parser::leading_trivia(self, input))
{%- endif -%}
}

Expand Down Expand Up @@ -238,7 +238,7 @@ impl Lexer for Parser {
input.set_position(save);

// TODO(#1001): Don't allocate a string here
let ident_value = input.content(save.utf8..furthest_position.utf8);
let ident_value = input.content(save..furthest_position);

for keyword_compound_scanner in [
{%- for keyword_name, _ in context.keyword_compound_scanners %}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ pub struct ChoiceHelper {
start_position: Marker,
// Because we backtrack after every non-final pick, we store the progress
// and the emitted errors from the time of a best pick, so that we can return to it later.
last_progress: TextIndex,
last_progress: usize,
recovered_errors: Vec<ParseError>,
}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,32 +1,44 @@
use std::mem;
use std::ops::Range;

use super::ParserResult;
use crate::cst::{TerminalKind, TextIndex};
use crate::parser::ParseError;

#[derive(Debug)]
struct CachedParserResult {
start_position: usize,
end_position: usize,
result: ParserResult,
}

#[derive(Debug)]
pub struct ParserContext<'s> {
source: &'s str,
position: TextIndex,
undo_position: Option<TextIndex>,
position: usize,
undo_position: Option<usize>,
errors: Vec<ParseError>,
closing_delimiters: Vec<TerminalKind>,
last_text_index: TextIndex,
leading_trivia_cache: Option<CachedParserResult>,
}

#[derive(Copy, Clone)]
pub struct Marker {
position: TextIndex,
position: usize,
err_len: usize,
}

impl<'s> ParserContext<'s> {
pub fn new(source: &'s str) -> Self {
Self {
source,
position: TextIndex::ZERO,
position: 0usize,
undo_position: None,
errors: vec![],
closing_delimiters: vec![],
last_text_index: TextIndex::ZERO,
leading_trivia_cache: None,
}
}

Expand Down Expand Up @@ -76,29 +88,54 @@ impl<'s> ParserContext<'s> {
&self.closing_delimiters
}

pub fn position(&self) -> TextIndex {
pub fn text_index_at(&mut self, position: usize) -> TextIndex {
// This is a minor optimization: we remember the last computed TextIndex
// and if the requested position is after, we start from that last
// index and avoid advancing over the same characters again. Otherwise,
// we do start from the beginning.
let mut text_index = if self.last_text_index.utf8 <= position {
self.last_text_index
} else {
TextIndex::ZERO
};
let mut from_iter = self.source[text_index.utf8..].chars();
let Some(mut c) = from_iter.next() else {
return text_index;
};
let mut next_c = from_iter.next();
loop {
if text_index.utf8 >= position {
break;
}
text_index.advance(c, next_c.as_ref());
c = match next_c {
Some(ch) => ch,
None => break,
};
next_c = from_iter.next();
}
self.last_text_index = text_index;
text_index
}

pub fn position(&self) -> usize {
self.position
}

pub fn set_position(&mut self, position: TextIndex) {
pub fn set_position(&mut self, position: usize) {
self.position = position;
}

pub fn peek(&self) -> Option<char> {
self.source[self.position.utf8..].chars().next()
}

pub fn peek_pair(&self) -> Option<(char, Option<char>)> {
let mut iter = self.source[self.position.utf8..].chars();
iter.next().map(|c| (c, iter.next()))
self.source[self.position..].chars().next()
}

#[allow(clippy::should_implement_trait)]
pub fn next(&mut self) -> Option<char> {
self.undo_position = Some(self.position);

if let Some((c, n)) = self.peek_pair() {
self.position.advance(c, n.as_ref());
if let Some(c) = self.peek() {
self.position += c.len_utf8();
Some(c)
} else {
None
Expand All @@ -113,6 +150,28 @@ impl<'s> ParserContext<'s> {
pub fn content(&self, range: Range<usize>) -> String {
self.source[range].to_owned()
}

pub fn cached_leading_trivia_or(
&mut self,
f: impl FnOnce(&mut Self) -> ParserResult,
) -> ParserResult {
let position = self.position();
if let Some(cache) = &self.leading_trivia_cache {
if cache.start_position == position {
let result = cache.result.clone();
self.set_position(cache.end_position);
return result;
}
}

let result = f(self);
self.leading_trivia_cache = Some(CachedParserResult {
start_position: position,
end_position: self.position(),
result: result.clone(),
});
result
}
}

pub(crate) struct DelimiterGuard<'a, 's> {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -105,23 +105,27 @@ where
// Mark the rest of the unconsumed stream as skipped and report an error
// NOTE: IncompleteMatch internally consumes the stream when picked via choice,
// so needs a separate check here.
if start.utf8 < input.len() || is_incomplete || is_recovering {
if start < input.len() || is_incomplete || is_recovering {
let start = if is_recovering {
topmost_node.text_len
topmost_node.text_len.utf8
} else {
start
};
let kind = if input[start.utf8..].is_empty() {
let kind = if input[start..].is_empty() {
TerminalKind::MISSING
} else {
TerminalKind::UNRECOGNIZED
};
let skipped_node = Node::terminal(kind, input[start.utf8..].to_string());
let skipped_node = Node::terminal(kind, input[start..].to_string());
let mut new_children = topmost_node.children.clone();
new_children.push(Edge::anonymous(skipped_node));

let start_index = stream.text_index_at(start);
let mut errors = stream.into_errors();
errors.push(ParseError::new(start..input.into(), expected_terminals));
errors.push(ParseError::new(
start_index..input.into(),
expected_terminals,
));

ParseOutput {
parse_tree: Node::nonterminal(topmost_node.kind, new_children),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,9 @@ pub(crate) fn skip_until_with_nested_delims<L: Lexer, LexCtx: IsLexicalContext>(
// Don't consume the delimiter; parent will consume it
input.set_position(save);

return Some((terminal, start..save));
let start_index = input.text_index_at(start);
let save_index = input.text_index_at(save);
return Some((terminal, start_index..save_index));
}
// Found the local closing delimiter, pop the stack
Some(terminal) if local_delims.last() == Some(&terminal) => {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ macro_rules! scan_keyword_choice {
$(
{
if let result @ (KeywordScan::Present(..) | KeywordScan::Reserved(..)) = ($scanner) {
if $ident.len() == $stream.position().utf8 - save.utf8 {
if $ident.len() == $stream.position() - save {
break result;
}
}
Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 68746d7

Please sign in to comment.