From 42e6c965316dceaea61832f8b82fd12e17e98243 Mon Sep 17 00:00:00 2001 From: Fredrik Vraalsen <22197+fredriv@users.noreply.github.com> Date: Tue, 10 Sep 2024 15:28:03 +0200 Subject: [PATCH 1/3] Improve performance of docs parsing. Make extract_top_level_blocks() faster. Duplicate the change from https://github.com/dbt-labs/dbt-core/pull/9045 into dbt-common --- dbt_common/clients/_jinja_blocks.py | 45 +++++++++++++++++++++++++++-- 1 file changed, 42 insertions(+), 3 deletions(-) diff --git a/dbt_common/clients/_jinja_blocks.py b/dbt_common/clients/_jinja_blocks.py index e4eeba85..798dcd64 100644 --- a/dbt_common/clients/_jinja_blocks.py +++ b/dbt_common/clients/_jinja_blocks.py @@ -1,6 +1,7 @@ +import dataclasses import re from collections import namedtuple -from typing import Iterator, List, Optional, Set, Union +from typing import Dict, Iterator, List, Optional, Set, Union from dbt_common.exceptions import ( BlockDefinitionNotAtTopError, @@ -104,11 +105,24 @@ def end_pat(self) -> re.Pattern: QUOTE_START_PATTERN = regex(r"""(?P(['"]))""") +@dataclasses.dataclass +class PositionedMatch: + """Used to accelerate TagIterator. Records the result of searching a string, starting + at start_pos and finding match (or None).""" + + start_pos: int + match: Optional[re.Match] + + class TagIterator: def __init__(self, text: str) -> None: self.text: str = text self.pos: int = 0 + # Performance enhancement: A cache of the most recent matches seen for each pattern. + # Includes the start position used for the search. + self._past_matches: Dict[re.Pattern, PositionedMatch] = {} + def linepos(self, end: Optional[int] = None) -> str: """Return relative position in line. @@ -129,8 +143,33 @@ def advance(self, new_position: int) -> None: def rewind(self, amount: int = 1) -> None: self.pos -= amount - def _search(self, pattern: re.Pattern) -> Optional[re.Match]: - return pattern.search(self.text, self.pos) + def _search(self, pattern) -> Optional[re.Match]: + + # Check to see if we have a cached search on this pattern. + positioned_match = self._past_matches.get(pattern) + + if positioned_match is None or positioned_match.start_pos > self.pos: + # We did not have a cached search, or we did, but it was done at a location + # further along in the string. Do a new search and cache it. + match = pattern.search(self.text, self.pos) + self._past_matches[pattern] = PositionedMatch(self.pos, match) + else: + # We have a cached search and its start position falls before (or at) the + # current search position... + if positioned_match.match is None: + # ...but there is no match in the rest of the 'data'. + match = None + elif positioned_match.match.start() >= self.pos: + # ...and there is a match we can reuse, because we have not yet passed + # the start position of the match. It's still the next match. + match = positioned_match.match + else: + # ...but we have passed the start of the cached match, and need to do a + # new search from our current position and cache it. + match = pattern.search(self.text, self.pos) + self._past_matches[pattern] = PositionedMatch(self.pos, match) + + return match def _match(self, pattern: re.Pattern) -> Optional[re.Match]: return pattern.match(self.text, self.pos) From 9a3aa221b9dd406e2e1b156f7b3a5a104af34ff6 Mon Sep 17 00:00:00 2001 From: Fredrik Vraalsen <22197+fredriv@users.noreply.github.com> Date: Tue, 10 Sep 2024 15:35:39 +0200 Subject: [PATCH 2/3] Add changelog entry --- .changes/unreleased/Under the Hood-20240910-153447.yaml | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 .changes/unreleased/Under the Hood-20240910-153447.yaml diff --git a/.changes/unreleased/Under the Hood-20240910-153447.yaml b/.changes/unreleased/Under the Hood-20240910-153447.yaml new file mode 100644 index 00000000..2539089f --- /dev/null +++ b/.changes/unreleased/Under the Hood-20240910-153447.yaml @@ -0,0 +1,6 @@ +kind: Under the Hood +body: Improve docs parsing performance. +time: 2024-09-10T15:34:47.389953+02:00 +custom: + Author: fredriv + Issue: "9037" From a45798caaed2d35b1728faa37fd4c372e5dfa650 Mon Sep 17 00:00:00 2001 From: Fredrik Vraalsen Date: Mon, 7 Oct 2024 23:08:40 +0200 Subject: [PATCH 3/3] Fix formatting --- dbt_common/clients/_jinja_blocks.py | 1 - 1 file changed, 1 deletion(-) diff --git a/dbt_common/clients/_jinja_blocks.py b/dbt_common/clients/_jinja_blocks.py index 798dcd64..24af370f 100644 --- a/dbt_common/clients/_jinja_blocks.py +++ b/dbt_common/clients/_jinja_blocks.py @@ -144,7 +144,6 @@ def rewind(self, amount: int = 1) -> None: self.pos -= amount def _search(self, pattern) -> Optional[re.Match]: - # Check to see if we have a cached search on this pattern. positioned_match = self._past_matches.get(pattern)