boa-dev#3780: seems like incorrect way

Nikita-str · Sep 30, 2024 · 9ee7627 · 9ee7627
1 parent 628e31c
commit 9ee7627
Show file tree

Hide file tree

Showing 16 changed files with 383 additions and 134 deletions.
diff --git a/core/interner/src/lib.rs b/core/interner/src/lib.rs
@@ -27,13 +27,15 @@ extern crate alloc;
 mod fixed_string;
 mod interned_str;
 mod raw;
+mod source_text;
 mod sym;
 
 #[cfg(test)]
 mod tests;
 
 use alloc::{borrow::Cow, format, string::String};
 use raw::RawInterner;
+use source_text::SourceText;
 
 pub use sym::*;
 
@@ -169,6 +171,8 @@ impl core::fmt::Display for JSInternedStrRef<'_, '_> {
 pub struct Interner {
     utf8_interner: RawInterner<u8>,
     utf16_interner: RawInterner<u16>,
+
+    source_text: SourceText,
 }
 
 impl Interner {
@@ -179,13 +183,35 @@ impl Interner {
         Self::default()
     }
 
+    /// Returns ref to [`SourceText`].
+    #[inline]
+    pub fn source_text(&self) -> &SourceText {
+        &self.source_text
+    }
+    /// Returns mut ref to [`SourceText`].
+    #[inline]
+    pub fn source_text_mut(&mut self) -> &mut SourceText {
+        &mut self.source_text
+    }
+    /// Collect source code char.
+    #[inline]
+    pub fn collect_code_point(&mut self, cp: u32) {
+        self.source_text_mut().collect_code_point(cp);
+    }
+    /// Collect source code char.
+    #[inline]
+    pub fn remove_last_code_point(&mut self) {
+        self.source_text_mut().remove_last_code_point();
+    }
+
     /// Creates a new [`Interner`] with the specified capacity.
     #[inline]
     #[must_use]
     pub fn with_capacity(capacity: usize) -> Self {
         Self {
             utf8_interner: RawInterner::with_capacity(capacity),
             utf16_interner: RawInterner::with_capacity(capacity),
+            source_text: SourceText::with_capacity(capacity),
         }
     }
 

diff --git a/core/interner/src/source_text.rs b/core/interner/src/source_text.rs
@@ -0,0 +1,80 @@
+use alloc::vec::Vec;
+
+#[derive(Debug)]
+pub struct SourceText {
+    source_text: Vec<u16>,
+    callable_parse: u32,
+}
+
+impl SourceText {
+    pub fn with_capacity(capacity: usize) -> Self {
+        Self {
+            source_text: Vec::with_capacity(capacity),
+            callable_parse: 0,
+        }
+    }
+
+    #[inline]
+    fn is_callable_parse(&self) -> bool {
+        // self.callable_parse != 0
+        true
+    }
+
+    pub fn inc_callable_parse(&mut self) {
+        self.callable_parse += 1;
+    }
+    pub fn dec_callable_parse(&mut self) {
+        if !self.is_callable_parse() {
+            panic!("TODO panic msg")
+        }
+        self.callable_parse -= 1;
+        if !self.is_callable_parse() {
+            self.source_text.clear();
+        }
+    }
+
+    pub fn get_source_text_pos(&self) -> usize {
+        self.source_text.len()
+    }
+    pub fn get_source_text_from_pos(&self, pos: usize) -> &[u16] {
+        &self.source_text[pos..]
+    }
+
+    #[inline]
+    pub fn remove_last_code_point(&mut self) {
+        self.source_text.pop();
+    }
+
+    #[inline]
+    pub fn collect_code_point(&mut self, cp: u32) {
+        if self.is_callable_parse() {
+            if let Ok(cu) = cp.try_into() {
+                self.push(cu);
+                return;
+            }
+
+            let cp = cp - 0x10000;
+            let cu1 = (cp / 0x400 + 0xD800)
+                .try_into()
+                .expect("Invalid code point");
+            let cu2 = (cp % 0x400 + 0xDC00)
+                .try_into()
+                .expect("Invalid code point");
+            self.push(cu1);
+            self.push(cu2);
+        }
+    }
+
+    #[inline]
+    fn push(&mut self, cp: u16) {
+        self.source_text.push(cp);
+    } 
+}
+
+const DEFAULT_CAPACITY: usize = 4 * 1024;
+
+impl Default for SourceText {
+    fn default() -> Self {
+        Self::with_capacity(DEFAULT_CAPACITY)
+    }
+}
diff --git a/core/parser/src/lexer/comment.rs b/core/parser/src/lexer/comment.rs
@@ -23,7 +23,7 @@ impl<R> Tokenizer<R> for SingleLineComment {
         &mut self,
         cursor: &mut Cursor<R>,
         start_pos: Position,
-        _interner: &mut Interner,
+        interner: &mut Interner,
     ) -> Result<Token, Error>
     where
         R: ReadChar,
@@ -37,7 +37,7 @@ impl<R> Tokenizer<R> for SingleLineComment {
                 Ok(c) if c == '\r' || c == '\n' || c == '\u{2028}' || c == '\u{2029}' => break,
                 _ => {}
             };
-            cursor.next_char().expect("Comment character vanished");
+            cursor.next_char_collect(interner).expect("Comment character vanished");
         }
         Ok(Token::new(
             TokenKind::Comment,
@@ -63,18 +63,19 @@ impl<R> Tokenizer<R> for MultiLineComment {
         &mut self,
         cursor: &mut Cursor<R>,
         start_pos: Position,
-        _interner: &mut Interner,
+        interner: &mut Interner,
     ) -> Result<Token, Error>
     where
         R: ReadChar,
     {
         let _timer = Profiler::global().start_event("MultiLineComment", "Lexing");
 
         let mut new_line = false;
-        while let Some(ch) = cursor.next_char()? {
+        while let Some(ch) = cursor.next_char_collect(interner)? {
             let tried_ch = char::try_from(ch);
             match tried_ch {
                 Ok(c) if c == '*' && cursor.next_if(0x2F /* / */)? => {
+                    interner.collect_code_point(0x2F);
                     return Ok(Token::new(
                         if new_line {
                             TokenKind::LineTerminator
@@ -103,11 +104,13 @@ impl<R> Tokenizer<R> for MultiLineComment {
 /// More information:
 ///  - [ECMAScript reference][spec]
 ///
-/// [spec]: https://tc39.es/ecma262/#sec-ecmascript-language-lexical-grammar
+/// [spec]: https://tc39.es/ecma262/#sec-hashbang
 
 pub(super) struct HashbangComment;
 
 impl<R> Tokenizer<R> for HashbangComment {
+    /// No source code char collection because this Token only valid at the 
+    /// start of the script and therefore there no function declaration.
     fn lex(
         &mut self,
         cursor: &mut Cursor<R>,

diff --git a/core/parser/src/lexer/cursor.rs b/core/parser/src/lexer/cursor.rs
@@ -2,6 +2,7 @@
 
 use crate::source::{ReadChar, UTF8Input};
 use boa_ast::Position;
+use boa_interner::Interner;
 use boa_profiler::Profiler;
 use std::io::{self, Error, ErrorKind};
 
@@ -172,6 +173,24 @@ impl<R: ReadChar> Cursor<R> {
         }
     }
 
+    #[inline]
+    pub(crate) fn next_char_collect(&mut self, interner: &mut Interner) -> Result<Option<u32>, Error>
+    {
+        let _timer = Profiler::global().start_event("cursor::next_char_collect", "Lexing");
+
+        let ch = self.next_char()?;
+        if let Some(ch) = ch {
+            if ch == '\r' as u32 {
+                #[cfg(windows)]
+                interner.collect_code_point('\r' as u32);
+                interner.collect_code_point('\n' as u32);
+            } else {
+                interner.collect_code_point(ch);
+            }
+        }
+        Ok(ch)
+    }
+
     /// Retrieves the next UTF-8 character.
     pub(crate) fn next_char(&mut self) -> Result<Option<u32>, Error> {
         let _timer = Profiler::global().start_event("cursor::next_char()", "Lexing");

diff --git a/core/parser/src/lexer/identifier.rs b/core/parser/src/lexer/identifier.rs
@@ -65,7 +65,7 @@ impl<R> Tokenizer<R> for Identifier {
         let _timer = Profiler::global().start_event("Identifier", "Lexing");
 
         let (identifier_name, contains_escaped_chars) =
-            Self::take_identifier_name(cursor, start_pos, self.init)?;
+            Self::take_identifier_name(cursor, interner, start_pos, self.init)?;
 
         let token_kind = match identifier_name.parse() {
             Ok(Keyword::True) => {
@@ -91,6 +91,7 @@ impl<R> Tokenizer<R> for Identifier {
 impl Identifier {
     pub(super) fn take_identifier_name<R>(
         cursor: &mut Cursor<R>,
+        interner: &mut Interner,
         start_pos: Position,
         init: char,
     ) -> Result<(String, bool), Error>
@@ -102,6 +103,8 @@ impl Identifier {
         let mut contains_escaped_chars = false;
         let mut identifier_name = if init == '\\' && cursor.next_if(0x75 /* u */)? {
             let ch = StringLiteral::take_unicode_escape_sequence(cursor, start_pos)?;
+            interner.remove_last_code_point();
+            interner.collect_code_point(ch);
 
             if Self::is_identifier_start(ch) {
                 contains_escaped_chars = true;
@@ -140,6 +143,7 @@ impl Identifier {
             };
 
             identifier_name.push(char::try_from(ch).expect("checked character value"));
+            interner.collect_code_point(ch);
         }
 
         Ok((identifier_name, contains_escaped_chars))

diff --git a/core/parser/src/lexer/mod.rs b/core/parser/src/lexer/mod.rs
@@ -133,12 +133,12 @@ impl<R> Lexer<R> {
             match c {
                 // /
                 0x002F => {
-                    self.cursor.next_char()?.expect("/ token vanished"); // Consume the '/'
+                    self.cursor.next_char_collect(interner)?.expect("/ token vanished"); // Consume the '/'
                     SingleLineComment.lex(&mut self.cursor, start, interner)
                 }
                 // *
                 0x002A => {
-                    self.cursor.next_char()?.expect("* token vanished"); // Consume the '*'
+                    self.cursor.next_char_collect(interner)?.expect("* token vanished"); // Consume the '*'
                     MultiLineComment.lex(&mut self.cursor, start, interner)
                 }
                 ch => {
@@ -186,14 +186,14 @@ impl<R> Lexer<R> {
         }
 
         while self.cursor.peek_char()?.map_or(false, is_whitespace) {
-            let _next = self.cursor.next_char();
+            let _next = self.cursor.next_char_collect(interner);
         }
 
         // -->
         if self.cursor.peek_n(3)?[..3] == [Some(0x2D), Some(0x2D), Some(0x3E)] {
-            let _next = self.cursor.next_char();
-            let _next = self.cursor.next_char();
-            let _next = self.cursor.next_char();
+            let _next = self.cursor.next_char_collect(interner);
+            let _next = self.cursor.next_char_collect(interner);
+            let _next = self.cursor.next_char_collect(interner);
 
             let start = self.cursor.pos();
             SingleLineComment.lex(&mut self.cursor, start, interner)?;
@@ -215,7 +215,7 @@ impl<R> Lexer<R> {
         let _timer = Profiler::global().start_event("next()", "Lexing");
 
         let mut start = self.cursor.pos();
-        let Some(mut next_ch) = self.cursor.next_char()? else {
+        let Some(mut next_ch) = self.cursor.next_char_collect(interner)? else {
             return Ok(None);
         };
 
@@ -233,7 +233,7 @@ impl<R> Lexer<R> {
         if is_whitespace(next_ch) {
             loop {
                 start = self.cursor.pos();
-                let Some(next) = self.cursor.next_char()? else {
+                let Some(next) = self.cursor.next_char_collect(interner)? else {
                     return Ok(None);
                 };
                 if !is_whitespace(next) {
@@ -306,9 +306,9 @@ impl<R> Lexer<R> {
                 '<' if !self.module()
                     && self.cursor.peek_n(3)?[..3] == [Some(0x21), Some(0x2D), Some(0x2D)] =>
                 {
-                    let _next = self.cursor.next_char();
-                    let _next = self.cursor.next_char();
-                    let _next = self.cursor.next_char();
+                    let _next = self.cursor.next_char_collect(interner);
+                    let _next = self.cursor.next_char_collect(interner);
+                    let _next = self.cursor.next_char_collect(interner);
                     let start = self.cursor.pos();
                     SingleLineComment.lex(&mut self.cursor, start, interner)
                 }