Skip to content

Commit

Permalink
boa-dev#3780: seems like incorrect way
Browse files Browse the repository at this point in the history
  • Loading branch information
Nikita-str committed Sep 30, 2024
1 parent 628e31c commit 9ee7627
Show file tree
Hide file tree
Showing 16 changed files with 383 additions and 134 deletions.
26 changes: 26 additions & 0 deletions core/interner/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,15 @@ extern crate alloc;
mod fixed_string;
mod interned_str;
mod raw;
mod source_text;
mod sym;

#[cfg(test)]
mod tests;

use alloc::{borrow::Cow, format, string::String};
use raw::RawInterner;
use source_text::SourceText;

pub use sym::*;

Expand Down Expand Up @@ -169,6 +171,8 @@ impl core::fmt::Display for JSInternedStrRef<'_, '_> {
pub struct Interner {
utf8_interner: RawInterner<u8>,
utf16_interner: RawInterner<u16>,

source_text: SourceText,
}

impl Interner {
Expand All @@ -179,13 +183,35 @@ impl Interner {
Self::default()
}

/// Returns ref to [`SourceText`].
#[inline]
pub fn source_text(&self) -> &SourceText {
&self.source_text
}
/// Returns mut ref to [`SourceText`].
#[inline]
pub fn source_text_mut(&mut self) -> &mut SourceText {
&mut self.source_text
}
/// Collect source code char.
#[inline]
pub fn collect_code_point(&mut self, cp: u32) {
self.source_text_mut().collect_code_point(cp);
}
/// Collect source code char.
#[inline]
pub fn remove_last_code_point(&mut self) {
self.source_text_mut().remove_last_code_point();
}

/// Creates a new [`Interner`] with the specified capacity.
#[inline]
#[must_use]
pub fn with_capacity(capacity: usize) -> Self {
Self {
utf8_interner: RawInterner::with_capacity(capacity),
utf16_interner: RawInterner::with_capacity(capacity),
source_text: SourceText::with_capacity(capacity),
}
}

Expand Down
80 changes: 80 additions & 0 deletions core/interner/src/source_text.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
use alloc::vec::Vec;

#[derive(Debug)]
pub struct SourceText {
source_text: Vec<u16>,
callable_parse: u32,
}

impl SourceText {
pub fn with_capacity(capacity: usize) -> Self {
Self {
source_text: Vec::with_capacity(capacity),
callable_parse: 0,
}
}

#[inline]
fn is_callable_parse(&self) -> bool {
// self.callable_parse != 0
true
}

pub fn inc_callable_parse(&mut self) {
self.callable_parse += 1;
}
pub fn dec_callable_parse(&mut self) {
if !self.is_callable_parse() {
panic!("TODO panic msg")
}
self.callable_parse -= 1;
if !self.is_callable_parse() {
self.source_text.clear();
}
}

pub fn get_source_text_pos(&self) -> usize {
self.source_text.len()
}
pub fn get_source_text_from_pos(&self, pos: usize) -> &[u16] {
&self.source_text[pos..]
}

#[inline]
pub fn remove_last_code_point(&mut self) {
self.source_text.pop();
}

#[inline]
pub fn collect_code_point(&mut self, cp: u32) {
if self.is_callable_parse() {
if let Ok(cu) = cp.try_into() {
self.push(cu);
return;
}

let cp = cp - 0x10000;
let cu1 = (cp / 0x400 + 0xD800)
.try_into()
.expect("Invalid code point");
let cu2 = (cp % 0x400 + 0xDC00)
.try_into()
.expect("Invalid code point");
self.push(cu1);
self.push(cu2);
}
}

#[inline]
fn push(&mut self, cp: u16) {
self.source_text.push(cp);
}
}

const DEFAULT_CAPACITY: usize = 4 * 1024;

impl Default for SourceText {
fn default() -> Self {
Self::with_capacity(DEFAULT_CAPACITY)
}
}
13 changes: 8 additions & 5 deletions core/parser/src/lexer/comment.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ impl<R> Tokenizer<R> for SingleLineComment {
&mut self,
cursor: &mut Cursor<R>,
start_pos: Position,
_interner: &mut Interner,
interner: &mut Interner,
) -> Result<Token, Error>
where
R: ReadChar,
Expand All @@ -37,7 +37,7 @@ impl<R> Tokenizer<R> for SingleLineComment {
Ok(c) if c == '\r' || c == '\n' || c == '\u{2028}' || c == '\u{2029}' => break,
_ => {}
};
cursor.next_char().expect("Comment character vanished");
cursor.next_char_collect(interner).expect("Comment character vanished");
}
Ok(Token::new(
TokenKind::Comment,
Expand All @@ -63,18 +63,19 @@ impl<R> Tokenizer<R> for MultiLineComment {
&mut self,
cursor: &mut Cursor<R>,
start_pos: Position,
_interner: &mut Interner,
interner: &mut Interner,
) -> Result<Token, Error>
where
R: ReadChar,
{
let _timer = Profiler::global().start_event("MultiLineComment", "Lexing");

let mut new_line = false;
while let Some(ch) = cursor.next_char()? {
while let Some(ch) = cursor.next_char_collect(interner)? {
let tried_ch = char::try_from(ch);
match tried_ch {
Ok(c) if c == '*' && cursor.next_if(0x2F /* / */)? => {
interner.collect_code_point(0x2F);
return Ok(Token::new(
if new_line {
TokenKind::LineTerminator
Expand Down Expand Up @@ -103,11 +104,13 @@ impl<R> Tokenizer<R> for MultiLineComment {
/// More information:
/// - [ECMAScript reference][spec]
///
/// [spec]: https://tc39.es/ecma262/#sec-ecmascript-language-lexical-grammar
/// [spec]: https://tc39.es/ecma262/#sec-hashbang

pub(super) struct HashbangComment;

impl<R> Tokenizer<R> for HashbangComment {
/// No source code char collection because this Token only valid at the
/// start of the script and therefore there no function declaration.
fn lex(
&mut self,
cursor: &mut Cursor<R>,
Expand Down
19 changes: 19 additions & 0 deletions core/parser/src/lexer/cursor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

use crate::source::{ReadChar, UTF8Input};
use boa_ast::Position;
use boa_interner::Interner;
use boa_profiler::Profiler;
use std::io::{self, Error, ErrorKind};

Expand Down Expand Up @@ -172,6 +173,24 @@ impl<R: ReadChar> Cursor<R> {
}
}

#[inline]
pub(crate) fn next_char_collect(&mut self, interner: &mut Interner) -> Result<Option<u32>, Error>
{
let _timer = Profiler::global().start_event("cursor::next_char_collect", "Lexing");

let ch = self.next_char()?;
if let Some(ch) = ch {
if ch == '\r' as u32 {
#[cfg(windows)]
interner.collect_code_point('\r' as u32);
interner.collect_code_point('\n' as u32);
} else {
interner.collect_code_point(ch);
}
}
Ok(ch)
}

/// Retrieves the next UTF-8 character.
pub(crate) fn next_char(&mut self) -> Result<Option<u32>, Error> {
let _timer = Profiler::global().start_event("cursor::next_char()", "Lexing");
Expand Down
6 changes: 5 additions & 1 deletion core/parser/src/lexer/identifier.rs
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ impl<R> Tokenizer<R> for Identifier {
let _timer = Profiler::global().start_event("Identifier", "Lexing");

let (identifier_name, contains_escaped_chars) =
Self::take_identifier_name(cursor, start_pos, self.init)?;
Self::take_identifier_name(cursor, interner, start_pos, self.init)?;

let token_kind = match identifier_name.parse() {
Ok(Keyword::True) => {
Expand All @@ -91,6 +91,7 @@ impl<R> Tokenizer<R> for Identifier {
impl Identifier {
pub(super) fn take_identifier_name<R>(
cursor: &mut Cursor<R>,
interner: &mut Interner,
start_pos: Position,
init: char,
) -> Result<(String, bool), Error>
Expand All @@ -102,6 +103,8 @@ impl Identifier {
let mut contains_escaped_chars = false;
let mut identifier_name = if init == '\\' && cursor.next_if(0x75 /* u */)? {
let ch = StringLiteral::take_unicode_escape_sequence(cursor, start_pos)?;
interner.remove_last_code_point();
interner.collect_code_point(ch);

if Self::is_identifier_start(ch) {
contains_escaped_chars = true;
Expand Down Expand Up @@ -140,6 +143,7 @@ impl Identifier {
};

identifier_name.push(char::try_from(ch).expect("checked character value"));
interner.collect_code_point(ch);
}

Ok((identifier_name, contains_escaped_chars))
Expand Down
22 changes: 11 additions & 11 deletions core/parser/src/lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -133,12 +133,12 @@ impl<R> Lexer<R> {
match c {
// /
0x002F => {
self.cursor.next_char()?.expect("/ token vanished"); // Consume the '/'
self.cursor.next_char_collect(interner)?.expect("/ token vanished"); // Consume the '/'
SingleLineComment.lex(&mut self.cursor, start, interner)
}
// *
0x002A => {
self.cursor.next_char()?.expect("* token vanished"); // Consume the '*'
self.cursor.next_char_collect(interner)?.expect("* token vanished"); // Consume the '*'
MultiLineComment.lex(&mut self.cursor, start, interner)
}
ch => {
Expand Down Expand Up @@ -186,14 +186,14 @@ impl<R> Lexer<R> {
}

while self.cursor.peek_char()?.map_or(false, is_whitespace) {
let _next = self.cursor.next_char();
let _next = self.cursor.next_char_collect(interner);
}

// -->
if self.cursor.peek_n(3)?[..3] == [Some(0x2D), Some(0x2D), Some(0x3E)] {
let _next = self.cursor.next_char();
let _next = self.cursor.next_char();
let _next = self.cursor.next_char();
let _next = self.cursor.next_char_collect(interner);
let _next = self.cursor.next_char_collect(interner);
let _next = self.cursor.next_char_collect(interner);

let start = self.cursor.pos();
SingleLineComment.lex(&mut self.cursor, start, interner)?;
Expand All @@ -215,7 +215,7 @@ impl<R> Lexer<R> {
let _timer = Profiler::global().start_event("next()", "Lexing");

let mut start = self.cursor.pos();
let Some(mut next_ch) = self.cursor.next_char()? else {
let Some(mut next_ch) = self.cursor.next_char_collect(interner)? else {
return Ok(None);
};

Expand All @@ -233,7 +233,7 @@ impl<R> Lexer<R> {
if is_whitespace(next_ch) {
loop {
start = self.cursor.pos();
let Some(next) = self.cursor.next_char()? else {
let Some(next) = self.cursor.next_char_collect(interner)? else {
return Ok(None);
};
if !is_whitespace(next) {
Expand Down Expand Up @@ -306,9 +306,9 @@ impl<R> Lexer<R> {
'<' if !self.module()
&& self.cursor.peek_n(3)?[..3] == [Some(0x21), Some(0x2D), Some(0x2D)] =>
{
let _next = self.cursor.next_char();
let _next = self.cursor.next_char();
let _next = self.cursor.next_char();
let _next = self.cursor.next_char_collect(interner);
let _next = self.cursor.next_char_collect(interner);
let _next = self.cursor.next_char_collect(interner);
let start = self.cursor.pos();
SingleLineComment.lex(&mut self.cursor, start, interner)
}
Expand Down
Loading

0 comments on commit 9ee7627

Please sign in to comment.