diff --git a/crates/dash_decompiler/src/decompiler.rs b/crates/dash_decompiler/src/decompiler.rs index 512d1538..83695494 100644 --- a/crates/dash_decompiler/src/decompiler.rs +++ b/crates/dash_decompiler/src/decompiler.rs @@ -443,7 +443,7 @@ impl fmt::Display for DisplayConstant<'_> { Constant::Function(fun) => write!(f, "", fun.name.as_deref().unwrap_or("")), Constant::Null => f.write_str("null"), Constant::Undefined => f.write_str("undefined"), - Constant::Regex(_, source) => write!(f, "{source}"), + Constant::Regex(_, _, source) => write!(f, "{source}"), } } } diff --git a/crates/dash_lexer/Cargo.toml b/crates/dash_lexer/Cargo.toml index 44fea8e1..f011c10e 100644 --- a/crates/dash_lexer/Cargo.toml +++ b/crates/dash_lexer/Cargo.toml @@ -8,3 +8,4 @@ edition = "2021" [dependencies] either = "1.6.1" dash_middle = { path = "../dash_middle" } +dash_regex = { path = "../dash_regex" } diff --git a/crates/dash_lexer/src/lib.rs b/crates/dash_lexer/src/lib.rs index eb315a6e..84fe7e3a 100644 --- a/crates/dash_lexer/src/lib.rs +++ b/crates/dash_lexer/src/lib.rs @@ -6,6 +6,7 @@ use dash_middle::lexer::token::{as_token, Token, TokenType}; use dash_middle::parser::error::Error; use dash_middle::sourcemap::Span; use dash_middle::util; +use dash_regex::flags::Flags; /// A JavaScript source code lexer #[derive(Debug)] @@ -398,8 +399,9 @@ impl<'a, 'interner> Lexer<'a, 'interner> { self.create_contextified_token(TokenType::TemplateLiteral(sym)); // TODO: check if the spans created by this call are right!! } - /// Reads an identifier and returns it as a node - fn read_identifier(&mut self) { + /// Assumes one character has already been read. + fn read_identifier_raw(&mut self) -> &'a str { + let start = self.idx - 1; while !self.is_eof() { let cur = self.current_real(); @@ -410,7 +412,13 @@ impl<'a, 'interner> Lexer<'a, 'interner> { self.advance(); } - let sym = self.interner.intern(self.get_lexeme()); + self.subslice(start..self.idx) + } + + /// Reads an identifier and returns it as a node + fn read_identifier(&mut self) { + let ident = self.read_identifier_raw(); + let sym = self.interner.intern(ident); self.create_contextified_token(as_token(sym)); } @@ -428,8 +436,19 @@ impl<'a, 'interner> Lexer<'a, 'interner> { } } - let sym = self.interner.intern(self.get_lexeme()); - self.create_contextified_token(TokenType::RegexLiteral(sym)); + let regex_sym = self.interner.intern(self.get_lexeme()); + + let flags = if self.current().is_some_and(util::is_alpha) { + self.advance(); // identifier reading requires one character to be read + self.read_identifier_raw().parse::().unwrap() // TODO: handle error + } else { + Flags::empty() + }; + + self.create_contextified_token(TokenType::RegexLiteral { + literal: regex_sym, + flags, + }); } /// Iterates through the input string and yields the next node diff --git a/crates/dash_middle/src/compiler/constant.rs b/crates/dash_middle/src/compiler/constant.rs index 63d73f02..615b53e6 100755 --- a/crates/dash_middle/src/compiler/constant.rs +++ b/crates/dash_middle/src/compiler/constant.rs @@ -104,7 +104,7 @@ pub enum Constant { Identifier(Rc), Boolean(bool), Function(Rc), - Regex(dash_regex::ParsedRegex, Rc), + Regex(dash_regex::ParsedRegex, dash_regex::Flags, Rc), Null, Undefined, } @@ -146,7 +146,9 @@ impl Constant { LiteralExpr::Boolean(b) => Self::Boolean(*b), LiteralExpr::Null => Self::Null, LiteralExpr::Undefined => Self::Undefined, - LiteralExpr::Regex(regex, source) => Self::Regex(regex.clone(), interner.resolve(*source).clone()), + LiteralExpr::Regex(regex, flags, source) => { + Self::Regex(regex.clone(), *flags, interner.resolve(*source).clone()) + } } } } diff --git a/crates/dash_middle/src/lexer/token.rs b/crates/dash_middle/src/lexer/token.rs index cf6dbea0..cffc2997 100644 --- a/crates/dash_middle/src/lexer/token.rs +++ b/crates/dash_middle/src/lexer/token.rs @@ -2,6 +2,7 @@ use std::fmt; use crate::interner::{sym, Symbol}; use crate::sourcemap::Span; +use dash_regex::flags::Flags; use derive_more::Display; /// The type of a token @@ -203,7 +204,7 @@ pub enum TokenType { /// Regex literal: /a+b/g #[display(fmt = "")] - RegexLiteral(Symbol), + RegexLiteral { literal: Symbol, flags: Flags }, #[display(fmt = "0x")] NumberHex(Symbol), diff --git a/crates/dash_middle/src/parser/expr.rs b/crates/dash_middle/src/parser/expr.rs index 27c4c422..c1568b55 100644 --- a/crates/dash_middle/src/parser/expr.rs +++ b/crates/dash_middle/src/parser/expr.rs @@ -159,8 +159,8 @@ impl ExprKind { Self::Literal(LiteralExpr::Undefined) } - pub fn regex_literal(regex: dash_regex::ParsedRegex, source: Symbol) -> Self { - Self::Literal(LiteralExpr::Regex(regex, source)) + pub fn regex_literal(regex: dash_regex::ParsedRegex, flags: dash_regex::Flags, source: Symbol) -> Self { + Self::Literal(LiteralExpr::Regex(regex, flags, source)) } /// Creates a function call expression @@ -498,8 +498,8 @@ pub enum LiteralExpr { #[display(fmt = "\"{_0}\"")] String(Symbol), - #[display(fmt = "/{_1}/")] - Regex(dash_regex::ParsedRegex, Symbol), + #[display(fmt = "/{_2}/")] + Regex(dash_regex::ParsedRegex, dash_regex::Flags, Symbol), #[display(fmt = "null")] Null, diff --git a/crates/dash_parser/src/expr.rs b/crates/dash_parser/src/expr.rs index b195c380..db7abccc 100644 --- a/crates/dash_parser/src/expr.rs +++ b/crates/dash_parser/src/expr.rs @@ -688,9 +688,9 @@ impl<'a, 'interner> Parser<'a, 'interner> { span, kind: ExprKind::function(f), })?, - TokenType::RegexLiteral(sym) => { + TokenType::RegexLiteral { literal, flags } => { // Trim / prefix and suffix - let full = self.interner.resolve(sym); + let full = self.interner.resolve(literal); let full = &full[1..full.len() - 1]; let nodes = match dash_regex::Parser::new(full.as_bytes()).parse_all() { Ok(nodes) => nodes, @@ -702,7 +702,7 @@ impl<'a, 'interner> Parser<'a, 'interner> { }; Expr { span: current.span, - kind: ExprKind::regex_literal(nodes, sym), + kind: ExprKind::regex_literal(nodes, flags, literal), } } other if other.is_identifier() => { diff --git a/crates/dash_regex/Cargo.toml b/crates/dash_regex/Cargo.toml index f8198cd5..dbe66c73 100644 --- a/crates/dash_regex/Cargo.toml +++ b/crates/dash_regex/Cargo.toml @@ -6,9 +6,10 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [features] -format = ["serde"] +format = ["serde", "bitflags/serde"] [dependencies] thiserror = "1.0.37" serde = { version = "1.0", features = ["derive", "rc"], optional = true } smallvec = { version = "1.9.0", features = ["const_generics"] } +bitflags = { version = "2.4.1", features = ["serde"] } diff --git a/crates/dash_regex/src/flags.rs b/crates/dash_regex/src/flags.rs new file mode 100644 index 00000000..568317a6 --- /dev/null +++ b/crates/dash_regex/src/flags.rs @@ -0,0 +1,33 @@ +use std::str::FromStr; + +use bitflags::bitflags; +use serde::{Deserialize, Serialize}; + +bitflags! { + #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)] + #[cfg_attr(feature = "format", derive(Serialize, Deserialize))] + pub struct Flags: u8 { + const GLOBAL = 1; + const IGNORE_CASE = 2; + } +} + +#[derive(Debug)] +pub enum Error { + UnknownFlag(char), +} + +impl FromStr for Flags { + type Err = Error; + fn from_str(s: &str) -> Result { + let mut flags = Flags::empty(); + for c in s.chars() { + match c { + 'g' => flags |= Flags::GLOBAL, + 'i' => flags |= Flags::IGNORE_CASE, + o => return Err(Error::UnknownFlag(o)), + } + } + Ok(flags) + } +} diff --git a/crates/dash_regex/src/lib.rs b/crates/dash_regex/src/lib.rs index e22b7731..4f5d539c 100644 --- a/crates/dash_regex/src/lib.rs +++ b/crates/dash_regex/src/lib.rs @@ -4,12 +4,14 @@ pub use node::Node; pub use parser::Parser; pub mod error; +pub mod flags; pub mod matcher; pub mod node; pub mod parser; mod stream; mod visitor; +pub use flags::Flags; pub use parser::ParsedRegex; #[cfg(test)] diff --git a/crates/dash_vm/src/js_std/regex.rs b/crates/dash_vm/src/js_std/regex.rs index f0936768..97a51338 100644 --- a/crates/dash_vm/src/js_std/regex.rs +++ b/crates/dash_vm/src/js_std/regex.rs @@ -7,16 +7,28 @@ use crate::value::regex::{RegExp, RegExpInner}; use crate::value::{Value, ValueContext}; use dash_regex::matcher::Matcher as RegexMatcher; use dash_regex::parser::Parser as RegexParser; +use dash_regex::Flags; pub fn constructor(cx: CallContext) -> Result { let pattern = cx.args.first().unwrap_or_undefined().to_string(cx.scope)?; + let flags = match cx + .args + .get(1) + .map(|v| v.to_string(cx.scope)) + .transpose()? + .map(|s| s.parse::()) + { + Some(Ok(flags)) => flags, + Some(Err(err)) => throw!(cx.scope, SyntaxError, "Invalid RegExp flags: {:?}", err), + None => Flags::empty(), + }; let nodes = match RegexParser::new(pattern.as_bytes()).parse_all() { Ok(nodes) => nodes, Err(err) => throw!(cx.scope, SyntaxError, "Regex parser error: {}", err), }; - let regex = RegExp::new(nodes, pattern, cx.scope); + let regex = RegExp::new(nodes, flags, pattern, cx.scope); Ok(Value::Object(cx.scope.register(regex))) } @@ -29,22 +41,33 @@ pub fn test(cx: CallContext) -> Result { None => throw!(cx.scope, TypeError, "Receiver must be a RegExp"), }; - let RegExpInner { regex, last_index, .. } = match regex.inner() { + let RegExpInner { + regex, + last_index, + flags, + .. + } = match regex.inner() { Some(nodes) => nodes, None => throw!(cx.scope, TypeError, "Receiver must be an initialized RegExp object"), }; - if last_index.get() >= text.len() { + let is_global = flags.contains(Flags::GLOBAL); + + if is_global && last_index.get() >= text.len() { last_index.set(0); return Ok(Value::Boolean(false)); } let mut matcher = RegexMatcher::new(regex, text[last_index.get()..].as_bytes()); if matcher.matches() { - last_index.set(last_index.get() + matcher.groups.get(0).unwrap().end); + if is_global { + last_index.set(last_index.get() + matcher.groups.get(0).unwrap().end); + } Ok(Value::Boolean(true)) } else { - last_index.set(0); + if is_global { + last_index.set(0); + } Ok(Value::Boolean(false)) } } @@ -57,19 +80,29 @@ pub fn exec(cx: CallContext<'_, '_>) -> Result { None => throw!(cx.scope, TypeError, "Receiver must be a RegExp"), }; - let RegExpInner { regex, last_index, .. } = match regex.inner() { + let RegExpInner { + regex, + last_index, + flags, + .. + } = match regex.inner() { Some(nodes) => nodes, None => throw!(cx.scope, TypeError, "Receiver must be an initialized RegExp object"), }; - if last_index.get() >= text.len() { + let is_global = flags.contains(Flags::GLOBAL); + + if is_global && last_index.get() >= text.len() { last_index.set(0); return Ok(Value::null()); } let mut matcher = RegexMatcher::new(regex, text[last_index.get()..].as_bytes()); if matcher.matches() { - last_index.set(last_index.get() + matcher.groups.get(0).unwrap().end); + if is_global { + last_index.set(last_index.get() + matcher.groups.get(0).unwrap().end); + } + let groups = Array::from_vec( cx.scope, matcher @@ -86,7 +119,10 @@ pub fn exec(cx: CallContext<'_, '_>) -> Result { ); Ok(Value::Object(cx.scope.register(groups))) } else { - last_index.set(0); + if is_global { + last_index.set(0); + } + Ok(Value::null()) } } diff --git a/crates/dash_vm/src/js_std/string.rs b/crates/dash_vm/src/js_std/string.rs index cdbeaa99..3eb694bd 100644 --- a/crates/dash_vm/src/js_std/string.rs +++ b/crates/dash_vm/src/js_std/string.rs @@ -310,7 +310,14 @@ pub fn substr(cx: CallContext) -> Result { let string = cx.this.to_string(cx.scope)?; let (start, end) = { let start = match cx.args.first() { - Some(arg) => arg.to_int32(cx.scope)? as usize, + Some(arg) => { + let num = arg.to_int32(cx.scope)?; + if num < 0 { + (num + string.len() as i32) as usize + } else { + num as usize + } + } None => 0, }; let end = match cx.args.get(1) { @@ -318,10 +325,10 @@ pub fn substr(cx: CallContext) -> Result { None => string.len(), }; - (start, end) + (start, start + end) }; - let bytes = string.as_bytes().get(start..start + end).unwrap_or(&[]); + let bytes = string.as_bytes().get(start..end.min(string.len())).unwrap_or(&[]); let result = String::from_utf8_lossy(bytes).into_owned(); Ok(Value::String(result.into())) @@ -329,19 +336,23 @@ pub fn substr(cx: CallContext) -> Result { pub fn substring(cx: CallContext) -> Result { let string = cx.this.to_string(cx.scope)?; - let (start, end) = { + let (mut start, mut end) = { let start = match cx.args.first() { - Some(arg) => arg.to_int32(cx.scope)? as usize, + Some(arg) => arg.to_int32(cx.scope)?.max(0) as usize, None => 0, }; let end = match cx.args.get(1) { - Some(arg) => arg.to_int32(cx.scope)? as usize, + Some(arg) => (arg.to_int32(cx.scope)? as usize).min(string.len()), None => string.len(), }; (start, end) }; + if start > end { + std::mem::swap(&mut start, &mut end); + } + let bytes = string.as_bytes().get(start..end).unwrap_or(&[]); let result = String::from_utf8_lossy(bytes).into_owned(); diff --git a/crates/dash_vm/src/value/mod.rs b/crates/dash_vm/src/value/mod.rs index 52582321..9a1ed081 100755 --- a/crates/dash_vm/src/value/mod.rs +++ b/crates/dash_vm/src/value/mod.rs @@ -297,8 +297,8 @@ impl Value { Constant::String(s) => Value::String(s), Constant::Undefined => Value::undefined(), Constant::Null => Value::null(), - Constant::Regex(nodes, source) => { - let regex = RegExp::new(nodes, source, vm); + Constant::Regex(nodes, flags, source) => { + let regex = RegExp::new(nodes, flags, source, vm); Value::Object(vm.register(regex)) } Constant::Function(f) => { diff --git a/crates/dash_vm/src/value/regex.rs b/crates/dash_vm/src/value/regex.rs index cca17f93..539185e8 100644 --- a/crates/dash_vm/src/value/regex.rs +++ b/crates/dash_vm/src/value/regex.rs @@ -2,7 +2,7 @@ use std::cell::Cell; use std::rc::Rc; use dash_proc_macro::Trace; -use dash_regex::ParsedRegex; +use dash_regex::{Flags, ParsedRegex}; use crate::{delegate, Vm}; @@ -11,8 +11,8 @@ use super::object::{NamedObject, Object}; #[derive(Debug)] pub struct RegExpInner { pub regex: ParsedRegex, + pub flags: Flags, pub source: Rc, - // TODO: this should only exist if the `g` flag is set (we currently don't even have regex flags) pub last_index: Cell, } @@ -23,13 +23,14 @@ pub struct RegExp { } impl RegExp { - pub fn new(regex: ParsedRegex, source: Rc, vm: &Vm) -> Self { + pub fn new(regex: ParsedRegex, flags: Flags, source: Rc, vm: &Vm) -> Self { let proto = vm.statics.regexp_prototype.clone(); let ctor = vm.statics.regexp_ctor.clone(); Self { inner: Some(RegExpInner { regex, + flags, source, last_index: Cell::new(0), }),