Skip to content

Commit

Permalink
implement regex flags
Browse files Browse the repository at this point in the history
  • Loading branch information
y21 committed Dec 25, 2023
1 parent 9cfb261 commit 9f57c14
Show file tree
Hide file tree
Showing 14 changed files with 144 additions and 37 deletions.
2 changes: 1 addition & 1 deletion crates/dash_decompiler/src/decompiler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -443,7 +443,7 @@ impl fmt::Display for DisplayConstant<'_> {
Constant::Function(fun) => write!(f, "<function {}>", fun.name.as_deref().unwrap_or("<anonymous>")),
Constant::Null => f.write_str("null"),
Constant::Undefined => f.write_str("undefined"),
Constant::Regex(_, source) => write!(f, "{source}"),
Constant::Regex(_, _, source) => write!(f, "{source}"),
}
}
}
1 change: 1 addition & 0 deletions crates/dash_lexer/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ edition = "2021"
[dependencies]
either = "1.6.1"
dash_middle = { path = "../dash_middle" }
dash_regex = { path = "../dash_regex" }
29 changes: 24 additions & 5 deletions crates/dash_lexer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ use dash_middle::lexer::token::{as_token, Token, TokenType};
use dash_middle::parser::error::Error;
use dash_middle::sourcemap::Span;
use dash_middle::util;
use dash_regex::flags::Flags;

/// A JavaScript source code lexer
#[derive(Debug)]
Expand Down Expand Up @@ -398,8 +399,9 @@ impl<'a, 'interner> Lexer<'a, 'interner> {
self.create_contextified_token(TokenType::TemplateLiteral(sym)); // TODO: check if the spans created by this call are right!!
}

/// Reads an identifier and returns it as a node
fn read_identifier(&mut self) {
/// Assumes one character has already been read.
fn read_identifier_raw(&mut self) -> &'a str {
let start = self.idx - 1;
while !self.is_eof() {
let cur = self.current_real();

Expand All @@ -410,7 +412,13 @@ impl<'a, 'interner> Lexer<'a, 'interner> {
self.advance();
}

let sym = self.interner.intern(self.get_lexeme());
self.subslice(start..self.idx)
}

/// Reads an identifier and returns it as a node
fn read_identifier(&mut self) {
let ident = self.read_identifier_raw();
let sym = self.interner.intern(ident);
self.create_contextified_token(as_token(sym));
}

Expand All @@ -428,8 +436,19 @@ impl<'a, 'interner> Lexer<'a, 'interner> {
}
}

let sym = self.interner.intern(self.get_lexeme());
self.create_contextified_token(TokenType::RegexLiteral(sym));
let regex_sym = self.interner.intern(self.get_lexeme());

let flags = if self.current().is_some_and(util::is_alpha) {
self.advance(); // identifier reading requires one character to be read
self.read_identifier_raw().parse::<Flags>().unwrap() // TODO: handle error
} else {
Flags::empty()
};

self.create_contextified_token(TokenType::RegexLiteral {
literal: regex_sym,
flags,
});
}

/// Iterates through the input string and yields the next node
Expand Down
6 changes: 4 additions & 2 deletions crates/dash_middle/src/compiler/constant.rs
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ pub enum Constant {
Identifier(Rc<str>),
Boolean(bool),
Function(Rc<Function>),
Regex(dash_regex::ParsedRegex, Rc<str>),
Regex(dash_regex::ParsedRegex, dash_regex::Flags, Rc<str>),
Null,
Undefined,
}
Expand Down Expand Up @@ -146,7 +146,9 @@ impl Constant {
LiteralExpr::Boolean(b) => Self::Boolean(*b),
LiteralExpr::Null => Self::Null,
LiteralExpr::Undefined => Self::Undefined,
LiteralExpr::Regex(regex, source) => Self::Regex(regex.clone(), interner.resolve(*source).clone()),
LiteralExpr::Regex(regex, flags, source) => {
Self::Regex(regex.clone(), *flags, interner.resolve(*source).clone())
}
}
}
}
Expand Down
3 changes: 2 additions & 1 deletion crates/dash_middle/src/lexer/token.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ use std::fmt;

use crate::interner::{sym, Symbol};
use crate::sourcemap::Span;
use dash_regex::flags::Flags;
use derive_more::Display;

/// The type of a token
Expand Down Expand Up @@ -203,7 +204,7 @@ pub enum TokenType {

/// Regex literal: /a+b/g
#[display(fmt = "<regex literal>")]
RegexLiteral(Symbol),
RegexLiteral { literal: Symbol, flags: Flags },

#[display(fmt = "0x")]
NumberHex(Symbol),
Expand Down
8 changes: 4 additions & 4 deletions crates/dash_middle/src/parser/expr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -159,8 +159,8 @@ impl ExprKind {
Self::Literal(LiteralExpr::Undefined)
}

pub fn regex_literal(regex: dash_regex::ParsedRegex, source: Symbol) -> Self {
Self::Literal(LiteralExpr::Regex(regex, source))
pub fn regex_literal(regex: dash_regex::ParsedRegex, flags: dash_regex::Flags, source: Symbol) -> Self {
Self::Literal(LiteralExpr::Regex(regex, flags, source))
}

/// Creates a function call expression
Expand Down Expand Up @@ -498,8 +498,8 @@ pub enum LiteralExpr {
#[display(fmt = "\"{_0}\"")]
String(Symbol),

#[display(fmt = "/{_1}/")]
Regex(dash_regex::ParsedRegex, Symbol),
#[display(fmt = "/{_2}/")]
Regex(dash_regex::ParsedRegex, dash_regex::Flags, Symbol),

#[display(fmt = "null")]
Null,
Expand Down
6 changes: 3 additions & 3 deletions crates/dash_parser/src/expr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -688,9 +688,9 @@ impl<'a, 'interner> Parser<'a, 'interner> {
span,
kind: ExprKind::function(f),
})?,
TokenType::RegexLiteral(sym) => {
TokenType::RegexLiteral { literal, flags } => {
// Trim / prefix and suffix
let full = self.interner.resolve(sym);
let full = self.interner.resolve(literal);
let full = &full[1..full.len() - 1];
let nodes = match dash_regex::Parser::new(full.as_bytes()).parse_all() {
Ok(nodes) => nodes,
Expand All @@ -702,7 +702,7 @@ impl<'a, 'interner> Parser<'a, 'interner> {
};
Expr {
span: current.span,
kind: ExprKind::regex_literal(nodes, sym),
kind: ExprKind::regex_literal(nodes, flags, literal),
}
}
other if other.is_identifier() => {
Expand Down
3 changes: 2 additions & 1 deletion crates/dash_regex/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@ edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[features]
format = ["serde"]
format = ["serde", "bitflags/serde"]

[dependencies]
thiserror = "1.0.37"
serde = { version = "1.0", features = ["derive", "rc"], optional = true }
smallvec = { version = "1.9.0", features = ["const_generics"] }
bitflags = { version = "2.4.1", features = ["serde"] }
33 changes: 33 additions & 0 deletions crates/dash_regex/src/flags.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
use std::str::FromStr;

use bitflags::bitflags;
use serde::{Deserialize, Serialize};

bitflags! {
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)]
#[cfg_attr(feature = "format", derive(Serialize, Deserialize))]
pub struct Flags: u8 {
const GLOBAL = 1;
const IGNORE_CASE = 2;
}
}

#[derive(Debug)]
pub enum Error {
UnknownFlag(char),
}

impl FromStr for Flags {
type Err = Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
let mut flags = Flags::empty();
for c in s.chars() {
match c {
'g' => flags |= Flags::GLOBAL,
'i' => flags |= Flags::IGNORE_CASE,
o => return Err(Error::UnknownFlag(o)),
}
}
Ok(flags)
}
}
2 changes: 2 additions & 0 deletions crates/dash_regex/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,14 @@ pub use node::Node;
pub use parser::Parser;

pub mod error;
pub mod flags;
pub mod matcher;
pub mod node;
pub mod parser;
mod stream;
mod visitor;

pub use flags::Flags;
pub use parser::ParsedRegex;

#[cfg(test)]
Expand Down
54 changes: 45 additions & 9 deletions crates/dash_vm/src/js_std/regex.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,28 @@ use crate::value::regex::{RegExp, RegExpInner};
use crate::value::{Value, ValueContext};
use dash_regex::matcher::Matcher as RegexMatcher;
use dash_regex::parser::Parser as RegexParser;
use dash_regex::Flags;

pub fn constructor(cx: CallContext) -> Result<Value, Value> {
let pattern = cx.args.first().unwrap_or_undefined().to_string(cx.scope)?;
let flags = match cx
.args
.get(1)
.map(|v| v.to_string(cx.scope))
.transpose()?
.map(|s| s.parse::<Flags>())
{
Some(Ok(flags)) => flags,
Some(Err(err)) => throw!(cx.scope, SyntaxError, "Invalid RegExp flags: {:?}", err),
None => Flags::empty(),
};

let nodes = match RegexParser::new(pattern.as_bytes()).parse_all() {
Ok(nodes) => nodes,
Err(err) => throw!(cx.scope, SyntaxError, "Regex parser error: {}", err),
};

let regex = RegExp::new(nodes, pattern, cx.scope);
let regex = RegExp::new(nodes, flags, pattern, cx.scope);

Ok(Value::Object(cx.scope.register(regex)))
}
Expand All @@ -29,22 +41,33 @@ pub fn test(cx: CallContext) -> Result<Value, Value> {
None => throw!(cx.scope, TypeError, "Receiver must be a RegExp"),
};

let RegExpInner { regex, last_index, .. } = match regex.inner() {
let RegExpInner {
regex,
last_index,
flags,
..
} = match regex.inner() {
Some(nodes) => nodes,
None => throw!(cx.scope, TypeError, "Receiver must be an initialized RegExp object"),
};

if last_index.get() >= text.len() {
let is_global = flags.contains(Flags::GLOBAL);

if is_global && last_index.get() >= text.len() {
last_index.set(0);
return Ok(Value::Boolean(false));
}

let mut matcher = RegexMatcher::new(regex, text[last_index.get()..].as_bytes());
if matcher.matches() {
last_index.set(last_index.get() + matcher.groups.get(0).unwrap().end);
if is_global {
last_index.set(last_index.get() + matcher.groups.get(0).unwrap().end);
}
Ok(Value::Boolean(true))
} else {
last_index.set(0);
if is_global {
last_index.set(0);
}
Ok(Value::Boolean(false))
}
}
Expand All @@ -57,19 +80,29 @@ pub fn exec(cx: CallContext<'_, '_>) -> Result<Value, Value> {
None => throw!(cx.scope, TypeError, "Receiver must be a RegExp"),
};

let RegExpInner { regex, last_index, .. } = match regex.inner() {
let RegExpInner {
regex,
last_index,
flags,
..
} = match regex.inner() {
Some(nodes) => nodes,
None => throw!(cx.scope, TypeError, "Receiver must be an initialized RegExp object"),
};

if last_index.get() >= text.len() {
let is_global = flags.contains(Flags::GLOBAL);

if is_global && last_index.get() >= text.len() {
last_index.set(0);
return Ok(Value::null());
}

let mut matcher = RegexMatcher::new(regex, text[last_index.get()..].as_bytes());
if matcher.matches() {
last_index.set(last_index.get() + matcher.groups.get(0).unwrap().end);
if is_global {
last_index.set(last_index.get() + matcher.groups.get(0).unwrap().end);
}

let groups = Array::from_vec(
cx.scope,
matcher
Expand All @@ -86,7 +119,10 @@ pub fn exec(cx: CallContext<'_, '_>) -> Result<Value, Value> {
);
Ok(Value::Object(cx.scope.register(groups)))
} else {
last_index.set(0);
if is_global {
last_index.set(0);
}

Ok(Value::null())
}
}
23 changes: 17 additions & 6 deletions crates/dash_vm/src/js_std/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -310,38 +310,49 @@ pub fn substr(cx: CallContext) -> Result<Value, Value> {
let string = cx.this.to_string(cx.scope)?;
let (start, end) = {
let start = match cx.args.first() {
Some(arg) => arg.to_int32(cx.scope)? as usize,
Some(arg) => {
let num = arg.to_int32(cx.scope)?;
if num < 0 {
(num + string.len() as i32) as usize
} else {
num as usize
}
}
None => 0,
};
let end = match cx.args.get(1) {
Some(arg) => arg.to_int32(cx.scope)? as usize,
None => string.len(),
};

(start, end)
(start, start + end)
};

let bytes = string.as_bytes().get(start..start + end).unwrap_or(&[]);
let bytes = string.as_bytes().get(start..end.min(string.len())).unwrap_or(&[]);
let result = String::from_utf8_lossy(bytes).into_owned();

Ok(Value::String(result.into()))
}

pub fn substring(cx: CallContext) -> Result<Value, Value> {
let string = cx.this.to_string(cx.scope)?;
let (start, end) = {
let (mut start, mut end) = {
let start = match cx.args.first() {
Some(arg) => arg.to_int32(cx.scope)? as usize,
Some(arg) => arg.to_int32(cx.scope)?.max(0) as usize,
None => 0,
};
let end = match cx.args.get(1) {
Some(arg) => arg.to_int32(cx.scope)? as usize,
Some(arg) => (arg.to_int32(cx.scope)? as usize).min(string.len()),
None => string.len(),
};

(start, end)
};

if start > end {
std::mem::swap(&mut start, &mut end);
}

let bytes = string.as_bytes().get(start..end).unwrap_or(&[]);
let result = String::from_utf8_lossy(bytes).into_owned();

Expand Down
4 changes: 2 additions & 2 deletions crates/dash_vm/src/value/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -297,8 +297,8 @@ impl Value {
Constant::String(s) => Value::String(s),
Constant::Undefined => Value::undefined(),
Constant::Null => Value::null(),
Constant::Regex(nodes, source) => {
let regex = RegExp::new(nodes, source, vm);
Constant::Regex(nodes, flags, source) => {
let regex = RegExp::new(nodes, flags, source, vm);
Value::Object(vm.register(regex))
}
Constant::Function(f) => {
Expand Down
Loading

0 comments on commit 9f57c14

Please sign in to comment.