From 0032adf465397f26028ca9d16ca7d559e1b4bcaa Mon Sep 17 00:00:00 2001 From: Nick Reynolds Date: Mon, 1 Apr 2013 23:25:05 -0400 Subject: [PATCH] Improves RegExp literal lexing. (Fixes #7) - Whitespace within the regexp is now allowed. - Escaped forward slashes (/) are now allowed. - Use the previous token to decide between a division and a RegExp literal. This reasoning works in the majority of cases but there are exceptions that will need a revisit (e.g. ++). - Can now parse underscore.js and the test262 harness without errors. --- src/lexer.l | 54 +++++++++++++++++++++++++++------------------ test/test_regexp.js | 40 +++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+), 21 deletions(-) create mode 100644 test/test_regexp.js diff --git a/src/lexer.l b/src/lexer.l index 636d90b..9b94e5c 100644 --- a/src/lexer.l +++ b/src/lexer.l @@ -24,6 +24,7 @@ #include "y.tab.h" int yycolumn = 1; + int prev_token = 0; void yyuseraction(void); char * fh_extract_string(char *); @@ -87,6 +88,24 @@ E [Ee][+-]?{D}+ "false" return KEYWORD(FALSE); + /* COMMENTS */ + + /* single-line */ +[/][/].* ; /* ignore comments */ + + + /* multi-line (taken from the Flex manual) */ +{ +"/*" BEGIN(ML_COMMENT); +} +{ +"*/" BEGIN(INITIAL); +[^*\n]+ // eat comment in chunks +"*" // eat the lone star +\n yylineno++; +} + + /* LITERALS */ /* double-quoted strings */ @@ -114,29 +133,22 @@ L?'(\\.|[^\\'])*' { yylval.val = fh_extract_string(yytext); return TOKEN("FLOAT", NUMBER); } /* regexps */ -L?\/[^\ \/]+\/([imgy]{0,4}) { /* FIXME: This is very incomplete */ +L?\/([^\*\n\/\\]|\\.|\\\/)([^\n\/\\]|\\.|\\\/)*\/([imgy]{0,4}) { + /* FIXME: This is incomplete */ + if (prev_token == NUMBER || + prev_token == STRING || + prev_token == IDENT || + prev_token == REGEXP || + prev_token == TRUE || + prev_token == FALSE || + prev_token == NULLT || + prev_token == PLUSPLUS || + prev_token == MINUSMINUS) REJECT; + yylval.val = yytext; return TOKEN("REGEXP", REGEXP); } - /* COMMENTS */ - - /* single-line */ -[/][/].* ; /* ignore comments */ - - - /* multi-line (taken from the Flex manual) */ -{ -"/*" BEGIN(ML_COMMENT); -} -{ -"*/" BEGIN(INITIAL); -[^*\n]+ // eat comment in chunks -"*" // eat the lone star -\n yylineno++; -} - - /* MULTI-CHARACTER PUNCTUATORS & OPERATORS */ "||" return OP(OR); @@ -169,8 +181,7 @@ L?\/[^\ \/]+\/([imgy]{0,4}) { /* FIXME: This is very incomplete */ [-+()\[\]=*/%<>,.:`;?!{}~&|^] { TOKEN("LITERAL", 0); - return *yytext; - } + return *yytext; } /* WHITESPACE & TABS */ @@ -227,6 +238,7 @@ fh_parse_error(char * val) int fh_token(char *name, int token) { + prev_token = token; if (fh->opt_print_tokens) { if (yytext) printf("(%s %s)\n", name, yytext); diff --git a/test/test_regexp.js b/test/test_regexp.js new file mode 100644 index 0000000..f1da7d2 --- /dev/null +++ b/test/test_regexp.js @@ -0,0 +1,40 @@ +// test_regexp.js +// -------------- +// Verify that these RegExp literals can all be parsed. + +/*this comment shouldn't cause problems*/ +/*orthisone/ */ + +var re; + +re = /abc/; +re = / abc /; +re = /\/abc/; +re = /\\/.exec("/"); +re = /ab\/c/; +re = /a{1,3}/; +re = /\\\\/; +re = /abc/igm; +re = /\\/g; +re = /^0/; +re = /(.)^/i; +re = /./g; +re = /x(?:...|(...))\1x/i; +re = /\\|'|\r|\n|\t|\u2028|\u2029/g; +re = /<%([\s\S]+?)%>/g; +re = /\(\d{2}.\w{3}.\d{4}\)[\s\-.,]{0,3}\b\w+\b/i; +re = /^(.+?)(\d+)\.(\S+)$/; + +if (true) /regex/.exec('regex'); + +// Division should not be confused with a RegExp literal. +var x = 12 / 3 / 2; +console.assert(x === 2); + +var a = 12, b = 2; +a++ / b; + +// FIXME: This final example doesn't work because when the preceding token is a +// ++, we assume that it is a division, but it could be either. +// +// ++/foo/.abc;