Skip to content

Commit

Permalink
Improves RegExp literal lexing. (Fixes #7)
Browse files Browse the repository at this point in the history
- Whitespace within the regexp is now allowed.
- Escaped forward slashes (/) are now allowed.
- Use the previous token to decide between a division and a RegExp
  literal. This reasoning works in the majority of cases but there are
  exceptions that will need a revisit (e.g.  ++).
- Can now parse underscore.js and the test262 harness without errors.
  • Loading branch information
ndreynolds committed Apr 2, 2013
1 parent a652cf5 commit 0032adf
Show file tree
Hide file tree
Showing 2 changed files with 73 additions and 21 deletions.
54 changes: 33 additions & 21 deletions src/lexer.l
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include "y.tab.h"

int yycolumn = 1;
int prev_token = 0;
void yyuseraction(void);

char * fh_extract_string(char *);
Expand Down Expand Up @@ -87,6 +88,24 @@ E [Ee][+-]?{D}+
"false" return KEYWORD(FALSE);


/* COMMENTS */

/* single-line */
[/][/].* ; /* ignore comments */


/* multi-line (taken from the Flex manual) */
<INITIAL>{
"/*" BEGIN(ML_COMMENT);
}
<ML_COMMENT>{
"*/" BEGIN(INITIAL);
[^*\n]+ // eat comment in chunks
"*" // eat the lone star
\n yylineno++;
}


/* LITERALS */

/* double-quoted strings */
Expand Down Expand Up @@ -114,29 +133,22 @@ L?'(\\.|[^\\'])*' { yylval.val = fh_extract_string(yytext);
return TOKEN("FLOAT", NUMBER); }

/* regexps */
L?\/[^\ \/]+\/([imgy]{0,4}) { /* FIXME: This is very incomplete */
L?\/([^\*\n\/\\]|\\.|\\\/)([^\n\/\\]|\\.|\\\/)*\/([imgy]{0,4}) {
/* FIXME: This is incomplete */
if (prev_token == NUMBER ||
prev_token == STRING ||
prev_token == IDENT ||
prev_token == REGEXP ||
prev_token == TRUE ||
prev_token == FALSE ||
prev_token == NULLT ||
prev_token == PLUSPLUS ||
prev_token == MINUSMINUS) REJECT;

yylval.val = yytext;
return TOKEN("REGEXP", REGEXP); }


/* COMMENTS */

/* single-line */
[/][/].* ; /* ignore comments */


/* multi-line (taken from the Flex manual) */
<INITIAL>{
"/*" BEGIN(ML_COMMENT);
}
<ML_COMMENT>{
"*/" BEGIN(INITIAL);
[^*\n]+ // eat comment in chunks
"*" // eat the lone star
\n yylineno++;
}


/* MULTI-CHARACTER PUNCTUATORS & OPERATORS */

"||" return OP(OR);
Expand Down Expand Up @@ -169,8 +181,7 @@ L?\/[^\ \/]+\/([imgy]{0,4}) { /* FIXME: This is very incomplete */

[-+()\[\]=*/%<>,.:`;?!{}~&|^] {
TOKEN("LITERAL", 0);
return *yytext;
}
return *yytext; }


/* WHITESPACE & TABS */
Expand Down Expand Up @@ -227,6 +238,7 @@ fh_parse_error(char * val)
int
fh_token(char *name, int token)
{
prev_token = token;
if (fh->opt_print_tokens) {
if (yytext)
printf("(%s %s)\n", name, yytext);
Expand Down
40 changes: 40 additions & 0 deletions test/test_regexp.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
// test_regexp.js
// --------------
// Verify that these RegExp literals can all be parsed.

/*this comment shouldn't cause problems*/
/*orthisone/ */

var re;

re = /abc/;
re = / abc /;
re = /\/abc/;
re = /\\/.exec("/");
re = /ab\/c/;
re = /a{1,3}/;
re = /\\\\/;
re = /abc/igm;
re = /\\/g;
re = /^0/;
re = /(.)^/i;
re = /./g;
re = /x(?:...|(...))\1x/i;
re = /\\|'|\r|\n|\t|\u2028|\u2029/g;
re = /<%([\s\S]+?)%>/g;
re = /\(\d{2}.\w{3}.\d{4}\)[\s\-.,]{0,3}\b\w+\b/i;
re = /^(.+?)(\d+)\.(\S+)$/;

if (true) /regex/.exec('regex');

// Division should not be confused with a RegExp literal.
var x = 12 / 3 / 2;
console.assert(x === 2);

var a = 12, b = 2;
a++ / b;

// FIXME: This final example doesn't work because when the preceding token is a
// ++, we assume that it is a division, but it could be either.
//
// ++/foo/.abc;

0 comments on commit 0032adf

Please sign in to comment.