From 0032adf465397f26028ca9d16ca7d559e1b4bcaa Mon Sep 17 00:00:00 2001
From: Nick Reynolds <ndreynolds@gmail.com>
Date: Mon, 1 Apr 2013 23:25:05 -0400
Subject: [PATCH] Improves RegExp literal lexing. (Fixes #7)

- Whitespace within the regexp is now allowed.
- Escaped forward slashes (/) are now allowed.
- Use the previous token to decide between a division and a RegExp
  literal. This reasoning works in the majority of cases but there are
  exceptions that will need a revisit (e.g.  ++).
- Can now parse underscore.js and the test262 harness without errors.
---
 src/lexer.l         | 54 +++++++++++++++++++++++++++------------------
 test/test_regexp.js | 40 +++++++++++++++++++++++++++++++++
 2 files changed, 73 insertions(+), 21 deletions(-)
 create mode 100644 test/test_regexp.js
diff --git a/src/lexer.l b/src/lexer.l
index 636d90b..9b94e5c 100644
--- a/src/lexer.l
+++ b/src/lexer.l
@@ -24,6 +24,7 @@
   #include "y.tab.h"
 
   int yycolumn = 1;
+  int prev_token = 0;
   void yyuseraction(void);
 
   char * fh_extract_string(char *);
@@ -87,6 +88,24 @@ E                            [Ee][+-]?{D}+
 "false"                      return KEYWORD(FALSE);
 
 
+    /* COMMENTS */
+
+    /* single-line */
+[/][/].*                     ; /* ignore comments */
+
+
+    /* multi-line (taken from the Flex manual) */
+<INITIAL>{
+"/*"                         BEGIN(ML_COMMENT);
+}
+<ML_COMMENT>{
+"*/"                         BEGIN(INITIAL);
+[^*\n]+                      // eat comment in chunks
+"*"                          // eat the lone star
+\n                           yylineno++;
+}
+
+
     /* LITERALS */
 
     /* double-quoted strings */
@@ -114,29 +133,22 @@ L?'(\\.|[^\\'])*'            { yylval.val = fh_extract_string(yytext);
                                return TOKEN("FLOAT", NUMBER); }
 
     /* regexps */
-L?\/[^\ \/]+\/([imgy]{0,4})  { /* FIXME: This is very incomplete */
+L?\/([^\*\n\/\\]|\\.|\\\/)([^\n\/\\]|\\.|\\\/)*\/([imgy]{0,4}) { 
+                               /* FIXME: This is incomplete */
+                               if (prev_token == NUMBER     ||
+                                   prev_token == STRING     ||
+                                   prev_token == IDENT      ||
+                                   prev_token == REGEXP     ||
+                                   prev_token == TRUE       ||
+                                   prev_token == FALSE      ||
+                                   prev_token == NULLT      ||
+                                   prev_token == PLUSPLUS   ||
+                                   prev_token == MINUSMINUS) REJECT;
+                                    
                                yylval.val = yytext;
                                return TOKEN("REGEXP", REGEXP); }
 
 
-    /* COMMENTS */
-
-    /* single-line */
-[/][/].*                     ; /* ignore comments */
-
-
-    /* multi-line (taken from the Flex manual) */
-<INITIAL>{
-"/*"                         BEGIN(ML_COMMENT);
-}
-<ML_COMMENT>{
-"*/"                         BEGIN(INITIAL);
-[^*\n]+                      // eat comment in chunks
-"*"                          // eat the lone star
-\n                           yylineno++;
-}
-
-
     /* MULTI-CHARACTER PUNCTUATORS & OPERATORS */
 
 "||"                         return OP(OR); 
@@ -169,8 +181,7 @@ L?\/[^\ \/]+\/([imgy]{0,4})  { /* FIXME: This is very incomplete */
 
 [-+()\[\]=*/%<>,.:`;?!{}~&|^] {
                                 TOKEN("LITERAL", 0);
-                                return *yytext;
-                             }
+                                return *yytext; }
 
 
     /* WHITESPACE & TABS */
@@ -227,6 +238,7 @@ fh_parse_error(char * val)
 int
 fh_token(char *name, int token)
 {
+  prev_token = token;
   if (fh->opt_print_tokens) {
     if (yytext)
       printf("(%s %s)\n", name, yytext);
diff --git a/test/test_regexp.js b/test/test_regexp.js
new file mode 100644
index 0000000..f1da7d2
--- /dev/null
+++ b/test/test_regexp.js
@@ -0,0 +1,40 @@
+// test_regexp.js
+// --------------
+// Verify that these RegExp literals can all be parsed.
+
+/*this comment shouldn't cause problems*/
+/*orthisone/ */
+
+var re;
+
+re = /abc/;
+re = / abc /;
+re = /\/abc/;
+re = /\\/.exec("/");
+re = /ab\/c/;
+re = /a{1,3}/;
+re = /\\\\/;
+re = /abc/igm;
+re = /\\/g;
+re = /^0/;
+re = /(.)^/i;
+re = /./g;
+re = /x(?:...|(...))\1x/i;
+re = /\\|'|\r|\n|\t|\u2028|\u2029/g;
+re = /<%([\s\S]+?)%>/g;
+re = /\(\d{2}.\w{3}.\d{4}\)[\s\-.,]{0,3}\b\w+\b/i;
+re = /^(.+?)(\d+)\.(\S+)$/;
+
+if (true) /regex/.exec('regex');
+
+// Division should not be confused with a RegExp literal.
+var x = 12 / 3 / 2;
+console.assert(x === 2);
+
+var a = 12, b = 2;
+a++ / b;
+
+// FIXME: This final example doesn't work because when the preceding token is a
+// ++, we assume that it is a division, but it could be either.
+//
+// ++/foo/.abc;