Skip to content

Commit

Permalink
Improve Syntax Check by New ANTLR Parser (#179)
Browse files Browse the repository at this point in the history
* Generate new parser by ANTLR

* Integrate ANTLR parser to existing code

* Fix string similarity and add visitor generated

* Use visitor to change default visiting order

* Add function arg type check

* Add compatibility check for basic operator

* Refactor inner classes to different upper class and subpackage

* Change default MySQL grammar file for our case

* Change default MySQL grammar file for our case

* Change default MySQL grammar file for our case

* Remove ANTLR plugin dependencies from runtime

* Initial commit for syntax analysis by new ANTLR parser

* Add missing function to grammar for unit test

* Add support for index name with - or /type

* Add more ES special functions to pass the unit test

* Add more ES special syntax to pass IT

* Add more syntax for MINUS to pass all IT

* Fix checkstyle violation

* Remove unsupported statements

* Remove SELECT INTO

* Remove more unused syntax

* Remove unused function

* Remove unused tokens

* Remove unused tokens

* Remove unused interval and charset syntax

* Remove unused interval and charset syntax

* Remove more unused syntax

* Add setting for enabling new parser

* Improve offending symbol location

* Improve offending symbol location

* Update 3rd party attribution with ANTLR

* Add supported functions missing in existing test code

* Add integration test

* Change grammar for new merged code from master

* Add integration test

* Add more test cases

* Add more test cases, enable/disable setting

* Add more test cases

* Add more test cases

* Simplify exception for now

* Move generated source back to build folder

* Rename analyze method and assert error message

* Fix typo

* Address more comments and fix broken tests
  • Loading branch information
dai-chen authored Sep 19, 2019
1 parent 3e0d611 commit cd95b20
Show file tree
Hide file tree
Showing 23 changed files with 2,018 additions and 512 deletions.
1,112 changes: 638 additions & 474 deletions THIRD-PARTY

Large diffs are not rendered by default.

25 changes: 24 additions & 1 deletion build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ version = "${opendistroVersion}.0"
apply plugin: 'elasticsearch.esplugin'
apply plugin: 'jacoco'
apply from: 'build-tools/sqlplugin-coverage.gradle'
apply plugin: 'antlr'

jacoco {
toolVersion = "0.8.3"
Expand All @@ -73,7 +74,12 @@ ext {
licenseFile = rootProject.file('LICENSE.TXT')
noticeFile = rootProject.file('NOTICE')
}
licenseHeaders.enabled = true

// ANTLR generated parser file is too large to be checked which caused licenseHeaders stuck.
licenseHeaders {
enabled = true
excludes = ['com/amazon/opendistroforelasticsearch/sql/antlr/parser/**']
}

// TODO: need to fix java doc to enable JavaDoc
javadoc.enabled = false
Expand Down Expand Up @@ -131,6 +137,19 @@ integTestCluster {
distribution = "oss-zip"
}

generateGrammarSource {
arguments += ['-visitor', '-package', 'com.amazon.opendistroforelasticsearch.sql.antlr.parser']
source = sourceSets.main.antlr
outputDirectory = file("build/generated-src/antlr/main/com/amazon/opendistroforelasticsearch/sql/antlr/parser")
}

// Remove ANTLR plugin jars as it's specified as 'compile' dependency internally
configurations {
compile {
extendsFrom = extendsFrom.findAll { it != configurations.antlr }
}
}

check.dependsOn jacocoTestReport

// TODO: fix code style in main and test source code
Expand Down Expand Up @@ -175,6 +194,10 @@ dependencies {
compile group: 'com.google.guava', name: 'guava', version:'15.0'
compile group: 'org.json', name: 'json', version:'20180813'

// ANTLR gradle plugin and runtime dependency
antlr "org.antlr:antlr4:4.7.1"
compile "org.antlr:antlr4-runtime:4.7.1"

//compileOnly group: 'org.locationtech.jts', name: 'jts-core', version:'1.15.0'
// compileOnly group: 'org.elasticsearch', name: 'elasticsearch', version:'6.5.3'
// compileOnly group: 'com.unboundid', name: 'unboundid-ldapsdk', version:'3.2.0'
Expand Down
2 changes: 1 addition & 1 deletion config/checkstyle/checkstyle.xml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
<property name="charset" value="UTF-8" />

<module name="SuppressionFilter">
<property name="file" value="${suppressions}" />
<property name="file" value="${config_loc}/suppressions.xml" />
</module>

<!-- Checks Java files and forbids empty Javadoc comments -->
Expand Down
10 changes: 10 additions & 0 deletions config/checkstyle/suppressions.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
<?xml version="1.0"?>
<!DOCTYPE suppressions PUBLIC
"-//Checkstyle//DTD SuppressionFilter Configuration 1.2//EN"
"https://checkstyle.org/dtds/suppressions_1_2.dtd">

<suppressions>

<suppress files="com[\\/]amazon[\\/]opendistroforelasticsearch[\\/]sql[\\/]antlr[\\/]parser" checks=".*"/>

</suppressions>
310 changes: 310 additions & 0 deletions src/main/antlr/OpenDistroSqlLexer.g4
Original file line number Diff line number Diff line change
@@ -0,0 +1,310 @@
/*
MySQL (Positive Technologies) grammar
The MIT License (MIT).
Copyright (c) 2015-2017, Ivan Kochurkin ([email protected]), Positive Technologies.
Copyright (c) 2017, Ivan Khudyashev ([email protected])
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/

lexer grammar OpenDistroSqlLexer;

channels { SQLCOMMENT, ERRORCHANNEL }


// SKIP

SPACE: [ \t\r\n]+ -> channel(HIDDEN);
SPEC_SQL_COMMENT: '/*!' .+? '*/' -> channel(SQLCOMMENT);
COMMENT_INPUT: '/*' .*? '*/' -> channel(HIDDEN);
LINE_COMMENT: (
('-- ' | '#') ~[\r\n]* ('\r'? '\n' | EOF)
| '--' ('\r'? '\n' | EOF)
) -> channel(HIDDEN);


// Keywords
// Common Keywords

ALL: 'ALL';
AND: 'AND';
AS: 'AS';
ASC: 'ASC';
BETWEEN: 'BETWEEN';
BY: 'BY';
CASE: 'CASE';
CROSS: 'CROSS';
DELETE: 'DELETE';
DESC: 'DESC';
DESCRIBE: 'DESCRIBE';
DISTINCT: 'DISTINCT';
ELSE: 'ELSE';
EXISTS: 'EXISTS';
FALSE: 'FALSE';
FROM: 'FROM';
GROUP: 'GROUP';
HAVING: 'HAVING';
IN: 'IN';
INNER: 'INNER';
IS: 'IS';
JOIN: 'JOIN';
LEFT: 'LEFT';
LIKE: 'LIKE';
LIMIT: 'LIMIT';
MATCH: 'MATCH';
NATURAL: 'NATURAL';
NOT: 'NOT';
NULL_LITERAL: 'NULL';
ON: 'ON';
OR: 'OR';
ORDER: 'ORDER';
OUTER: 'OUTER';
REGEXP: 'REGEXP';
RIGHT: 'RIGHT';
SELECT: 'SELECT';
SHOW: 'SHOW';
THEN: 'THEN';
TRUE: 'TRUE';
UNION: 'UNION';
USING: 'USING';
WHEN: 'WHEN';
WHERE: 'WHERE';


// OD SQL special keyword
MISSING: 'MISSING';
EXCEPT: 'MINUS';


// Group function Keywords

AVG: 'AVG';
COUNT: 'COUNT';
MAX: 'MAX';
MIN: 'MIN';
SUM: 'SUM';


// Common function Keywords

SUBSTRING: 'SUBSTRING';
TRIM: 'TRIM';
YEAR: 'YEAR';


// Keywords, but can be ID
// Common Keywords, but can be ID

END: 'END';
FULL: 'FULL';
OFFSET: 'OFFSET';


// PRIVILEGES

TABLES: 'TABLES';


// Common function names

ABS: 'ABS';
ACOS: 'ACOS';
ASIN: 'ASIN';
ATAN: 'ATAN';
ATAN2: 'ATAN2';
CBRT: 'CBRT';
CEIL: 'CEIL';
CONCAT: 'CONCAT';
CONCAT_WS: 'CONCAT_WS';
COS: 'COS';
COSH: 'COSH';
DATE_FORMAT: 'DATE_FORMAT';
DEGREES: 'DEGREES';
E: 'E';
EXP: 'EXP';
EXPM1: 'EXPM1';
FLOOR: 'FLOOR';
LOG: 'LOG';
LOG10: 'LOG10';
LOG2: 'LOG2';
LOWER: 'LOWER';
PI: 'PI';
POW: 'POW';
RADIANS: 'RADIANS';
RANDOM: 'RANDOM';
RINT: 'RINT';
ROUND: 'ROUND';
SIN: 'SIN';
SINH: 'SINH';
SQRT: 'SQRT';
TAN: 'TAN';
UPPER: 'UPPER';

D: 'D';
T: 'T';
TS: 'TS';
LEFT_BRACE: '{';
RIGHT_BRACE: '}';


// OD SQL special functions
DATE_HISTOGRAM: 'DATE_HISTOGRAM';
DAY_OF_MONTH: 'DAY_OF_MONTH';
DAY_OF_YEAR: 'DAY_OF_YEAR';
DAY_OF_WEEK: 'DAY_OF_WEEK';
EXCLUDE: 'EXCLUDE';
EXTENDED_STATS: 'EXTENDED_STATS';
FIELD: 'FIELD';
FILTER: 'FILTER';
GEO_BOUNDING_BOX: 'GEO_BOUNDING_BOX';
GEO_DISTANCE: 'GEO_DISTANCE';
GEO_INTERSECTS: 'GEO_INTERSECTS';
GEO_POLYGON: 'GEO_POLYGON';
HISTOGRAM: 'HISTOGRAM';
HOUR_OF_DAY: 'HOUR_OF_DAY';
INCLUDE: 'INCLUDE';
IN_TERMS: 'IN_TERMS';
MATCHPHRASE: 'MATCHPHRASE';
MATCH_PHRASE: 'MATCH_PHRASE';
MATCHQUERY: 'MATCHQUERY';
MATCH_QUERY: 'MATCH_QUERY';
MINUTE_OF_DAY: 'MINUTE_OF_DAY';
MINUTE_OF_HOUR: 'MINUTE_OF_HOUR';
MONTH_OF_YEAR: 'MONTH_OF_YEAR';
MULTIMATCH: 'MULTIMATCH';
MULTI_MATCH: 'MULTI_MATCH';
NESTED: 'NESTED';
PERCENTILES: 'PERCENTILES';
REGEXP_QUERY: 'REGEXP_QUERY';
REVERSE_NESTED: 'REVERSE_NESTED';
QUERY: 'QUERY';
RANGE: 'RANGE';
SCORE: 'SCORE';
SECOND_OF_MINUTE: 'SECOND_OF_MINUTE';
STATS: 'STATS';
TERM: 'TERM';
TERMS: 'TERMS';
TOPHITS: 'TOPHITS';
WEEK_OF_YEAR: 'WEEK_OF_YEAR';
WILDCARDQUERY: 'WILDCARDQUERY';
WILDCARD_QUERY: 'WILDCARD_QUERY';


// Operators

// Operators. Arithmetics

STAR: '*';
DIVIDE: '/';
MODULE: '%';
PLUS: '+';
MINUS: '-';
DIV: 'DIV';
MOD: 'MOD';


// Operators. Comparation

EQUAL_SYMBOL: '=';
GREATER_SYMBOL: '>';
LESS_SYMBOL: '<';
EXCLAMATION_SYMBOL: '!';


// Operators. Bit

BIT_NOT_OP: '~';
BIT_OR_OP: '|';
BIT_AND_OP: '&';
BIT_XOR_OP: '^';


// Constructors symbols

DOT: '.';
LR_BRACKET: '(';
RR_BRACKET: ')';
COMMA: ',';
SEMI: ';';
AT_SIGN: '@';
ZERO_DECIMAL: '0';
ONE_DECIMAL: '1';
TWO_DECIMAL: '2';
SINGLE_QUOTE_SYMB: '\'';
DOUBLE_QUOTE_SYMB: '"';
REVERSE_QUOTE_SYMB: '`';
COLON_SYMB: ':';


// Literal Primitives

START_NATIONAL_STRING_LITERAL: 'N' SQUOTA_STRING;
STRING_LITERAL: DQUOTA_STRING | SQUOTA_STRING | BQUOTA_STRING;
DECIMAL_LITERAL: DEC_DIGIT+;
HEXADECIMAL_LITERAL: 'X' '\'' (HEX_DIGIT HEX_DIGIT)+ '\''
| '0X' HEX_DIGIT+;

REAL_LITERAL: (DEC_DIGIT+)? '.' DEC_DIGIT+
| DEC_DIGIT+ '.' EXPONENT_NUM_PART
| (DEC_DIGIT+)? '.' (DEC_DIGIT+ EXPONENT_NUM_PART)
| DEC_DIGIT+ EXPONENT_NUM_PART;
NULL_SPEC_LITERAL: '\\' 'N';
BIT_STRING: BIT_STRING_L;



// Hack for dotID
// Prevent recognize string: .123somelatin AS ((.123), FLOAT_LITERAL), ((somelatin), ID)
// it must recoginze: .123somelatin AS ((.), DOT), (123somelatin, ID)

DOT_ID: '.' ID_LITERAL;



// Identifiers

ID: ID_LITERAL;
// DOUBLE_QUOTE_ID: '"' ~'"'+ '"';
REVERSE_QUOTE_ID: '`' ~'`'+ '`';
STRING_USER_NAME: (
SQUOTA_STRING | DQUOTA_STRING
| BQUOTA_STRING | ID_LITERAL
) '@'
(
SQUOTA_STRING | DQUOTA_STRING
| BQUOTA_STRING | ID_LITERAL
);


// Fragments for Literal primitives

fragment EXPONENT_NUM_PART: 'E' [-+]? DEC_DIGIT+;
fragment ID_LITERAL: [A-Z_$0-9]*?[A-Z_$\-]+?[A-Z_$\-0-9]*;
fragment DQUOTA_STRING: '"' ( '\\'. | '""' | ~('"'| '\\') )* '"';
fragment SQUOTA_STRING: '\'' ('\\'. | '\'\'' | ~('\'' | '\\'))* '\'';
fragment BQUOTA_STRING: '`' ( '\\'. | '``' | ~('`'|'\\'))* '`';
fragment HEX_DIGIT: [0-9A-F];
fragment DEC_DIGIT: [0-9];
fragment BIT_STRING_L: 'B' '\'' [01]+ '\'';



// Last tokens must generate Errors

ERROR_RECOGNITION: . -> channel(ERRORCHANNEL);
Loading

0 comments on commit cd95b20

Please sign in to comment.