added skewed by

xnuinside · Sep 18, 2021 · c5e2fe6 · c5e2fe6
1 parent 387a273
commit c5e2fe6
Show file tree

Hide file tree

Showing 7 changed files with 41 additions and 13 deletions.
diff --git a/CHANGELOG.txt b/CHANGELOG.txt
@@ -3,7 +3,9 @@ Fixes:
 1. Add support for more special symbols to strings - https://github.com/xnuinside/simple-ddl-parser/issues/68
 
 Features:
-1. Added support for HQL statements: STORED AS INPUTFORMAT, OUTPUTFORMAT - https://github.com/xnuinside/simple-ddl-parser/issues/69
+1. Added support for HQL statements:
+    STORED AS INPUTFORMAT, OUTPUTFORMAT - https://github.com/xnuinside/simple-ddl-parser/issues/69
+    SKEWED BY
 
 **v0.19.6**
 Fixes:

diff --git a/README.md b/README.md
@@ -308,6 +308,7 @@ You also can provide a path where you want to have a dumps with schema with argu
 - LOCATION
 - FIELDS TERMINATED BY, LINES TERMINATED BY, COLLECTION ITEMS TERMINATED BY, MAP KEYS TERMINATED BY
 - TBLPROPERTIES ('parquet.compression'='SNAPPY' & etc.)
+- SKEWED BY
 
 ### MSSQL / MySQL/ Oracle
 
@@ -345,7 +346,6 @@ You also can provide a path where you want to have a dumps with schema with argu
 0. Add support for ALTER TABLE ... ADD COLUMN
 1. Add more support for CREATE type IS TABLE (example: CREATE OR REPLACE TYPE budget_tbl_typ IS TABLE OF NUMBER(8,2);
 2. Add support (ignore correctly) ALTER TABLE ... DROP CONSTRAINT ..., ALTER TABLE ... DROP INDEX ...
-4. Add support for SKEWED BY for HQL
 
 ## non-feature todo
 

diff --git a/simple_ddl_parser/ddl_parser.py b/simple_ddl_parser/ddl_parser.py
@@ -48,24 +48,32 @@ def process_body_tokens(self, t):
         return t
 
     def t_STRING(self, t):
-        r"((\')([a-zA-Z_,`0-9:><\=\-\+.\~\%$\!() {}\[\]\/\\\"\#\*&^|?±§@~]*)(\')){1}"
+        r"((\')([a-zA-Z_,`0-9:><\=\-\+.\~\%$\!() {}\[\]\/\\\"\#\*&^|?;±§@~]*)(\')){1}"
         t.type = "STRING"
-        print(t.type, t.value)
         return t
 
     def t_ID(self, t):
         r"([0-9]\.[0-9])\w|([a-zA-Z_,0-9:><\/\=\-\+\~\%$\*'\()!{}\[\]\"\`]+)"
         t.type = tok.symbol_tokens.get(t.value, "ID")
         skip_id_tokens = ["(", ")", ","]
+        print(
+                t.value not in skip_id_tokens
+                and self.lexer.is_table
+                and self.lexer.lp_open
+                and (self.lexer.last_token == "COMMA" or self.lexer.last_token == "LP")
+                and t.value.upper() not in tok.first_liners
+        )
         if t.type == "LP":
             self.lexer.lp_open += 1
             self.lexer.columns_def = True
+            self.lexer.last_token = "LP"
             return t
+
         elif (
             t.value not in skip_id_tokens
             and self.lexer.is_table
             and self.lexer.lp_open
-            and self.lexer.last_token == "COMMA"
+            and (self.lexer.last_token == "COMMA" or self.lexer.last_token == "LP")
             and t.value.upper() not in tok.first_liners
         ):
             t.type = "ID"
@@ -109,7 +117,6 @@ def set_last_token(self, t):
             self.lexer.is_table = False
         elif t.type == "TABLE" or t.type == "INDEX":
             self.lexer.is_table = True
-        print(t.type, t.value)
         return t
 
     def t_newline(self, t):

diff --git a/simple_ddl_parser/dialects/hql.py b/simple_ddl_parser/dialects/hql.py
@@ -85,6 +85,13 @@ def p_expression_map_keys_terminated_by(self, p):
         p_list = list(p)
         p[0]["map_keys_terminated_by"] = check_spec(p_list[-1])
 
+    def p_expression_skewed_by(self, p):
+        """expr : expr SKEWED BY LP ID RP ON LP pid RP
+        """
+        p[0] = p[1]
+        p_list = remove_par(list(p))
+        p[0]["skewed_by"] = {'key': p_list[4], 'on': p_list[-1]}
+
     def p_expression_collection_terminated_by(self, p):
         """expr : expr COLLECTION ITEMS TERMINATED BY ID
         | expr COLLECTION ITEMS TERMINATED BY STRING

diff --git a/simple_ddl_parser/parser.py b/simple_ddl_parser/parser.py
@@ -95,12 +95,14 @@ def parse_data(self):
             if line.replace("\n", "").replace("\t", "") or num == len(lines) - 1:
                 # to avoid issues when comma or parath are glued to column name
                 if statement is not None:
-                    statement += f" {line}"
+                    statement += f" {line.strip()}"
                 else:
-                    statement = line
-                if ";" not in statement and num != len(lines) - 1:
+                    statement = line.strip()
+                if not statement.endswith(';') and num != len(lines) - 1:
                     continue
                 self.set_default_flags_in_lexer()
+                if statement.endswith(';'):
+                    statement = statement[:-1]
                 _parse_result = yacc.parse(statement)
 
                 if _parse_result:

diff --git a/simple_ddl_parser/tokens.py b/simple_ddl_parser/tokens.py
@@ -20,7 +20,6 @@
     "KEY": "KEY",
     "ADD": "ADD",
     "AS": "AS",
-    "LIKE": "LIKE",
     "CLONE": "CLONE",
     "DEFERRABLE": "DEFERRABLE",
     "INITIALLY": "INITIALLY",
@@ -49,6 +48,7 @@
     "COMMENT": "COMMENT",
 }
 first_liners = {
+    "LIKE": "LIKE",
     "CONSTRAINT": "CONSTRAINT",
     "FOREIGN": "FOREIGN",
     "PRIMARY": "PRIMARY",
@@ -77,6 +77,7 @@
     "CLUSTER": "CLUSTER",
     "SERDEPROPERTIES": "SERDEPROPERTIES",
     "TBLPROPERTIES": "TBLPROPERTIES",
+    "SKEWED": "SKEWED",
     # oracle
     "STORAGE": "STORAGE",
     "TABLESPACE": "TABLESPACE",

diff --git a/tests/test_hql_output_mode.py b/tests/test_hql_output_mode.py
@@ -1606,7 +1606,7 @@ def test_special_characters_in_comment():
                 "columns": [
                     {
                         "check": None,
-                        "comment": "'t# est | & * % $ // * 6 % !?;;±§@~^'",
+                        "comment": "'t# est | & * % $ // * 6 % !?;;\\0b1\\0a7@~^'",
                         "default": None,
                         "name": "job_id",
                         "nullable": True,
@@ -1771,8 +1771,17 @@ def test_output_input_format():
     LOCATION
     'hdfs://xxxx'
     """
-    from simple_ddl_parser import DDLParser
     parse_results = DDLParser(ddl).run(output_mode="hql")
     expected = [{'columns': [{'name': 'test', 'type': 'STRING', 'size': None, 'references': None, 'unique': False,
                               'nullable': True, 'default': None, 'check': None, 'comment': "'xxxx'"}], 'primary_key': [], 'alter': {}, 'checks': [], 'index': [], 'partitioned_by': [], 'tablespace': None, 'stored_as': {'outputformat': "'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'", 'inputformat': "'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'"}, 'location': "'hdfs://xxxx'", 'comment': None, 'row_format': {'serde': True, 'java_class': "'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'"}, 'fields_terminated_by': None, 'lines_terminated_by': None, 'map_keys_terminated_by': None, 'collection_items_terminated_by': None, 'external': True, 'schema': None, 'table_name': 'test'}]
-    assert expected == parse_results
+    assert expected == parse_results
+
+
+def test_skewed_by():
+    ddl = """
+    CREATE TABLE list_bucket_single (key STRING, value STRING)
+      SKEWED BY (key) ON (1,5,6) STORED AS DIRECTORIES;
+    """
+    parse_results = DDLParser(ddl).run(output_mode="hql")
+    expected =[{'columns': [{'name': 'key', 'type': 'STRING', 'size': None, 'references': None, 'unique': False, 'nullable': True, 'default': None, 'check': None}, {'name': 'value', 'type': 'STRING', 'size': None, 'references': None, 'unique': False, 'nullable': True, 'default': None, 'check': None}], 'primary_key': [], 'alter': {}, 'checks': [], 'index': [], 'partitioned_by': [], 'tablespace': None, 'stored_as': 'DIRECTORIES', 'location': None, 'comment': None, 'row_format': None, 'fields_terminated_by': None, 'lines_terminated_by': None, 'map_keys_terminated_by': None, 'collection_items_terminated_by': None, 'external': False, 'schema': None, 'table_name': 'list_bucket_single', 'skewed_by': {'key': 'key', 'on': ['1', '5', '6']}}]
+    assert expected == parse_results