From c5e2fe6ffd1a7e3a6445a1e175b34358878affdd Mon Sep 17 00:00:00 2001 From: xnuinside Date: Sat, 18 Sep 2021 21:22:18 +0300 Subject: [PATCH] added skewed by --- CHANGELOG.txt | 4 +++- README.md | 2 +- simple_ddl_parser/ddl_parser.py | 15 +++++++++++---- simple_ddl_parser/dialects/hql.py | 7 +++++++ simple_ddl_parser/parser.py | 8 +++++--- simple_ddl_parser/tokens.py | 3 ++- tests/test_hql_output_mode.py | 15 ++++++++++++--- 7 files changed, 41 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.txt b/CHANGELOG.txt index 17bc234..2f186f6 100644 --- a/CHANGELOG.txt +++ b/CHANGELOG.txt @@ -3,7 +3,9 @@ Fixes: 1. Add support for more special symbols to strings - https://github.com/xnuinside/simple-ddl-parser/issues/68 Features: -1. Added support for HQL statements: STORED AS INPUTFORMAT, OUTPUTFORMAT - https://github.com/xnuinside/simple-ddl-parser/issues/69 +1. Added support for HQL statements: + STORED AS INPUTFORMAT, OUTPUTFORMAT - https://github.com/xnuinside/simple-ddl-parser/issues/69 + SKEWED BY **v0.19.6** Fixes: diff --git a/README.md b/README.md index 5231434..b0362fc 100644 --- a/README.md +++ b/README.md @@ -308,6 +308,7 @@ You also can provide a path where you want to have a dumps with schema with argu - LOCATION - FIELDS TERMINATED BY, LINES TERMINATED BY, COLLECTION ITEMS TERMINATED BY, MAP KEYS TERMINATED BY - TBLPROPERTIES ('parquet.compression'='SNAPPY' & etc.) +- SKEWED BY ### MSSQL / MySQL/ Oracle @@ -345,7 +346,6 @@ You also can provide a path where you want to have a dumps with schema with argu 0. Add support for ALTER TABLE ... ADD COLUMN 1. Add more support for CREATE type IS TABLE (example: CREATE OR REPLACE TYPE budget_tbl_typ IS TABLE OF NUMBER(8,2); 2. Add support (ignore correctly) ALTER TABLE ... DROP CONSTRAINT ..., ALTER TABLE ... DROP INDEX ... -4. Add support for SKEWED BY for HQL ## non-feature todo diff --git a/simple_ddl_parser/ddl_parser.py b/simple_ddl_parser/ddl_parser.py index 57682e2..b036849 100755 --- a/simple_ddl_parser/ddl_parser.py +++ b/simple_ddl_parser/ddl_parser.py @@ -48,24 +48,32 @@ def process_body_tokens(self, t): return t def t_STRING(self, t): - r"((\')([a-zA-Z_,`0-9:><\=\-\+.\~\%$\!() {}\[\]\/\\\"\#\*&^|?±§@~]*)(\')){1}" + r"((\')([a-zA-Z_,`0-9:><\=\-\+.\~\%$\!() {}\[\]\/\\\"\#\*&^|?;±§@~]*)(\')){1}" t.type = "STRING" - print(t.type, t.value) return t def t_ID(self, t): r"([0-9]\.[0-9])\w|([a-zA-Z_,0-9:><\/\=\-\+\~\%$\*'\()!{}\[\]\"\`]+)" t.type = tok.symbol_tokens.get(t.value, "ID") skip_id_tokens = ["(", ")", ","] + print( + t.value not in skip_id_tokens + and self.lexer.is_table + and self.lexer.lp_open + and (self.lexer.last_token == "COMMA" or self.lexer.last_token == "LP") + and t.value.upper() not in tok.first_liners + ) if t.type == "LP": self.lexer.lp_open += 1 self.lexer.columns_def = True + self.lexer.last_token = "LP" return t + elif ( t.value not in skip_id_tokens and self.lexer.is_table and self.lexer.lp_open - and self.lexer.last_token == "COMMA" + and (self.lexer.last_token == "COMMA" or self.lexer.last_token == "LP") and t.value.upper() not in tok.first_liners ): t.type = "ID" @@ -109,7 +117,6 @@ def set_last_token(self, t): self.lexer.is_table = False elif t.type == "TABLE" or t.type == "INDEX": self.lexer.is_table = True - print(t.type, t.value) return t def t_newline(self, t): diff --git a/simple_ddl_parser/dialects/hql.py b/simple_ddl_parser/dialects/hql.py index f431291..b2d03b5 100644 --- a/simple_ddl_parser/dialects/hql.py +++ b/simple_ddl_parser/dialects/hql.py @@ -85,6 +85,13 @@ def p_expression_map_keys_terminated_by(self, p): p_list = list(p) p[0]["map_keys_terminated_by"] = check_spec(p_list[-1]) + def p_expression_skewed_by(self, p): + """expr : expr SKEWED BY LP ID RP ON LP pid RP + """ + p[0] = p[1] + p_list = remove_par(list(p)) + p[0]["skewed_by"] = {'key': p_list[4], 'on': p_list[-1]} + def p_expression_collection_terminated_by(self, p): """expr : expr COLLECTION ITEMS TERMINATED BY ID | expr COLLECTION ITEMS TERMINATED BY STRING diff --git a/simple_ddl_parser/parser.py b/simple_ddl_parser/parser.py index 38b5013..7aede04 100755 --- a/simple_ddl_parser/parser.py +++ b/simple_ddl_parser/parser.py @@ -95,12 +95,14 @@ def parse_data(self): if line.replace("\n", "").replace("\t", "") or num == len(lines) - 1: # to avoid issues when comma or parath are glued to column name if statement is not None: - statement += f" {line}" + statement += f" {line.strip()}" else: - statement = line - if ";" not in statement and num != len(lines) - 1: + statement = line.strip() + if not statement.endswith(';') and num != len(lines) - 1: continue self.set_default_flags_in_lexer() + if statement.endswith(';'): + statement = statement[:-1] _parse_result = yacc.parse(statement) if _parse_result: diff --git a/simple_ddl_parser/tokens.py b/simple_ddl_parser/tokens.py index 7bb9dea..1e60e04 100644 --- a/simple_ddl_parser/tokens.py +++ b/simple_ddl_parser/tokens.py @@ -20,7 +20,6 @@ "KEY": "KEY", "ADD": "ADD", "AS": "AS", - "LIKE": "LIKE", "CLONE": "CLONE", "DEFERRABLE": "DEFERRABLE", "INITIALLY": "INITIALLY", @@ -49,6 +48,7 @@ "COMMENT": "COMMENT", } first_liners = { + "LIKE": "LIKE", "CONSTRAINT": "CONSTRAINT", "FOREIGN": "FOREIGN", "PRIMARY": "PRIMARY", @@ -77,6 +77,7 @@ "CLUSTER": "CLUSTER", "SERDEPROPERTIES": "SERDEPROPERTIES", "TBLPROPERTIES": "TBLPROPERTIES", + "SKEWED": "SKEWED", # oracle "STORAGE": "STORAGE", "TABLESPACE": "TABLESPACE", diff --git a/tests/test_hql_output_mode.py b/tests/test_hql_output_mode.py index 34ff993..067281c 100644 --- a/tests/test_hql_output_mode.py +++ b/tests/test_hql_output_mode.py @@ -1606,7 +1606,7 @@ def test_special_characters_in_comment(): "columns": [ { "check": None, - "comment": "'t# est | & * % $ // * 6 % !?;;±§@~^'", + "comment": "'t# est | & * % $ // * 6 % !?;;\\0b1\\0a7@~^'", "default": None, "name": "job_id", "nullable": True, @@ -1771,8 +1771,17 @@ def test_output_input_format(): LOCATION 'hdfs://xxxx' """ - from simple_ddl_parser import DDLParser parse_results = DDLParser(ddl).run(output_mode="hql") expected = [{'columns': [{'name': 'test', 'type': 'STRING', 'size': None, 'references': None, 'unique': False, 'nullable': True, 'default': None, 'check': None, 'comment': "'xxxx'"}], 'primary_key': [], 'alter': {}, 'checks': [], 'index': [], 'partitioned_by': [], 'tablespace': None, 'stored_as': {'outputformat': "'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'", 'inputformat': "'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'"}, 'location': "'hdfs://xxxx'", 'comment': None, 'row_format': {'serde': True, 'java_class': "'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'"}, 'fields_terminated_by': None, 'lines_terminated_by': None, 'map_keys_terminated_by': None, 'collection_items_terminated_by': None, 'external': True, 'schema': None, 'table_name': 'test'}] - assert expected == parse_results \ No newline at end of file + assert expected == parse_results + + +def test_skewed_by(): + ddl = """ + CREATE TABLE list_bucket_single (key STRING, value STRING) + SKEWED BY (key) ON (1,5,6) STORED AS DIRECTORIES; + """ + parse_results = DDLParser(ddl).run(output_mode="hql") + expected =[{'columns': [{'name': 'key', 'type': 'STRING', 'size': None, 'references': None, 'unique': False, 'nullable': True, 'default': None, 'check': None}, {'name': 'value', 'type': 'STRING', 'size': None, 'references': None, 'unique': False, 'nullable': True, 'default': None, 'check': None}], 'primary_key': [], 'alter': {}, 'checks': [], 'index': [], 'partitioned_by': [], 'tablespace': None, 'stored_as': 'DIRECTORIES', 'location': None, 'comment': None, 'row_format': None, 'fields_terminated_by': None, 'lines_terminated_by': None, 'map_keys_terminated_by': None, 'collection_items_terminated_by': None, 'external': False, 'schema': None, 'table_name': 'list_bucket_single', 'skewed_by': {'key': 'key', 'on': ['1', '5', '6']}}] + assert expected == parse_results