Skip to content

Commit

Permalink
added skewed by
Browse files Browse the repository at this point in the history
  • Loading branch information
xnuinside committed Sep 18, 2021
1 parent 387a273 commit c5e2fe6
Show file tree
Hide file tree
Showing 7 changed files with 41 additions and 13 deletions.
4 changes: 3 additions & 1 deletion CHANGELOG.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@ Fixes:
1. Add support for more special symbols to strings - https://github.com/xnuinside/simple-ddl-parser/issues/68

Features:
1. Added support for HQL statements: STORED AS INPUTFORMAT, OUTPUTFORMAT - https://github.com/xnuinside/simple-ddl-parser/issues/69
1. Added support for HQL statements:
STORED AS INPUTFORMAT, OUTPUTFORMAT - https://github.com/xnuinside/simple-ddl-parser/issues/69
SKEWED BY

**v0.19.6**
Fixes:
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,7 @@ You also can provide a path where you want to have a dumps with schema with argu
- LOCATION
- FIELDS TERMINATED BY, LINES TERMINATED BY, COLLECTION ITEMS TERMINATED BY, MAP KEYS TERMINATED BY
- TBLPROPERTIES ('parquet.compression'='SNAPPY' & etc.)
- SKEWED BY

### MSSQL / MySQL/ Oracle

Expand Down Expand Up @@ -345,7 +346,6 @@ You also can provide a path where you want to have a dumps with schema with argu
0. Add support for ALTER TABLE ... ADD COLUMN
1. Add more support for CREATE type IS TABLE (example: CREATE OR REPLACE TYPE budget_tbl_typ IS TABLE OF NUMBER(8,2);
2. Add support (ignore correctly) ALTER TABLE ... DROP CONSTRAINT ..., ALTER TABLE ... DROP INDEX ...
4. Add support for SKEWED BY for HQL

## non-feature todo

Expand Down
15 changes: 11 additions & 4 deletions simple_ddl_parser/ddl_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,24 +48,32 @@ def process_body_tokens(self, t):
return t

def t_STRING(self, t):
r"((\')([a-zA-Z_,`0-9:><\=\-\+.\~\%$\!() {}\[\]\/\\\"\#\*&^|?±§@~]*)(\')){1}"
r"((\')([a-zA-Z_,`0-9:><\=\-\+.\~\%$\!() {}\[\]\/\\\"\#\*&^|?;±§@~]*)(\')){1}"
t.type = "STRING"
print(t.type, t.value)
return t

def t_ID(self, t):
r"([0-9]\.[0-9])\w|([a-zA-Z_,0-9:><\/\=\-\+\~\%$\*'\()!{}\[\]\"\`]+)"
t.type = tok.symbol_tokens.get(t.value, "ID")
skip_id_tokens = ["(", ")", ","]
print(
t.value not in skip_id_tokens
and self.lexer.is_table
and self.lexer.lp_open
and (self.lexer.last_token == "COMMA" or self.lexer.last_token == "LP")
and t.value.upper() not in tok.first_liners
)
if t.type == "LP":
self.lexer.lp_open += 1
self.lexer.columns_def = True
self.lexer.last_token = "LP"
return t

elif (
t.value not in skip_id_tokens
and self.lexer.is_table
and self.lexer.lp_open
and self.lexer.last_token == "COMMA"
and (self.lexer.last_token == "COMMA" or self.lexer.last_token == "LP")
and t.value.upper() not in tok.first_liners
):
t.type = "ID"
Expand Down Expand Up @@ -109,7 +117,6 @@ def set_last_token(self, t):
self.lexer.is_table = False
elif t.type == "TABLE" or t.type == "INDEX":
self.lexer.is_table = True
print(t.type, t.value)
return t

def t_newline(self, t):
Expand Down
7 changes: 7 additions & 0 deletions simple_ddl_parser/dialects/hql.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,13 @@ def p_expression_map_keys_terminated_by(self, p):
p_list = list(p)
p[0]["map_keys_terminated_by"] = check_spec(p_list[-1])

def p_expression_skewed_by(self, p):
"""expr : expr SKEWED BY LP ID RP ON LP pid RP
"""
p[0] = p[1]
p_list = remove_par(list(p))
p[0]["skewed_by"] = {'key': p_list[4], 'on': p_list[-1]}

def p_expression_collection_terminated_by(self, p):
"""expr : expr COLLECTION ITEMS TERMINATED BY ID
| expr COLLECTION ITEMS TERMINATED BY STRING
Expand Down
8 changes: 5 additions & 3 deletions simple_ddl_parser/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,12 +95,14 @@ def parse_data(self):
if line.replace("\n", "").replace("\t", "") or num == len(lines) - 1:
# to avoid issues when comma or parath are glued to column name
if statement is not None:
statement += f" {line}"
statement += f" {line.strip()}"
else:
statement = line
if ";" not in statement and num != len(lines) - 1:
statement = line.strip()
if not statement.endswith(';') and num != len(lines) - 1:
continue
self.set_default_flags_in_lexer()
if statement.endswith(';'):
statement = statement[:-1]
_parse_result = yacc.parse(statement)

if _parse_result:
Expand Down
3 changes: 2 additions & 1 deletion simple_ddl_parser/tokens.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
"KEY": "KEY",
"ADD": "ADD",
"AS": "AS",
"LIKE": "LIKE",
"CLONE": "CLONE",
"DEFERRABLE": "DEFERRABLE",
"INITIALLY": "INITIALLY",
Expand Down Expand Up @@ -49,6 +48,7 @@
"COMMENT": "COMMENT",
}
first_liners = {
"LIKE": "LIKE",
"CONSTRAINT": "CONSTRAINT",
"FOREIGN": "FOREIGN",
"PRIMARY": "PRIMARY",
Expand Down Expand Up @@ -77,6 +77,7 @@
"CLUSTER": "CLUSTER",
"SERDEPROPERTIES": "SERDEPROPERTIES",
"TBLPROPERTIES": "TBLPROPERTIES",
"SKEWED": "SKEWED",
# oracle
"STORAGE": "STORAGE",
"TABLESPACE": "TABLESPACE",
Expand Down
15 changes: 12 additions & 3 deletions tests/test_hql_output_mode.py
Original file line number Diff line number Diff line change
Expand Up @@ -1606,7 +1606,7 @@ def test_special_characters_in_comment():
"columns": [
{
"check": None,
"comment": "'t# est | & * % $ // * 6 % !?;;±§@~^'",
"comment": "'t# est | & * % $ // * 6 % !?;;\\0b1\\0a7@~^'",
"default": None,
"name": "job_id",
"nullable": True,
Expand Down Expand Up @@ -1771,8 +1771,17 @@ def test_output_input_format():
LOCATION
'hdfs://xxxx'
"""
from simple_ddl_parser import DDLParser
parse_results = DDLParser(ddl).run(output_mode="hql")
expected = [{'columns': [{'name': 'test', 'type': 'STRING', 'size': None, 'references': None, 'unique': False,
'nullable': True, 'default': None, 'check': None, 'comment': "'xxxx'"}], 'primary_key': [], 'alter': {}, 'checks': [], 'index': [], 'partitioned_by': [], 'tablespace': None, 'stored_as': {'outputformat': "'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'", 'inputformat': "'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'"}, 'location': "'hdfs://xxxx'", 'comment': None, 'row_format': {'serde': True, 'java_class': "'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'"}, 'fields_terminated_by': None, 'lines_terminated_by': None, 'map_keys_terminated_by': None, 'collection_items_terminated_by': None, 'external': True, 'schema': None, 'table_name': 'test'}]
assert expected == parse_results
assert expected == parse_results


def test_skewed_by():
ddl = """
CREATE TABLE list_bucket_single (key STRING, value STRING)
SKEWED BY (key) ON (1,5,6) STORED AS DIRECTORIES;
"""
parse_results = DDLParser(ddl).run(output_mode="hql")
expected =[{'columns': [{'name': 'key', 'type': 'STRING', 'size': None, 'references': None, 'unique': False, 'nullable': True, 'default': None, 'check': None}, {'name': 'value', 'type': 'STRING', 'size': None, 'references': None, 'unique': False, 'nullable': True, 'default': None, 'check': None}], 'primary_key': [], 'alter': {}, 'checks': [], 'index': [], 'partitioned_by': [], 'tablespace': None, 'stored_as': 'DIRECTORIES', 'location': None, 'comment': None, 'row_format': None, 'fields_terminated_by': None, 'lines_terminated_by': None, 'map_keys_terminated_by': None, 'collection_items_terminated_by': None, 'external': False, 'schema': None, 'table_name': 'list_bucket_single', 'skewed_by': {'key': 'key', 'on': ['1', '5', '6']}}]
assert expected == parse_results

0 comments on commit c5e2fe6

Please sign in to comment.