Merge pull request #279 from xnuinside/v1.6.0_release

V1.6.0 release: refactoring & adding EQ token
xnuinside · Aug 11, 2024 · 35451ea · 35451ea
2 parents 548042a + 689a6df
commit 35451ea
Show file tree

Hide file tree

Showing 27 changed files with 63,859 additions and 679 deletions.
diff --git a/CHANGELOG.txt b/CHANGELOG.txt
@@ -1,8 +1,37 @@
+**v1.6.0**
+### IMPORTANT:
+In this versions there is some output changes & fixes that can break your code.
+1. Now all arguments inside brackets are parsed as separate strings in the list.
+For example:
+ `file_format = (TYPE=JSON NULL_IF=('field')` this was parsed like 'NULL_IF': "('field')",
+ now it will be: 'NULL_IF': ["'field'"],
+
+2. Added separate tokens for EQ `=` and IN (previously they was parsed as IDs also - for internal info, for contributors.
+
+3. Some check statements in columns now parsed validly, also IN statements parsed as normal lists.
+So this statement include_exclude_ind CHAR(1) NOT NULL CONSTRAINT chk_metalistcombo_logicalopr
+  CHECK (include_exclude_ind IN ('I', 'E')),
+
+
+will produce this output:
+
+{'check': {'constraint_name': 'chk_metalistcombo_logicalopr',
+                         'statement': {'in_statement': {'in': ["'I'", "'E'"],
+                                                        'name': 'include_exclude_ind'}}},
+
+
+### Fixes
+1. DEFAULT word now is not arriving in key 'default' (it was before in some cases)
+
+### New Features
+1. Added Athena output mode and initial support - https://github.com/datacontract/datacontract-cli/issues/332
+
+
 **v1.5.4**
 ### Improvements
 #### Snowflake :
 1. In Snowflake add `pattern` token for external table statement, and improve location rendering
-2.
+
 
 **v1.5.3**
 ### Fixes

diff --git a/README.md b/README.md
@@ -489,11 +489,40 @@ for help with debugging & testing support for BigQuery dialect DDLs:
 * https://github.com/kalyan939
 
 ## Changelog
+**v1.6.0**
+### IMPORTANT:
+In this versions there is some output changes & fixes that can break your code.
+1. Now all arguments inside brackets are parsed as separate strings in the list.
+For example:
+ `file_format = (TYPE=JSON NULL_IF=('field')` this was parsed like 'NULL_IF': "('field')",
+ now it will be: 'NULL_IF': ["'field'"],
+
+2. Added separate tokens for EQ `=` and IN (previously they was parsed as IDs also - for internal info, for contributors.
+
+3. Some check statements in columns now parsed validly, also IN statements parsed as normal lists.
+So this statement include_exclude_ind CHAR(1) NOT NULL CONSTRAINT chk_metalistcombo_logicalopr
+  CHECK (include_exclude_ind IN ('I', 'E')),
+
+
+will produce this output:
+
+{'check': {'constraint_name': 'chk_metalistcombo_logicalopr',
+                         'statement': {'in_statement': {'in': ["'I'", "'E'"],
+                                                        'name': 'include_exclude_ind'}}},
+
+
+### Fixes
+1. DEFAULT word now is not arriving in key 'default' (it was before in some cases)
+
+### New Features
+1. Added Athena output mode and initial support - https://github.com/datacontract/datacontract-cli/issues/332
+
+
 **v1.5.4**
 ### Improvements
 #### Snowflake :
 1. In Snowflake add `pattern` token for external table statement, and improve location rendering
-2.
+
 
 **v1.5.3**
 ### Fixes

diff --git a/docs/README.rst b/docs/README.rst
@@ -555,6 +555,46 @@ for help with debugging & testing support for BigQuery dialect DDLs:
 Changelog
 ---------
 
+**v1.6.0**
+
+IMPORTANT:
+^^^^^^^^^^
+
+In this versions there is some output changes & fixes that can break your code.
+
+
+#. 
+   Now all arguments inside brackets are parsed as separate strings in the list.
+   For example:
+   ``file_format = (TYPE=JSON NULL_IF=('field')`` this was parsed like 'NULL_IF': "('field')",
+   now it will be: 'NULL_IF': ["'field'"],
+
+#. 
+   Added separate tokens for EQ ``=`` and IN (previously they was parsed as IDs also - for internal info, for contributors.
+
+#. 
+   Some check statements in columns now parsed validly, also IN statements parsed as normal lists.
+   So this statement include_exclude_ind CHAR(1) NOT NULL CONSTRAINT chk_metalistcombo_logicalopr
+   CHECK (include_exclude_ind IN ('I', 'E')),
+
+will produce this output:
+
+{'check': {'constraint_name': 'chk_metalistcombo_logicalopr',
+                         'statement': {'in_statement': {'in': ["'I'", "'E'"],
+                                                        'name': 'include_exclude_ind'}}},
+
+Fixes
+^^^^^
+
+
+#. DEFAULT word now is not arriving in key 'default' (it was before in some cases)
+
+New Features
+^^^^^^^^^^^^
+
+
+#. Added Athena output mode and initial support - https://github.com/datacontract/datacontract-cli/issues/332
+
 **v1.5.4**
 
 Improvements
@@ -565,7 +605,6 @@ Snowflake :
 
 
 #. In Snowflake add ``pattern`` token for external table statement, and improve location rendering
-   2.
 
 **v1.5.3**
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "simple-ddl-parser"
-version = "1.5.4"
+version = "1.6.0"
 description = "Simple DDL Parser to parse SQL & dialects like HQL, TSQL (MSSQL), Oracle, AWS Redshift, Snowflake, MySQL, PostgreSQL, etc ddl files to json/python dict with full information about columns: types, defaults, primary keys, etc.; sequences, alters, custom types & other entities from ddl."
 authors = ["Iuliia Volkova <xnuinside@gmail.com>"]
 license = "MIT"

diff --git a/simple_ddl_parser/ddl_parser.py b/simple_ddl_parser/ddl_parser.py
@@ -7,6 +7,7 @@
     HQL,
     MSSQL,
     PSQL,
+    Athena,
     BaseSQL,
     BigQuery,
     IBMDb2,
@@ -37,6 +38,7 @@ class Dialects(
     BigQuery,
     IBMDb2,
     PSQL,
+    Athena,
 ):
     pass
 
@@ -115,8 +117,13 @@ def set_lexer_tags(self, t: LexToken) -> None:
         elif t.type == "CHECK":
             self.lexer.check = True
 
+    def t_EQ(self, t: LexToken) -> LexToken:
+        r"(=)+"
+        t.type = "EQ"
+        return self.set_last_token(t)
+
     def t_DOT(self, t: LexToken) -> LexToken:
-        r"\."
+        r"(\.)+"
         t.type = "DOT"
         return self.set_last_token(t)
 
@@ -154,14 +161,18 @@ def is_creation_name(self, t: LexToken) -> bool:
             "TYPE",
             "DOMAIN",
             "TABLESPACE",
-            "INDEX",
             "CONSTRAINT",
             "EXISTS",
         ]
         return (
             t.value not in skip_id_tokens
             and t.value.upper() not in ["IF"]
-            and self.lexer.last_token in exceptional_keys
+            and (
+                self.lexer.last_token in exceptional_keys
+                or (
+                    self.lexer.last_token == "INDEX" and self.lexer.is_table is not True
+                )
+            )
             and not self.exceptional_cases(t.value.upper())
         )
 
@@ -188,6 +199,8 @@ def t_AUTOINCREMENT(self, t: LexToken):
 
     def t_ID(self, t: LexToken):
         r"([0-9]+[.][0-9]*([e][+-]?[0-9]+)?|[0-9]\.[0-9])\w|([a-zA-Z_,0-9:><\/\\\=\-\+\~\%$@#\|&?;*\()!{}\[\]\`\[\]]+)"
+        if len(t.value) > 1 and t.value.endswith(","):
+            t.value = t.value[:-1]
         t.type = tok.symbol_tokens.get(t.value, "ID")
 
         if t.type == "LP":

diff --git a/simple_ddl_parser/dialects/__init__.py b/simple_ddl_parser/dialects/__init__.py
@@ -1,3 +1,4 @@
+from simple_ddl_parser.dialects.athena import Athena
 from simple_ddl_parser.dialects.bigquery import BigQuery
 from simple_ddl_parser.dialects.hql import HQL
 from simple_ddl_parser.dialects.ibm import IBMDb2
@@ -22,4 +23,5 @@
     "IBMDb2",
     "BaseSQL",
     "PSQL",
+    "Athena",
 ]
diff --git a/simple_ddl_parser/dialects/athena.py b/simple_ddl_parser/dialects/athena.py
@@ -0,0 +1,11 @@
+from typing import List
+
+
+class Athena:
+    def p_escaped_by(self, p: List) -> None:
+        """expr : expr ESCAPED BY STRING_BASE"""
+        p[0] = p[1]
+        p_list = list(p)
+        if "\\\\" in p_list[-1]:
+            p_list[-1] = "\\"
+        p[0]["escaped_by"] = p_list[-1]
diff --git a/simple_ddl_parser/dialects/bigquery.py b/simple_ddl_parser/dialects/bigquery.py
@@ -15,10 +15,11 @@ def p_multiple_options(self, p):
             p[0] = p[1]
 
     def p_options(self, p):
-        """options : OPTIONS LP id_equals RP"""
+        """options : OPTIONS LP multi_id_equals RP"""
         p_list = list(p)
         if not isinstance(p[1], dict):
-            p[0] = {"options": p[3]}
+            options = [{key: value} for key, value in p[3].items()]
+            p[0] = {"options": options}
         else:
             p[0] = p[1]
             if len(p) == 4:

diff --git a/simple_ddl_parser/dialects/hql.py b/simple_ddl_parser/dialects/hql.py
@@ -5,13 +5,22 @@
 
 class HQL:
     def p_expression_location(self, p: List) -> None:
-        """expr : expr LOCATION STRING
+        """expr : expr LOCATION EQ STRING
+        | expr LOCATION EQ DQ_STRING
+        | expr LOCATION EQ multi_id_or_string
         | expr LOCATION DQ_STRING
+        | expr LOCATION STRING
         | expr LOCATION multi_id_or_string
+        | expr LOCATION EQ ID EQ ID EQ ID
         """
+        # last expr for sample like location=@ADL_Azure_Storage_Account_Container_Name/year=2023/month=08/
         p[0] = p[1]
         p_list = list(p)
-        p[0]["location"] = p_list[-1]
+        if len(p_list) == 9:
+            location = "".join(p_list[4:])
+        else:
+            location = p_list[-1]
+        p[0]["location"] = location
 
     def p_expression_clustered(self, p: List) -> None:
         """expr : expr ID ON LP pid RP
@@ -73,10 +82,10 @@ def p_multi_assignments(self, p: List) -> None:
         p[0].update(p_list[-1])
 
     def p_assignment(self, p: List) -> None:
-        """assignment : id id id
-        |  STRING id STRING
-        |  id id STRING
-        |  STRING id id
+        """assignment : id EQ id
+        |  STRING EQ STRING
+        |  id EQ STRING
+        |  STRING EQ id
         |  STRING id"""
         p_list = remove_par(list(p))
         if "state" in self.lexer.__dict__:
@@ -142,6 +151,7 @@ def p_expression_partitioned_by_hql(self, p: List) -> None:
         """expr : expr PARTITIONED BY pid_with_type
         | expr PARTITIONED BY LP pid RP
         | expr PARTITIONED BY LP multiple_funct RP
+        | expr PARTITIONED BY funct
         """
         p[0] = p[1]
         p_list = remove_par(list(p))

diff --git a/simple_ddl_parser/dialects/ibm.py b/simple_ddl_parser/dialects/ibm.py
@@ -3,8 +3,7 @@
 
 class IBMDb2:
     def p_expr_index_in(self, p: List) -> None:
-        """expr : expr INDEX id id"""
+        """expr : expr INDEX IN id"""
         p_list = list(p)
-        if p_list[-2].upper() == "IN":
-            p[1].update({"index_in": p_list[-1]})
+        p[1].update({"index_in": p_list[-1]})
         p[0] = p[1]
diff --git a/simple_ddl_parser/dialects/mssql.py b/simple_ddl_parser/dialects/mssql.py
@@ -35,9 +35,9 @@ def p_with(self, p: List) -> None:
             p[0]["with"]["properties"] = p_list[-1]["properties"]
 
     def p_equals(self, p: List) -> None:
-        """equals : id id id
-        | id id ON
-        | id id id DOT id
+        """equals : id EQ id
+        | id EQ ON
+        | id EQ dot_id
         """
         p_list = list(p)
         if "." in p_list:

diff --git a/simple_ddl_parser/dialects/mysql.py b/simple_ddl_parser/dialects/mysql.py
@@ -3,11 +3,17 @@
 
 class MySQL:
     def p_engine(self, p: List) -> None:
-        """expr : expr ENGINE id id"""
+        """expr : expr ENGINE EQ id"""
         p_list = list(p)
         p[0] = p[1]
         p[0]["engine"] = p_list[-1]
 
+    def p_db_properties(self, p: List) -> None:
+        """expr : expr id EQ id_or_string"""
+        p_list = list(p)
+        p[0] = p[1]
+        p[0][p[2]] = p_list[-1]
+
     def p_on_update(self, p: List) -> None:
         """on_update : ON UPDATE id
         | ON UPDATE STRING

diff --git a/simple_ddl_parser/dialects/redshift.py b/simple_ddl_parser/dialects/redshift.py
@@ -16,6 +16,7 @@ def p_encode(self, p: List) -> None:
     def p_expression_diststyle(self, p: List) -> None:
         """expr : expr id id
         | expr id KEY
+        | expr IN id
         """
         p_list = list(p)
         if p_list[-2] == "IN":