Merge branch 'master' into feature_streaming_enhancments

ronanstokes-db · web-flow · commit 51ba8706aa4d · 2023-03-11T01:04:26.000-08:00
diff --git a/.github/workflows/onrelease.yml b/.github/workflows/onrelease.yml
@@ -30,6 +30,9 @@ jobs:
       - name: Install
         run: pip install pipenv
 
+      - name: Install dependencies
+        run: pipenv install --dev
+
       - name: Build dist
         run: pipenv run python setup.py sdist bdist_wheel
 
diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml
@@ -37,6 +37,9 @@ jobs:
       - name: Install 
         run: pip install pipenv
 
+      - name: Install dependencies
+        run: pipenv install --dev
+
       - name: Run tests
         run: make test
 
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,7 +3,7 @@
 ## Change History
 All notable changes to the Databricks Labs Data Generator will be documented in this file.
 
-### Unreleased
+### Version 0.3.2
 
 #### Changed
 * Additional migration of tests to use of `pytest`
@@ -14,6 +14,7 @@ All notable changes to the Databricks Labs Data Generator will be documented in
 * Changed build labelling to comply with PEP440
 
 #### Fixed 
+* Fixed compatibility of build with older versions of runtime that rely on `pyparsing` version 2.4.7
 
 #### Added 
 * Added support for additional streaming source types and for use of custom streaming sources
@@ -22,7 +23,8 @@ All notable changes to the Databricks Labs Data Generator will be documented in
 * Parsing of SQL expressions to determine column dependencies
 
 #### Notes
-* This does not change actual order of column building - but adjusts which phase columns are built in 
+* The enhancements to build ordering does not change actual order of column building -
+  but adjusts which phase columns are built in 
 
 
 ### Version 0.3.1
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -14,7 +14,8 @@ warrant that you have the legal authority to do so.
 # Building the code
 
 ## Package Dependencies
-See the contents of the file `python/require.txt` to see the Python package dependencies
+See the contents of the file `python/require.txt` to see the Python package dependencies. 
+Dependent packages are not installed automatically by the `dbldatagen` package.
 
 ## Python compatibility
 
diff --git a/Pipfile b/Pipfile
@@ -6,13 +6,6 @@ verify_ssl = true
 [dev-packages]
 pytest = "*"
 pytest-cov = "*"
-
-numpy = "1.22.0"
-pyspark = "3.1.3"
-pyarrow = "1.0.1"
-pandas = "1.1.3"
-pyparsing = ">=2.4.7,<3.0.9"
-
 sphinx = ">=2.0.0,<3.1.0"
 nbsphinx = "*"
 numpydoc = "0.8"
@@ -21,6 +14,16 @@ ipython = "7.31.1"
 pydata-sphinx-theme = "*"
 recommonmark = "*"
 sphinx-markdown-builder = "*"
+bumpversion = "*"
+
+[packages]
+numpy = "==1.22.0"
+pyspark = "==3.1.3"
+pyarrow = "==4.0.1"
+wheel = "==0.38.4"
+pandas = "==1.2.4"
+setuptools = "==65.6.3"
+pyparsing = "==2.4.7"
 
 [requires]
-python_version = "3.8"
+python_version = ">=3.8.10"
diff --git a/README.md b/README.md
@@ -60,7 +60,7 @@ details of use and many examples.
 
 Release notes and details of the latest changes for this specific release
 can be found in the Github repository
-[here](https://github.com/databrickslabs/dbldatagen/blob/release/v0.3.2a0/CHANGELOG.md)
+[here](https://github.com/databrickslabs/dbldatagen/blob/release/v0.3.2/CHANGELOG.md)
 
 # Installation
 
@@ -126,6 +126,21 @@ examples.
 
 The Github repository also contains further examples in the examples directory
 
+## Spark and Databricks Runtime Compatibility
+The `dbldatagen` package is intended to be compatible with recent LTS versions of the Databricks runtime including 
+older LTS versions at least from 10.4 LTS and later. It also aims to be compatible with Delta Live Table runtimes 
+including `current` and `preview`. 
+
+While we dont specifically drop support for older runtimes, changes in Pyspark APIs or
+APIs from dependent packages such as `numpy`, `pandas`, `pyarrow` and `pyparsing` make cause issues with older
+runtimes. 
+
+Installing `dbldatagen` explicitly does not install releases of dependent packages so as to preserve the curated
+set of packages installed in any Databricks runtime environment.
+
+When building on local environments, the `Pipfile` and requirements files are used to determine the versions 
+tested against for releases and unit tests. 
+
 ## Project Support
 Please note that all projects released under [`Databricks Labs`](https://www.databricks.com/learn/labs)
  are provided for your exploration only, and are not formally supported by Databricks with Service Level Agreements 
diff --git a/dbldatagen/_version.py b/dbldatagen/_version.py
@@ -33,7 +33,7 @@ def get_version(version):
     return version_info
 
 
-__version__ = "0.3.2a0"  # DO NOT EDIT THIS DIRECTLY!  It is managed by bumpversion
+__version__ = "0.3.2"  # DO NOT EDIT THIS DIRECTLY!  It is managed by bumpversion
 __version_info__ = get_version(__version__)
 
 
diff --git a/dbldatagen/schema_parser.py b/dbldatagen/schema_parser.py
@@ -273,18 +273,18 @@ def _cleanseSQL(cls, sql_string):
 
         # skip over quoted identifiers even if they contain quotes
         quoted_ident = pp.QuotedString(quoteChar="`", escQuote="``")
-        quoted_ident.set_parse_action(lambda s, loc, toks: f"`{toks[0]}`")
+        quoted_ident.setParseAction(lambda s, loc, toks: f"`{toks[0]}`")
 
         stringForm1 = pp.Literal('r') + pp.QuotedString(quoteChar="'")
         stringForm2 = pp.Literal('r') + pp.QuotedString(quoteChar='"')
         stringForm3 = pp.QuotedString(quoteChar="'", escQuote=r"\'")
         stringForm4 = pp.QuotedString(quoteChar='"', escQuote=r'\"')
         stringForm = stringForm1 ^ stringForm2 ^ stringForm3 ^ stringForm4
-        stringForm.set_parse_action(lambda s, loc, toks: "' '")
+        stringForm.setParseAction(lambda s, loc, toks: "' '")
 
         parser = quoted_ident ^ stringForm
 
-        transformed_string = parser.transform_string(sql_string)
+        transformed_string = parser.transformString(sql_string)
 
         return transformed_string
 
@@ -312,7 +312,7 @@ def columnsReferencesFromSQLString(cls, sql_string, filter=None):
         ident = pp.Word(pp.alphas, pp.alphanums + "_") | pp.QuotedString(quoteChar="`", escQuote="``")
         parser = ident
 
-        references = parser.search_string(cleansed_sql_string)
+        references = parser.searchString(cleansed_sql_string)
 
         results = set([item for sublist in references for item in sublist])
 
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -28,7 +28,7 @@
 author = 'Databricks Inc'
 
 # The full version, including alpha/beta/rc tags
-release = "0.3.2a0"  # DO NOT EDIT THIS DIRECTLY!  It is managed by bumpversion
+release = "0.3.2"  # DO NOT EDIT THIS DIRECTLY!  It is managed by bumpversion
 
 
 # -- General configuration ---------------------------------------------------
diff --git a/python/.bumpversion.cfg b/python/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.3.2a0
+current_version = 0.3.2
 commit = False
 tag = False
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+){0,1}(?P<release>\D*)(?P<build>\d*)
diff --git a/python/dev_require.txt b/python/dev_require.txt
@@ -1,14 +1,14 @@
 # The following packages are used in building the test data generator framework.
 # All packages used are already installed in the Databricks runtime environment for version 6.5 or later
-numpy==1.19.2
+numpy==1.22.0
 pandas==1.2.4
 pickleshare==0.7.5
 py4j==0.10.9
-pyarrow==4.0.0
-pyspark>=3.1.2
+pyarrow==4.0.1
+pyspark>=3.1.3
 python-dateutil==2.8.1
 six==1.15.0
-pyparsing>=2.4.7, <= 3.0.9
+pyparsing==2.4.7
 
 # The following packages are required for development only
 wheel==0.36.2
diff --git a/python/require.txt b/python/require.txt
@@ -4,11 +4,11 @@ numpy==1.22.0
 pandas==1.2.5
 pickleshare==0.7.5
 py4j==0.10.9
-pyarrow==4.0.0
-pyspark>=3.1.2
+pyarrow==4.0.1
+pyspark>=3.1.3
 python-dateutil==2.8.1
 six==1.15.0
-pyparsing>=2.4.7, <= 3.0.9
+pyparsing==2.4.7
 
 # The following packages are required for development only
 wheel==0.36.2
diff --git a/setup.py b/setup.py
@@ -31,7 +31,7 @@
 
 setuptools.setup(
     name="dbldatagen",
-    version="0.3.2a0",
+    version="0.3.2",
     author="Ronan Stokes, Databricks",
     description="Databricks Labs -  PySpark Synthetic Data Generator",
     long_description=long_description,