Merge branch 'master' into feature_streaming_enhancments

ronanstokes-db · web-flow · commit ab5908a79ad7 · 2023-03-17T14:36:43.000-07:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,11 @@
 ## Change History
 All notable changes to the Databricks Labs Data Generator will be documented in this file.
 
+### Unreleased
+
+#### Changed
+* Fixed use of logger in _version.py and in spark_singleton.py
+
 ### Version 0.3.2
 
 #### Changed
diff --git a/README.md b/README.md
@@ -9,8 +9,10 @@
 <!-- Dont remove: end exclude package -->
 
 [![build](https://github.com/databrickslabs/dbldatagen/workflows/build/badge.svg?branch=master)](https://github.com/databrickslabs/dbldatagen/actions?query=workflow%3Abuild+branch%3Amaster)
+[![PyPi package](https://img.shields.io/pypi/v/dbldatagen?color=green)](https://pypi.org/project/dbldatagen/)
 [![codecov](https://codecov.io/gh/databrickslabs/dbldatagen/branch/master/graph/badge.svg)](https://codecov.io/gh/databrickslabs/dbldatagen)
-[![PyPi downloads](https://img.shields.io/pypi/dm/dbldatagen?label=PyPi%20Downloads)](https://pypi.org/project/dbldatagen/)
+[![PyPi downloads](https://img.shields.io/pypi/dm/dbldatagen?label=PyPi%20Downloads)](https://pypistats.org/packages/dbldatagen)
+
 <!-- 
 [![Language grade: Python](https://img.shields.io/lgtm/grade/python/g/databrickslabs/dbldatagen.svg?logo=lgtm&logoWidth=18)](https://lgtm.com/projects/g/databrickslabs/dbldatagen/context:python)
 [![downloads](https://img.shields.io/github/downloads/databrickslabs/dbldatagen/total.svg)](https://hanadigital.github.io/grev/?user=databrickslabs&repo=dbldatagen)
@@ -89,6 +91,14 @@ release notes for library compatibility
 
 - https://docs.databricks.com/release-notes/runtime/releases.html
 
+When using the Databricks Labs Data Generator on Unity Catalog enabled environments, the Data Generator requires
+the use of `Single User` or `No Isolation Shared` access modes as some needed features are not available in `Shared` 
+mode (for example, use of 3rd party libraries). Depending on settings, `Custom` access mode may be supported.
+
+See the following documentation for more information:
+
+- https://docs.databricks.com/data-governance/unity-catalog/compute.html
+
 ## Using the Data Generator
 To use the data generator, install the library using the `%pip install` method or install the Python wheel directly 
 in your environment.
@@ -104,19 +114,19 @@ column_count = 10
 data_rows = 1000 * 1000
 df_spec = (dg.DataGenerator(spark, name="test_data_set1", rows=data_rows,
                                                   partitions=4)
-                            .withIdOutput()
-                            .withColumn("r", FloatType(), 
-                                             expr="floor(rand() * 350) * (86400 + 3600)",
-                                             numColumns=column_count)
-                            .withColumn("code1", IntegerType(), minValue=100, maxValue=200)
-                            .withColumn("code2", IntegerType(), minValue=0, maxValue=10)
-                            .withColumn("code3", StringType(), values=['a', 'b', 'c'])
-                            .withColumn("code4", StringType(), values=['a', 'b', 'c'], 
-                                           random=True)
-                            .withColumn("code5", StringType(), values=['a', 'b', 'c'], 
-                                           random=True, weights=[9, 1, 1])
-
-                            )
+           .withIdOutput()
+           .withColumn("r", FloatType(), 
+                            expr="floor(rand() * 350) * (86400 + 3600)",
+                            numColumns=column_count)
+           .withColumn("code1", IntegerType(), minValue=100, maxValue=200)
+           .withColumn("code2", IntegerType(), minValue=0, maxValue=10)
+           .withColumn("code3", StringType(), values=['a', 'b', 'c'])
+           .withColumn("code4", StringType(), values=['a', 'b', 'c'], 
+                          random=True)
+           .withColumn("code5", StringType(), values=['a', 'b', 'c'], 
+                          random=True, weights=[9, 1, 1])
+ 
+           )
                             
 df = df_spec.build()
 num_rows=df.count()                          
diff --git a/dbldatagen/_version.py b/dbldatagen/_version.py
@@ -44,7 +44,8 @@ def _get_spark_version(sparkVersion):
         spark_version_info = VersionInfo(int(major), int(minor), int(patch), release, build="0")
     except (RuntimeError, AttributeError):
         spark_version_info = VersionInfo(major=3, minor=0, patch=1, release="unknown", build="0")
-        logging.warning("Could not parse spark version - using assumed Spark Version : %s", spark_version_info)
+        logger = logging.getLogger(__name__)
+        logger.warning("Could not parse spark version - using assumed Spark Version : %s", spark_version_info)
 
     return spark_version_info
 
diff --git a/dbldatagen/data_generator.py b/dbldatagen/data_generator.py
@@ -89,7 +89,7 @@ class DataGenerator:
 
     # restrict spurious messages from java gateway
     logging.getLogger("py4j").setLevel(logging.WARNING)
-    logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.NOTSET)
+    #logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.NOTSET)
 
     def __init__(self, sparkSession=None, name=None, randomSeedMethod=None,
                  rows=1000000, startingId=0, randomSeed=None, partitions=None, verbose=False,
diff --git a/dbldatagen/spark_singleton.py b/dbldatagen/spark_singleton.py
@@ -42,7 +42,8 @@ def getLocalInstance(cls, appName="new Spark session", useAllCores=True):
         else:
             spark_core_count = cpu_count - 1
 
-        logging.info("Spark core count: %d", spark_core_count)
+        logger = logging.getLogger(__name__)
+        logger.info("Spark core count: %d", spark_core_count)
 
         sparkSession = SparkSession.builder \
             .master(f"local[{spark_core_count}]") \
diff --git a/tests/test_logging.py b/tests/test_logging.py
@@ -0,0 +1,149 @@
+import logging
+import pytest
+
+from pyspark.sql import functions as F
+from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType
+
+#import  dbldatagen as dg
+
+
+@pytest.fixture(scope="class")
+def setupSpark():
+    import dbldatagen as dg
+    sparkSession = dg.SparkSingleton.getLocalInstance("unit tests")
+    return sparkSession
+
+
+@pytest.fixture(scope="class")
+def setupLogging():
+    #FORMAT = '%(asctime)-15s %(message)s'
+    #logging.basicConfig(format=FORMAT)
+    pass
+
+
+class TestLoggingOperation:
+    testDataSpec = None
+    dfTestData = None
+    SMALL_ROW_COUNT = 100000
+    TINY_ROW_COUNT = 1000
+    column_count = 10
+    row_count = SMALL_ROW_COUNT
+
+    def setup_log_capture(self, caplog_object):
+        """ set up log capture fixture
+
+        Sets up log capture fixture to only capture messages after setup and only
+        capture warnings and errors
+
+        """
+        caplog_object.set_level(logging.INFO)
+
+        # clear messages from setup
+        caplog_object.clear()
+
+    def get_log_capture_warnings_and_errors(self, caplog_object, textFlag):
+        """
+        gets count of errors containing specified text
+
+        :param caplog_object: log capture object from fixture
+        :param textFlag: text to search for to include error or warning in count
+        :return: count of errors containg text specified in `textFlag`
+        """
+        flagged_text_warnings_and_errors = 0
+        for r in caplog_object.records:
+            if (r.levelname == "WARNING" or r.levelname == "ERROR") and textFlag in r.message:
+                flagged_text_warnings_and_errors += 1
+
+        return flagged_text_warnings_and_errors
+
+    def get_log_capture_info(self, caplog_object, textFlag):
+        """
+        gets count of errors containing specified text
+
+        :param caplog_object: log capture object from fixture
+        :param textFlag: text to search for to include error or warning in count
+        :return: count of errors containg text specified in `textFlag`
+        """
+        flagged_text_info = 0
+        for r in caplog_object.records:
+            if (r.levelname == "INFO") and textFlag in r.message:
+                flagged_text_info += 1
+
+        return flagged_text_info
+
+
+    def test_logging_operation(self, caplog):
+        # caplog fixture captures log content
+        self.setup_log_capture(caplog)
+
+        date_format = "%Y-%m-%d %H:%M:%S"
+        log_format = "[%(name)s]%(asctime)s %(levelname)-8s [%(module)s][%(funcName)s] TESTING1 %(message)s"
+        formatter = logging.Formatter(log_format, date_format)
+        handler = logging.StreamHandler()
+        handler.setFormatter(formatter)
+        logger = logging.getLogger(__name__)
+        logger.setLevel(level=logging.INFO)
+        logger.addHandler(handler)
+
+        logger.warning("Info message 1")
+
+        # Prints: 2023-03-08 14:18:59 INFO      Info message
+
+        from dbldatagen import DataGenerator
+
+        logger.info("Info message 2")
+
+        for h in logger.handlers:
+            h.flush()
+
+        message1_count = self.get_log_capture_warnings_and_errors(caplog, "Info message 1")
+        assert message1_count == 1, "Should only have 1 message 1"
+
+        message2_count = self.get_log_capture_info(caplog, "Info message 2")
+        assert message2_count == 1, "Should only have 1 message 2"
+
+
+    def test_logging_operation2(self, setupSpark, caplog):
+        self.setup_log_capture(caplog)
+
+        spark=setupSpark
+
+        date_format = "%Y-%m-%d %H:%M:%S"
+        log_format = "%(asctime)s %(levelname)-8s TESTING2  %(message)s"
+        formatter1 = logging.Formatter(log_format, date_format)
+        handler1 = logging.StreamHandler()
+        handler1.setFormatter(formatter1)
+        logger2 = logging.getLogger("test1")
+        logger2.setLevel(level=logging.INFO)
+        logger2.addHandler(handler1)
+
+        logger2.warning("Info message 1")
+        # Prints: 2023-03-08 14:18:59 INFO      Info message
+
+        from dbldatagen import DataGenerator
+
+        spec = (DataGenerator(sparkSession=spark, name="test_data_set1", rows=10000, seedMethod='hash_fieldname')
+                .withIdOutput()
+                .withColumn("r", "float", expr="floor(rand() * 350) * (86400 + 3600)",
+                            numColumns=10)
+                .withColumn("code1", "int", min=100, max=200)
+                .withColumn("code2", "int", min=0, max=10)
+                .withColumn("code3", "string", values=['a', 'b', 'c'])
+                .withColumn("code4", "string", values=['a', 'b', 'c'], random=True)
+                .withColumn("code5", "string", values=['a', 'b', 'c'], random=True, weights=[9, 1, 1])
+                )
+
+        df = spec.build()
+
+
+        # Prints: INFO: Version : VersionInfo(major='0', minor='3', patch='1', release='', build='')
+        logger2.info("Info message 2")
+
+        for h in logger2.handlers:
+            h.flush()
+
+        message1_count = self.get_log_capture_warnings_and_errors(caplog, "Info message 1")
+        assert message1_count == 1, "Should only have 1 message 1"
+
+        message2_count = self.get_log_capture_info(caplog, "Info message 2")
+        assert message2_count == 1, "Should only have 1 message 2"