wip

ronanstokes-db · ronanstokes-db · commit 5f51db143dd2 · 2023-03-10T10:42:52.000-08:00
diff --git a/dbldatagen/data_generator.py b/dbldatagen/data_generator.py
@@ -44,10 +44,11 @@
 _STREAM_SOURCE_START_TIMESTAMP = "startTimestamp"
 
 _STREAMING_SOURCE_TEXT = "text"
-_STREAMING_SOURCE_TEXT = "parquet"
-_STREAMING_SOURCE_TEXT = "csv"
-_STREAMING_SOURCE_TEXT = "json"
-_STREAMING_SOURCE_TEXT = "ord"
+_STREAMING_SOURCE_PARQUET = "parquet"
+_STREAMING_SOURCE_CSV = "csv"
+_STREAMING_SOURCE_JSON = "json"
+_STREAMING_SOURCE_ORC = "ord"
+_STREAMING_SOURCE_DELTA = "delta"
 
 class DataGenerator:
     """ Main Class for test data set generation
@@ -901,19 +902,60 @@ def _getBaseDataFrame(self, startId=0, streaming=False, options=None):
 
         else:
             self._applyStreamingDefaults(build_options, passthrough_options)
-            status = (
-                f"Generating streaming data frame with {id_partitions} partitions")
+
+            assert _STREAMING_SOURCE_OPTION in build_options.keys(), "There must be a source type specified"
+            streaming_source_format = build_options[_STREAMING_SOURCE_OPTION]
+
+            if streaming_source_format in [ _STREAMING_SOURCE_RATE, _STREAMING_SOURCE_RATE_MICRO_BATCH]:
+                streaming_partitions = passthrough_options[_STREAMING_SOURCE_NUM_PARTITIONS]
+                status = (
+                    f"Generating streaming data frame with {streaming_partitions} partitions")
+            else:
+                status = (
+                    f"Generating streaming data frame with '{streaming_source_format}' streaming source")
+
             self.logger.info(status)
             self.executionHistory.append(status)
 
             df1 = (self.sparkSession.readStream
-                   .format("rate"))
+                   .format(streaming_source_format))
 
             for k, v in passthrough_options.items():
                 df1 = df1.option(k, v)
-            df1 = (df1.load()
-                   .withColumnRenamed("value", self._seedColumnName)
-                   )
+
+            file_formats = [_STREAMING_SOURCE_TEXT, _STREAMING_SOURCE_JSON, _STREAMING_SOURCE_CSV,
+                            _STREAMING_SOURCE_PARQUET, _STREAMING_SOURCE_DELTA, _STREAMING_SOURCE_ORC]
+
+            data_path = None
+            source_table = None
+            id_column = "value"
+
+            if _STREAMING_ID_FIELD_OPTION in build_options:
+                id_column = build_options[_STREAMING_ID_FIELD_OPTION]
+
+            if _STREAMING_TABLE_OPTION in build_options:
+                source_table = build_options[_STREAMING_TABLE_OPTION]
+
+            if _STREAMING_SCHEMA_OPTION in build_options:
+                source_schema = build_options[_STREAMING_SCHEMA_OPTION]
+                df1 = df1.schema(source_schema)
+
+            # get path for file based reads
+            if _STREAMING_PATH_OPTION in build_options:
+                data_path = build_options[_STREAMING_PATH_OPTION]
+            elif streaming_source_format in file_formats:
+                if "path" in passthrough_options:
+                    data_path = passthrough_options["path"]
+
+            if data_path is not None:
+                df1 = df1.load(data_path)
+            elif source_table is not None:
+                df1 = df1.table(source_table)
+            else:
+                df1 = df1.load()
+
+            if id_column != self._seedColumnName:
+                df1 = df1.withColumnRenamed(id_column, self._seedColumnName)
 
         return df1
 
@@ -1174,7 +1216,7 @@ def _applyStreamingDefaults(self, build_options, passthrough_options):
             build_options[_STREAMING_SOURCE_OPTION] = _STREAMING_SOURCE_RATE
 
         # setup `numPartitions` if not specified
-        if build_options[_STREAMING_SOURCE_OPTION] in [_STREAMING_SOURCE_RATE,_STREAMING_SOURCE_RATE_MICRO_BATCH]:
+        if build_options[_STREAMING_SOURCE_OPTION] in [_STREAMING_SOURCE_RATE, _STREAMING_SOURCE_RATE_MICRO_BATCH]:
             if _STREAMING_SOURCE_NUM_PARTITIONS not in passthrough_options:
                 passthrough_options[_STREAMING_SOURCE_NUM_PARTITIONS] = self.partitions
 
diff --git a/tests/test_streaming.py b/tests/test_streaming.py
@@ -2,20 +2,55 @@
 import shutil
 import time
 import pytest
+import logging
 
 from pyspark.sql.types import IntegerType, StringType, FloatType
+import pyspark.sql.functions as F
 
 import dbldatagen as dg
 
 spark = dg.SparkSingleton.getLocalInstance("streaming tests")
 
 
+@pytest.fixture(scope="class")
+def setupLogging():
+    FORMAT = '%(asctime)-15s %(message)s'
+    logging.basicConfig(format=FORMAT)
+
+
 class TestStreaming():
     row_count = 100000
     column_count = 10
     time_to_run = 10
     rows_per_second = 5000
 
+    def setup_log_capture(self, caplog_object):
+        """ set up log capture fixture
+
+        Sets up log capture fixture to only capture messages after setup and only
+        capture warnings and errors
+
+        """
+        caplog_object.set_level(logging.WARNING)
+
+        # clear messages from setup
+        caplog_object.clear()
+
+    def get_log_capture_warnings_and_errors(self, caplog_object, textFlag):
+        """
+        gets count of errors containing specified text
+
+        :param caplog_object: log capture object from fixture
+        :param textFlag: text to search for to include error or warning in count
+        :return: count of errors containg text specified in `textFlag`
+        """
+        streaming_warnings_and_errors = 0
+        for r in caplog_object.records:
+            if (r.levelname == "WARNING" or r.levelname == "ERROR") and textFlag in r.message:
+                streaming_warnings_and_errors += 1
+
+        return streaming_warnings_and_errors
+
     @pytest.fixture
     def getStreamingDirs(self):
         time_now = int(round(time.time() * 1000))
@@ -32,6 +67,23 @@ def getStreamingDirs(self):
         shutil.rmtree(base_dir, ignore_errors=True)
         print(f"\n\n*** test dir [{base_dir}] deleted")
 
+    @pytest.fixture
+    def getDataDir(self):
+        time_now = int(round(time.time() * 1000))
+        base_dir = "/tmp/testdata_{}".format(time_now)
+        data_dir = os.path.join(base_dir, "data")
+        print(f"test data dir created '{base_dir}'")
+
+        # dont need to create the data dir
+        os.makedirs(base_dir)
+
+        try:
+            yield data_dir
+        finally:
+            shutil.rmtree(base_dir, ignore_errors=True)
+            print(f"\n\n*** test data dir [{base_dir}] deleted")
+
+
     @pytest.mark.parametrize("seedColumnName", ["id",
                                                 "_id",
                                                 None])
@@ -342,4 +394,79 @@ def test_default_options(self, options, optionsExpected):
         assert datagen_options == expected_datagen_options
         assert passthrough_options == expected_passthrough_options
 
+    def test_text_streaming(self, getDataDir, caplog, getStreamingDirs):
+        datadir = getDataDir
+        base_dir, test_dir, checkpoint_dir = getStreamingDirs
+
+        # caplog fixture captures log content
+        self.setup_log_capture(caplog)
+
+        df = spark.range(10000).select(F.expr("cast(id as string)").alias("id"))
+        df.write.format("text").save(datadir)
+
+        testDataSpec = (dg.DataGenerator(sparkSession=spark, name="test_data_set1", rows=self.row_count,
+                                         partitions=10, seedMethod='hash_fieldname')
+                        .withColumn("code1", IntegerType(), minValue=100, maxValue=200)
+                        .withColumn("code2", IntegerType(), minValue=0, maxValue=10)
+                        .withColumn("code3", StringType(), values=['a', 'b', 'c'])
+                        .withColumn("code4", StringType(), values=['a', 'b', 'c'], random=True)
+                        .withColumn("code5", StringType(), values=['a', 'b', 'c'], random=True, weights=[9, 1, 1])
+                        )
+
+        streamingOptions = {
+            'dbldatagen.streaming.source': 'text',
+            'dbldatagen.streaming.sourcePath': datadir,
+
+        }
+        df_streaming = testDataSpec.build(withStreaming=True, options=streamingOptions)
+
+        # check that there warnings about `text` format
+        text_format_warnings_and_errors = self.get_log_capture_warnings_and_errors(caplog, "text")
+        assert text_format_warnings_and_errors > 0, "Should  have error  or warning messages about text format"
+
+        # loop until we get one seconds worth of data
+        start_time = time.time()
+        elapsed_time = 0
+        rows_retrieved = 0
+        time_limit = 10.0
+
+        while elapsed_time < time_limit and rows_retrieved < self.rows_per_second:
+            sq = (df_streaming
+                  .writeStream
+                  .format("parquet")
+                  .outputMode("append")
+                  .option("path", test_dir)
+                  .option("checkpointLocation", checkpoint_dir)
+                  .trigger(once=True)
+                  .start())
+
+            # wait for trigger once to terminate
+            sq.awaitTermination(5)
+
+            elapsed_time = time.time() - start_time
+
+            try:
+                df2 = spark.read.format("parquet").load(test_dir)
+                rows_retrieved = df2.count()
+
+            # ignore file or metadata not found issues arising from read before stream has written first batch
+            except Exception as exc:
+                print("Exception:", exc)
+
+            if sq.isActive:
+                sq.stop()
+
+        end_time = time.time()
+
+        print("*** Done ***")
+        print("read {} rows from newly written data".format(rows_retrieved))
+        print("elapsed time (seconds)", end_time - start_time)
+
+        # check that we have at least one second of data
+        assert rows_retrieved >= self.rows_per_second
+
+
+
+
+