microsoft · mwc360 · Oct 21, 2025 · Oct 21, 2025
diff --git a/README.md b/README.md
@@ -129,9 +129,11 @@ _Note: in this initial beta version, all engines have only been tested inside Mi
 To run any LakeBench benchmark, first do a one time generation of the data required for the benchmark and scale of interest. LakeBench provides datagen classes to quickly generate parquet datasets required by the benchmarks.
 
 ### Data Generation
-Data generation is provided via the DuckDB [TPC-DS](https://duckdb.org/docs/stable/core_extensions/tpcds) and [TPC-H](https://duckdb.org/docs/stable/core_extensions/tpch) extensions. The LakeBench wrapper around DuckDB adds support for writing out parquet files with a provided row-group target file size as normally the files generated by DuckDB are atypically small (i.e. 10MB) and are most suitable for ultra-small scale scenarios. LakeBench defaults to target 128MB row groups but can be configured via the `target_row_group_size_mb` parameter of both TPC-H and TPC-DS DataGenerator classes.
+- **TPC-H** data generation is provided via the (tpchgen-rs)[https://github.com/clflushopt/tpchgen-rs] project. The project is currently about 10x faster than the next closest method of generating TPC-H datasets. _The TPC-DS version of project is currently under development._
+- **TPC-DS** data generation is provided via the DuckDB [TPC-DS](https://duckdb.org/docs/stable/core_extensions/tpcds) extension. The LakeBench wrapper around DuckDB adds support for writing out parquet files with a provided row-group target file size as normally the files generated by DuckDB are atypically small (i.e. 10MB) and are most suitable for ultra-small scale scenarios. LakeBench defaults to target 128MB row groups but can be configured via the `target_row_group_size_mb` parameter of both TPC-H and TPC-DS DataGenerator classes.
+- **ClickBench** data is downloaded directly from the Clickhouse host site.
 
-_Generating scale factor 1 data takes about 1 minute on a 2vCore VM._
+_Generating TPC-H scale factor 1 data takes about 14 seconds on a 2vCore VM._
 
 #### TPC-H Data Generation
 ```python
@@ -156,9 +158,8 @@ datagen.run()
 ```
 
 _Notes:_
-- TPC-H data can be generated up to SF100 however I hit OOM issues when targeting generating SF1000 on a 64-vCore machine.
 - TPC-DS data up to SF1000 can be generated on a 32-vCore machine. 
-- TPC-H and TPC-DS datasets up to SF10 will complete in minutes on a 2-vCore machine.
+- TPC-H datasets are generated extremely fast (i.e. SF1000 in 20 minutes on an 8-vCore machine).
 - The ClickBench dataset (only 1 size) should download with partitioned files in ~ 1 minute and ~ 6 minutes as a single file. 
 
 #### Is BYO Data Supported?

diff --git a/pyproject.toml b/pyproject.toml
@@ -31,7 +31,7 @@ duckdb = ["duckdb==1.4.1", "deltalake==1.2.0", "pyarrow>=15.0.0"]
 polars = ["polars==1.34.0", "deltalake==1.2.0", "pyarrow>=15.0.0"]
 daft = ["daft==0.6.6", "deltalake==1.0.2", "pyarrow>=15.0.0"]
 tpcds_datagen = ["duckdb==1.4.1", "pyarrow>=15.0.0"]
-tpch_datagen = ["duckdb==1.4.1", "pyarrow>=15.0.0"]
+tpch_datagen = ["tpchgen-cli>=2.0.1"]
 sparkmeasure = ["sparkmeasure==0.24.0"]
 sail = ["pysail>=0.3.7", "pyspark[connect]>=4.0.0", "deltalake>=1.2.0", "pyarrow>=15.0.0"]
 

diff --git a/src/lakebench/datagen/_tpc.py b/src/lakebench/datagen/_tpc.py
@@ -10,6 +10,7 @@ class _TPCDataGenerator:
     subclasses instead.
     """
     GEN_UTIL = ''
+    GEN_TYPE = ''
 
     def __init__(self, scale_factor: int, target_folder_uri: str, target_row_group_size_mb: int = 128) -> None:
         """

diff --git a/src/lakebench/datagen/_tpc_rs.py b/src/lakebench/datagen/_tpc_rs.py
@@ -0,0 +1,89 @@
+import posixpath
+import importlib.util
+import fsspec
+from fsspec import AbstractFileSystem
+import subprocess
+from lakebench.utils.path_utils import to_unix_path
+from urllib.parse import urlparse
+
+class _TPCRsDataGenerator:
+    """
+    Base class for TPC Rust based data generation. PLEASE DO NOT INSTANTIATE THIS CLASS DIRECTLY. Use the TPCHDataGenerator and TPCDSDataGenerator
+    subclasses instead.
+    """
+    GEN_UTIL = ''
+    GEN_TYPE = ''
+
+    def __init__(self, scale_factor: int, target_folder_uri: str, target_row_group_size_mb: int = 128) -> None:
+        """
+        Initialize the TPC data generator with a scale factor.
+
+        Parameters
+        ----------
+        scale_factor: int
+            The scale factor for the data generation.
+        target_folder_uri: str
+            Test data will be written to this location where tables are represented as folders containing parquet files.
+        target_row_group_size_mb: int, default=128
+            Desired row group size for the generated parquet files.
+
+        """
+        self.scale_factor = scale_factor
+        uri_scheme = urlparse(target_folder_uri).scheme
+
+        # Allow local file systems: no scheme, file://, or Windows drive letters
+        cloud_schemes = {'s3', 'gs', 'gcs', 'abfs', 'abfss', 'adl', 'wasb', 'wasbs'}
+
+        if uri_scheme in cloud_schemes:
+            raise ValueError(f"{uri_scheme} protocol is not currently supported for TPC-RS data generation. Please use a local file path.")
+
+        self.fs: AbstractFileSystem = fsspec.filesystem("file")
+        self.target_folder_uri = to_unix_path(target_folder_uri)
+        self.target_row_group_size_mb = target_row_group_size_mb
+
+        def get_tpcgen_path():
+            import shutil
+            # Try shutil.which first (most reliable)
+            path = shutil.which(f"{self.GEN_TYPE}gen-cli")
+            if path:
+                return path
+
+            # Fallback to user Scripts directory
+            from pathlib import Path
+            import sys
+            user_scripts = Path.home() / "AppData" / "Roaming" / "Python" / f"Python{sys.version_info.major}{sys.version_info.minor}" / "Scripts" / "tpchgen-cli.exe"
+            if user_scripts.exists():
+                return str(user_scripts)
+
+            raise ImportError(f"{self.GEN_TYPE}gen-cli is used for data generation but is not installed. Install using `%pip install {self.GEN_TYPE}gen-cli`")
+
+        self.tpcgen_exe = get_tpcgen_path()
+
+
+    def run(self) -> None:
+        """
+        This method uses a rust based TPC data generation utility to generate Parquet files
+        based on the specified scale factor. The generated tables are written to the target folder.
+        """
+
+        # cleanup target directory
+        if self.fs.exists(self.target_folder_uri):
+            self.fs.rm(self.target_folder_uri, recursive=True)
+        self.fs.mkdirs(self.target_folder_uri, exist_ok=True)
+
+        cmd = [
+            self.tpcgen_exe,
+            "--scale-factor", str(self.scale_factor),
+            "--output-dir", self.target_folder_uri,
+            "--parts", "1",
+            "--format", "parquet",
+            "--parquet-row-group-bytes", str(self.target_row_group_size_mb * 1024 * 1024),
+            "--parquet-compression", "SNAPPY"
+        ]
+        try:
+            result = subprocess.run(cmd, capture_output=True, text=True, check=True)
+            if result.stdout:
+                print(result.stdout)
+        except subprocess.CalledProcessError as e:
+            print(f"stdout: {e.stdout}")
+            print(f"stderr: {e.stderr}")
diff --git a/src/lakebench/datagen/tpcds.py b/src/lakebench/datagen/tpcds.py
@@ -18,4 +18,5 @@ class TPCDSDataGenerator(_TPCDataGenerator):
     run()
         Generates TPC-DS data in Parquet format based on the input scale factor and writes it to the target folder.
     """
-    GEN_UTIL = 'dsdgen'
+    GEN_UTIL = 'dsdgen'
+    GEN_TYPE = 'tpds'
diff --git a/src/lakebench/datagen/tpch.py b/src/lakebench/datagen/tpch.py
@@ -1,7 +1,7 @@
-from ._tpc import _TPCDataGenerator
-class TPCHDataGenerator(_TPCDataGenerator):
+from ._tpc_rs import _TPCRsDataGenerator
+class TPCHDataGenerator(_TPCRsDataGenerator):
     """
-    This class is a wrapper for the DuckDB TPC-H data generation utility. It generates TPC-H data in Parquet format
+    This class is a wrapper of the rust-based TPC-H data generator, `tpchgen-rs`. It generates TPC-H data in Parquet format
     based on the specified scale factor and target row group size in MB.
 
     Attributes
@@ -18,4 +18,5 @@ class TPCHDataGenerator(_TPCDataGenerator):
     run()
         Generates TPC-H data in Parquet format based on the input scale factor and writes it to the target folder.
     """
-    GEN_UTIL = 'dbgen'
+    GEN_UTIL = 'dbgen'
+    GEN_TYPE = 'tpch'