Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -129,9 +129,11 @@ _Note: in this initial beta version, all engines have only been tested inside Mi
To run any LakeBench benchmark, first do a one time generation of the data required for the benchmark and scale of interest. LakeBench provides datagen classes to quickly generate parquet datasets required by the benchmarks.

### Data Generation
Data generation is provided via the DuckDB [TPC-DS](https://duckdb.org/docs/stable/core_extensions/tpcds) and [TPC-H](https://duckdb.org/docs/stable/core_extensions/tpch) extensions. The LakeBench wrapper around DuckDB adds support for writing out parquet files with a provided row-group target file size as normally the files generated by DuckDB are atypically small (i.e. 10MB) and are most suitable for ultra-small scale scenarios. LakeBench defaults to target 128MB row groups but can be configured via the `target_row_group_size_mb` parameter of both TPC-H and TPC-DS DataGenerator classes.
- **TPC-H** data generation is provided via the (tpchgen-rs)[https://github.com/clflushopt/tpchgen-rs] project. The project is currently about 10x faster than the next closest method of generating TPC-H datasets. _The TPC-DS version of project is currently under development._
- **TPC-DS** data generation is provided via the DuckDB [TPC-DS](https://duckdb.org/docs/stable/core_extensions/tpcds) extension. The LakeBench wrapper around DuckDB adds support for writing out parquet files with a provided row-group target file size as normally the files generated by DuckDB are atypically small (i.e. 10MB) and are most suitable for ultra-small scale scenarios. LakeBench defaults to target 128MB row groups but can be configured via the `target_row_group_size_mb` parameter of both TPC-H and TPC-DS DataGenerator classes.
- **ClickBench** data is downloaded directly from the Clickhouse host site.

_Generating scale factor 1 data takes about 1 minute on a 2vCore VM._
_Generating TPC-H scale factor 1 data takes about 14 seconds on a 2vCore VM._

#### TPC-H Data Generation
```python
Expand All @@ -156,9 +158,8 @@ datagen.run()
```

_Notes:_
- TPC-H data can be generated up to SF100 however I hit OOM issues when targeting generating SF1000 on a 64-vCore machine.
- TPC-DS data up to SF1000 can be generated on a 32-vCore machine.
- TPC-H and TPC-DS datasets up to SF10 will complete in minutes on a 2-vCore machine.
- TPC-H datasets are generated extremely fast (i.e. SF1000 in 20 minutes on an 8-vCore machine).
- The ClickBench dataset (only 1 size) should download with partitioned files in ~ 1 minute and ~ 6 minutes as a single file.

#### Is BYO Data Supported?
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ duckdb = ["duckdb==1.4.1", "deltalake==1.2.0", "pyarrow>=15.0.0"]
polars = ["polars==1.34.0", "deltalake==1.2.0", "pyarrow>=15.0.0"]
daft = ["daft==0.6.6", "deltalake==1.0.2", "pyarrow>=15.0.0"]
tpcds_datagen = ["duckdb==1.4.1", "pyarrow>=15.0.0"]
tpch_datagen = ["duckdb==1.4.1", "pyarrow>=15.0.0"]
tpch_datagen = ["tpchgen-cli>=2.0.1"]
sparkmeasure = ["sparkmeasure==0.24.0"]
sail = ["pysail>=0.3.7", "pyspark[connect]>=4.0.0", "deltalake>=1.2.0", "pyarrow>=15.0.0"]

Expand Down
1 change: 1 addition & 0 deletions src/lakebench/datagen/_tpc.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ class _TPCDataGenerator:
subclasses instead.
"""
GEN_UTIL = ''
GEN_TYPE = ''

def __init__(self, scale_factor: int, target_folder_uri: str, target_row_group_size_mb: int = 128) -> None:
"""
Expand Down
89 changes: 89 additions & 0 deletions src/lakebench/datagen/_tpc_rs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import posixpath
import importlib.util
import fsspec
from fsspec import AbstractFileSystem
import subprocess
from lakebench.utils.path_utils import to_unix_path
from urllib.parse import urlparse

class _TPCRsDataGenerator:
"""
Base class for TPC Rust based data generation. PLEASE DO NOT INSTANTIATE THIS CLASS DIRECTLY. Use the TPCHDataGenerator and TPCDSDataGenerator
subclasses instead.
"""
GEN_UTIL = ''
GEN_TYPE = ''

def __init__(self, scale_factor: int, target_folder_uri: str, target_row_group_size_mb: int = 128) -> None:
"""
Initialize the TPC data generator with a scale factor.

Parameters
----------
scale_factor: int
The scale factor for the data generation.
target_folder_uri: str
Test data will be written to this location where tables are represented as folders containing parquet files.
target_row_group_size_mb: int, default=128
Desired row group size for the generated parquet files.

"""
self.scale_factor = scale_factor
uri_scheme = urlparse(target_folder_uri).scheme

# Allow local file systems: no scheme, file://, or Windows drive letters
cloud_schemes = {'s3', 'gs', 'gcs', 'abfs', 'abfss', 'adl', 'wasb', 'wasbs'}

if uri_scheme in cloud_schemes:
raise ValueError(f"{uri_scheme} protocol is not currently supported for TPC-RS data generation. Please use a local file path.")

self.fs: AbstractFileSystem = fsspec.filesystem("file")
self.target_folder_uri = to_unix_path(target_folder_uri)
self.target_row_group_size_mb = target_row_group_size_mb

def get_tpcgen_path():
import shutil
# Try shutil.which first (most reliable)
path = shutil.which(f"{self.GEN_TYPE}gen-cli")
if path:
return path

# Fallback to user Scripts directory
from pathlib import Path
import sys
user_scripts = Path.home() / "AppData" / "Roaming" / "Python" / f"Python{sys.version_info.major}{sys.version_info.minor}" / "Scripts" / "tpchgen-cli.exe"
if user_scripts.exists():
return str(user_scripts)

raise ImportError(f"{self.GEN_TYPE}gen-cli is used for data generation but is not installed. Install using `%pip install {self.GEN_TYPE}gen-cli`")

self.tpcgen_exe = get_tpcgen_path()


def run(self) -> None:
"""
This method uses a rust based TPC data generation utility to generate Parquet files
based on the specified scale factor. The generated tables are written to the target folder.
"""

# cleanup target directory
if self.fs.exists(self.target_folder_uri):
self.fs.rm(self.target_folder_uri, recursive=True)
self.fs.mkdirs(self.target_folder_uri, exist_ok=True)

cmd = [
self.tpcgen_exe,
"--scale-factor", str(self.scale_factor),
"--output-dir", self.target_folder_uri,
"--parts", "1",
"--format", "parquet",
"--parquet-row-group-bytes", str(self.target_row_group_size_mb * 1024 * 1024),
"--parquet-compression", "SNAPPY"
]
try:
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
if result.stdout:
print(result.stdout)
except subprocess.CalledProcessError as e:
print(f"stdout: {e.stdout}")
print(f"stderr: {e.stderr}")
3 changes: 2 additions & 1 deletion src/lakebench/datagen/tpcds.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,5 @@ class TPCDSDataGenerator(_TPCDataGenerator):
run()
Generates TPC-DS data in Parquet format based on the input scale factor and writes it to the target folder.
"""
GEN_UTIL = 'dsdgen'
GEN_UTIL = 'dsdgen'
GEN_TYPE = 'tpds'
9 changes: 5 additions & 4 deletions src/lakebench/datagen/tpch.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from ._tpc import _TPCDataGenerator
class TPCHDataGenerator(_TPCDataGenerator):
from ._tpc_rs import _TPCRsDataGenerator
class TPCHDataGenerator(_TPCRsDataGenerator):
"""
This class is a wrapper for the DuckDB TPC-H data generation utility. It generates TPC-H data in Parquet format
This class is a wrapper of the rust-based TPC-H data generator, `tpchgen-rs`. It generates TPC-H data in Parquet format
based on the specified scale factor and target row group size in MB.

Attributes
Expand All @@ -18,4 +18,5 @@ class TPCHDataGenerator(_TPCDataGenerator):
run()
Generates TPC-H data in Parquet format based on the input scale factor and writes it to the target folder.
"""
GEN_UTIL = 'dbgen'
GEN_UTIL = 'dbgen'
GEN_TYPE = 'tpch'