Skip to content

Commit 0165e4f

Browse files
committed
Enable setting {page,row-group} limit
1 parent ba2fe43 commit 0165e4f

File tree

3 files changed

+16
-1
lines changed

3 files changed

+16
-1
lines changed

mkdocs/docs/configuration.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,9 @@ Iceberg tables support table properties to configure table behavior.
5757
| `write.parquet.compression-codec` | `{uncompressed,zstd,gzip,snappy}` | zstd | Sets the Parquet compression coddec. |
5858
| `write.parquet.compression-level` | Integer | null | Parquet compression level for the codec. If not set, it is up to PyIceberg |
5959
| `write.parquet.page-size-bytes` | Size in bytes | 1MB | Set a target threshold for the approximate encoded size of data pages within a column chunk |
60+
| `write.parquet.page-row-limit` | Number of rows | 20000 | Set a target threshold for the approximate encoded size of data pages within a column chunk |
6061
| `write.parquet.dict-size-bytes` | Size in bytes | 2MB | Set the dictionary page size limit per row group |
62+
| `write.parquet.row-group-limit` | Number of rows | 122880 | The Parquet row group limit |
6163

6264
# FileIO
6365

pyiceberg/io/pyarrow.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1730,9 +1730,14 @@ def write_file(table: Table, tasks: Iterator[WriteTask]) -> Iterator[DataFile]:
17301730
file_schema = schema_to_pyarrow(table.schema())
17311731

17321732
fo = table.io.new_output(file_path)
1733+
row_group_size = PropertyUtil.property_as_int(
1734+
properties=table.properties,
1735+
property_name=TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES,
1736+
default=TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES_DEFAULT,
1737+
)
17331738
with fo.create(overwrite=True) as fos:
17341739
with pq.ParquetWriter(fos, schema=file_schema, **parquet_writer_kwargs) as writer:
1735-
writer.write_table(task.df)
1740+
writer.write_table(task.df, row_group_size=row_group_size)
17361741

17371742
data_file = DataFile(
17381743
content=DataFileContent.DATA,
@@ -1795,4 +1800,9 @@ def _get_parquet_writer_kwargs(table_properties: Properties) -> Dict[str, Any]:
17951800
property_name=TableProperties.PARQUET_DICT_SIZE_BYTES,
17961801
default=TableProperties.PARQUET_DICT_SIZE_BYTES_DEFAULT,
17971802
),
1803+
"write_batch_size": PropertyUtil.property_as_int(
1804+
properties=table_properties,
1805+
property_name=TableProperties.PARQUET_PAGE_ROW_LIMIT,
1806+
default=TableProperties.PARQUET_PAGE_ROW_LIMIT_DEFAULT,
1807+
),
17981808
}

pyiceberg/table/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,9 @@ class TableProperties:
137137
PARQUET_ROW_GROUP_SIZE_BYTES = "write.parquet.row-group-size-bytes"
138138
PARQUET_ROW_GROUP_SIZE_BYTES_DEFAULT = 128 * 1024 * 1024 # 128 MB
139139

140+
PARQUET_ROW_GROUP_LIMIT = "write.parquet.row-group-limit"
141+
PARQUET_ROW_GROUP_LIMIT_DEFAULT = 128 * 1024 * 1024 # 128 MB
142+
140143
PARQUET_PAGE_SIZE_BYTES = "write.parquet.page-size-bytes"
141144
PARQUET_PAGE_SIZE_BYTES_DEFAULT = 1024 * 1024 # 1 MB
142145

0 commit comments

Comments
 (0)