huggingface · wengh · May 28, 2025 · May 30, 2025 · May 30, 2025 · May 30, 2025
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -0,0 +1,42 @@
+name: CI
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ['3.9', '3.13']
+        packages: [['pyspark>=4.0.0'], ['pyspark==3.5.6', 'numpy<2.0.0']]
+        exclude:
+          - python-version: '3.13'
+            packages: ['pyspark==3.5.6', 'numpy<2.0.0']
+      fail-fast: false
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v5
+      with:
+        python-version: ${{ matrix.python-version }}
+
+    - name: Install uv
+      run: |
+        curl -LsSf https://astral.sh/uv/install.sh | sh
+        echo "$HOME/.cargo/bin" >> $GITHUB_PATH
+
+    - name: Install dependencies
+      run: |
+        echo "${{ matrix.python-version }}" > .python-version
+        uv add --dev "${{ join(matrix.packages, '" "') }}"
+        uv sync
+
+    - name: Run tests
+      run: |
+        uv run pytest
diff --git a/.python-version b/.python-version
@@ -0,0 +1 @@
+3.9
diff --git a/README.md b/README.md
@@ -43,7 +43,7 @@ Save to Hugging Face:
 df.write.format("huggingface").save("username/my_dataset")
 # Or pass a token manually
 df.write.format("huggingface").option("token", "hf_xxx").save("username/my_dataset")
-``` 
+```
 
 ## Advanced
 
@@ -91,3 +91,14 @@ huggingface datasource enabled for pyspark 3.x.x (backport from pyspark 4)
 
 The import is only necessary on Spark 3.x to enable the backport.
 Spark 4 automatically imports `pyspark_huggingface` as soon as it is installed, and registers the "huggingface" data source.
+
+
+## Development
+
+[Install uv](https://docs.astral.sh/uv/getting-started/installation/) if not already done.
+
+Then, from the project root directory, sync dependencies and run tests.
+```
+uv sync
+uv run pytest
+```
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,24 +1,33 @@
-[tool.poetry]
+[project]
 name = "pyspark_huggingface"
 version = "1.0.0"
 description = "A DataSource for reading and writing HuggingFace Datasets in Spark"
-authors = ["allisonwang-db <allison.wang@databricks.com>", "lhoestq <quentin@huggingface.co>"]
-license = "Apache License 2.0"
+authors = [
+    {name = "allisonwang-db", email = "allison.wang@databricks.com"},
+    {name = "lhoestq", email = "quentin@huggingface.co"},
+    {name = "wengh", email = "wenghy02@gmail.com"},
+]
+license = {text = "Apache License 2.0"}
 readme = "README.md"
-packages = [
-    { include = "pyspark_huggingface" },
+requires-python = ">=3.9"
+dependencies = [
+    "datasets>=3.2",
+    "huggingface-hub>=0.27.1",
 ]
 
-[tool.poetry.dependencies]
-python = "^3.9"
-datasets = "^3.2"
-huggingface_hub = "^0.27.1"
-
-[tool.poetry.group.dev.dependencies]
-pytest = "^8.0.0"
-pytest-dotenv = "^0.5.2"
-pytest-mock = "^3.14.0"
+[dependency-groups]
+dev = [
+    "ipykernel>=6.29.5",
+    "pyarrow-stubs>=19.4",
+    "pyspark>=4.0.0",
+    "pytest>=8.0.0",
+    "pytest-dotenv>=0.5.2",
+    "pytest-mock>=3.14.0",
+]
 
 [build-system]
-requires = ["poetry-core"]
-build-backend = "poetry.core.masonry.api"
+requires = ["uv_build>=0.7.3,<0.8"]
+build-backend = "uv_build"
+
+[tool.uv.build-backend]
+module-root = ""
diff --git a/tests/test_huggingface.py b/tests/test_huggingface.py
@@ -1,6 +1,9 @@
 import pytest
 from pyspark.sql import SparkSession
 
+import pyspark_huggingface  # noqa: F401
+
+
 @pytest.fixture
 def spark():
     spark = SparkSession.builder.getOrCreate()

diff --git a/tests/test_huggingface_writer.py b/tests/test_huggingface_writer.py
@@ -1,11 +1,15 @@
 import os
 import uuid
 
+import pyarrow as pa
 import pytest
 from pyspark.sql import DataFrame, SparkSession
+from pyspark.sql.pandas.types import to_arrow_schema
 from pyspark.testing import assertDataFrameEqual
 from pytest_mock import MockerFixture
 
+import pyspark_huggingface  # noqa: F401
+
 # ============== Fixtures & Helpers ==============
 
 
@@ -16,6 +20,8 @@ def spark():
 
 
 def token():
+    if "HF_TOKEN" not in os.environ:
+        pytest.skip("HF_TOKEN environment variable is not set")
     return os.environ["HF_TOKEN"]
 
 
@@ -110,7 +116,7 @@ def test_revision(repo, random_df, api):
     )
 
 
-def test_max_bytes_per_file(spark, mocker: MockerFixture):
+def test_max_bytes_per_file(spark: SparkSession, mocker: MockerFixture):
     from pyspark_huggingface.huggingface_sink import HuggingFaceDatasetsWriter
 
     repo = "user/test"
@@ -124,5 +130,9 @@ def test_max_bytes_per_file(spark, mocker: MockerFixture):
         token="token",
         max_bytes_per_file=1,
     )
-    writer.write(iter(df.toArrow().to_batches(max_chunksize=1)))
+    # Don't use toArrow() because it's not available in pyspark 3.x
+    arrow_table = pa.Table.from_pylist(
+        [row.asDict() for row in df.collect()], schema=to_arrow_schema(df.schema)
+    )
+    writer.write(iter(arrow_table.to_batches(max_chunksize=1)))
     assert api.preupload_lfs_files.call_count == 10
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		3.9
wengh marked this conversation as resolved. Show resolved Hide resolved Copy link Preview Copilot AI May 30, 2025 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. The .python-version file statically specifies Python 3.9, yet the CI workflow includes Python 3.13. It would be helpful to document the intended supported Python versions or update .python-version accordingly for consistency. Copilot uses AI. Check for mistakes.