Upd/hf-dataset-get-format (#522)

bhimrazy · web-flow · commit 1c8ab3f6f4da · 2025-03-23T20:54:45.000Z
* fix: update row retrieval to return rows as dictionaries in ParquetLoader

* fix: update tests to assert dictionary structure for parquet dataset rows
diff --git a/src/litdata/streaming/item_loader.py b/src/litdata/streaming/item_loader.py
@@ -674,7 +674,8 @@ def _get_item_with_low_memory(self, chunk_index: int, chunk_filepath: str, row_i
             del self._chunk_row_group_item_read_count[chunk_index][row_group_index]
 
         # Return the specific row from the dataframe
-        return row_group_df.row(row_index_within_group)  # type: ignore
+        # Note: The `named=True` argument is used to return the row as a dictionary
+        return row_group_df.row(row_index_within_group, named=True)  # type: ignore
 
     def _get_item(self, chunk_index: int, chunk_filepath: str, index: int) -> Any:
         """Retrieve a dataframe row from a parquet chunk by loading the entire chunk into memory.
@@ -695,7 +696,10 @@ def _get_item(self, chunk_index: int, chunk_filepath: str, index: int) -> Any:
 
         if chunk_index not in self._df:
             self._df[chunk_index] = pl.scan_parquet(chunk_filepath, low_memory=True).collect()
-        return self._df[chunk_index].row(index)
+
+        # Retrieve the specific row from the dataframe
+        # Note: The `named=True` argument is used to return the row as a dictionary
+        return self._df[chunk_index].row(index, named=True)
 
     def delete(self, chunk_index: int, chunk_filepath: str) -> None:
         """Delete a chunk from the local filesystem."""
diff --git a/tests/streaming/test_parquet.py b/tests/streaming/test_parquet.py
@@ -81,10 +81,10 @@ def test_parquet_index_write(
 
         for i, _ds in enumerate(ds):
             idx = i % 5
-            assert len(_ds) == 3
-            assert _ds[0] == pq_data["name"][idx]
-            assert _ds[1] == pq_data["weight"][idx]
-            assert _ds[2] == pq_data["height"][idx]
+            assert isinstance(_ds, dict)
+            assert _ds["name"] == pq_data["name"][idx]
+            assert _ds["weight"] == pq_data["weight"][idx]
+            assert _ds["height"] == pq_data["height"][idx]
 
 
 @pytest.mark.skipif(condition=sys.platform == "win32", reason="Fails on windows and test gets cancelled")
@@ -168,7 +168,9 @@ def test_get_parquet_indexer_cls(pq_url, cls, expectation, monkeypatch, fsspec_m
 @pytest.mark.usefixtures("clean_pq_index_cache")
 @patch("litdata.utilities.parquet._HF_HUB_AVAILABLE", True)
 @patch("litdata.streaming.downloader._HF_HUB_AVAILABLE", True)
-def test_stream_hf_parquet_dataset(monkeypatch, huggingface_hub_fs_mock, pq_data):
+@pytest.mark.parametrize(("pre_load_chunk"), [False, True])
+@pytest.mark.parametrize(("low_memory"), [False, True])
+def test_stream_hf_parquet_dataset(monkeypatch, huggingface_hub_fs_mock, pq_data, pre_load_chunk, low_memory):
     hf_url = "hf://datasets/some_org/some_repo/some_path"
 
     # Test case 1: Invalid item_loader
@@ -180,27 +182,18 @@ def test_stream_hf_parquet_dataset(monkeypatch, huggingface_hub_fs_mock, pq_data
     assert len(ds) == 25  # 5 datasets for 5 loops
     for i, _ds in enumerate(ds):
         idx = i % 5
-        assert len(_ds) == 3
-        assert _ds[0] == pq_data["name"][idx]
-        assert _ds[1] == pq_data["weight"][idx]
-        assert _ds[2] == pq_data["height"][idx]
-
-    # Test case 3: Streaming with ParquetLoader as item_loader and low_memory=False
-    ds = StreamingDataset(hf_url, item_loader=ParquetLoader(low_memory=False))
-    assert len(ds) == 25
-    for i, _ds in enumerate(ds):
-        idx = i % 5
-        assert len(_ds) == 3
-        assert _ds[0] == pq_data["name"][idx]
-        assert _ds[1] == pq_data["weight"][idx]
-        assert _ds[2] == pq_data["height"][idx]
-
-    # Test case 4: Streaming with ParquetLoader and low_memory=True
-    ds = StreamingDataset(hf_url, item_loader=ParquetLoader(low_memory=True))
+        assert isinstance(_ds, dict)
+        assert _ds["name"] == pq_data["name"][idx]
+        assert _ds["weight"] == pq_data["weight"][idx]
+        assert _ds["height"] == pq_data["height"][idx]
+
+    # Test case 3: Streaming with passing item_loader
+    print("pre_load_chunk", pre_load_chunk, "low_memory", low_memory)
+    ds = StreamingDataset(hf_url, item_loader=ParquetLoader(pre_load_chunk, low_memory))
     assert len(ds) == 25
     for i, _ds in enumerate(ds):
         idx = i % 5
-        assert len(_ds) == 3
-        assert _ds[0] == pq_data["name"][idx]
-        assert _ds[1] == pq_data["weight"][idx]
-        assert _ds[2] == pq_data["height"][idx]
+        assert isinstance(_ds, dict)
+        assert _ds["name"] == pq_data["name"][idx]
+        assert _ds["weight"] == pq_data["weight"][idx]
+        assert _ds["height"] == pq_data["height"][idx]