[data] - if column is empty skip the sampling step in pandas_block (#57740)

goutamvenkat-anyscale · elliot-barn · commit 739cca1efba9 · 2025-10-23T05:46:44.000Z
## Why are these changes needed? If pandas column is empty, don't continue with the sampling. ## Related issue number  ## Checks - [x] I've signed off every commit(by using the -s flag, i.e., `git commit -s`) in this PR. - [x] I've run `scripts/format.sh` to lint the changes in this PR. - [ ] I've included any doc changes needed for https://docs.ray.io/en/master/. - [ ] I've added any new APIs to the API Reference. For example, if I added a method in Tune, I've added it in `doc/source/tune/api/` under the corresponding `.rst` file. - [ ] I've made sure the tests are passing. Note that there might be a few flaky tests, see the recent failures at https://flakey-tests.ray.io/ - Testing Strategy - [x] Unit tests - [ ] Release tests - [ ] This PR is not tested :( Signed-off-by: Goutam <goutam@anyscale.com> Signed-off-by: elliot-barn <elliot.barnwell@anyscale.com>
diff --git a/python/ray/data/_internal/pandas_block.py b/python/ray/data/_internal/pandas_block.py
@@ -508,6 +508,9 @@ def get_deep_size(obj):
 
                 # Determine the sample size based on max_sample_count
                 sample_size = min(total_size, max_sample_count)
+                # Skip size calculation for empty columns
+                if sample_size == 0:
+                    continue
                 # Following codes can also handel case that sample_size == total_size
                 sampled_data = self._table[column].sample(n=sample_size).values
 
diff --git a/python/ray/data/tests/test_pandas_block.py b/python/ray/data/tests/test_pandas_block.py
@@ -466,5 +466,34 @@ def test_iter_rows_with_na(ray_start_regular_shared):
     assert list(rows) == [{"col": None}]
 
 
+def test_empty_dataframe_with_object_columns(ray_start_regular_shared):
+    """Test that size_bytes handles empty DataFrames with object/string columns.
+
+    The warning log:
+    "Error calculating size for column 'parent': cannot call `vectorize`
+    on size 0 inputs unless `otypes` is set"
+    should not be logged in the presence of empty columns.
+    """
+    from unittest.mock import patch
+
+    # Create an empty DataFrame but with defined columns and dtypes
+    block = pd.DataFrame(
+        {
+            "parent": pd.Series([], dtype=object),
+            "child": pd.Series([], dtype="string"),
+            "data": pd.Series([], dtype=object),
+        }
+    )
+
+    block_accessor = PandasBlockAccessor.for_block(block)
+
+    # Check that NO warning is logged after calling size_bytes
+    with patch("ray.data._internal.pandas_block.logger.warning") as mock_warning:
+        bytes_size = block_accessor.size_bytes()
+        mock_warning.assert_not_called()
+
+    assert bytes_size >= 0
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main(["-v", __file__]))