ray-project
diff --git a/‎python/ray/data/BUILD.bazel‎
Lines changed: 58 additions & 2 deletions b/‎python/ray/data/BUILD.bazel‎
Lines changed: 58 additions & 2 deletions
diff --git a/‎python/ray/data/tests/test_consumption.py‎
Lines changed: 74 additions & 66 deletions b/‎python/ray/data/tests/test_consumption.py‎
Lines changed: 74 additions & 66 deletions
diff --git a/‎python/ray/data/tests/test_datasink.py‎
Lines changed: 50 additions & 50 deletions b/‎python/ray/data/tests/test_datasink.py‎
Lines changed: 50 additions & 50 deletions
@@ -54,6 +54,19 @@ py_test_module_list(
     ],
 )
 
+py_test_module_list(
+    size = "small",
+    files = glob(["tests/anyscale/test_*.py"]),
+    tags = [
+        "exclusive",
+        "team:data",
+    ],
+    deps = [
+        ":conftest",
+        "//:ray_lib",
+    ],
+)
+
 py_test(
     name = "test_formats",
     size = "medium",
@@ -650,6 +663,7 @@ py_test(
     size = "medium",
     srcs = ["tests/test_json.py"],
     tags = [
+        "data_non_parallel",
         "exclusive",
         "team:data",
     ],
@@ -942,9 +956,51 @@ py_test(
 )
 
 py_test(
-    name = "test_execution_optimizer",
+    name = "test_execution_optimizer_basic",
+    size = "medium",
+    srcs = ["tests/test_execution_optimizer_basic.py"],
+    tags = [
+        "exclusive",
+        "team:data",
+    ],
+    deps = [
+        ":conftest",
+        "//:ray_lib",
+    ],
+)
+
+py_test(
+    name = "test_execution_optimizer_advanced",
+    size = "medium",
+    srcs = ["tests/test_execution_optimizer_advanced.py"],
+    tags = [
+        "exclusive",
+        "team:data",
+    ],
+    deps = [
+        ":conftest",
+        "//:ray_lib",
+    ],
+)
+
+py_test(
+    name = "test_execution_optimizer_integrations",
+    size = "medium",
+    srcs = ["tests/test_execution_optimizer_integrations.py"],
+    tags = [
+        "exclusive",
+        "team:data",
+    ],
+    deps = [
+        ":conftest",
+        "//:ray_lib",
+    ],
+)
+
+py_test(
+    name = "test_execution_optimizer_limit_pushdown",
     size = "medium",
-    srcs = ["tests/test_execution_optimizer.py"],
+    srcs = ["tests/test_execution_optimizer_limit_pushdown.py"],
     tags = [
         "exclusive",
         "team:data",
 
@@ -1185,73 +1185,81 @@ def sort(r):
     assert r1 == ds.take()
 
 
-def test_iter_batches_grid(ray_start_regular_shared):
+@pytest.mark.parametrize(
+    "block_sizes,batch_size,drop_last",
+    [
+        # Single block, batch smaller than block, keep partial
+        ([10], 3, False),
+        # Single block, batch smaller than block, drop partial
+        ([10], 3, True),
+        # Single block, exact division
+        ([10], 5, False),
+        # Multiple equal-sized blocks, batch doesn't divide evenly, keep partial
+        ([5, 5, 5], 7, False),
+        # Multiple equal-sized blocks, batch doesn't divide evenly, drop partial
+        ([5, 5, 5], 7, True),
+        # Multiple unequal-sized blocks, keep partial
+        ([1, 5, 10], 4, False),
+        # Multiple unequal-sized blocks, drop partial
+        ([1, 5, 10], 4, True),
+        # Edge case: batch_size = 1
+        ([5, 3, 7], 1, False),
+        # Edge case: batch larger than total rows
+        ([2, 3, 4], 100, False),
+        # Exact division across multiple blocks
+        ([6, 12, 18], 6, False),
+    ],
+)
+def test_iter_batches_grid(
+    ray_start_regular_shared,
+    block_sizes,
+    batch_size,
+    drop_last,
+):
     # Tests slicing, batch combining, and partial batch dropping logic over
-    # a grid of dataset, batching, and dropping configurations.
-    # Grid: num_blocks x num_rows_block_1 x ... x num_rows_block_N x
-    #       batch_size x drop_last
-    seed = int(time.time())
-    print(f"Seeding RNG for test_iter_batches_grid with: {seed}")
-    random.seed(seed)
-    max_num_blocks = 20
-    max_num_rows_per_block = 20
-    num_blocks_samples = 3
-    block_sizes_samples = 3
-    batch_size_samples = 3
-
-    for num_blocks in np.random.randint(1, max_num_blocks + 1, size=num_blocks_samples):
-        block_sizes_list = [
-            np.random.randint(1, max_num_rows_per_block + 1, size=num_blocks)
-            for _ in range(block_sizes_samples)
-        ]
-        for block_sizes in block_sizes_list:
-            # Create the dataset with the given block sizes.
-            dfs = []
-            running_size = 0
-            for block_size in block_sizes:
-                dfs.append(
-                    pd.DataFrame(
-                        {"value": list(range(running_size, running_size + block_size))}
-                    )
-                )
-                running_size += block_size
-            num_rows = running_size
-            ds = ray.data.from_blocks(dfs)
-            for batch_size in np.random.randint(
-                1, num_rows + 1, size=batch_size_samples
-            ):
-                for drop_last in (False, True):
-                    batches = list(
-                        ds.iter_batches(
-                            batch_size=batch_size,
-                            drop_last=drop_last,
-                            batch_format="pandas",
-                        )
-                    )
-                    if num_rows % batch_size == 0 or not drop_last:
-                        # Number of batches should be equal to
-                        # num_rows / batch_size,  rounded up.
-                        assert len(batches) == math.ceil(num_rows / batch_size)
-                        # Concatenated batches should equal the DataFrame
-                        # representation of the entire dataset.
-                        assert pd.concat(batches, ignore_index=True).equals(
-                            ds.to_pandas()
-                        )
-                    else:
-                        # Number of batches should be equal to
-                        # num_rows / batch_size, rounded down.
-                        assert len(batches) == num_rows // batch_size
-                        # Concatenated batches should equal the DataFrame
-                        # representation of the dataset with the partial batch
-                        # remainder sliced off.
-                        assert pd.concat(batches, ignore_index=True).equals(
-                            ds.to_pandas()[: batch_size * (num_rows // batch_size)]
-                        )
-                    if num_rows % batch_size == 0 or drop_last:
-                        assert all(len(batch) == batch_size for batch in batches)
-                    else:
-                        assert all(len(batch) == batch_size for batch in batches[:-1])
-                        assert len(batches[-1]) == num_rows % batch_size
+    # specific dataset, batching, and dropping configurations.
+    # Create the dataset with the given block sizes.
+    dfs = []
+    running_size = 0
+    for block_size in block_sizes:
+        dfs.append(
+            pd.DataFrame(
+                {"value": list(range(running_size, running_size + block_size))}
+            )
+        )
+        running_size += block_size
+    num_rows = running_size
+    ds = ray.data.from_blocks(dfs)
+
+    batches = list(
+        ds.iter_batches(
+            batch_size=batch_size,
+            drop_last=drop_last,
+            batch_format="pandas",
+        )
+    )
+    if num_rows % batch_size == 0 or not drop_last:
+        # Number of batches should be equal to
+        # num_rows / batch_size,  rounded up.
+        assert len(batches) == math.ceil(num_rows / batch_size)
+        # Concatenated batches should equal the DataFrame
+        # representation of the entire dataset.
+        assert pd.concat(batches, ignore_index=True).equals(ds.to_pandas())
+    else:
+        # Number of batches should be equal to
+        # num_rows / batch_size, rounded down.
+        assert len(batches) == num_rows // batch_size
+        # Concatenated batches should equal the DataFrame
+        # representation of the dataset with the partial batch
+        # remainder sliced off.
+        assert pd.concat(batches, ignore_index=True).equals(
+            ds.to_pandas()[: batch_size * (num_rows // batch_size)]
+        )
+    if num_rows % batch_size == 0 or drop_last:
+        assert all(len(batch) == batch_size for batch in batches)
+    else:
+        assert all(len(batch) == batch_size for batch in batches[:-1])
+        assert len(batches[-1]) == num_rows % batch_size
 
 
 def test_union(ray_start_regular_shared):
 
@@ -28,54 +28,6 @@ def test_write_datasink(ray_start_regular_shared):
     assert ray.get(output.data_sink.get_rows_written.remote()) == 10
 
 
-class NodeLoggerOutputDatasink(Datasink[None]):
-    """A writable datasource that logs node IDs of write tasks, for testing."""
-
-    def __init__(self, node_id: str):
-
-        self.num_ok = 0
-        self.num_failed = 0
-        self.node_id = node_id
-        self.num_rows_written = 0
-
-    def write(
-        self,
-        blocks: Iterable[Block],
-        ctx: TaskContext,
-    ) -> None:
-
-        node_id = ray.get_runtime_context().get_node_id()
-        assert node_id == self.node_id
-
-    def on_write_complete(self, write_result: WriteResult[None]):
-        self.num_ok += 1
-        self.num_rows_written += write_result.num_rows
-
-    def on_write_failed(self, error: Exception) -> None:
-        self.num_failed += 1
-
-
-def test_write_datasink_ray_remote_args(ray_start_cluster):
-    ray.shutdown()
-    cluster = ray_start_cluster
-    cluster.add_node(
-        resources={"foo": 100},
-        num_cpus=1,
-    )
-    bar_worker = cluster.add_node(resources={"bar": 100}, num_cpus=1)
-    bar_node_id = bar_worker.node_id
-
-    ray.init(cluster.address)
-
-    output = NodeLoggerOutputDatasink(bar_node_id)
-    ds = ray.data.range(100, override_num_blocks=10)
-    # Pin write tasks to node with "bar" resource.
-    ds.write_datasink(output, ray_remote_args={"resources": {"bar": 1}})
-    assert output.num_ok == 1
-    assert output.num_failed == 0
-    assert output.num_rows_written == 100
-
-
 @pytest.mark.parametrize("min_rows_per_write", [25, 50])
 def test_min_rows_per_write(tmp_path, ray_start_regular_shared, min_rows_per_write):
     class MockDatasink(Datasink[None]):
@@ -122,8 +74,8 @@ def on_write_complete(self, write_result: WriteResult[CustomWriteResult]):
             self.num_rows = write_result.num_rows
             self.size_bytes = write_result.size_bytes
 
-    num_items = 100
-    size_bytes_per_row = 1000
+    num_items = 10
+    size_bytes_per_row = 500
 
     def map_fn(row):
         row["data"] = numpy.zeros(size_bytes_per_row, dtype=numpy.int8)
@@ -139,6 +91,54 @@ def map_fn(row):
     assert datasink.size_bytes == pytest.approx(num_items * size_bytes_per_row, rel=0.1)
 
 
+class NodeLoggerOutputDatasink(Datasink[None]):
+    """A writable datasource that logs node IDs of write tasks, for testing."""
+
+    def __init__(self, node_id: str):
+
+        self.num_ok = 0
+        self.num_failed = 0
+        self.node_id = node_id
+        self.num_rows_written = 0
+
+    def write(
+        self,
+        blocks: Iterable[Block],
+        ctx: TaskContext,
+    ) -> None:
+
+        node_id = ray.get_runtime_context().get_node_id()
+        assert node_id == self.node_id
+
+    def on_write_complete(self, write_result: WriteResult[None]):
+        self.num_ok += 1
+        self.num_rows_written += write_result.num_rows
+
+    def on_write_failed(self, error: Exception) -> None:
+        self.num_failed += 1
+
+
+def test_write_datasink_ray_remote_args(ray_start_cluster):
+    ray.shutdown()
+    cluster = ray_start_cluster
+    cluster.add_node(
+        resources={"foo": 100},
+        num_cpus=1,
+    )
+    bar_worker = cluster.add_node(resources={"bar": 100}, num_cpus=1)
+    bar_node_id = bar_worker.node_id
+
+    ray.init(cluster.address)
+
+    output = NodeLoggerOutputDatasink(bar_node_id)
+    ds = ray.data.range(100, override_num_blocks=10)
+    # Pin write tasks to node with "bar" resource.
+    ds.write_datasink(output, ray_remote_args={"resources": {"bar": 1}})
+    assert output.num_ok == 1
+    assert output.num_failed == 0
+    assert output.num_rows_written == 100
+
+
 if __name__ == "__main__":
     import sys