Optimize while scans when only last state is needed

ricardoV94 · ricardoV94 · commit f665170ad3dc · 2023-02-10T18:54:53.000+01:00
diff --git a/pytensor/scan/rewriting.py b/pytensor/scan/rewriting.py
@@ -28,10 +28,18 @@
 from pytensor.graph.fg import FunctionGraph
 from pytensor.graph.op import compute_test_value
 from pytensor.graph.replace import clone_replace
-from pytensor.graph.rewriting.basic import GraphRewriter, in2out, node_rewriter
+from pytensor.graph.rewriting.basic import (
+    GraphRewriter,
+    copy_stack_trace,
+    in2out,
+    node_rewriter,
+)
 from pytensor.graph.rewriting.db import EquilibriumDB, SequenceDB
+from pytensor.graph.rewriting.utils import get_clients_at_depth
 from pytensor.graph.type import HasShape
 from pytensor.graph.utils import InconsistencyError
+from pytensor.raise_op import Assert
+from pytensor.scalar import ScalarConstant
 from pytensor.scan.op import Scan, ScanInfo
 from pytensor.scan.utils import (
     ScanArgs,
@@ -1115,6 +1123,61 @@ def sanitize(x):
         return at.as_tensor_variable(x)
 
 
+@node_rewriter([Scan])
+def merge_while_scan_subtensor_last_element(fgraph, scan_node):
+    """
+    Replace while_scan_out[1:][-1] by while_scan_out[-1], for recurring outputs,
+    asserting that at least on step will happen. Only the first step can be ensured
+    by the inputs alone (i.e., `n_steps > 0` and the non-empty sequences), as
+    the while scan could abort earlier anytime after that. This means it is not
+    generally safe to replace while_scan[1:][-i] by while_scan[-i] for -i != -1.
+    """
+    if not scan_node.op.info.as_while:
+        return None
+
+    recurrent_outputs = scan_node.outputs[: scan_node.op.n_outs]
+
+    n_steps = scan_node.inputs[0]
+    sequences = scan_node.inputs[1 : 1 + scan_node.op.info.n_seqs]
+    non_zero_steps_cond = at.all([n_steps > 0] + [seq.shape[0] for seq in sequences])
+    assert_non_zero_steps_op = Assert("n_steps > 0 and all(len(sequences) > 0))")
+
+    subtensor_merge_replacements = {}
+
+    # Iterate over oll nodes that are two computations below the while scan
+    for node in get_clients_at_depth(fgraph, scan_node, depth=2):
+        if not isinstance(node.op, Subtensor):
+            continue
+
+        u = node.inputs[0]
+        if not (u.owner and isinstance(u.owner.op, Subtensor)):
+            continue
+
+        x = u.owner.inputs[0]
+        if x not in recurrent_outputs:
+            continue
+
+        slice1 = get_idx_list(u.owner.inputs, u.owner.op.idx_list)
+        slice2 = get_idx_list(node.inputs, node.op.idx_list)
+
+        if (
+            len(slice1) == 1
+            and isinstance(slice1[0], slice)
+            and isinstance(slice1[0].start, aes.ScalarConstant)
+            and slice1[0].start.data == 1
+            and slice1[0].stop is None
+            and slice1[0].step is None
+            and len(slice2) == 1
+            and isinstance(slice2[0], aes.ScalarConstant)
+            and slice2[0].data == -1
+        ):
+            out = assert_non_zero_steps_op(x[-1], non_zero_steps_cond)
+            copy_stack_trace([node.outputs[0], node.inputs[0]], out)
+            subtensor_merge_replacements[node.outputs[0]] = out
+
+    return subtensor_merge_replacements
+
+
 @node_rewriter([Scan])
 def save_mem_new_scan(fgraph, node):
     r"""Graph optimizer that reduces scan memory consumption.
@@ -1136,6 +1199,17 @@ def save_mem_new_scan(fgraph, node):
     that SITSOT output. Only the most recently computed timestep ever needs to
     be kept in memory.
 
+    There are two ways in which the Scan buffer size is controlled:
+    1. Each recurring output is saved in an input empty tensor x with the initial
+    state written at x[0]. The remaining x[1:] positions determine how many
+    intermediate results should be stored.
+    This rewrite shortens x[1:] to the smallest possible size.
+    2. Each non-recurrent outputs (nit-sot) is associated with a scalar integer
+    input that determines how many steps should be saved in the perform method.
+    This rewrite reduces this number to the smallest possible.
+
+    The scan perform implementation takes the output sizes into consideration,
+    saving the newest results over the oldest ones whenever the buffer is filled.
     """
     if not isinstance(node.op, Scan):
         return False
@@ -1184,13 +1258,16 @@ def save_mem_new_scan(fgraph, node):
     # index(step) for any output scan actually needs to compute
     # In other words n_steps should be equal to this maximal !
     # Note: if we have a shared variable that gets updated at every step
-    # of the loop, reducing the number of steps will affect the the
-    # value of the shared variable after the loop so we need not to
+    # of the loop, reducing the number of steps will affect the
+    # value of the shared variable after the loop so we cannot
     # change the number of steps in that case. To do this we set
     # global_nsteps to None which is seen as a flag that nothing needs
-    # to be done
+    # to be done.
+    # Note: For simplicity while Scans also have global_nsteps set to None.
+    #  All step optimizations require knowing the shape of the output, which
+    #  cannot be determined from the inputs alone.
     assert len(node.outputs) >= c_outs
-    if len(node.outputs) == c_outs:
+    if len(node.outputs) == c_outs and not op.info.as_while:
         global_nsteps = {"real": -1, "sym": []}
     else:
         global_nsteps = None
@@ -1298,9 +1375,9 @@ def save_mem_new_scan(fgraph, node):
 
     # 2.3. Analyze global_nsteps to figure out for how many steps scan
     # needs to iterate
-    if global_nsteps is not None:
+    if global_nsteps is None:
         nw_steps = node.inputs[0]
-
+    else:
         # there are some symbolic tensors that limit the number of
         # steps
         if len(global_nsteps["sym"]) == 0:
@@ -1316,16 +1393,14 @@ def save_mem_new_scan(fgraph, node):
             real_steps = None
         nw_steps = select_min(select_max(sym_steps, real_steps), node.inputs[0])
 
+        # FIXME: This is not correct. Scan with 0 steps seems to be supported
         # Make sure the ScanSaveMem optimization never makes the new
         # number of steps to be 0 (this could happen, for instance, if
         # the optimization detects that the outputs of the Scan go through
         # subtensor nodes that end up taking no elements) because Scan with
         # 0 iterations are not supported. Make sure the new number of steps
         # is at least 1.
         nw_steps = select_max(nw_steps, 1)
-    else:
-        nw_steps = node.inputs[0]
-        global_nsteps = None
 
     # 2.4 Loop over the clients again now looking just to see how many
     # intermediate steps to store
@@ -1348,19 +1423,33 @@ def save_mem_new_scan(fgraph, node):
                     store_steps[i] = 0
                     break
 
-                if i > op_info.n_mit_mot:
-                    length = node.inputs[0] + init_l[i]
+                # Special case for recurrent outputs where only the last result
+                # is requested. This is needed for this rewrite to apply to
+                # While Scans at all. Otherwise, `get_canonical_form_slice` in
+                # the `else` branch would reintroduce a shape dependency on the
+                # original While Scan which would lead this rewrite to abort.
+                if (
+                    i <= op.info.n_mit_mot
+                    and isinstance(this_slice[0], ScalarConstant)
+                    and this_slice[0].value == -1
+                ):
+                    start = nw_steps
                 else:
-                    try:
-                        length = shape_of[out][0]
-                    except KeyError:
-                        length = out.shape[0]
-                cf_slice = get_canonical_form_slice(this_slice[0], length)
+                    if i <= op.info.n_mit_mot:
+                        try:
+                            length = shape_of[out][0]
+                        except KeyError:
+                            length = out.shape[0]
+                    else:
+                        length = node.inputs[0] + init_l[i]
+
+                    cf_slice = get_canonical_form_slice(this_slice[0], length)
+
+                    if isinstance(cf_slice[0], slice):
+                        start = at.extract_constant(cf_slice[0].start)
+                    else:
+                        start = at.extract_constant(cf_slice[0])
 
-                if isinstance(cf_slice[0], slice):
-                    start = at.extract_constant(cf_slice[0].start)
-                else:
-                    start = at.extract_constant(cf_slice[0])
                 if start == 0 or store_steps[i] == 0:
                     store_steps[i] = 0
                 else:
@@ -1514,6 +1603,7 @@ def save_mem_new_scan(fgraph, node):
                             nw_input = expand_empty(_nw_input, nw_steps)
                             nw_inputs[in_idx] = nw_input
                         else:
+                            # FIXME: This is never used
                             nw_input = nw_inputs[in_idx][: (initl + nw_steps)]
 
                     elif (
@@ -1569,9 +1659,16 @@ def save_mem_new_scan(fgraph, node):
                             sanitize(cnf_slice[0].step),
                         )
                     else:
-                        fslice = sanitize(cnf_slice[0])
+                        if (
+                            isinstance(old_slices[0], ScalarConstant)
+                            and this_slice[0].value == -1
+                        ):
+                            fslice = old_slices[0]
+                        else:
+                            fslice = sanitize(cnf_slice[0])
+
+                        nw_slice = (fslice,) + tuple(old_slices[1:])
 
-                    nw_slice = (fslice,) + tuple(old_slices[1:])
                     nw_pos = inv_compress_map[idx]
 
                     subtens = Subtensor(nw_slice)
@@ -1620,9 +1717,15 @@ def save_mem_new_scan(fgraph, node):
                         ) + tuple(old_slices[1:])
 
                     else:
-                        position = (
-                            cnf_slice[0] - nw_steps - init_l[pos] + store_steps[pos]
-                        )
+                        if (
+                            isinstance(old_slices[0], ScalarConstant)
+                            and this_slice[0].value == -1
+                        ):
+                            position = old_slices[0]
+                        else:
+                            position = (
+                                cnf_slice[0] - nw_steps - init_l[pos] + store_steps[pos]
+                            )
 
                         nw_slice = (sanitize(position),) + tuple(old_slices[1:])
                     subtens = Subtensor(nw_slice)
@@ -2424,6 +2527,12 @@ def push_out_dot1_scan(fgraph, node):
     position=5,
 )
 
+scan_eqopt2.register(
+    "merge_while_scan_subtensor_last_element",
+    in2out(merge_while_scan_subtensor_last_element, ignore_newtrees=True),
+    "fast_run",
+    "scan",
+)
 
 scan_eqopt2.register(
     "constant_folding_for_scan2",
diff --git a/pytensor/tensor/rewriting/subtensor.py b/pytensor/tensor/rewriting/subtensor.py
@@ -479,6 +479,7 @@ def local_subtensor_merge(fgraph, node):
     expresses all slices in a canonical form, and then merges them together.
 
     """
+    from pytensor.scan.op import Scan
 
     if isinstance(node.op, Subtensor):
         u = node.inputs[0]
@@ -489,6 +490,16 @@ def local_subtensor_merge(fgraph, node):
             # slices of the first applied subtensor
             slices1 = get_idx_list(u.owner.inputs, u.owner.op.idx_list)
             slices2 = get_idx_list(node.inputs, node.op.idx_list)
+
+            # Don't try to do the optimization on While scan nodes,
+            # as it will create a dependency on the shape of the outputs
+            if (
+                x.owner is not None
+                and isinstance(x.owner.op, Scan)
+                and x.owner.op.info.as_while
+            ):
+                return None
+
             # Get the shapes of the vectors !
             try:
                 # try not to introduce new shape into the graph
diff --git a/tests/scan/test_rewriting.py b/tests/scan/test_rewriting.py
@@ -1397,6 +1397,47 @@ def f_pow2(x_tm1):
         rng = np.random.default_rng(utt.fetch_seed())
         my_f(rng.uniform(size=(3,)), 4, np.int64([2, 2, 3]))
 
+    def test_while_scan(self):
+        x0 = scalar("x0")
+        seq = vector("seq")
+        n_steps = scalar("n_steps", dtype="int64")
+
+        # while loop
+        (ys, zs), _, = pytensor.scan(
+            lambda s, xtm1: ((xtm1 + 1, xtm1 + 1 + s), {}, until(xtm1 >= 99)),
+            sequences=[seq],
+            outputs_info=[x0, None],
+            n_steps=n_steps,
+            strict=True,
+        )
+        # Save memory is triggered by choosing only last value
+        y = ys[-1]
+        z = zs[-1]
+
+        f = pytensor.function([x0, seq, n_steps], [y, z])
+
+        [scan_node] = (n for n in f.maker.fgraph.apply_nodes if isinstance(n.op, Scan))
+        print(scan_node.inputs)
+        _, _, ys_trace, len_zs = scan_node.inputs
+
+        # Evaluate the shape of ys_trace and len_zs to confirm the rewrite worked correctly.
+        debug_fn = pytensor.function(
+            [n_steps], [ys_trace.shape[0], len_zs], accept_inplace=True
+        )
+        stored_ys_steps, stored_zs_steps = debug_fn(n_steps=200)
+        assert stored_ys_steps == 2
+        assert stored_zs_steps == 1
+
+        test_seq = np.zeros(200)
+        np.testing.assert_allclose(f(x0=0, seq=test_seq, n_steps=200), 100)
+        np.testing.assert_allclose(f(x0=1, seq=test_seq, n_steps=20), 21)
+        np.testing.assert_allclose(f(x0=np.e, seq=test_seq, n_steps=1), np.e + 1)
+        with pytest.raises(AssertionError, match="n_steps > 0 and all"):
+            f(x0=0, seq=test_seq, n_steps=0)
+        # This fails too early inside Scan due to https://github.com/pymc-devs/pytensor/issues/215
+        # with pytest.raises(AssertionError, match="n_steps > 0 and all"):
+        #     f(x0=0, seq=[], n_steps=200)
+
 
 def test_inner_replace_dot():
     """