Add special optimization for While Scan where only last state is used

ricardoV94 · ricardoV94 · commit d242d47c7a1d · 2023-02-09T20:14:39.000+01:00
diff --git a/pytensor/scan/op.py b/pytensor/scan/op.py
@@ -677,7 +677,6 @@ def __init__(
         typeConstructor: Optional[TensorConstructorType] = None,
         truncate_gradient: int = -1,
         name: Optional[str] = None,
-        as_while: bool = False,
         profile: Optional[Union[str, bool]] = None,
         allow_gc: bool = True,
         strict: bool = True,
diff --git a/pytensor/scan/rewriting.py b/pytensor/scan/rewriting.py
@@ -32,6 +32,7 @@
 from pytensor.graph.rewriting.db import EquilibriumDB, SequenceDB
 from pytensor.graph.type import HasShape
 from pytensor.graph.utils import InconsistencyError
+from pytensor.scalar import ScalarConstant
 from pytensor.scan.op import Scan, ScanInfo
 from pytensor.scan.utils import (
     ScanArgs,
@@ -1184,13 +1185,16 @@ def save_mem_new_scan(fgraph, node):
     # index(step) for any output scan actually needs to compute
     # In other words n_steps should be equal to this maximal !
     # Note: if we have a shared variable that gets updated at every step
-    # of the loop, reducing the number of steps will affect the the
-    # value of the shared variable after the loop so we need not to
+    # of the loop, reducing the number of steps will affect the
+    # value of the shared variable after the loop so we cannot
     # change the number of steps in that case. To do this we set
     # global_nsteps to None which is seen as a flag that nothing needs
-    # to be done
+    # to be done.
+    # Note: For simplicity while Scans also have global_nsteps set to None.
+    #  All step optimizations require knowing the shape of the output, which
+    #  cannot be determined from the inputs alone.
     assert len(node.outputs) >= c_outs
-    if len(node.outputs) == c_outs:
+    if len(node.outputs) == c_outs and not op.info.as_while:
         global_nsteps = {"real": -1, "sym": []}
     else:
         global_nsteps = None
@@ -1298,9 +1302,9 @@ def save_mem_new_scan(fgraph, node):
 
     # 2.3. Analyze global_nsteps to figure out for how many steps scan
     # needs to iterate
-    if global_nsteps is not None:
+    if global_nsteps is None:
         nw_steps = node.inputs[0]
-
+    else:
         # there are some symbolic tensors that limit the number of
         # steps
         if len(global_nsteps["sym"]) == 0:
@@ -1316,16 +1320,15 @@ def save_mem_new_scan(fgraph, node):
             real_steps = None
         nw_steps = select_min(select_max(sym_steps, real_steps), node.inputs[0])
 
+        # FIXME: This is not correct. Scan with 0 steps seem to be supported again
         # Make sure the ScanSaveMem optimization never makes the new
         # number of steps to be 0 (this could happen, for instance, if
         # the optimization detects that the outputs of the Scan go through
         # subtensor nodes that end up taking no elements) because Scan with
         # 0 iterations are not supported. Make sure the new number of steps
         # is at least 1.
         nw_steps = select_max(nw_steps, 1)
-    else:
-        nw_steps = node.inputs[0]
-        global_nsteps = None
+
 
     # 2.4 Loop over the clients again now looking just to see how many
     # intermediate steps to store
@@ -1348,19 +1351,26 @@ def save_mem_new_scan(fgraph, node):
                     store_steps[i] = 0
                     break
 
-                if i > op_info.n_mit_mot:
-                    length = node.inputs[0] + init_l[i]
+                if (
+                    isinstance(this_slice[0], ScalarConstant)
+                    and this_slice[0].value == -1
+                ):
+                    start = nw_steps
                 else:
-                    try:
-                        length = shape_of[out][0]
-                    except KeyError:
-                        length = out.shape[0]
-                cf_slice = get_canonical_form_slice(this_slice[0], length)
+                    if i > op_info.n_mit_mot:
+                        length = node.inputs[0] + init_l[i]
+                    else:
+                        try:
+                            length = shape_of[out][0]
+                        except KeyError:
+                            length = out.shape[0]
+                    cf_slice = get_canonical_form_slice(this_slice[0], length)
+
+                    if isinstance(cf_slice[0], slice):
+                        start = at.extract_constant(cf_slice[0].start)
+                    else:
+                        start = at.extract_constant(cf_slice[0])
 
-                if isinstance(cf_slice[0], slice):
-                    start = at.extract_constant(cf_slice[0].start)
-                else:
-                    start = at.extract_constant(cf_slice[0])
                 if start == 0 or store_steps[i] == 0:
                     store_steps[i] = 0
                 else:
@@ -1514,6 +1524,7 @@ def save_mem_new_scan(fgraph, node):
                             nw_input = expand_empty(_nw_input, nw_steps)
                             nw_inputs[in_idx] = nw_input
                         else:
+                            # FIXME: This is never used
                             nw_input = nw_inputs[in_idx][: (initl + nw_steps)]
 
                     elif (
@@ -1569,9 +1580,16 @@ def save_mem_new_scan(fgraph, node):
                             sanitize(cnf_slice[0].step),
                         )
                     else:
-                        fslice = sanitize(cnf_slice[0])
+                        if (
+                            isinstance(old_slices[0], ScalarConstant)
+                            and this_slice[0].value == -1
+                        ):
+                            fslice = old_slices[0]
+                        else:
+                            fslice = sanitize(cnf_slice[0])
+
+                        nw_slice = (fslice,) + tuple(old_slices[1:])
 
-                    nw_slice = (fslice,) + tuple(old_slices[1:])
                     nw_pos = inv_compress_map[idx]
 
                     subtens = Subtensor(nw_slice)
@@ -1620,9 +1638,15 @@ def save_mem_new_scan(fgraph, node):
                         ) + tuple(old_slices[1:])
 
                     else:
-                        position = (
-                            cnf_slice[0] - nw_steps - init_l[pos] + store_steps[pos]
-                        )
+                        if (
+                            isinstance(old_slices[0], ScalarConstant)
+                            and this_slice[0].value == -1
+                        ):
+                            position = old_slices[0]
+                        else:
+                            position = (
+                                cnf_slice[0] - nw_steps - init_l[pos] + store_steps[pos]
+                            )
 
                         nw_slice = (sanitize(position),) + tuple(old_slices[1:])
                     subtens = Subtensor(nw_slice)
diff --git a/pytensor/tensor/rewriting/subtensor.py b/pytensor/tensor/rewriting/subtensor.py
@@ -479,6 +479,7 @@ def local_subtensor_merge(fgraph, node):
     expresses all slices in a canonical form, and then merges them together.
 
     """
+    from pytensor.scan.op import Scan
 
     if isinstance(node.op, Subtensor):
         u = node.inputs[0]
@@ -489,6 +490,25 @@ def local_subtensor_merge(fgraph, node):
             # slices of the first applied subtensor
             slices1 = get_idx_list(u.owner.inputs, u.owner.op.idx_list)
             slices2 = get_idx_list(node.inputs, node.op.idx_list)
+
+            # Special case for scan[1:][-1] = scan[-1]
+            # FIXME: This assumes scan[1] always exists which is not True,
+            #  because Scans can have 0 steps or 0-length sequences.
+            #  We can fix it by adding an assert that n_steps is
+            #  not zero and no sequence is empty. This generalizes for
+            #  any negative scalar index, although -1 is the most common case.
+            # TODO: Check that slices1 is indeed [1:]
+            if (
+                isinstance(x.owner.op, Scan)
+                and isinstance(slices2, tuple)
+                and len(slices2) == 1
+                and isinstance(slices2[0], aes.ScalarConstant)
+                and slices2[0].data == -1
+            ):
+                out = x[-1]
+                copy_stack_trace([node.outputs[0], node.inputs[0]], out)
+                return [out]
+
             # Get the shapes of the vectors !
             try:
                 # try not to introduce new shape into the graph
diff --git a/pytensor/tensor/subtensor.py b/pytensor/tensor/subtensor.py
@@ -82,6 +82,7 @@
 )
 
 
+# FIXME: This type hint is wrong, it returns tuples
 def indices_from_subtensor(
     op_indices: Iterable[ScalarConstant],
     idx_list: Optional[List[Union[Type, slice, Variable]]],
diff --git a/tests/scan/test_rewriting.py b/tests/scan/test_rewriting.py
@@ -1397,6 +1397,27 @@ def f_pow2(x_tm1):
         rng = np.random.default_rng(utt.fetch_seed())
         my_f(rng.uniform(size=(3,)), 4, np.int64([2, 2, 3]))
 
+    def test_while_scan(self):
+        x = scalar("x")
+
+        ys, _, = pytensor.scan(
+            # lambda xtm1: xtm1 + 1,  # for loop
+            lambda xtm1: (xtm1 + 1, {}, until(xtm1 >= 100)),  # while loop
+            outputs_info=[x],
+            n_steps=100,
+            strict=True,
+        )
+        # Save memory is triggered by choosing only last value
+        y = ys[-1]
+
+        f = pytensor.function([x], y)
+        assert f(0) == 100
+
+        [scan_node] = (n for n in f.maker.fgraph.apply_nodes if isinstance(n.op, Scan))
+        _, scan_trace = scan_node.inputs
+        # This means scan is only saving the last 2 states
+        assert scan_trace.type.shape == (2,)
+
 
 def test_inner_replace_dot():
     """

Original file line number	Diff line number	Diff line change
`@@ -82,6 +82,7 @@`
`82`	`82`	`)`
`83`	`83`
`84`	`84`
	`85`	`+# FIXME: This type hint is wrong, it returns tuples`
`85`	`86`	`def indices_from_subtensor(`
`86`	`87`	`op_indices: Iterable[ScalarConstant],`
`87`	`88`	`idx_list: Optional[List[Union[Type, slice, Variable]]],`