Make matrix mod 2 conversion to numpy faster

user202729 · user202729 · commit b1d65a824f16 · 2025-01-06T16:33:58.000+07:00
diff --git a/src/bin/sage-fixdoctests b/src/bin/sage-fixdoctests
@@ -36,6 +36,7 @@ import re
 import shlex
 import subprocess
 import sys
+import textwrap
 
 from argparse import ArgumentParser
 from collections import defaultdict
@@ -303,7 +304,7 @@ def process_block(block, src_in_lines, file_optional_tags, venv_explainer=''):
         got = re.sub(r'(doctest:warning).*^( *DeprecationWarning:)',
                      r'\1...\n\2',
                      got, 1, re.DOTALL | re.MULTILINE)
-        got = got.splitlines()      # got can't be the empty string
+        got = textwrap.dedent(got).splitlines()      # got can't be the empty string
 
     if args.only_tags:
         if args.verbose:
@@ -329,7 +330,7 @@ def process_block(block, src_in_lines, file_optional_tags, venv_explainer=''):
     if not expected:
         indent = (len(src_in_lines[first_line_num - 1]) - len(src_in_lines[first_line_num - 1].lstrip()))
         append_to_line(line_num - 1,
-                       '\n' + '\n'.join('%s%s' % (' ' * indent, line.lstrip()) for line in got),
+                       '\n' + '\n'.join(' ' * indent + line for line in got),
                        message="Adding the new output")
         return
 
@@ -338,8 +339,7 @@ def process_block(block, src_in_lines, file_optional_tags, venv_explainer=''):
     # has the smallest indentation after lstrip(). Note that the amount of indentation
     # required could be negative if the ``got`` block is indented. In this case
     # ``indent`` is set to zero.
-    indent = max(0, (len(src_in_lines[line_num]) - len(src_in_lines[line_num].lstrip())
-                     - min(len(got[j]) - len(got[j].lstrip()) for j in range(len(got)))))
+    indent = len(src_in_lines[line_num]) - len(src_in_lines[line_num].lstrip())
 
     # Double check that what was expected was indeed in the source file and if
     # it is not then then print a warning for the user which contains the
@@ -358,7 +358,7 @@ def process_block(block, src_in_lines, file_optional_tags, venv_explainer=''):
         update_line(line_num, None,
                     message="Expected something, got nothing; deleting the old output")
     else:
-        update_line(line_num, '\n'.join((' ' * indent + got[i]) for i in range(len(got))),
+        update_line(line_num, '\n'.join(' ' * indent + line for line in got),
                     message="Replacing the old expected output with the new output")
 
     # Mark any remaining `expected` lines as ``None`` so as to preserve the line numbering
diff --git a/src/sage/matrix/matrix1.pyx b/src/sage/matrix/matrix1.pyx
@@ -25,6 +25,9 @@ import sage.modules.free_module
 from sage.structure.coerce cimport coercion_model
 
 
+_MISSING = object()
+
+
 cdef class Matrix(Matrix0):
     ###################################################
     # Coercion to Various Systems
@@ -670,7 +673,7 @@ cdef class Matrix(Matrix0):
             entries = [[sib(v, 2) for v in row] for row in self.rows()]
             return sib.name('matrix')(self.base_ring(), entries)
 
-    def numpy(self, dtype=None, copy=True):
+    def numpy(self, dtype=None, copy=_MISSING):
         """
         Return the Numpy matrix associated to this matrix.
 
@@ -680,10 +683,6 @@ cdef class Matrix(Matrix0):
           then the type will be determined as the minimum type required
           to hold the objects in the sequence.
 
-        - ``copy`` -- if `self` is already an `ndarray`, then this flag
-          determines whether the data is copied (the default), or whether
-          a view is constructed.
-
         EXAMPLES::
 
             sage: # needs numpy
@@ -708,14 +707,14 @@ cdef class Matrix(Matrix0):
         Type ``numpy.typecodes`` for a list of the possible
         typecodes::
 
-            sage: import numpy                            # needs numpy
-            sage: numpy.typecodes.items()                 # needs numpy # random
+            sage: import numpy                                                          # needs numpy
+            sage: numpy.typecodes.items()                                               # needs numpy  # random
             [('All', '?bhilqpBHILQPefdgFDGSUVOMm'), ('AllFloat', 'efdgFDG'),
             ...
 
         For instance, you can see possibilities for real floating point numbers::
 
-            sage: numpy.typecodes['Float']                # needs numpy
+            sage: numpy.typecodes['Float']                                              # needs numpy
             'efdg'
 
         Alternatively, numpy automatically calls this function (via
@@ -733,15 +732,70 @@ cdef class Matrix(Matrix0):
             dtype('int64')  # 64-bit
             sage: b.shape
             (3, 4)
+
+        TESTS::
+
+            sage: # needs numpy
+            sage: matrix(3, range(12)).numpy(copy=False)
+            doctest:warning...
+            DeprecationWarning: passing copy argument to numpy() is deprecated
+            See https://github.com/sagemath/sage/issues/39152 for details.
+            array([[ 0,  1,  2,  3],
+                   [ 4,  5,  6,  7],
+                   [ 8,  9, 10, 11]])
         """
+        if copy is not _MISSING:
+            from sage.misc.superseded import deprecation
+            deprecation(39152, "passing copy argument to numpy() is deprecated")
         import numpy
-        A = numpy.matrix(self.list(), dtype=dtype, copy=copy)
-        return numpy.resize(A,(self.nrows(), self.ncols()))
+        return numpy.asarray(self.list(), dtype=dtype).reshape(self.nrows(), self.ncols())
+
+    def __array__(self, dtype=None, copy=None):
+        """
+        Define the magic ``__array__`` function so that ``numpy.array(m)`` can convert
+        a matrix ``m`` to a numpy array. See
+        `Interoperability with NumPy <https://numpy.org/doc/1.26/user/basics.interoperability.html>`_.
 
-    # Define the magic "__array__" function so that numpy.array(m) can convert
-    # a matrix m to a numpy array.
-    # See http://docs.scipy.org/doc/numpy/user/c-info.how-to-extend.html#converting-an-arbitrary-sequence-object
-    __array__=numpy
+        Note that subclasses should override :meth:`numpy`, but usually not this method.
+
+        INPUT:
+
+        - ``dtype`` -- the desired data-type for the array. If not given,
+          then the type will be determined automatically.
+
+        - ``copy`` -- required for numpy 2.0 compatibility.
+          See <https://numpy.org/devdocs/numpy_2_0_migration_guide.html#adapting-to-changes-in-the-copy-keyword>`_.
+          Note that ``copy=False`` is not supported.
+
+        TESTS::
+
+            sage: # needs numpy
+            sage: import numpy as np
+            sage: a = matrix(3, range(12))
+            sage: if np.lib.NumpyVersion(np.__version__) >= '2.0.0':
+            ....:     try:
+            ....:         np.array(a, copy=False)  # in numpy 2.0, this raises an error
+            ....:     except ValueError:
+            ....:         pass
+            ....:     else:
+            ....:         assert False
+            ....: else:
+            ....:     b = np.array(a, copy=False)  # in numpy 1.26, this means "avoid copy if possible"
+            ....:     # https://numpy.org/doc/1.26/reference/generated/numpy.array.html#numpy.array
+            ....:     # but no-copy is not supported so it will copy anyway
+            ....:     a[0,0] = 1
+            ....:     assert b[0,0] == 0
+            ....:     b = np.asarray(a)
+            ....:     a[0,0] = 2
+            ....:     assert b[0,0] == 1
+        """
+        import numpy as np
+        if np.lib.NumpyVersion(np.__version__) >= '2.0.0':
+            if copy is False:
+                raise ValueError("Sage matrix cannot be converted to numpy array without copying")
+        else:
+            assert copy is None  # numpy versions before 2.0 should not pass copy argument
+        return self.numpy(dtype)
 
     ###################################################
     # Construction functions
diff --git a/src/sage/matrix/matrix_mod2_dense.pyx b/src/sage/matrix/matrix_mod2_dense.pyx
@@ -110,6 +110,7 @@ from cysignals.signals cimport sig_on, sig_str, sig_off
 cimport sage.matrix.matrix_dense as matrix_dense
 from sage.matrix.args cimport SparseEntry, MatrixArgs_init, MA_ENTRIES_NDARRAY
 from libc.stdio cimport *
+from libc.stdint cimport uintptr_t
 from sage.structure.element cimport (Matrix, Vector)
 from sage.modules.free_module_element cimport FreeModuleElement
 from sage.libs.gmp.random cimport *
@@ -581,6 +582,33 @@ cdef class Matrix_mod2_dense(matrix_dense.Matrix_dense):   # dense or sparse
             return list(C)
         return C
 
+    def numpy(self, dtype=None):
+        """
+        Return the Numpy matrix associated to this matrix.
+        See :meth:`.matrix1.Matrix.numpy`.
+
+        EXAMPLES::
+
+            sage: # needs numpy
+            sage: import numpy as np
+            sage: np.array(matrix.identity(GF(2), 5))
+            array([[1, 0, 0, 0, 0],
+                   [0, 1, 0, 0, 0],
+                   [0, 0, 1, 0, 0],
+                   [0, 0, 0, 1, 0],
+                   [0, 0, 0, 0, 1]], dtype=uint8)
+
+        TESTS:
+
+        Make sure it's reasonably fast (the temporary numpy array is immediately
+        destroyed otherwise it consumes 400MB memory)::
+
+            sage: np.sum(np.array(matrix.identity(GF(2), 2*10^4)))  # around 2s walltime            # needs numpy
+            20000
+        """
+        from ..modules.numpy_util import mzd_matrix_to_numpy
+        return mzd_matrix_to_numpy(<uintptr_t>self._entries, dtype)
+
     ########################################################################
     # LEVEL 2 functionality
     #   * def _pickle
diff --git a/src/sage/matrix/matrix_numpy_dense.pyx b/src/sage/matrix/matrix_numpy_dense.pyx
@@ -366,15 +366,12 @@ cdef class Matrix_numpy_dense(Matrix_dense):
 
     def numpy(self, dtype=None):
         """
-        Return a copy of the matrix as a numpy array.
-
-        It uses the numpy C/api so is very fast.
+        Return the Numpy matrix associated to this matrix.
 
         INPUT:
 
         - ``dtype`` -- the desired data-type for the array. If not given,
-          then the type will be determined as the minimum type required
-          to hold the objects in the sequence.
+          then the type will be determined automatically.
 
         EXAMPLES::
 
@@ -424,12 +421,71 @@ cdef class Matrix_numpy_dense(Matrix_dense):
             []
             sage: m.numpy()
             array([], shape=(5, 0), dtype=float64)
+
+        Test that a copy is always made::
+
+            sage: import numpy as np
+            sage: m = matrix(RDF,2); m
+            [0.0 0.0]
+            [0.0 0.0]
+            sage: m[0,0]=1
+            sage: n=m.numpy()  # should copy
+            sage: m[0,0]=2
+            sage: n
+            array([[1., 0.],
+                   [0., 0.]])
+            sage: n=numpy.array(m)  # should copy
+            sage: m[0,0]=3
+            sage: n
+            array([[2., 0.],
+                   [0., 0.]])
+            sage: n=numpy.asarray(m)  # should still copy
+            sage: m[0,0]=4
+            sage: n
+            array([[3., 0.],
+                   [0., 0.]])
+            sage: n=numpy.asarray(m, dtype=numpy.int64)  # should copy
+            sage: m[0,0]=5
+            sage: n
+            array([[4, 0],
+                   [0, 0]])
+
+        ::
+
+            sage: import numpy as np
+            sage: a = matrix(RDF, 3, range(12))
+            sage: if np.lib.NumpyVersion(np.__version__) >= '2.0.0':
+            ....:     try:
+            ....:         np.array(a, copy=False)  # in numpy 2.0, this raises an error
+            ....:     except ValueError:
+            ....:         pass
+            ....:     else:
+            ....:         assert False
+            ....: else:
+            ....:     b = np.array(a, copy=False)  # in numpy 1.26, this means "avoid copy if possible"
+            ....:     # https://numpy.org/doc/1.26/reference/generated/numpy.array.html#numpy.array
+            ....:     # but no-copy is not supported so it will copy anyway
+            ....:     a[0,0] = 1
+            ....:     assert b[0,0] == 0
+            ....:     b = np.asarray(a)
+            ....:     a[0,0] = 2
+            ....:     assert b[0,0] == 1
+
+        Make sure it's reasonably fast (the temporary numpy array is immediately
+        destroyed otherwise it consumes 200MB memory)::
+
+            sage: import numpy as np
+            sage: np.sum(np.array(matrix.identity(RDF, 5*10^3))).item()  # around 2s each
+            5000.0
+            sage: np.sum(np.asarray(matrix.identity(RDF, 5*10^3))).item()
+            5000.0
+            sage: np.sum(np.asarray(matrix.identity(RDF, 5*10^3), dtype=np.uint8)).item()
+            5000
+            sage: np.sum(np.array(matrix.identity(CDF, 3*10^3))).item()
+            (3000+0j)
         """
         import numpy as np
-        if dtype is None or self._numpy_dtype == np.dtype(dtype):
-            return self._matrix_numpy.copy()
-        else:
-            return Matrix_dense.numpy(self, dtype=dtype)
+        return np.array(self._matrix_numpy, dtype=dtype)
 
     def _replace_self_with_numpy(self, numpy_matrix):
         """
diff --git a/src/sage/modules/numpy_util.pyx b/src/sage/modules/numpy_util.pyx
@@ -133,3 +133,28 @@ cpdef int set_matrix_mod2_from_numpy(Matrix_mod2_dense a, b) except -1:
         return (<object>_set_matrix_mod2_from_numpy_helper)(a, b)  # https://github.com/cython/cython/issues/6588
     except TypeError:
         return False
+
+
+cpdef object mzd_matrix_to_numpy(uintptr_t entries_addr, object dtype):
+    """
+    Convert ``<mzd_t*>entries_addr`` to a numpy array.
+
+    INPUT:
+
+    - ``entries_addr`` -- must be a ``mzd_t*`` casted to ``uintptr_t``; the casting
+      is necessary to pass it through Python boundary because of lazy import.
+      Do not pass arbitrary integer value here, will crash the program.
+
+    - ``dtype`` -- numpy dtype. If ``None``, the result will have some convenient dtype.
+
+    OUTPUT: a 2-dimensional array.
+    """
+    if dtype is not None:
+        return mzd_matrix_to_numpy(entries_addr, None).astype(dtype)
+    cdef mzd_t* entries = <mzd_t*>entries_addr
+    cdef np.ndarray[np.uint8_t, ndim=2] result = np.empty((entries.nrows, entries.ncols), dtype=np.uint8)
+    cdef Py_ssize_t i, j
+    for i in range(entries.nrows):
+        for j in range(entries.ncols):
+            result[i, j] = mzd_read_bit(entries, i, j)
+    return result