updated setup.py

ParticularMiner · ParticularMiner · commit 798daf78fe97 · 2021-04-14T21:25:08.000+02:00
diff --git a/setup.py b/setup.py
@@ -1,30 +1,81 @@
-from setuptools import setup
+from setuptools import setup, Extension
 import pathlib
+import os
 
 # The directory containing this file
 HERE = pathlib.Path(__file__).parent
 
 # The text of the README file
 README = (HERE / "README.md").read_text()
 
+# workaround for numpy and Cython install dependency
+# the solution is from https://stackoverflow.com/a/54138355
+def my_build_ext(pars):
+    # import delayed:
+    from setuptools.command.build_ext import build_ext as _build_ext
+    class build_ext(_build_ext):
+        def finalize_options(self):
+            _build_ext.finalize_options(self)
+            # Prevent numpy from thinking it is still in its setup process:
+            __builtins__.__NUMPY_SETUP__ = False
+            import numpy
+            self.include_dirs.append(numpy.get_include())
+
+    #object returned:
+    return build_ext(pars)
+
+if os.name == 'nt':
+    extra_compile_args = ["-Ox"]
+else:
+    extra_compile_args = ['-std=c++0x', '-pthread', '-O3']
+
+original_ext = Extension('sparse_dot_topn.sparse_dot_topn',
+                         sources=['./sparse_dot_topn/sparse_dot_topn.pyx',
+                                  './sparse_dot_topn/sparse_dot_topn_source.cpp'],
+                         extra_compile_args=extra_compile_args,
+                         language='c++')
+
+threaded_ext = Extension('sparse_dot_topn.sparse_dot_topn_threaded',
+                         sources=[
+                             './sparse_dot_topn/sparse_dot_topn_threaded.pyx',
+                             './sparse_dot_topn/sparse_dot_topn_source.cpp',
+                             './sparse_dot_topn/sparse_dot_topn_parallel.cpp'],
+                         extra_compile_args=extra_compile_args,
+                         language='c++')
+
 setup(
     name='string_grouper',
     version='0.4.0',
-    packages=['string_grouper'],
+    packages=[
+        'string_grouper'
+        , 'string_grouper_utils'
+        , 'sparse_dot_topn'
+    ],
     license='MIT License',
     description='String grouper contains functions to do string matching using TF-IDF and the cossine similarity. '
                 'Based on https://bergvca.github.io/2017/10/14/super-fast-string-matching.html',
+    keywords='cosine-similarity sparse-matrix sparse-graph scipy cython',
     author='Chris van den Berg',
     long_description=README,
     long_description_content_type="text/markdown",
     author_email='fake_email@gmail.com',
     url='https://github.com/Bergvca/string_grouper',
     zip_safe=False,
     python_requires='>3.7',
-    install_requires=['pandas>=0.25.3'
+    setup_requires=[# Setuptools 18.0 properly handles Cython extensions.
+                    'setuptools>=18.0'
+                    , 'cython>=0.29.15'
+                    , 'numpy'
+                    , 'scipy'
+    ],
+    install_requires=[# Setuptools 18.0 properly handles Cython extensions.
+                      'setuptools>=18.0'
+                      , 'cython>=0.29.15'
+                      , 'numpy'
                       , 'scipy'
                       , 'scikit-learn'
-                      , 'numpy'
-                      , 'sparse_dot_topn>=0.2.6'
-                      ]
+                      , 'pandas>=0.25.3'
+    ],
+    cmdclass={'build_ext': my_build_ext},
+    ext_modules=[original_ext, threaded_ext]
 )
diff --git a/sparse_dot_topn/__init__.py b/sparse_dot_topn/__init__.py
@@ -1,7 +1,2 @@
 # flake8: noqa
-import sys
-
-if sys.version_info[0] >= 3:
-    from sparse_dot_topn.awesome_cossim_topn import awesome_cossim_topn, awesome_cossim_true_minmax_topn_only
-else:
-    from awesome_cossim_topn import awesome_cossim_topn, awesome_cossim_true_minmax_topn_only
+from sparse_dot_topn.awesome_cossim_topn import awesome_cossim_topn, awesome_cossim_true_minmax_topn_only
diff --git a/sparse_dot_topn/awesome_cossim_topn.py b/sparse_dot_topn/awesome_cossim_topn.py
@@ -1,14 +1,9 @@
-import sys
 import numpy as np
 from scipy.sparse import csr_matrix
 from scipy.sparse import isspmatrix_csr
 
-if sys.version_info[0] >= 3:
-    from sparse_dot_topn import sparse_dot_topn as ct
-    from sparse_dot_topn import sparse_dot_topn_threaded as ct_thread
-else:
-    import sparse_dot_topn as ct
-    import sparse_dot_topn_threaded as ct_thread
+from sparse_dot_topn import sparse_dot_topn as ct
+from sparse_dot_topn import sparse_dot_topn_threaded as ct_thread
 
 
 def awesome_cossim_topn(A, B, ntop, lower_bound=0, use_threads=False, n_jobs=1):
diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py
@@ -249,8 +249,8 @@ def fit(self) -> 'StringGrouper':
         master_matrix, duplicate_matrix = self._get_tf_idf_matrices()
         # Calculate the matches using the cosine similarity
         matches = self._build_matches(master_matrix, duplicate_matrix)
-        if self._duplicates is None:
-            # the matrix of matches needs to be symmetric!!! (i.e., if A != B and A matches B; then B matches A)
+        if self._duplicates is None and self._max_n_matches < self._true_max_n_matches:
+            # the list of matches needs to be symmetric!!! (i.e., if A != B and A matches B; then B matches A)
             matches = StringGrouper._symmetrize_matrix(matches)
         # build list from matrix
         self._matches_list = self._get_matches_list(matches)
@@ -439,16 +439,16 @@ def _build_matches(self, master_matrix: csr_matrix, duplicate_matrix: csr_matrix
                 'n_jobs': self._config.number_of_processes
             }
 
-        # if min_similarity <= 0 compute the true maximum number of matches over all strings in master:
-        if self._config.min_similarity <= 0:
-            self._true_max_n_matches = awesome_cossim_true_minmax_topn_only(
-                tf_idf_matrix_1,
-                tf_idf_matrix_2,
-                **optional_kwargs
-            )
-            # if kwarg max_n_matches was not set then set it now to true value
-            if self._config.max_n_matches is None:
-                self._max_n_matches = self._true_max_n_matches
+        # compute the true maximum number of matches over all strings in master:
+        self._true_max_n_matches = awesome_cossim_true_minmax_topn_only(
+            tf_idf_matrix_1,
+            tf_idf_matrix_2,
+            **optional_kwargs
+        )
+
+        if self._config.min_similarity <= 0 and self._config.max_n_matches is None:
+            # if kwarg max_n_matches was not set when min_similarity <= 0 then set it now to its true value
+            self._max_n_matches = self._true_max_n_matches
 
         return awesome_cossim_topn(
             tf_idf_matrix_1, tf_idf_matrix_2,
diff --git a/string_grouper/test/test_string_grouper.py b/string_grouper/test/test_string_grouper.py
@@ -3,13 +3,11 @@
 import numpy as np
 from scipy.sparse.csr import csr_matrix
 from string_grouper.string_grouper import DEFAULT_MIN_SIMILARITY, \
-    DEFAULT_MAX_N_MATCHES, DEFAULT_REGEX, \
-    DEFAULT_NGRAM_SIZE, DEFAULT_N_PROCESSES, DEFAULT_IGNORE_CASE, \
+    DEFAULT_REGEX, DEFAULT_NGRAM_SIZE, DEFAULT_N_PROCESSES, DEFAULT_IGNORE_CASE, \
     StringGrouperConfig, StringGrouper, StringGrouperNotFitException, \
-    match_most_similar, group_similar_strings, match_strings,\
+    match_most_similar, group_similar_strings, match_strings, \
     compute_pairwise_similarities
 from unittest.mock import patch
-from scipy.sparse.csgraph._flow import csr_matrix
 
 def mock_symmetrize_matrix(A: csr_matrix) -> csr_matrix:
     return A
@@ -383,7 +381,7 @@ def test_get_matches_single(self):
         left_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo']
         right_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo']
         left_index = [0, 0, 1, 2, 3, 3]
-        right_index = [0, 3, 1, 2, 0, 3]
+        right_index = [3, 0, 1, 2, 3, 0]
         similarity = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
         expected_df = pd.DataFrame({'left_index': left_index, 'left_side': left_side,
                                     'similarity': similarity,
@@ -399,8 +397,8 @@ def test_get_matches_1_series_1_id_series(self):
         left_side_id = ['A0', 'A0', 'A1', 'A2', 'A3', 'A3']
         left_index = [0, 0, 1, 2, 3, 3]
         right_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo']
-        right_side_id = ['A0', 'A3', 'A1', 'A2', 'A0', 'A3']
-        right_index = [0, 3, 1, 2, 0, 3]
+        right_side_id = ['A3', 'A0', 'A1', 'A2', 'A3', 'A0']
+        right_index = [3, 0, 1, 2, 3, 0]
         similarity = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
         expected_df = pd.DataFrame({'left_index': left_index, 'left_side': left_side, 'left_id': left_side_id,
                                     'similarity': similarity,