Skip to content

Commit 798daf7

Browse files
updated setup.py
1 parent 68a51a1 commit 798daf7

File tree

5 files changed

+77
-38
lines changed

5 files changed

+77
-38
lines changed

setup.py

Lines changed: 57 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,81 @@
1-
from setuptools import setup
1+
from setuptools import setup, Extension
22
import pathlib
3+
import os
34

45
# The directory containing this file
56
HERE = pathlib.Path(__file__).parent
67

78
# The text of the README file
89
README = (HERE / "README.md").read_text()
910

11+
# workaround for numpy and Cython install dependency
12+
# the solution is from https://stackoverflow.com/a/54138355
13+
def my_build_ext(pars):
14+
# import delayed:
15+
from setuptools.command.build_ext import build_ext as _build_ext
16+
class build_ext(_build_ext):
17+
def finalize_options(self):
18+
_build_ext.finalize_options(self)
19+
# Prevent numpy from thinking it is still in its setup process:
20+
__builtins__.__NUMPY_SETUP__ = False
21+
import numpy
22+
self.include_dirs.append(numpy.get_include())
23+
24+
#object returned:
25+
return build_ext(pars)
26+
27+
if os.name == 'nt':
28+
extra_compile_args = ["-Ox"]
29+
else:
30+
extra_compile_args = ['-std=c++0x', '-pthread', '-O3']
31+
32+
original_ext = Extension('sparse_dot_topn.sparse_dot_topn',
33+
sources=['./sparse_dot_topn/sparse_dot_topn.pyx',
34+
'./sparse_dot_topn/sparse_dot_topn_source.cpp'],
35+
extra_compile_args=extra_compile_args,
36+
language='c++')
37+
38+
threaded_ext = Extension('sparse_dot_topn.sparse_dot_topn_threaded',
39+
sources=[
40+
'./sparse_dot_topn/sparse_dot_topn_threaded.pyx',
41+
'./sparse_dot_topn/sparse_dot_topn_source.cpp',
42+
'./sparse_dot_topn/sparse_dot_topn_parallel.cpp'],
43+
extra_compile_args=extra_compile_args,
44+
language='c++')
45+
1046
setup(
1147
name='string_grouper',
1248
version='0.4.0',
13-
packages=['string_grouper'],
49+
packages=[
50+
'string_grouper'
51+
, 'string_grouper_utils'
52+
, 'sparse_dot_topn'
53+
],
1454
license='MIT License',
1555
description='String grouper contains functions to do string matching using TF-IDF and the cossine similarity. '
1656
'Based on https://bergvca.github.io/2017/10/14/super-fast-string-matching.html',
57+
keywords='cosine-similarity sparse-matrix sparse-graph scipy cython',
1758
author='Chris van den Berg',
1859
long_description=README,
1960
long_description_content_type="text/markdown",
2061
author_email='fake_email@gmail.com',
2162
url='https://github.com/Bergvca/string_grouper',
2263
zip_safe=False,
2364
python_requires='>3.7',
24-
install_requires=['pandas>=0.25.3'
65+
setup_requires=[# Setuptools 18.0 properly handles Cython extensions.
66+
'setuptools>=18.0'
67+
, 'cython>=0.29.15'
68+
, 'numpy'
69+
, 'scipy'
70+
],
71+
install_requires=[# Setuptools 18.0 properly handles Cython extensions.
72+
'setuptools>=18.0'
73+
, 'cython>=0.29.15'
74+
, 'numpy'
2575
, 'scipy'
2676
, 'scikit-learn'
27-
, 'numpy'
28-
, 'sparse_dot_topn>=0.2.6'
29-
]
77+
, 'pandas>=0.25.3'
78+
],
79+
cmdclass={'build_ext': my_build_ext},
80+
ext_modules=[original_ext, threaded_ext]
3081
)

sparse_dot_topn/__init__.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,2 @@
11
# flake8: noqa
2-
import sys
3-
4-
if sys.version_info[0] >= 3:
5-
from sparse_dot_topn.awesome_cossim_topn import awesome_cossim_topn, awesome_cossim_true_minmax_topn_only
6-
else:
7-
from awesome_cossim_topn import awesome_cossim_topn, awesome_cossim_true_minmax_topn_only
2+
from sparse_dot_topn.awesome_cossim_topn import awesome_cossim_topn, awesome_cossim_true_minmax_topn_only

sparse_dot_topn/awesome_cossim_topn.py

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,9 @@
1-
import sys
21
import numpy as np
32
from scipy.sparse import csr_matrix
43
from scipy.sparse import isspmatrix_csr
54

6-
if sys.version_info[0] >= 3:
7-
from sparse_dot_topn import sparse_dot_topn as ct
8-
from sparse_dot_topn import sparse_dot_topn_threaded as ct_thread
9-
else:
10-
import sparse_dot_topn as ct
11-
import sparse_dot_topn_threaded as ct_thread
5+
from sparse_dot_topn import sparse_dot_topn as ct
6+
from sparse_dot_topn import sparse_dot_topn_threaded as ct_thread
127

138

149
def awesome_cossim_topn(A, B, ntop, lower_bound=0, use_threads=False, n_jobs=1):

string_grouper/string_grouper.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -249,8 +249,8 @@ def fit(self) -> 'StringGrouper':
249249
master_matrix, duplicate_matrix = self._get_tf_idf_matrices()
250250
# Calculate the matches using the cosine similarity
251251
matches = self._build_matches(master_matrix, duplicate_matrix)
252-
if self._duplicates is None:
253-
# the matrix of matches needs to be symmetric!!! (i.e., if A != B and A matches B; then B matches A)
252+
if self._duplicates is None and self._max_n_matches < self._true_max_n_matches:
253+
# the list of matches needs to be symmetric!!! (i.e., if A != B and A matches B; then B matches A)
254254
matches = StringGrouper._symmetrize_matrix(matches)
255255
# build list from matrix
256256
self._matches_list = self._get_matches_list(matches)
@@ -439,16 +439,16 @@ def _build_matches(self, master_matrix: csr_matrix, duplicate_matrix: csr_matrix
439439
'n_jobs': self._config.number_of_processes
440440
}
441441

442-
# if min_similarity <= 0 compute the true maximum number of matches over all strings in master:
443-
if self._config.min_similarity <= 0:
444-
self._true_max_n_matches = awesome_cossim_true_minmax_topn_only(
445-
tf_idf_matrix_1,
446-
tf_idf_matrix_2,
447-
**optional_kwargs
448-
)
449-
# if kwarg max_n_matches was not set then set it now to true value
450-
if self._config.max_n_matches is None:
451-
self._max_n_matches = self._true_max_n_matches
442+
# compute the true maximum number of matches over all strings in master:
443+
self._true_max_n_matches = awesome_cossim_true_minmax_topn_only(
444+
tf_idf_matrix_1,
445+
tf_idf_matrix_2,
446+
**optional_kwargs
447+
)
448+
449+
if self._config.min_similarity <= 0 and self._config.max_n_matches is None:
450+
# if kwarg max_n_matches was not set when min_similarity <= 0 then set it now to its true value
451+
self._max_n_matches = self._true_max_n_matches
452452

453453
return awesome_cossim_topn(
454454
tf_idf_matrix_1, tf_idf_matrix_2,

string_grouper/test/test_string_grouper.py

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,11 @@
33
import numpy as np
44
from scipy.sparse.csr import csr_matrix
55
from string_grouper.string_grouper import DEFAULT_MIN_SIMILARITY, \
6-
DEFAULT_MAX_N_MATCHES, DEFAULT_REGEX, \
7-
DEFAULT_NGRAM_SIZE, DEFAULT_N_PROCESSES, DEFAULT_IGNORE_CASE, \
6+
DEFAULT_REGEX, DEFAULT_NGRAM_SIZE, DEFAULT_N_PROCESSES, DEFAULT_IGNORE_CASE, \
87
StringGrouperConfig, StringGrouper, StringGrouperNotFitException, \
9-
match_most_similar, group_similar_strings, match_strings,\
8+
match_most_similar, group_similar_strings, match_strings, \
109
compute_pairwise_similarities
1110
from unittest.mock import patch
12-
from scipy.sparse.csgraph._flow import csr_matrix
1311

1412
def mock_symmetrize_matrix(A: csr_matrix) -> csr_matrix:
1513
return A
@@ -383,7 +381,7 @@ def test_get_matches_single(self):
383381
left_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo']
384382
right_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo']
385383
left_index = [0, 0, 1, 2, 3, 3]
386-
right_index = [0, 3, 1, 2, 0, 3]
384+
right_index = [3, 0, 1, 2, 3, 0]
387385
similarity = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
388386
expected_df = pd.DataFrame({'left_index': left_index, 'left_side': left_side,
389387
'similarity': similarity,
@@ -399,8 +397,8 @@ def test_get_matches_1_series_1_id_series(self):
399397
left_side_id = ['A0', 'A0', 'A1', 'A2', 'A3', 'A3']
400398
left_index = [0, 0, 1, 2, 3, 3]
401399
right_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo']
402-
right_side_id = ['A0', 'A3', 'A1', 'A2', 'A0', 'A3']
403-
right_index = [0, 3, 1, 2, 0, 3]
400+
right_side_id = ['A3', 'A0', 'A1', 'A2', 'A3', 'A0']
401+
right_index = [3, 0, 1, 2, 3, 0]
404402
similarity = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
405403
expected_df = pd.DataFrame({'left_index': left_index, 'left_side': left_side, 'left_id': left_side_id,
406404
'similarity': similarity,

0 commit comments

Comments
 (0)