44import multiprocessing
55from sklearn .feature_extraction .text import TfidfVectorizer
66from scipy .sparse .csr import csr_matrix
7+ from scipy .sparse .lil import lil_matrix
78from scipy .sparse .csgraph import connected_components
89from typing import Tuple , NamedTuple , List , Optional , Union
910from string_grouper_topn import awesome_cossim_topn
1718DEFAULT_IGNORE_CASE : bool = True # ignores case by default
1819DEFAULT_DROP_INDEX : bool = False # includes index-columns in output
1920DEFAULT_REPLACE_NA : bool = False # when finding the most similar strings, does not replace NaN values in most
20- # similar string index-columns with corresponding duplicates-index values
21- DEFAULT_INCLUDE_ZEROES : bool = True # when the minimum cosine similarity <=0, determines whether zero-similarity
22- # matches appear in the output
21+ # similar string index-columns with corresponding duplicates-index values
22+ DEFAULT_INCLUDE_ZEROES : bool = True # when the minimum cosine similarity <=0, determines whether zero-similarity
23+ # matches appear in the output
2324GROUP_REP_CENTROID : str = 'centroid' # Option value to select the string in each group with the largest
24- # similarity aggregate as group-representative:
25+ # similarity aggregate as group-representative:
2526GROUP_REP_FIRST : str = 'first' # Option value to select the first string in each group as group-representative:
26- DEFAULT_GROUP_REP : str = GROUP_REP_CENTROID # chooses group centroid as group-representative by default
27+ DEFAULT_GROUP_REP : str = GROUP_REP_CENTROID # chooses group centroid as group-representative by default
2728
2829# The following string constants are used by (but aren't [yet] options passed to) StringGrouper
2930DEFAULT_COLUMN_NAME : str = 'side' # used to name non-index columns of the output of StringGrouper.get_matches
30- DEFAULT_ID_NAME : str = 'id' # used to name id-columns in the output of StringGrouper.get_matches
31+ DEFAULT_ID_NAME : str = 'id' # used to name id-columns in the output of StringGrouper.get_matches
3132LEFT_PREFIX : str = 'left_' # used to prefix columns on the left of the output of StringGrouper.get_matches
3233RIGHT_PREFIX : str = 'right_' # used to prefix columns on the right of the output of StringGrouper.get_matches
3334MOST_SIMILAR_PREFIX : str = 'most_similar_' # used to prefix columns of the output of
34- # StringGrouper._get_nearest_matches
35- DEFAULT_MASTER_NAME : str = 'master' # used to name non-index column of the output of StringGrouper.get_nearest_matches
35+ # StringGrouper._get_nearest_matches
36+ DEFAULT_MASTER_NAME : str = 'master' # used to name non-index column of the output of StringGrouper.get_nearest_matches
3637DEFAULT_MASTER_ID_NAME : str = f'{ DEFAULT_MASTER_NAME } _{ DEFAULT_ID_NAME } ' # used to name id-column of the output of
37- # StringGrouper.get_nearest_matches
38+ # StringGrouper.get_nearest_matches
3839GROUP_REP_PREFIX : str = 'group_rep_' # used to prefix and name columns of the output of StringGrouper._deduplicate
3940
4041# High level functions
@@ -147,9 +148,9 @@ class StringGrouperConfig(NamedTuple):
147148 Defaults to number of cores on a machine - 1.
148149 :param ignore_case: bool. Whether or not case should be ignored. Defaults to True (ignore case).
149150 :param ignore_index: whether or not to exclude string Series index-columns in output. Defaults to False.
150- :param include_zeroes: when the minimum cosine similarity <=0, determines whether zero-similarity matches
151+ :param include_zeroes: when the minimum cosine similarity <=0, determines whether zero-similarity matches
151152 appear in the output. Defaults to True.
152- :param replace_na: whether or not to replace NaN values in most similar string index-columns with
153+ :param replace_na: whether or not to replace NaN values in most similar string index-columns with
153154 corresponding duplicates-index values. Defaults to False.
154155 :param group_rep: str. The scheme to select the group-representative. Default is 'centroid'.
155156 The other choice is 'first'.
@@ -231,8 +232,8 @@ def __init__(self, master: pd.Series,
231232 self ._vectorizer = TfidfVectorizer (min_df = 1 , analyzer = self .n_grams )
232233 # After the StringGrouper is built, _matches_list will contain the indices and similarities of the matches
233234 self ._matches_list : pd .DataFrame = pd .DataFrame ()
234- # _true_max_n_matches will contain the true maximum number of matches over all strings in master if
235- # self._config.min_similarity <= 0
235+ # _true_max_n_matches will contain the true maximum number of matches over all strings in master if
236+ # self._config.min_similarity <= 0
236237 self ._true_max_n_matches = None
237238
238239 def n_grams (self , string : str ) -> List [str ]:
@@ -251,21 +252,21 @@ def n_grams(self, string: str) -> List[str]:
251252 def fit (self ) -> 'StringGrouper' :
252253 """Builds the _matches list which contains string matches indices and similarity"""
253254 master_matrix , duplicate_matrix = self ._get_tf_idf_matrices ()
254-
255+
255256 # Calculate the matches using the cosine similarity
256257 matches , self ._true_max_n_matches = self ._build_matches (master_matrix , duplicate_matrix )
257-
258+
258259 if self ._duplicates is None :
259260 # convert to lil format for best efficiency when setting matrix-elements
260- matches = matches .tolil ()
261- # matrix diagonal elements must be exactly 1 (numerical precision errors introduced by
261+ matches = matches .tolil ()
262+ # matrix diagonal elements must be exactly 1 (numerical precision errors introduced by
262263 # floating-point computations in awesome_cossim_topn sometimes lead to unexpected results)
263264 matches = StringGrouper ._fix_diagonal (matches )
264265 if self ._max_n_matches < self ._true_max_n_matches :
265266 # the list of matches must be symmetric! (i.e., if A != B and A matches B; then B matches A)
266267 matches = StringGrouper ._symmetrize_matrix (matches )
267268 matches = matches .tocsr ()
268-
269+
269270 # build list from matrix
270271 self ._matches_list = self ._get_matches_list (matches )
271272 self .is_build = True
@@ -283,14 +284,14 @@ def dot(self) -> pd.Series:
283284 @validate_is_fit
284285 def get_matches (self ,
285286 ignore_index : Optional [bool ] = None ,
286- include_zeroes : Optional [bool ]= None ) -> pd .DataFrame :
287+ include_zeroes : Optional [bool ] = None ) -> pd .DataFrame :
287288 """
288289 Returns a DataFrame with all the matches and their cosine similarity.
289290 If optional IDs are used, returned as extra columns with IDs matched to respective data rows
290291
291- :param ignore_index: whether or not to exclude string Series index-columns in output. Defaults to
292+ :param ignore_index: whether or not to exclude string Series index-columns in output. Defaults to
292293 self._config.ignore_index.
293- :param include_zeroes: when the minimum cosine similarity <=0, determines whether zero-similarity matches
294+ :param include_zeroes: when the minimum cosine similarity <=0, determines whether zero-similarity matches
294295 appear in the output. Defaults to self._config.include_zeroes.
295296 """
296297 def get_both_sides (master : pd .Series ,
@@ -313,18 +314,20 @@ def prefix_column_names(data: Union[pd.Series, pd.DataFrame], prefix: str):
313314 else :
314315 return data .rename (f"{ prefix } { data .name } " )
315316
316- if ignore_index is None : ignore_index = self ._config .ignore_index
317- if include_zeroes is None : include_zeroes = self ._config .include_zeroes
317+ if ignore_index is None :
318+ ignore_index = self ._config .ignore_index
319+ if include_zeroes is None :
320+ include_zeroes = self ._config .include_zeroes
318321 if self ._config .min_similarity > 0 or not include_zeroes :
319322 matches_list = self ._matches_list
320323 elif include_zeroes :
321324 # Here's a fix to a bug pointed out by one GitHub user (@nbcvijanovic):
322- # the fix includes zero-similarity matches that are missing by default
323- # in _matches_list due to our use of sparse matrices
325+ # the fix includes zero-similarity matches that are missing by default
326+ # in _matches_list due to our use of sparse matrices
324327 non_matches_list = self ._get_non_matches_list ()
325328 matches_list = self ._matches_list if non_matches_list .empty else \
326329 pd .concat ([self ._matches_list , non_matches_list ], axis = 0 , ignore_index = True )
327-
330+
328331 left_side , right_side = get_both_sides (self ._master , self ._duplicates , drop_index = ignore_index )
329332 similarity = matches_list .similarity .reset_index (drop = True )
330333 if self ._master_id is None :
@@ -366,16 +369,18 @@ def get_groups(self,
366369 If there are IDs (master_id and/or duplicates_id) then the IDs corresponding to the string outputs
367370 above are returned as well altogether in a DataFrame.
368371
369- :param ignore_index: whether or not to exclude string Series index-columns in output. Defaults to
372+ :param ignore_index: whether or not to exclude string Series index-columns in output. Defaults to
370373 self._config.ignore_index.
371- :param replace_na: whether or not to replace NaN values in most similar string index-columns with
374+ :param replace_na: whether or not to replace NaN values in most similar string index-columns with
372375 corresponding duplicates-index values. Defaults to self._config.replace_na.
373376 """
374- if ignore_index is None : ignore_index = self ._config .ignore_index
377+ if ignore_index is None :
378+ ignore_index = self ._config .ignore_index
375379 if self ._duplicates is None :
376380 return self ._deduplicate (ignore_index = ignore_index )
377381 else :
378- if replace_na is None : replace_na = self ._config .replace_na
382+ if replace_na is None :
383+ replace_na = self ._config .replace_na
379384 return self ._get_nearest_matches (ignore_index = ignore_index , replace_na = replace_na )
380385
381386 @validate_is_fit
@@ -445,7 +450,7 @@ def _build_matches(self, master_matrix: csr_matrix, duplicate_matrix: csr_matrix
445450 """Builds the cossine similarity matrix of two csr matrices"""
446451 tf_idf_matrix_1 = master_matrix
447452 tf_idf_matrix_2 = duplicate_matrix .transpose ()
448-
453+
449454 optional_kwargs = {
450455 'return_best_ntop' : True ,
451456 'use_threads' : self ._config .number_of_processes > 1 ,
@@ -465,7 +470,8 @@ def _get_non_matches_list(self) -> pd.DataFrame:
465470 all_pairs = pd .MultiIndex .from_product ([range (m_sz ), range (d_sz )], names = ['master_side' , 'dupe_side' ])
466471 matched_pairs = pd .MultiIndex .from_frame (self ._matches_list [['master_side' , 'dupe_side' ]])
467472 missing_pairs = all_pairs .difference (matched_pairs )
468- if missing_pairs .empty : return pd .DataFrame ()
473+ if missing_pairs .empty :
474+ return pd .DataFrame ()
469475 if (self ._max_n_matches < self ._true_max_n_matches ):
470476 raise Exception (f'\n ERROR: Cannot return zero-similarity matches since \n '
471477 f'\t \t max_n_matches={ self ._max_n_matches } is too small!\n '
@@ -483,8 +489,8 @@ def _get_nearest_matches(self,
483489 master_label = f'{ prefix } { self ._master .name if self ._master .name else DEFAULT_MASTER_NAME } '
484490 master = self ._master .rename (master_label ).reset_index (drop = ignore_index )
485491 dupes = self ._duplicates .rename ('duplicates' ).reset_index (drop = ignore_index )
486-
487- # Rename new master-columns to avoid possible conflict with new dupes-columns when later merging
492+
493+ # Rename new master-columns to avoid possible conflict with new dupes-columns when later merging
488494 if isinstance (dupes , pd .DataFrame ):
489495 master .rename (
490496 columns = {col : f'{ prefix } { col } ' for col in master .columns if str (col ) != master_label },
@@ -514,14 +520,14 @@ def _get_nearest_matches(self,
514520 if self ._master_id is not None :
515521 # Also update the master_id-series with the duplicates_id in cases were there is no match
516522 dupes_max_sim .loc [rows_to_update , master_id_label ] = dupes_max_sim [rows_to_update ].duplicates_id
517-
523+
518524 # For some weird reason, pandas' merge function changes int-datatype columns to float when NaN values
519525 # appear within them. So here we change them back to their original datatypes if possible:
520526 if dupes_max_sim [master_id_label ].dtype != self ._master_id .dtype and \
521- self ._duplicates_id .dtype == self ._master_id .dtype :
527+ self ._duplicates_id .dtype == self ._master_id .dtype :
522528 dupes_max_sim .loc [:, master_id_label ] = \
523- dupes_max_sim .loc [:, master_id_label ].astype (self ._master_id .dtype )
524-
529+ dupes_max_sim .loc [:, master_id_label ].astype (self ._master_id .dtype )
530+
525531 # Prepare the output:
526532 required_column_list = [master_label ] if self ._master_id is None else [master_id_label , master_label ]
527533 index_column_list = \
@@ -531,13 +537,13 @@ def _get_nearest_matches(self,
531537 # Update the master index-columns with the duplicates index-column values in cases were there is no match
532538 dupes_index_columns = [col for col in dupes .columns if str (col ) != 'duplicates' ]
533539 dupes_max_sim .loc [rows_to_update , index_column_list ] = \
534- dupes_max_sim .loc [rows_to_update , dupes_index_columns ].values
535-
540+ dupes_max_sim .loc [rows_to_update , dupes_index_columns ].values
541+
536542 # Restore their original datatypes if possible:
537543 for m , d in zip (index_column_list , dupes_index_columns ):
538544 if dupes_max_sim [m ].dtype != master [m ].dtype and dupes [d ].dtype == master [m ].dtype :
539545 dupes_max_sim .loc [:, m ] = dupes_max_sim .loc [:, m ].astype (master [m ].dtype )
540-
546+
541547 # Make sure to keep same order as duplicates
542548 dupes_max_sim = dupes_max_sim .sort_values ('dupe_side' ).set_index ('dupe_side' )
543549 output = dupes_max_sim [index_column_list + required_column_list ]
@@ -608,7 +614,7 @@ def _get_indices_of(self, master_side: str, dupe_side: str) -> Tuple[pd.Series,
608614 master_indices = master_strings [master_strings == master_side ].index .to_series ().reset_index (drop = True )
609615 dupe_indices = dupe_strings [dupe_strings == dupe_side ].index .to_series ().reset_index (drop = True )
610616 return master_indices , dupe_indices
611-
617+
612618 def _validate_group_rep_specs (self ):
613619 group_rep_options = (GROUP_REP_FIRST , GROUP_REP_CENTROID )
614620 if self ._config .group_rep not in group_rep_options :
@@ -626,16 +632,16 @@ def _validate_replace_na_and_drop(self):
626632 )
627633
628634 @staticmethod
629- def _fix_diagonal (A ) -> csr_matrix :
630- r = np .arange (A .shape [0 ])
631- A [r , r ] = 1
632- return A
635+ def _fix_diagonal (m : lil_matrix ) -> csr_matrix :
636+ r = np .arange (m .shape [0 ])
637+ m [r , r ] = 1
638+ return m
633639
634640 @staticmethod
635- def _symmetrize_matrix (A ) -> csr_matrix :
636- r , c = A .nonzero ()
637- A [c , r ] = A [r , c ]
638- return A
641+ def _symmetrize_matrix (m_symmetric : lil_matrix ) -> csr_matrix :
642+ r , c = m_symmetric .nonzero ()
643+ m_symmetric [c , r ] = m_symmetric [r , c ]
644+ return m_symmetric
639645
640646 @staticmethod
641647 def _get_matches_list (matches : csr_matrix ) -> pd .DataFrame :
0 commit comments