ParticularMiner
diff --git a/‎string_grouper/string_grouper.py‎
Lines changed: 56 additions & 50 deletions b/‎string_grouper/string_grouper.py‎
Lines changed: 56 additions & 50 deletions
@@ -4,6 +4,7 @@
 import multiprocessing
 from sklearn.feature_extraction.text import TfidfVectorizer
 from scipy.sparse.csr import csr_matrix
+from scipy.sparse.lil import lil_matrix
 from scipy.sparse.csgraph import connected_components
 from typing import Tuple, NamedTuple, List, Optional, Union
 from string_grouper_topn import awesome_cossim_topn
@@ -17,24 +18,24 @@
 DEFAULT_IGNORE_CASE: bool = True  # ignores case by default
 DEFAULT_DROP_INDEX: bool = False  # includes index-columns in output
 DEFAULT_REPLACE_NA: bool = False    # when finding the most similar strings, does not replace NaN values in most
-                                    # similar string index-columns with corresponding duplicates-index values
-DEFAULT_INCLUDE_ZEROES: bool = True # when the minimum cosine similarity <=0, determines whether zero-similarity
-                                    # matches appear in the output 
+# similar string index-columns with corresponding duplicates-index values
+DEFAULT_INCLUDE_ZEROES: bool = True  # when the minimum cosine similarity <=0, determines whether zero-similarity
+# matches appear in the output
 GROUP_REP_CENTROID: str = 'centroid'    # Option value to select the string in each group with the largest
-                                        # similarity aggregate as group-representative:
+# similarity aggregate as group-representative:
 GROUP_REP_FIRST: str = 'first'  # Option value to select the first string in each group as group-representative:
-DEFAULT_GROUP_REP: str = GROUP_REP_CENTROID # chooses group centroid as group-representative by default
+DEFAULT_GROUP_REP: str = GROUP_REP_CENTROID  # chooses group centroid as group-representative by default
 
 # The following string constants are used by (but aren't [yet] options passed to) StringGrouper
 DEFAULT_COLUMN_NAME: str = 'side'   # used to name non-index columns of the output of StringGrouper.get_matches
-DEFAULT_ID_NAME: str = 'id' # used to name id-columns in the output of StringGrouper.get_matches
+DEFAULT_ID_NAME: str = 'id'  # used to name id-columns in the output of StringGrouper.get_matches
 LEFT_PREFIX: str = 'left_'  # used to prefix columns on the left of the output of StringGrouper.get_matches
 RIGHT_PREFIX: str = 'right_'    # used to prefix columns on the right of the output of StringGrouper.get_matches
 MOST_SIMILAR_PREFIX: str = 'most_similar_'  # used to prefix columns of the output of
-                                            # StringGrouper._get_nearest_matches
-DEFAULT_MASTER_NAME: str = 'master' # used to name non-index column of the output of StringGrouper.get_nearest_matches
+# StringGrouper._get_nearest_matches
+DEFAULT_MASTER_NAME: str = 'master'  # used to name non-index column of the output of StringGrouper.get_nearest_matches
 DEFAULT_MASTER_ID_NAME: str = f'{DEFAULT_MASTER_NAME}_{DEFAULT_ID_NAME}'    # used to name id-column of the output of
-                                                                            # StringGrouper.get_nearest_matches
+# StringGrouper.get_nearest_matches
 GROUP_REP_PREFIX: str = 'group_rep_'    # used to prefix and name columns of the output of StringGrouper._deduplicate
 
 # High level functions
@@ -147,9 +148,9 @@ class StringGrouperConfig(NamedTuple):
     Defaults to number of cores on a machine - 1.
     :param ignore_case: bool. Whether or not case should be ignored. Defaults to True (ignore case).
     :param ignore_index: whether or not to exclude string Series index-columns in output.  Defaults to False.
-    :param include_zeroes: when the minimum cosine similarity <=0, determines whether zero-similarity matches 
+    :param include_zeroes: when the minimum cosine similarity <=0, determines whether zero-similarity matches
     appear in the output.  Defaults to True.
-    :param replace_na: whether or not to replace NaN values in most similar string index-columns with 
+    :param replace_na: whether or not to replace NaN values in most similar string index-columns with
     corresponding duplicates-index values. Defaults to False.
     :param group_rep: str.  The scheme to select the group-representative.  Default is 'centroid'.
     The other choice is 'first'.
@@ -231,8 +232,8 @@ def __init__(self, master: pd.Series,
         self._vectorizer = TfidfVectorizer(min_df=1, analyzer=self.n_grams)
         # After the StringGrouper is built, _matches_list will contain the indices and similarities of the matches
         self._matches_list: pd.DataFrame = pd.DataFrame()
-        # _true_max_n_matches will contain the true maximum number of matches over all strings in master if 
-        # self._config.min_similarity <= 0 
+        # _true_max_n_matches will contain the true maximum number of matches over all strings in master if
+        # self._config.min_similarity <= 0
         self._true_max_n_matches = None
 
     def n_grams(self, string: str) -> List[str]:
@@ -251,21 +252,21 @@ def n_grams(self, string: str) -> List[str]:
     def fit(self) -> 'StringGrouper':
         """Builds the _matches list which contains string matches indices and similarity"""
         master_matrix, duplicate_matrix = self._get_tf_idf_matrices()
-        
+
         # Calculate the matches using the cosine similarity
         matches, self._true_max_n_matches = self._build_matches(master_matrix, duplicate_matrix)
-        
+
         if self._duplicates is None:
             # convert to lil format for best efficiency when setting matrix-elements
-            matches = matches.tolil() 
-            # matrix diagonal elements must be exactly 1 (numerical precision errors introduced by 
+            matches = matches.tolil()
+            # matrix diagonal elements must be exactly 1 (numerical precision errors introduced by
             # floating-point computations in awesome_cossim_topn sometimes lead to unexpected results)
             matches = StringGrouper._fix_diagonal(matches)
             if self._max_n_matches < self._true_max_n_matches:
                 # the list of matches must be symmetric! (i.e., if A != B and A matches B; then B matches A)
                 matches = StringGrouper._symmetrize_matrix(matches)
             matches = matches.tocsr()
-        
+
         # build list from matrix
         self._matches_list = self._get_matches_list(matches)
         self.is_build = True
@@ -283,14 +284,14 @@ def dot(self) -> pd.Series:
     @validate_is_fit
     def get_matches(self,
                     ignore_index: Optional[bool] = None,
-                    include_zeroes: Optional[bool]=None) -> pd.DataFrame:
+                    include_zeroes: Optional[bool] = None) -> pd.DataFrame:
         """
         Returns a DataFrame with all the matches and their cosine similarity.
         If optional IDs are used, returned as extra columns with IDs matched to respective data rows
 
-        :param ignore_index: whether or not to exclude string Series index-columns in output.  Defaults to 
+        :param ignore_index: whether or not to exclude string Series index-columns in output.  Defaults to
         self._config.ignore_index.
-        :param include_zeroes: when the minimum cosine similarity <=0, determines whether zero-similarity matches 
+        :param include_zeroes: when the minimum cosine similarity <=0, determines whether zero-similarity matches
         appear in the output.  Defaults to self._config.include_zeroes.
         """
         def get_both_sides(master: pd.Series,
@@ -313,18 +314,20 @@ def prefix_column_names(data: Union[pd.Series, pd.DataFrame], prefix: str):
             else:
                 return data.rename(f"{prefix}{data.name}")
 
-        if ignore_index is None: ignore_index = self._config.ignore_index
-        if include_zeroes is None: include_zeroes = self._config.include_zeroes
+        if ignore_index is None:
+            ignore_index = self._config.ignore_index
+        if include_zeroes is None:
+            include_zeroes = self._config.include_zeroes
         if self._config.min_similarity > 0 or not include_zeroes:
             matches_list = self._matches_list
         elif include_zeroes:
             # Here's a fix to a bug pointed out by one GitHub user (@nbcvijanovic):
-            # the fix includes zero-similarity matches that are missing by default 
-            # in _matches_list due to our use of sparse matrices 
+            # the fix includes zero-similarity matches that are missing by default
+            # in _matches_list due to our use of sparse matrices
             non_matches_list = self._get_non_matches_list()
             matches_list = self._matches_list if non_matches_list.empty else \
                 pd.concat([self._matches_list, non_matches_list], axis=0, ignore_index=True)
-            
+
         left_side, right_side = get_both_sides(self._master, self._duplicates, drop_index=ignore_index)
         similarity = matches_list.similarity.reset_index(drop=True)
         if self._master_id is None:
@@ -366,16 +369,18 @@ def get_groups(self,
          If there are IDs (master_id and/or duplicates_id) then the IDs corresponding to the string outputs
          above are returned as well altogether in a DataFrame.
 
-        :param ignore_index: whether or not to exclude string Series index-columns in output.  Defaults to 
+        :param ignore_index: whether or not to exclude string Series index-columns in output.  Defaults to
         self._config.ignore_index.
-        :param replace_na: whether or not to replace NaN values in most similar string index-columns with 
+        :param replace_na: whether or not to replace NaN values in most similar string index-columns with
         corresponding duplicates-index values. Defaults to self._config.replace_na.
          """
-        if ignore_index is None: ignore_index = self._config.ignore_index
+        if ignore_index is None:
+            ignore_index = self._config.ignore_index
         if self._duplicates is None:
             return self._deduplicate(ignore_index=ignore_index)
         else:
-            if replace_na is None: replace_na = self._config.replace_na
+            if replace_na is None:
+                replace_na = self._config.replace_na
             return self._get_nearest_matches(ignore_index=ignore_index, replace_na=replace_na)
 
     @validate_is_fit
@@ -445,7 +450,7 @@ def _build_matches(self, master_matrix: csr_matrix, duplicate_matrix: csr_matrix
         """Builds the cossine similarity matrix of two csr matrices"""
         tf_idf_matrix_1 = master_matrix
         tf_idf_matrix_2 = duplicate_matrix.transpose()
-        
+
         optional_kwargs = {
             'return_best_ntop': True,
             'use_threads': self._config.number_of_processes > 1,
@@ -465,7 +470,8 @@ def _get_non_matches_list(self) -> pd.DataFrame:
         all_pairs = pd.MultiIndex.from_product([range(m_sz), range(d_sz)], names=['master_side', 'dupe_side'])
         matched_pairs = pd.MultiIndex.from_frame(self._matches_list[['master_side', 'dupe_side']])
         missing_pairs = all_pairs.difference(matched_pairs)
-        if missing_pairs.empty: return pd.DataFrame()
+        if missing_pairs.empty:
+            return pd.DataFrame()
         if (self._max_n_matches < self._true_max_n_matches):
             raise Exception(f'\nERROR: Cannot return zero-similarity matches since \n'
                             f'\t\t max_n_matches={self._max_n_matches} is too small!\n'
@@ -483,8 +489,8 @@ def _get_nearest_matches(self,
         master_label = f'{prefix}{self._master.name if self._master.name else DEFAULT_MASTER_NAME}'
         master = self._master.rename(master_label).reset_index(drop=ignore_index)
         dupes = self._duplicates.rename('duplicates').reset_index(drop=ignore_index)
-        
-        # Rename new master-columns to avoid possible conflict with new dupes-columns when later merging 
+
+        # Rename new master-columns to avoid possible conflict with new dupes-columns when later merging
         if isinstance(dupes, pd.DataFrame):
             master.rename(
                 columns={col: f'{prefix}{col}' for col in master.columns if str(col) != master_label},
@@ -514,14 +520,14 @@ def _get_nearest_matches(self,
         if self._master_id is not None:
             # Also update the master_id-series with the duplicates_id in cases were there is no match
             dupes_max_sim.loc[rows_to_update, master_id_label] = dupes_max_sim[rows_to_update].duplicates_id
-            
+
             # For some weird reason, pandas' merge function changes int-datatype columns to float when NaN values
             # appear within them. So here we change them back to their original datatypes if possible:
             if dupes_max_sim[master_id_label].dtype != self._master_id.dtype and \
-                self._duplicates_id.dtype == self._master_id.dtype:
+                    self._duplicates_id.dtype == self._master_id.dtype:
                 dupes_max_sim.loc[:, master_id_label] = \
-                dupes_max_sim.loc[:, master_id_label].astype(self._master_id.dtype)
-            
+                    dupes_max_sim.loc[:, master_id_label].astype(self._master_id.dtype)
+
         # Prepare the output:
         required_column_list = [master_label] if self._master_id is None else [master_id_label, master_label]
         index_column_list = \
@@ -531,13 +537,13 @@ def _get_nearest_matches(self,
             # Update the master index-columns with the duplicates index-column values in cases were there is no match
             dupes_index_columns = [col for col in dupes.columns if str(col) != 'duplicates']
             dupes_max_sim.loc[rows_to_update, index_column_list] = \
-            dupes_max_sim.loc[rows_to_update, dupes_index_columns].values
-            
+                dupes_max_sim.loc[rows_to_update, dupes_index_columns].values
+
             # Restore their original datatypes if possible:
             for m, d in zip(index_column_list, dupes_index_columns):
                 if dupes_max_sim[m].dtype != master[m].dtype and dupes[d].dtype == master[m].dtype:
                     dupes_max_sim.loc[:, m] = dupes_max_sim.loc[:, m].astype(master[m].dtype)
-                    
+
         # Make sure to keep same order as duplicates
         dupes_max_sim = dupes_max_sim.sort_values('dupe_side').set_index('dupe_side')
         output = dupes_max_sim[index_column_list + required_column_list]
@@ -608,7 +614,7 @@ def _get_indices_of(self, master_side: str, dupe_side: str) -> Tuple[pd.Series,
         master_indices = master_strings[master_strings == master_side].index.to_series().reset_index(drop=True)
         dupe_indices = dupe_strings[dupe_strings == dupe_side].index.to_series().reset_index(drop=True)
         return master_indices, dupe_indices
-    
+
     def _validate_group_rep_specs(self):
         group_rep_options = (GROUP_REP_FIRST, GROUP_REP_CENTROID)
         if self._config.group_rep not in group_rep_options:
@@ -626,16 +632,16 @@ def _validate_replace_na_and_drop(self):
             )
 
     @staticmethod
-    def _fix_diagonal(A) -> csr_matrix:
-        r = np.arange(A.shape[0])
-        A[r, r] = 1
-        return A
+    def _fix_diagonal(m: lil_matrix) -> csr_matrix:
+        r = np.arange(m.shape[0])
+        m[r, r] = 1
+        return m
 
     @staticmethod
-    def _symmetrize_matrix(A) -> csr_matrix:
-        r, c = A.nonzero()
-        A[c, r] = A[r, c]
-        return A
+    def _symmetrize_matrix(m_symmetric: lil_matrix) -> csr_matrix:
+        r, c = m_symmetric.nonzero()
+        m_symmetric[c, r] = m_symmetric[r, c]
+        return m_symmetric
 
     @staticmethod
     def _get_matches_list(matches: csr_matrix) -> pd.DataFrame: