Skip to content

Commit

Permalink
Solved bug in column name to column signature mapping for categorical…
Browse files Browse the repository at this point in the history
… columns. Solved deprecation warnings.
  • Loading branch information
alex-bogatu committed Nov 18, 2021
1 parent 8d74152 commit ce3874b
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 15 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def transform(self, input_values: Iterable[str]) -> Set[str]:
except ValueError:
return set()

weight_map = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
weight_map = dict(zip(vectorizer.get_feature_names_out(), vectorizer.idf_))
tokenset = set()
tokenizer = vectorizer.build_tokenizer()
for value in input_values:
Expand Down
20 changes: 10 additions & 10 deletions d3l/indexing/similarity_indexes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,8 +148,8 @@ def create_index(self) -> LSHIndex:
lsh_index.add(input_id=str(table), input_set=table_signature)
column_data = self.dataloader.get_columns(table_name=table)

column_signatures = [self.transformer.transform(c) for c in column_data]
for c, signature in zip(column_data, column_signatures):
column_signatures = [(c, self.transformer.transform(c)) for c in column_data]
for c, signature in column_signatures:
if len(signature) > 0:
lsh_index.add(input_id=str(table) + "." + str(c), input_set=signature)

Expand Down Expand Up @@ -249,11 +249,11 @@ def create_index(self) -> LSHIndex:
table_data = self.dataloader.read_table(table_name=table)

column_signatures = [
self.transformer.transform(table_data[c].tolist())
(c, self.transformer.transform(table_data[c].tolist()))
for c in table_data.columns
if not is_numeric(table_data[c]) and table_data[c].count() > 0
]
for c, signature in zip(table_data.columns.tolist(), column_signatures):
for c, signature in column_signatures:
if len(signature) > 0:
lsh_index.add(input_id=str(table) + "." + str(c), input_set=signature)
return lsh_index
Expand Down Expand Up @@ -374,11 +374,11 @@ def create_index(self) -> LSHIndex:
table_data = self.dataloader.read_table(table_name=table)

column_signatures = [
self.transformer.transform(table_data[c].tolist())
(c, self.transformer.transform(table_data[c].tolist()))
for c in table_data.columns
if not is_numeric(table_data[c]) and table_data[c].count() > 0
]
for c, signature in zip(table_data.columns.tolist(), column_signatures):
for c, signature in column_signatures:
if len(signature) > 0:
lsh_index.add(input_id=str(table) + "." + str(c), input_set=signature)

Expand Down Expand Up @@ -519,11 +519,11 @@ def create_index(self) -> LSHIndex:
table_data = self.dataloader.read_table(table_name=table)

column_signatures = [
self.transformer.transform(table_data[c].tolist())
(c, self.transformer.transform(table_data[c].tolist()))
for c in table_data.columns
if not is_numeric(table_data[c]) and table_data[c].count() > 0
]
for c, signature in zip(table_data.columns.tolist(), column_signatures):
for c, signature in column_signatures:
if len(signature) > 0:
lsh_index.add(input_id=str(table) + "." + str(c), input_set=signature)

Expand Down Expand Up @@ -641,11 +641,11 @@ def create_index(self) -> LSHIndex:
table_data = self.dataloader.read_table(table_name=table)

column_signatures = [
self.transformer.transform(table_data[c].tolist())
(c, self.transformer.transform(table_data[c].tolist()))
for c in table_data.columns
if is_numeric(table_data[c]) and table_data[c].count() > 0
]
for c, signature in zip(table_data.columns.tolist(), column_signatures):
for c, signature in column_signatures:
if len(signature) > 0:
lsh_index.add(input_id=str(table) + "." + str(c), input_set=signature)

Expand Down
8 changes: 4 additions & 4 deletions d3l/input_output/dataloaders/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -491,15 +491,15 @@ def read_table(
usecols=table_columns,
chunksize=chunk_size,
low_memory=False,
error_bad_lines=False,
warn_bad_lines=False,
# error_bad_lines=False, # Deprecated in future versions
# warn_bad_lines=False, # Deprecated in future versions
**self.loading_kwargs
)
return pd.read_csv(
file_path,
chunksize=chunk_size,
low_memory=False,
error_bad_lines=False,
warn_bad_lines=False,
# error_bad_lines=False, # Deprecated in future versions
# warn_bad_lines=False, # Deprecated in future versions
**self.loading_kwargs
)

0 comments on commit ce3874b

Please sign in to comment.