Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rev fr rules #32

Merged
merged 15 commits into from
Feb 21, 2024
Prev Previous commit
Next Next commit
Update search_columns.py
  • Loading branch information
mfebrizio committed Feb 15, 2024
commit 64c02b635c53515fb72bff2e3517011ddd3b689b
30 changes: 14 additions & 16 deletions data/fr_rules/code/search_columns.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,24 @@
# import dependencies
import itertools
import re
import numpy as np

from numpy import array
from pandas import DataFrame


# Defining a function to search for string patterns within dataframe columns
def search_columns(df,
def search_columns(df: DataFrame,
patterns: list,
columns: list,
return_as: str = "indicator_column",
return_column: str = "indicator",
re_flags = re.I|re.X):
re_flags = re.I | re.X):
"""Search columns for string patterns within dataframe columns.

Args:
df (DataFrame): Input data in format of pandas dataframe.
patterns (list): List of string patterns to input, compatible with regex.
columns (list): List of column names to search for input patterns.
return_as (str, optional): Choose whether to return a DataFrame with indicator column ("indicator_column") or a DataFrame filtered by the search terms ("filtered_df"). Defaults to "indicator_column".
return_as (str, optional): Return a DataFrame with indicator column ("indicator_column") or filtered by the search terms ("filtered_df"). Defaults to "indicator_column".
re_flags (optional): Regex flags to use. Defaults to re.I | re.X.

Raises:
Expand All @@ -32,9 +33,7 @@ def search_columns(df,
bool_list = []

# ensure that input patterns and columns are formatted as lists
if type(patterns) == list and type(columns) == list:
pass
else:
if not (isinstance(patterns, list) and isinstance(columns, list)):
raise TypeError('Inputs for "patterns" and "columns" keywords must be lists.')

if len(patterns) == len(columns):
Expand All @@ -44,7 +43,7 @@ def search_columns(df,
# loop over list of inputs
for i in inputs:
searchre = df[i[1]].str.contains(i[0], regex=True, case=False, flags=re_flags)
searchbool = np.array([True if n is True else False for n in searchre])
searchbool = array([True if n is True else False for n in searchre])
bool_list.append(searchbool)

elif (len(patterns) == 1) and (len(patterns) != len(columns)):
Expand All @@ -54,30 +53,29 @@ def search_columns(df,
# loop over list of inputs
for i in inputs:
searchre = df[i[1]].str.contains(i[0], regex=True, case=False, flags=re_flags)
searchbool = np.array([True if n is True else False for n in searchre])
searchbool = array([True if n is True else False for n in searchre])
bool_list.append(searchbool)

else: # eg, patterns formatted as a list of len(n>1) but does not match len(columns)
raise ValueError("Length of inputs are incorrect. Lengths of 'patterns' and 'columns' must match or a single pattern can map to multiple columns.")
raise ValueError("Length of inputs are incorrect. Lengths of 'patterns' and 'columns' can either match or a single pattern can map to multiple columns.")

# combine each "searchbool" array elementwise
# we want a positive match for any column to evaluate as True
# equivalent to (bool_list[0] | bool_list[1] | bool_list[2] | ... | bool_list[n-1])
filter_bool = np.array(bool_list).any(axis=0)
filter_bool = array(bool_list).any(axis=0)

if return_as == "indicator_column":
dfResults = df.copy(deep=True)
dfResults.loc[:, return_column] = 0
dfResults.loc[filter_bool, return_column] = 1
print(f"Count {return_column}: {sum(dfResults[return_column].values)}")
#print(f"Count {return_column}: {sum(dfResults[return_column].values)}")
return dfResults

elif return_as == "filtered_df":
# filter results
dfResults = df.loc[filter_bool,:].copy(deep=True)
print(f"Count {return_column}: {len(dfResults)}")
dfResults = df.loc[filter_bool, :].copy(deep=True)
#print(f"Count {return_column}: {len(dfResults)}")
return dfResults

else:
raise ValueError("Incorrect input for 'return_as' parameter.")