Skip to content
120 changes: 43 additions & 77 deletions choice_learn/data/choice_dataset.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Main classes to handle assortment data."""

import logging
import re

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -884,11 +885,9 @@ def from_single_wide_df(
df,
items_id,
shared_features_columns=None,
items_features_suffixes=None,
items_features_prefixes=None,
available_items_suffix=None,
available_items_prefix=None,
delimiter="_",
items_features_patterns=None,
available_items_pattern=None,
patterns_ignore_chars="[^a-zA-Z0-9]",
choices_column="choice",
choice_format="items_id",
):
Expand All @@ -902,21 +901,19 @@ def from_single_wide_df(
List of items ids
shared_features_columns : list, optional
List of columns of the dataframe that are shared_features_by_choice, default is None
items_features_prefixes : list, optional
Prefixes of the columns of the dataframe that are items_features_by_choice,
items_features_patterns : list of str, optional
Patterns of the columns of the dataframe that are items_features_by_choice,
given as "*suffix" or "prefix*" where "*" is replaced by items_id in df columns.
It is possible to specify characters to be ignored by including them between [^ and ].
default is None
items_features_suffixes : list, optional
Suffixes of the columns of the dataframe that are items_features_by_choice,
available_items_pattern: str, optional
Pattern of the columns of the dataframe that are available_items_by_choice,
given as "*suffix" or "prefix*" where "*" is replaced by items_id in df columns.
default is None
available_items_prefix: str, optional
Prefix of the columns of the dataframe that precise available_items_by_choice,
default is None
available_items_suffix: str, optional
Suffix of the columns of the dataframe that precise available_items_by_choice,
default is None
delimiter: str, optional
Delimiter used to separate the given prefix or suffixes and the features names,
default is "_"
patterns_ignore_chars: str or list, optional
Characters to be ignored in the patterns matching, given as a regex string
(e.g. "[^a-zA-Z0-9_]") or as a list of characters (e.g. [" ", "-", "/"]),
default is "[^a-zA-Z0-9_]"
choice_column: str, optional
Name of the column containing the choices, default is "choice"
choice_format: str, optional
Expand All @@ -928,11 +925,6 @@ def from_single_wide_df(
ChoiceDataset
corresponding ChoiceDataset
"""
if available_items_prefix is not None and available_items_suffix is not None:
raise ValueError(
"You cannot give both available_items_prefix and\
available_items_suffix."
)
if choice_format not in ["items_index", "items_id"]:
logging.warning("choice_format not understood, defaulting to 'items_index'")

Expand All @@ -943,43 +935,12 @@ def from_single_wide_df(
shared_features_by_choice = None
shared_features_by_choice_names = None

if items_features_suffixes is not None and items_features_prefixes is not None:
# The list of features names is the concatenation of the two lists of
# prefixes and suffixes
items_features_names = items_features_prefixes + items_features_suffixes
items_features_by_choice = []
for item in items_id:
columns = [f"{feature}{delimiter}{item}" for feature in items_features_prefixes] + [
f"{item}{delimiter}{feature}" for feature in items_features_suffixes
]
for col in columns:
if col not in df.columns:
logging.warning(
f"Column {col} was not in DataFrame,\
dummy creation of the feature with zeros."
)
df[col] = 0
items_features_by_choice.append(df[columns].to_numpy())
items_features_by_choice = np.stack(items_features_by_choice, axis=1)
elif items_features_suffixes is not None:
items_features_names = items_features_suffixes
items_features_by_choice = []
for item in items_id:
columns = [f"{item}{delimiter}{feature}" for feature in items_features_suffixes]
for col in columns:
if col not in df.columns:
logging.warning(
f"Column {col} was not in DataFrame,\
dummy creation of the feature with zeros."
)
df[col] = 0
items_features_by_choice.append(df[columns].to_numpy())
items_features_by_choice = np.stack(items_features_by_choice, axis=1)
elif items_features_prefixes is not None:
items_features_names = items_features_prefixes
if items_features_patterns is not None:
if not all(["*" in pattern for pattern in items_features_patterns]):
raise ValueError("items_features_patterns should all contain '*' character.")
items_features_by_choice = []
for item in items_id:
columns = [f"{feature}{delimiter}{item}" for feature in items_features_prefixes]
columns = [feature.replace("*", item) for feature in items_features_patterns]
for col in columns:
if col not in df.columns:
logging.warning(
Expand All @@ -989,35 +950,40 @@ def from_single_wide_df(
df[col] = 0
items_features_by_choice.append(df[columns].to_numpy())
items_features_by_choice = np.stack(items_features_by_choice, axis=1)
items_features_names = [
features.replace("*", "") for features in items_features_patterns
]
if isinstance(patterns_ignore_chars, list):
for char in patterns_ignore_chars:
items_features_names = [name.replace(char, "") for name in items_features_names]
elif isinstance(patterns_ignore_chars, str):
regex = re.compile(patterns_ignore_chars)
items_features_names = [regex.sub("", name) for name in items_features_names]
print(">>>", items_features_names)
elif items_features_patterns is not None:
raise ValueError(
f"""patterns_ignore_chars should either be a list of characters,
a regex string or None, got {type(patterns_ignore_chars)}"""
)
else:
items_features_by_choice = None
items_features_names = None

if available_items_suffix is not None:
if isinstance(available_items_suffix, list):
if not len(available_items_suffix) == len(items_id):
raise ValueError(
"You have given a list of columns for availabilities."
"We consider that it is one for each item however lenghts do not match"
)
logging.info("You have given a list of columns for availabilities.")
logging.info("Each column will be matched to an item, given their order")
available_items_by_choice = df[available_items_suffix].to_numpy()
else:
columns = [f"{item}{delimiter}{available_items_suffix}" for item in items_id]
available_items_by_choice = df[columns].to_numpy()
elif available_items_prefix is not None:
if isinstance(available_items_prefix, list):
if not len(available_items_prefix) == len(items_id):
if available_items_pattern is not None:
if isinstance(available_items_pattern, list):
if not len(available_items_pattern) == len(items_id):
raise ValueError(
"You have given a list of columns for availabilities."
"We consider that it is one for each item however lenghts do not match"
"We consider that it is one for each item however lengths do not match"
)
logging.info("You have given a list of columns for availabilities.")
logging.info("Each column will be matched to an item, given their order")
available_items_by_choice = df[available_items_prefix].to_numpy()
available_items_by_choice = df[available_items_pattern].to_numpy()
else:
columns = [f"{available_items_prefix}{delimiter}{item}" for item in items_id]
if "*" not in available_items_pattern:
raise ValueError("available_items_pattern should contain '*' character.")
columns = [available_items_pattern.replace("*", item) for item in items_id]
print(">>>", columns, available_items_pattern, items_id)
available_items_by_choice = df[columns].to_numpy()
else:
available_items_by_choice = None
Expand Down
Loading