Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
126 changes: 102 additions & 24 deletions src/petab_gui/controllers/table_controllers.py
Original file line number Diff line number Diff line change
Expand Up @@ -769,34 +769,112 @@ def _detect_time_column(self, df) -> str | None:
return c
return None

def _rank_dose_candidates(self, df) -> list[str]:
"""Lightweight ranking of dose-like columns (regex + numeric + cardinality)."""
def _rank_dose_candidates(self, df: pd.DataFrame) -> list[str]:
"""Rank DataFrame columns by likelihood of containing dose/concentration data.

This method implements a lightweight scoring system to identify and rank
columns that are most likely to contain dose, concentration, or drug-related
data. The ranking is based on multiple heuristics including column naming
patterns, data types, value ranges, and statistical properties.

Parameters
----------
df : pd.DataFrame
Input DataFrame containing columns to be evaluated and ranked.
Must contain at least one column with data.

Returns
-------
list[str]
Column names sorted by descending likelihood of containing dose data.
Columns with higher scores appear first. In case of tied scores,
columns with fewer unique values are ranked higher.

Notes
-----
The scoring algorithm considers the following criteria:

- **Name matching** (+2.0 points): Column names containing keywords like
'dose', 'conc', 'concentration', 'drug', 'compound', 'stim', 'input',
or patterns like 'u<digit>' (case-insensitive).

- **Numeric data type** (+1.0 points): Columns with integer or float dtype.

- **Reasonable cardinality** (+0.8 points): Columns with 2-30 unique
non-null values, which is typical for dose series.

- **Non-negative values** (+0.3 points): All values are >= 0 when converted
to numeric (dose/concentration values are typically non-negative).

- **Monotonic tendency** (+0.2 points): At least 70% of consecutive numeric
differences are non-decreasing, indicating potential dose escalation
patterns. Requires at least 5 non-null numeric values.

Raises
------
AttributeError
If df does not have the expected pandas DataFrame interface.

ValueError
If df is empty or contains no valid columns for evaluation.

See Also
--------
pandas.DataFrame.nunique : Count unique values in each column
pandas.to_numeric : Convert argument to numeric type
numpy.diff : Calculate discrete differences along array

Warning
-------
This function uses broad exception handling to ensure robustness when
processing diverse data types. Individual column evaluation errors are
silently ignored to prevent failure on edge cases like mixed data types
or missing values.
"""
# Compile pattern for dose-related column names
patt = re.compile(
r"\b(dose|conc|concentration|drug|compound|stim|input|u\d+)\b",
re.IGNORECASE,
)
scores = {}
# FIXME: https://github.com/PaulJonasJost/PEtab_GUI/issues/159
for col in df.columns: # noqa: B007

scores: dict[str, float] = {}
for col in df.columns:
s = 0.0
if patt.search(col or ""):
s += 2.0
try:
if df[col].dtype.kind in "if":
s += 1.0
uniq = df[col].nunique(dropna=True)
if 2 <= uniq <= 30:
s += 0.8
if np.all(pd.to_numeric(df[col], errors="coerce").fillna(0) >= 0):
s += 0.3
ser = pd.to_numeric(df[col], errors="coerce").dropna()
if len(ser) >= 5:
diffs = np.diff(ser.values)
if np.mean(diffs >= 0) >= 0.7:
s += 0.2
except Exception:
pass
scores[col] = s

# Score based on column name pattern matching
if patt.search(col or ""):
s += 2.0

try:
# Score based on data type (numeric preferred)
if df[col].dtype.kind in "if": # integer or float
s += 1.0

# Score based on reasonable number of unique values
uniq = df[col].nunique(dropna=True)
if 2 <= uniq <= 30: # Reasonable range for dose series?
s += 0.8

# Score based on non-negative values (typical for doses)
if np.all(
pd.to_numeric(df[col], errors="coerce").fillna(0) >= 0
):
s += 0.3

# Score based on monotonic tendency (dose escalation pattern)
ser = pd.to_numeric(df[col], errors="coerce").dropna()
if len(ser) >= 5:
diffs = np.diff(ser.values)
if np.mean(diffs >= 0) >= 0.7: # 70% non-decreasing
s += 0.2

except Exception: # noqa: S110
# Silently handle any data processing errors
pass

scores[col] = s

# Sort by score (descending) then by unique count (ascending) for ties
return [
c
for c, _ in sorted(
Expand All @@ -806,7 +884,7 @@ def _rank_dose_candidates(self, df) -> list[str]:
]

def _resolve_dose_and_time(self, df) -> tuple[str | None, str | None, str]:
"""Open dialog with ranked dose suggestions and time choices (incl. steady state)."""
"""Open dialog with ranked dose suggestions and time choices."""
header_key = str(hash(tuple(df.columns)))
settings = settings_manager.settings
# TODO: rename settings location
Expand Down