Skip to content

Commit 9f63435

Browse files
authored
refactor: add structured logging, fix error handling, and validate inputs (#63)
- Add hashprep/utils/logging.py with package-level structured logger - Replace 8 bare `except Exception` blocks with specific exception types (ValueError, LinAlgError, RuntimeWarning, OSError) and debug logging - Add input validation in DatasetAnalyzer.__init__: - TypeError if input is not a DataFrame - ValueError for duplicate column names - ValueError if target_col not found in DataFrame - TypeError if comparison_df is not a DataFrame - Add input validation in check_drift() for both DataFrames
1 parent 96eed0e commit 9f63435

File tree

7 files changed

+73
-11
lines changed

7 files changed

+73
-11
lines changed

hashprep/checks/drift.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,9 @@
44

55
from .core import Issue
66
from ..config import DEFAULT_CONFIG
7+
from ..utils.logging import get_logger
8+
9+
_log = get_logger("checks.drift")
710

811
_DRIFT = DEFAULT_CONFIG.drift
912
CRITICAL_P_VALUE = _DRIFT.critical_p_value
@@ -19,6 +22,9 @@ def check_drift(
1922
Check for distribution shift between two datasets.
2023
Uses Kolmogorov-Smirnov test for numeric columns and Chi-square for categorical.
2124
"""
25+
if not isinstance(df_train, pd.DataFrame) or not isinstance(df_test, pd.DataFrame):
26+
raise TypeError("Both df_train and df_test must be pandas DataFrames")
27+
2228
issues = []
2329

2430
issues.extend(_check_numeric_drift(df_train, df_test, threshold))
@@ -132,7 +138,7 @@ def _check_categorical_drift(
132138
quick_fix="Options:\n- Re-train model with recent data.\n- Investigate category distribution changes.\n- Consider rebalancing categories.",
133139
)
134140
)
135-
except (ValueError, RuntimeWarning):
136-
pass
141+
except (ValueError, RuntimeWarning) as e:
142+
_log.debug("Chi-square drift test failed for '%s': %s", col, e)
137143

138144
return issues

hashprep/checks/leakage.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,10 @@
33
from scipy.stats import chi2_contingency, f_oneway
44
import numpy as np
55
from ..config import DEFAULT_CONFIG
6+
from ..utils.logging import get_logger
67

78
_LEAK = DEFAULT_CONFIG.leakage
9+
_log = get_logger("checks.leakage")
810

911
def _check_data_leakage(analyzer):
1012
issues = []
@@ -91,7 +93,8 @@ def _check_target_leakage_patterns(analyzer):
9193
quick_fix=quick_fix,
9294
)
9395
)
94-
except Exception:
96+
except (ValueError, np.linalg.LinAlgError) as e:
97+
_log.debug("Chi-square leakage test failed for '%s': %s", col, e)
9598
continue
9699
numeric_cols = analyzer.df.select_dtypes(include="number").drop(
97100
columns=[analyzer.target_col], errors="ignore"
@@ -127,6 +130,7 @@ def _check_target_leakage_patterns(analyzer):
127130
quick_fix=quick_fix,
128131
)
129132
)
130-
except Exception:
133+
except (ValueError, RuntimeWarning) as e:
134+
_log.debug("F-test leakage check failed for '%s': %s", col, e)
131135
continue
132136
return issues

hashprep/checks/missing_values.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,11 @@
33
import pandas as pd
44
from collections import defaultdict
55
import numpy as np
6+
from numpy.linalg import LinAlgError
67
from ..config import DEFAULT_CONFIG
8+
from ..utils.logging import get_logger
9+
10+
_log = get_logger("checks.missing_values")
711

812
_THRESHOLDS = DEFAULT_CONFIG.missing_values
913

@@ -117,7 +121,8 @@ def cramers_v(table):
117121
cramers = cramers_v(table)
118122
if p_val < threshold and cramers > _THRESHOLDS.pattern_cramers_v_min:
119123
cat_patterns[col].append((other_col, p_val, cramers))
120-
except Exception:
124+
except (ValueError, LinAlgError) as e:
125+
_log.debug("Chi-square test failed for '%s' vs '%s': %s", col, other_col, e)
121126
continue
122127

123128
for other_col in analyzer.df.select_dtypes(
@@ -140,7 +145,8 @@ def cramers_v(table):
140145

141146
if p_val < threshold and cohens_d > _THRESHOLDS.pattern_cohens_d_min:
142147
num_patterns[col].append((other_col, p_val, cohens_d))
143-
except Exception:
148+
except (ValueError, RuntimeWarning) as e:
149+
_log.debug("Mann-Whitney U test failed for '%s' vs '%s': %s", col, other_col, e)
144150
continue
145151

146152
# Generate grouped issues

hashprep/core/analyzer.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,15 @@ def __init__(
6969
sampling_config: Optional[SamplingConfig] = None,
7070
auto_sample: bool = True,
7171
):
72+
if not isinstance(df, pd.DataFrame):
73+
raise TypeError(f"Expected pandas DataFrame, got {type(df).__name__}")
74+
if df.columns.duplicated().any():
75+
raise ValueError(f"DataFrame has duplicate column names: {list(df.columns[df.columns.duplicated()])}")
76+
if target_col is not None and target_col not in df.columns:
77+
raise ValueError(f"Target column '{target_col}' not found in DataFrame")
78+
if comparison_df is not None and not isinstance(comparison_df, pd.DataFrame):
79+
raise TypeError(f"comparison_df must be a pandas DataFrame, got {type(comparison_df).__name__}")
80+
7281
self.comparison_df = comparison_df
7382
self.target_col = target_col
7483
self.selected_checks = selected_checks

hashprep/reports/markdown.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@
33
from typing import Dict, List
44

55
import pandas as pd
6+
from ..utils.logging import get_logger
7+
8+
_log = get_logger("reports.markdown")
69

710
import hashprep
811

@@ -205,7 +208,8 @@ def generate(self, summary, full=False, output_file=None):
205208
img_f.write(base64.b64decode(plot_data))
206209
rel_path = os.path.join(f"{report_name}_images", img_filename)
207210
content += f"![{plot_name}]({rel_path})\n\n"
208-
except Exception:
211+
except (OSError, ValueError) as e:
212+
_log.warning("Failed to save plot '%s': %s", plot_name, e)
209213
content += f"*(Error saving plot {plot_name})*\n\n"
210214

211215
content += "---\n\n"
@@ -224,8 +228,8 @@ def generate(self, summary, full=False, output_file=None):
224228
img_f.write(base64.b64decode(plot_data))
225229
rel_path = os.path.join(f"{report_name}_images", img_filename)
226230
content += f"![{method} Correlation]({rel_path})\n\n"
227-
except Exception:
228-
pass
231+
except (OSError, ValueError) as e:
232+
_log.warning("Failed to save correlation plot '%s': %s", method, e)
229233

230234
pairs = []
231235
for c1, corrs in num_corr["pearson"].items():

hashprep/summaries/interactions.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
import pandas as pd
22
from scipy.stats import chi2_contingency, f_oneway
33
import numpy as np
4+
from ..utils.logging import get_logger
5+
6+
_log = get_logger("summaries.interactions")
47

58

69
def summarize_interactions(df):
@@ -45,7 +48,8 @@ def _compute_categorical_correlations(df):
4548
r, k = table.shape
4649
cramers_v = (phi2 / min(k - 1, r - 1)) ** 0.5
4750
results[f"{c1}__{c2}"] = float(cramers_v)
48-
except Exception:
51+
except (ValueError, np.linalg.LinAlgError) as e:
52+
_log.debug("Categorical correlation failed for '%s' vs '%s': %s", c1, c2, e)
4953
continue
5054
return results
5155

@@ -69,6 +73,7 @@ def _compute_mixed_correlations(df):
6973
"f_stat": float(f_stat),
7074
"p_value": float(p_val),
7175
}
72-
except Exception as e:
76+
except (ValueError, RuntimeWarning) as e:
77+
_log.debug("Mixed correlation failed for '%s' vs '%s': %s", cat, num, e)
7378
mixed_corr[f"{cat}__{num}"] = {"error": str(e)}
7479
return mixed_corr

hashprep/utils/logging.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
"""Structured logging for HashPrep.
2+
3+
Provides a package-level logger that callers can use via get_logger().
4+
By default, logs at WARNING level so end users don't see noise.
5+
Library consumers can adjust via standard logging configuration.
6+
"""
7+
8+
import logging
9+
10+
LOGGER_NAME = "hashprep"
11+
12+
13+
def get_logger(module_name: str = "") -> logging.Logger:
14+
"""Get a logger scoped to a hashprep submodule.
15+
16+
Args:
17+
module_name: Dot-separated submodule path (e.g. "checks.correlations").
18+
If empty, returns the root hashprep logger.
19+
"""
20+
name = f"{LOGGER_NAME}.{module_name}" if module_name else LOGGER_NAME
21+
return logging.getLogger(name)
22+
23+
24+
# Configure root hashprep logger with NullHandler (library best practice).
25+
# This prevents "No handlers could be found" warnings when hashprep is used
26+
# as a library. End users or the CLI can attach their own handlers.
27+
_root_logger = logging.getLogger(LOGGER_NAME)
28+
_root_logger.addHandler(logging.NullHandler())

0 commit comments

Comments
 (0)