refactor: add structured logging, fix error handling, and validate inputs (#63)

maskedsyntax · web-flow · commit 9f63435db5ae · 2026-02-19T16:53:27.000+05:30
- Add hashprep/utils/logging.py with package-level structured logger
- Replace 8 bare `except Exception` blocks with specific exception types
  (ValueError, LinAlgError, RuntimeWarning, OSError) and debug logging
- Add input validation in DatasetAnalyzer.__init__:
  - TypeError if input is not a DataFrame
  - ValueError for duplicate column names
  - ValueError if target_col not found in DataFrame
  - TypeError if comparison_df is not a DataFrame
- Add input validation in check_drift() for both DataFrames
diff --git a/hashprep/checks/drift.py b/hashprep/checks/drift.py
@@ -4,6 +4,9 @@
 
 from .core import Issue
 from ..config import DEFAULT_CONFIG
+from ..utils.logging import get_logger
+
+_log = get_logger("checks.drift")
 
 _DRIFT = DEFAULT_CONFIG.drift
 CRITICAL_P_VALUE = _DRIFT.critical_p_value
@@ -19,6 +22,9 @@ def check_drift(
     Check for distribution shift between two datasets.
     Uses Kolmogorov-Smirnov test for numeric columns and Chi-square for categorical.
     """
+    if not isinstance(df_train, pd.DataFrame) or not isinstance(df_test, pd.DataFrame):
+        raise TypeError("Both df_train and df_test must be pandas DataFrames")
+
     issues = []
 
     issues.extend(_check_numeric_drift(df_train, df_test, threshold))
@@ -132,7 +138,7 @@ def _check_categorical_drift(
                         quick_fix="Options:\n- Re-train model with recent data.\n- Investigate category distribution changes.\n- Consider rebalancing categories.",
                     )
                 )
-        except (ValueError, RuntimeWarning):
-            pass
+        except (ValueError, RuntimeWarning) as e:
+            _log.debug("Chi-square drift test failed for '%s': %s", col, e)
 
     return issues
diff --git a/hashprep/checks/leakage.py b/hashprep/checks/leakage.py
@@ -3,8 +3,10 @@
 from scipy.stats import chi2_contingency, f_oneway
 import numpy as np
 from ..config import DEFAULT_CONFIG
+from ..utils.logging import get_logger
 
 _LEAK = DEFAULT_CONFIG.leakage
+_log = get_logger("checks.leakage")
 
 def _check_data_leakage(analyzer):
     issues = []
@@ -91,7 +93,8 @@ def _check_target_leakage_patterns(analyzer):
                                 quick_fix=quick_fix,
                             )
                         )
-                except Exception:
+                except (ValueError, np.linalg.LinAlgError) as e:
+                    _log.debug("Chi-square leakage test failed for '%s': %s", col, e)
                     continue
             numeric_cols = analyzer.df.select_dtypes(include="number").drop(
                 columns=[analyzer.target_col], errors="ignore"
@@ -127,6 +130,7 @@ def _check_target_leakage_patterns(analyzer):
                                 quick_fix=quick_fix,
                             )
                         )
-                except Exception:
+                except (ValueError, RuntimeWarning) as e:
+                    _log.debug("F-test leakage check failed for '%s': %s", col, e)
                     continue
     return issues
diff --git a/hashprep/checks/missing_values.py b/hashprep/checks/missing_values.py
@@ -3,7 +3,11 @@
 import pandas as pd
 from collections import defaultdict
 import numpy as np
+from numpy.linalg import LinAlgError
 from ..config import DEFAULT_CONFIG
+from ..utils.logging import get_logger
+
+_log = get_logger("checks.missing_values")
 
 _THRESHOLDS = DEFAULT_CONFIG.missing_values
 
@@ -117,7 +121,8 @@ def cramers_v(table):
                 cramers = cramers_v(table)
                 if p_val < threshold and cramers > _THRESHOLDS.pattern_cramers_v_min:
                     cat_patterns[col].append((other_col, p_val, cramers))
-            except Exception:
+            except (ValueError, LinAlgError) as e:
+                _log.debug("Chi-square test failed for '%s' vs '%s': %s", col, other_col, e)
                 continue
 
         for other_col in analyzer.df.select_dtypes(
@@ -140,7 +145,8 @@ def cramers_v(table):
 
                 if p_val < threshold and cohens_d > _THRESHOLDS.pattern_cohens_d_min:
                     num_patterns[col].append((other_col, p_val, cohens_d))
-            except Exception:
+            except (ValueError, RuntimeWarning) as e:
+                _log.debug("Mann-Whitney U test failed for '%s' vs '%s': %s", col, other_col, e)
                 continue
 
     # Generate grouped issues
diff --git a/hashprep/core/analyzer.py b/hashprep/core/analyzer.py
@@ -69,6 +69,15 @@ def __init__(
         sampling_config: Optional[SamplingConfig] = None,
         auto_sample: bool = True,
     ):
+        if not isinstance(df, pd.DataFrame):
+            raise TypeError(f"Expected pandas DataFrame, got {type(df).__name__}")
+        if df.columns.duplicated().any():
+            raise ValueError(f"DataFrame has duplicate column names: {list(df.columns[df.columns.duplicated()])}")
+        if target_col is not None and target_col not in df.columns:
+            raise ValueError(f"Target column '{target_col}' not found in DataFrame")
+        if comparison_df is not None and not isinstance(comparison_df, pd.DataFrame):
+            raise TypeError(f"comparison_df must be a pandas DataFrame, got {type(comparison_df).__name__}")
+
         self.comparison_df = comparison_df
         self.target_col = target_col
         self.selected_checks = selected_checks
diff --git a/hashprep/reports/markdown.py b/hashprep/reports/markdown.py
@@ -3,6 +3,9 @@
 from typing import Dict, List
 
 import pandas as pd
+from ..utils.logging import get_logger
+
+_log = get_logger("reports.markdown")
 
 import hashprep
 
@@ -205,7 +208,8 @@ def generate(self, summary, full=False, output_file=None):
                                 img_f.write(base64.b64decode(plot_data))
                             rel_path = os.path.join(f"{report_name}_images", img_filename)
                             content += f"![{plot_name}]({rel_path})\n\n"
-                        except Exception:
+                        except (OSError, ValueError) as e:
+                            _log.warning("Failed to save plot '%s': %s", plot_name, e)
                             content += f"*(Error saving plot {plot_name})*\n\n"
 
                 content += "---\n\n"
@@ -224,8 +228,8 @@ def generate(self, summary, full=False, output_file=None):
                                 img_f.write(base64.b64decode(plot_data))
                             rel_path = os.path.join(f"{report_name}_images", img_filename)
                             content += f"![{method} Correlation]({rel_path})\n\n"
-                        except Exception:
-                            pass
+                        except (OSError, ValueError) as e:
+                            _log.warning("Failed to save correlation plot '%s': %s", method, e)
 
                 pairs = []
                 for c1, corrs in num_corr["pearson"].items():
diff --git a/hashprep/summaries/interactions.py b/hashprep/summaries/interactions.py
@@ -1,6 +1,9 @@
 import pandas as pd
 from scipy.stats import chi2_contingency, f_oneway
 import numpy as np
+from ..utils.logging import get_logger
+
+_log = get_logger("summaries.interactions")
 
 
 def summarize_interactions(df):
@@ -45,7 +48,8 @@ def _compute_categorical_correlations(df):
                 r, k = table.shape
                 cramers_v = (phi2 / min(k - 1, r - 1)) ** 0.5
                 results[f"{c1}__{c2}"] = float(cramers_v)
-            except Exception:
+            except (ValueError, np.linalg.LinAlgError) as e:
+                _log.debug("Categorical correlation failed for '%s' vs '%s': %s", c1, c2, e)
                 continue
     return results
 
@@ -69,6 +73,7 @@ def _compute_mixed_correlations(df):
                     "f_stat": float(f_stat),
                     "p_value": float(p_val),
                 }
-            except Exception as e:
+            except (ValueError, RuntimeWarning) as e:
+                _log.debug("Mixed correlation failed for '%s' vs '%s': %s", cat, num, e)
                 mixed_corr[f"{cat}__{num}"] = {"error": str(e)}
     return mixed_corr
diff --git a/hashprep/utils/logging.py b/hashprep/utils/logging.py
@@ -0,0 +1,28 @@
+"""Structured logging for HashPrep.
+
+Provides a package-level logger that callers can use via get_logger().
+By default, logs at WARNING level so end users don't see noise.
+Library consumers can adjust via standard logging configuration.
+"""
+
+import logging
+
+LOGGER_NAME = "hashprep"
+
+
+def get_logger(module_name: str = "") -> logging.Logger:
+    """Get a logger scoped to a hashprep submodule.
+
+    Args:
+        module_name: Dot-separated submodule path (e.g. "checks.correlations").
+                     If empty, returns the root hashprep logger.
+    """
+    name = f"{LOGGER_NAME}.{module_name}" if module_name else LOGGER_NAME
+    return logging.getLogger(name)
+
+
+# Configure root hashprep logger with NullHandler (library best practice).
+# This prevents "No handlers could be found" warnings when hashprep is used
+# as a library. End users or the CLI can attach their own handlers.
+_root_logger = logging.getLogger(LOGGER_NAME)
+_root_logger.addHandler(logging.NullHandler())