Vectorize _compute_vote_stats and make benchmark standalone

jucor · claude · jucor · commit cee2bd13c1e8 · 2025-11-26T13:40:36.000Z
- _compute_vote_stats: Replace per-row/per-column loops with numpy vectorized operations using boolean masks and axis-based sums. This eliminates O(rows + cols) Python loops. - bench_update_votes.py: Make standalone by accepting CSV path directly instead of depending on datasets package. Add TODO for using datasets package once PR compdemocracy#2312 is merged. Combined with pivot_table optimization, achieves ~10x speedup on bg2050 dataset (1M votes): 18-30s -> 2.5s (~400k votes/sec). 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
diff --git a/delphi/polismath/benchmarks/bench_update_votes.py b/delphi/polismath/benchmarks/bench_update_votes.py
@@ -4,32 +4,78 @@
 
 Usage:
     cd delphi
-    ../.venv/bin/python -m polismath.benchmarks.bench_update_votes [dataset_name] [--runs N]
+    ../.venv/bin/python -m polismath.benchmarks.bench_update_votes <votes_csv_path> [--runs N]
 
 Example:
-    ../.venv/bin/python -m polismath.benchmarks.bench_update_votes bg2050 --runs 3
+    ../.venv/bin/python -m polismath.benchmarks.bench_update_votes real_data/.local/r7wehfsmutrwndviddnii-bg2050/2025-11-25-1909-r7wehfsmutrwndviddnii-votes.csv --runs 3
 """
+# TODO(datasets): Once PR https://github.com/compdemocracy/polis/pull/2312 is merged,
+# use the datasets package with include_local=True instead of requiring a path argument.
+
 import argparse
 import time
 import sys
+from pathlib import Path
+
+import pandas as pd
 
 
-def benchmark_update_votes(dataset_name: str = 'bg2050', runs: int = 3) -> dict:
+def load_votes_from_csv(votes_csv: Path) -> dict:
+    """
+    Load votes from a CSV file into the format expected by Conversation.update_votes().
+
+    Args:
+        votes_csv: Path to votes CSV file with columns: voter-id, comment-id, vote, timestamp
+
+    Returns:
+        Dictionary with 'votes' list and 'lastVoteTimestamp'
+    """
+    df = pd.read_csv(votes_csv)
+
+    # Fixed timestamp for reproducibility
+    fixed_timestamp = 1700000000000
+
+    votes_list = []
+    for _, row in df.iterrows():
+        votes_list.append({
+            'pid': row['voter-id'],
+            'tid': row['comment-id'],
+            'vote': row['vote'],
+            'created': int(row['timestamp']) if 'timestamp' in df.columns else fixed_timestamp
+        })
+
+    return {
+        'votes': votes_list,
+        'lastVoteTimestamp': fixed_timestamp
+    }
+
+
+def benchmark_update_votes(votes_csv: str, runs: int = 3) -> dict:
     """
     Benchmark update_votes on a dataset.
 
     Args:
-        dataset_name: Name of the dataset to benchmark
+        votes_csv: Path to votes CSV file
         runs: Number of runs to average
 
     Returns:
         Dictionary with benchmark results
     """
     from polismath.conversation import Conversation
-    from polismath.regression.utils import prepare_votes_data
 
-    print(f"Loading dataset '{dataset_name}'...")
-    votes_dict, metadata = prepare_votes_data(dataset_name)
+    votes_path = Path(votes_csv)
+    if not votes_path.exists():
+        raise FileNotFoundError(f"Votes CSV not found: {votes_csv}")
+
+    # Extract dataset name from path (e.g., "r7wehfsmutrwndviddnii-bg2050" -> "bg2050")
+    parent_name = votes_path.parent.name
+    if '-' in parent_name:
+        dataset_name = parent_name.split('-', 1)[1]
+    else:
+        dataset_name = parent_name
+
+    print(f"Loading votes from '{votes_csv}'...")
+    votes_dict = load_votes_from_csv(votes_path)
     n_votes = len(votes_dict['votes'])
     print(f"Loaded {n_votes:,} votes")
     print()
@@ -69,14 +115,13 @@ def benchmark_update_votes(dataset_name: str = 'bg2050', runs: int = 3) -> dict:
 
 def main():
     parser = argparse.ArgumentParser(description='Benchmark update_votes performance')
-    parser.add_argument('dataset', nargs='?', default='bg2050',
-                        help='Dataset name (default: bg2050)')
+    parser.add_argument('votes_csv', help='Path to votes CSV file')
     parser.add_argument('--runs', type=int, default=3,
                         help='Number of benchmark runs (default: 3)')
     args = parser.parse_args()
 
     try:
-        benchmark_update_votes(args.dataset, args.runs)
+        benchmark_update_votes(args.votes_csv, args.runs)
     except Exception as e:
         print(f"Error: {e}", file=sys.stderr)
         sys.exit(1)
diff --git a/delphi/polismath/conversation/conversation.py b/delphi/polismath/conversation/conversation.py
@@ -310,12 +310,10 @@ def _apply_moderation(self) -> None:
     
     def _compute_vote_stats(self) -> None:
         """
-        Compute statistics on votes.
+        Compute statistics on votes using vectorized operations.
         """
-        # Make sure pandas is imported
         import numpy as np
-        import pandas as pd
-        
+
         # Initialize stats
         self.vote_stats = {
             'n_votes': 0,
@@ -325,84 +323,65 @@ def _compute_vote_stats(self) -> None:
             'comment_stats': {},
             'participant_stats': {}
         }
-        
-        # Get matrix values and ensure they are numeric
+
         try:
-            # Make a clean copy that's definitely numeric
+            # Get clean numeric matrix
             clean_mat = self._get_clean_matrix()
-            # TODO: we can probably count without needing to convert to numpy array
             values = clean_mat.to_numpy()
 
-            # Count votes safely
+            # Create boolean masks once for the entire matrix.
+            # These are 2D arrays of the same shape as values.
+            non_null_mask = ~np.isnan(values)
+            agree_mask = np.abs(values - 1.0) < 0.001  # Close to 1
+            disagree_mask = np.abs(values + 1.0) < 0.001  # Close to -1
+
+            # Global stats: sum over entire matrix
             try:
-                # Create masks, handling non-numeric data
-                non_null_mask = ~np.isnan(values)
-                agree_mask = np.abs(values - 1.0) < 0.001  # Close to 1
-                disagree_mask = np.abs(values + 1.0) < 0.001  # Close to -1
-                
                 self.vote_stats['n_votes'] = int(np.sum(non_null_mask))
                 self.vote_stats['n_agree'] = int(np.sum(agree_mask))
                 self.vote_stats['n_disagree'] = int(np.sum(disagree_mask))
-                self.vote_stats['n_pass'] = int(np.sum(np.isnan(values)))
+                self.vote_stats['n_pass'] = int(np.sum(~non_null_mask))
             except Exception as e:
-                logger.error(f"Error counting votes: {e}")
-                # Set defaults if counting fails
-                self.vote_stats['n_votes'] = 0
-                self.vote_stats['n_agree'] = 0
-                self.vote_stats['n_disagree'] = 0
-                self.vote_stats['n_pass'] = 0
-            
-            # Compute comment stats
-            for i, cid in enumerate(clean_mat.columns):
-                if i >= values.shape[1]:
-                    continue
-                    
-                try:
-                    col = values[:, i]
-                    n_votes = np.sum(~np.isnan(col))
-                    n_agree = np.sum(np.abs(col - 1.0) < 0.001)
-                    n_disagree = np.sum(np.abs(col + 1.0) < 0.001)
-                    
-                    self.vote_stats['comment_stats'][cid] = {
-                        'n_votes': int(n_votes),
-                        'n_agree': int(n_agree),
-                        'n_disagree': int(n_disagree),
-                        'agree_ratio': float(n_agree / max(n_votes, 1))
-                    }
-                except Exception as e:
-                    logger.error(f"Error computing stats for comment {cid}: {e}")
+                logger.error(f"Error counting global votes: {e}")
+
+            # Per-comment stats: sum along axis=0 (columns).
+            # axis=0 sums over rows, giving one value per column (comment).
+            try:
+                comment_n_votes = np.sum(non_null_mask, axis=0)
+                comment_n_agree = np.sum(agree_mask, axis=0)
+                comment_n_disagree = np.sum(disagree_mask, axis=0)
+                # Avoid division by zero: use np.maximum to ensure denominator >= 1
+                comment_agree_ratio = comment_n_agree / np.maximum(comment_n_votes, 1)
+
+                # Build comment_stats dict from the arrays.
+                for i, cid in enumerate(clean_mat.columns):
                     self.vote_stats['comment_stats'][cid] = {
-                        'n_votes': 0,
-                        'n_agree': 0,
-                        'n_disagree': 0,
-                        'agree_ratio': 0.0
+                        'n_votes': int(comment_n_votes[i]),
+                        'n_agree': int(comment_n_agree[i]),
+                        'n_disagree': int(comment_n_disagree[i]),
+                        'agree_ratio': float(comment_agree_ratio[i])
                     }
-            
-            # Compute participant stats
-            for i, pid in enumerate(clean_mat.index):
-                if i >= values.shape[0]:
-                    continue
-                    
-                try:
-                    row = values[i, :]
-                    n_votes = np.sum(~np.isnan(row))
-                    n_agree = np.sum(np.abs(row - 1.0) < 0.001)
-                    n_disagree = np.sum(np.abs(row + 1.0) < 0.001)
-                    
-                    self.vote_stats['participant_stats'][pid] = {
-                        'n_votes': int(n_votes),
-                        'n_agree': int(n_agree),
-                        'n_disagree': int(n_disagree),
-                        'agree_ratio': float(n_agree / max(n_votes, 1))
-                    }
-                except Exception as e:
-                    logger.error(f"Error computing stats for participant {pid}: {e}")
+            except Exception as e:
+                logger.error(f"Error computing comment stats: {e}")
+
+            # Per-participant stats: sum along axis=1 (rows).
+            # axis=1 sums over columns, giving one value per row (participant).
+            try:
+                ptpt_n_votes = np.sum(non_null_mask, axis=1)
+                ptpt_n_agree = np.sum(agree_mask, axis=1)
+                ptpt_n_disagree = np.sum(disagree_mask, axis=1)
+                ptpt_agree_ratio = ptpt_n_agree / np.maximum(ptpt_n_votes, 1)
+
+                # Build participant_stats dict from the arrays.
+                for i, pid in enumerate(clean_mat.index):
                     self.vote_stats['participant_stats'][pid] = {
-                        'n_votes': 0,
-                        'n_agree': 0,
-                        'n_disagree': 0,
-                        'agree_ratio': 0.0
+                        'n_votes': int(ptpt_n_votes[i]),
+                        'n_agree': int(ptpt_n_agree[i]),
+                        'n_disagree': int(ptpt_n_disagree[i]),
+                        'agree_ratio': float(ptpt_agree_ratio[i])
                     }
+            except Exception as e:
+                logger.error(f"Error computing participant stats: {e}")
         except Exception as e:
             logger.error(f"Error in vote stats computation: {e}")
             # Initialize with empty stats if computation fails