Optimize update_votes with vectorized pivot_table (5x speedup)

jucor · claude · jucor · commit b8b1ea4c10cf · 2025-11-26T13:04:42.000Z
Replace the row-by-row for-loop in update_votes with a vectorized pivot_table approach. This dramatically speeds up vote loading for large datasets. Performance on bg2050 dataset (1M+ votes, 7.8k participants, 7.7k comments): - Before: 18.5s average, 56k votes/sec - After: 3.5s average, 295k votes/sec - Speedup: 5.3x overall, 16x for the batch update step The optimization: 1. Use pivot_table to reshape long-form votes to wide-form matrix 2. Use DataFrame.where() to merge with existing matrix 3. Use float32 for intermediate matrix to halve memory usage Also adds a benchmark script at polismath/benchmarks/bench_update_votes.py for measuring update_votes performance. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
diff --git a/delphi/polismath/benchmarks/__init__.py b/delphi/polismath/benchmarks/__init__.py
@@ -0,0 +1 @@
+"""Benchmark scripts for polismath performance testing."""
diff --git a/delphi/polismath/benchmarks/bench_update_votes.py b/delphi/polismath/benchmarks/bench_update_votes.py
@@ -0,0 +1,86 @@
+#!/usr/bin/env python3
+"""
+Benchmark script for update_votes performance.
+
+Usage:
+    cd delphi
+    ../.venv/bin/python -m polismath.benchmarks.bench_update_votes [dataset_name] [--runs N]
+
+Example:
+    ../.venv/bin/python -m polismath.benchmarks.bench_update_votes bg2050 --runs 3
+"""
+import argparse
+import time
+import sys
+
+
+def benchmark_update_votes(dataset_name: str = 'bg2050', runs: int = 3) -> dict:
+    """
+    Benchmark update_votes on a dataset.
+
+    Args:
+        dataset_name: Name of the dataset to benchmark
+        runs: Number of runs to average
+
+    Returns:
+        Dictionary with benchmark results
+    """
+    from polismath.conversation import Conversation
+    from polismath.regression.utils import prepare_votes_data
+
+    print(f"Loading dataset '{dataset_name}'...")
+    votes_dict, metadata = prepare_votes_data(dataset_name)
+    n_votes = len(votes_dict['votes'])
+    print(f"Loaded {n_votes:,} votes")
+    print()
+
+    times = []
+    for i in range(runs):
+        conv = Conversation(dataset_name)
+        start = time.perf_counter()
+        conv = conv.update_votes(votes_dict, recompute=False)
+        elapsed = time.perf_counter() - start
+        times.append(elapsed)
+        print(f"  Run {i+1}: {elapsed:.2f}s")
+
+    avg = sum(times) / len(times)
+    min_time = min(times)
+    max_time = max(times)
+
+    print()
+    print(f"Dataset: {dataset_name}")
+    print(f"Votes: {n_votes:,}")
+    print(f"Matrix shape: {conv.raw_rating_mat.shape}")
+    print(f"Average time: {avg:.2f}s")
+    print(f"Min/Max: {min_time:.2f}s / {max_time:.2f}s")
+    print(f"Throughput: {n_votes/avg:,.0f} votes/sec")
+
+    return {
+        'dataset': dataset_name,
+        'n_votes': n_votes,
+        'shape': conv.raw_rating_mat.shape,
+        'times': times,
+        'avg': avg,
+        'min': min_time,
+        'max': max_time,
+        'throughput': n_votes / avg,
+    }
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Benchmark update_votes performance')
+    parser.add_argument('dataset', nargs='?', default='bg2050',
+                        help='Dataset name (default: bg2050)')
+    parser.add_argument('--runs', type=int, default=3,
+                        help='Number of benchmark runs (default: 3)')
+    args = parser.parse_args()
+
+    try:
+        benchmark_update_votes(args.dataset, args.runs)
+    except Exception as e:
+        print(f"Error: {e}", file=sys.stderr)
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/delphi/polismath/conversation/conversation.py b/delphi/polismath/conversation/conversation.py
@@ -231,25 +231,43 @@ def update_votes(self,
 
         logger.info(f"[{time.time() - start_time:.2f}s] Found {len(new_rows)} new rows and {len(new_cols)} new columns")
 
-        # Apply all updates in a single batch operation for better performance
-        # Honestly, we should probably keep the matrix of votes in long-form,
-        # and only convert to wide-form when requested.
-        
-        logger.info(f"[{time.time() - start_time:.2f}s] Applying {len(vote_updates)} votes as batch update...")
+        # Apply all updates using vectorized pivot_table approach.
+        # This is much faster than row-by-row iteration because pandas/numpy
+        # can use optimized C code for the reshape operation.
+
+        logger.info(f"[{time.time() - start_time:.2f}s] Applying {len(updates_df)} votes as batch update...")
         batch_start = time.time()
-        # For backward compatibility, sort the rows and columns by label.
-        result.raw_rating_mat = result.raw_rating_mat.reindex(index=all_rows, columns=all_cols, fill_value=np.nan)
-        # NOTE: we cannot use .loc(rows, cols) = values with rows,cols,and values being Series 
-        # for example `result.raw_rating_mat.loc[updates_df['row'], updates_df['col']] = updates_df['value'].values`
-        # because pandas then tries to assign to the Cartesian product of rows and cols, and it gets very messy
-        # and is definitely *not* what we intended. 
-        # We could convert to integer indices with get_loc, then use .value to use numpy assignment (which does not
-        # do any cartesian product), but a/ it's less legible, b/ there is *no* guarantee at all that .value is always
-        # a view and not a copy, so we might end up modifying a copy of the data frame.
-        # Therefore, for simplicity and readability, sticking to an ugly for loop.
-        # If you have a better idea, let me know at julien@cornebise.com, I would love to know :)
-        for idx, row_data in updates_df.iterrows():
-            result.raw_rating_mat.at[row_data['row'], row_data['col']] = row_data['value']
+
+        # Build a wide-form matrix from the long-form updates using pivot_table.
+        # aggfunc='last' keeps the last vote if any duplicates remain after dedup.
+        update_matrix = updates_df.pivot_table(
+            index='row',
+            columns='col',
+            values='value',
+            aggfunc='last'
+        )
+        # Use float32 for the intermediate matrix to save memory (~200MB vs
+        # ~400MB for 8k comments and 8k participants).  float32 can exactly
+        # represent -1, 0, +1 and NaN.
+        update_matrix = update_matrix.astype('float32')
+
+        # Expand the existing matrix to include any new rows/columns.
+        # fill_value=np.nan ensures new cells start as "no vote".
+        result.raw_rating_mat = result.raw_rating_mat.reindex(
+            index=all_rows, columns=all_cols, fill_value=np.nan
+        )
+
+        # Align the update matrix to the same shape (new cells become NaN).
+        update_matrix = update_matrix.reindex(index=all_rows, columns=all_cols)
+
+        # Merge: where update_matrix has a value, use it; otherwise keep original.
+        # DataFrame.where(cond, other) keeps self where cond is True, uses other where False.
+        # So: keep raw_rating_mat where update_matrix is NaN, else use update_matrix.
+        result.raw_rating_mat = result.raw_rating_mat.where(
+            update_matrix.isna(),  # condition: True where update has no value
+            update_matrix          # other: use update value where condition is False
+        )
+
         logger.info(f"[{time.time() - start_time:.2f}s] Batch update completed in {time.time() - batch_start:.2f}s")
         
         # Update last updated timestamp

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+"""Benchmark scripts for polismath performance testing."""`