Skip to content

Commit cee2bd1

Browse files
jucorclaude
andcommitted
Vectorize _compute_vote_stats and make benchmark standalone
- _compute_vote_stats: Replace per-row/per-column loops with numpy vectorized operations using boolean masks and axis-based sums. This eliminates O(rows + cols) Python loops. - bench_update_votes.py: Make standalone by accepting CSV path directly instead of depending on datasets package. Add TODO for using datasets package once PR compdemocracy#2312 is merged. Combined with pivot_table optimization, achieves ~10x speedup on bg2050 dataset (1M votes): 18-30s -> 2.5s (~400k votes/sec). 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
1 parent b8b1ea4 commit cee2bd1

File tree

2 files changed

+103
-79
lines changed

2 files changed

+103
-79
lines changed

delphi/polismath/benchmarks/bench_update_votes.py

Lines changed: 55 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4,32 +4,78 @@
44
55
Usage:
66
cd delphi
7-
../.venv/bin/python -m polismath.benchmarks.bench_update_votes [dataset_name] [--runs N]
7+
../.venv/bin/python -m polismath.benchmarks.bench_update_votes <votes_csv_path> [--runs N]
88
99
Example:
10-
../.venv/bin/python -m polismath.benchmarks.bench_update_votes bg2050 --runs 3
10+
../.venv/bin/python -m polismath.benchmarks.bench_update_votes real_data/.local/r7wehfsmutrwndviddnii-bg2050/2025-11-25-1909-r7wehfsmutrwndviddnii-votes.csv --runs 3
1111
"""
12+
# TODO(datasets): Once PR https://github.com/compdemocracy/polis/pull/2312 is merged,
13+
# use the datasets package with include_local=True instead of requiring a path argument.
14+
1215
import argparse
1316
import time
1417
import sys
18+
from pathlib import Path
19+
20+
import pandas as pd
1521

1622

17-
def benchmark_update_votes(dataset_name: str = 'bg2050', runs: int = 3) -> dict:
23+
def load_votes_from_csv(votes_csv: Path) -> dict:
24+
"""
25+
Load votes from a CSV file into the format expected by Conversation.update_votes().
26+
27+
Args:
28+
votes_csv: Path to votes CSV file with columns: voter-id, comment-id, vote, timestamp
29+
30+
Returns:
31+
Dictionary with 'votes' list and 'lastVoteTimestamp'
32+
"""
33+
df = pd.read_csv(votes_csv)
34+
35+
# Fixed timestamp for reproducibility
36+
fixed_timestamp = 1700000000000
37+
38+
votes_list = []
39+
for _, row in df.iterrows():
40+
votes_list.append({
41+
'pid': row['voter-id'],
42+
'tid': row['comment-id'],
43+
'vote': row['vote'],
44+
'created': int(row['timestamp']) if 'timestamp' in df.columns else fixed_timestamp
45+
})
46+
47+
return {
48+
'votes': votes_list,
49+
'lastVoteTimestamp': fixed_timestamp
50+
}
51+
52+
53+
def benchmark_update_votes(votes_csv: str, runs: int = 3) -> dict:
1854
"""
1955
Benchmark update_votes on a dataset.
2056
2157
Args:
22-
dataset_name: Name of the dataset to benchmark
58+
votes_csv: Path to votes CSV file
2359
runs: Number of runs to average
2460
2561
Returns:
2662
Dictionary with benchmark results
2763
"""
2864
from polismath.conversation import Conversation
29-
from polismath.regression.utils import prepare_votes_data
3065

31-
print(f"Loading dataset '{dataset_name}'...")
32-
votes_dict, metadata = prepare_votes_data(dataset_name)
66+
votes_path = Path(votes_csv)
67+
if not votes_path.exists():
68+
raise FileNotFoundError(f"Votes CSV not found: {votes_csv}")
69+
70+
# Extract dataset name from path (e.g., "r7wehfsmutrwndviddnii-bg2050" -> "bg2050")
71+
parent_name = votes_path.parent.name
72+
if '-' in parent_name:
73+
dataset_name = parent_name.split('-', 1)[1]
74+
else:
75+
dataset_name = parent_name
76+
77+
print(f"Loading votes from '{votes_csv}'...")
78+
votes_dict = load_votes_from_csv(votes_path)
3379
n_votes = len(votes_dict['votes'])
3480
print(f"Loaded {n_votes:,} votes")
3581
print()
@@ -69,14 +115,13 @@ def benchmark_update_votes(dataset_name: str = 'bg2050', runs: int = 3) -> dict:
69115

70116
def main():
71117
parser = argparse.ArgumentParser(description='Benchmark update_votes performance')
72-
parser.add_argument('dataset', nargs='?', default='bg2050',
73-
help='Dataset name (default: bg2050)')
118+
parser.add_argument('votes_csv', help='Path to votes CSV file')
74119
parser.add_argument('--runs', type=int, default=3,
75120
help='Number of benchmark runs (default: 3)')
76121
args = parser.parse_args()
77122

78123
try:
79-
benchmark_update_votes(args.dataset, args.runs)
124+
benchmark_update_votes(args.votes_csv, args.runs)
80125
except Exception as e:
81126
print(f"Error: {e}", file=sys.stderr)
82127
sys.exit(1)

delphi/polismath/conversation/conversation.py

Lines changed: 48 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -310,12 +310,10 @@ def _apply_moderation(self) -> None:
310310

311311
def _compute_vote_stats(self) -> None:
312312
"""
313-
Compute statistics on votes.
313+
Compute statistics on votes using vectorized operations.
314314
"""
315-
# Make sure pandas is imported
316315
import numpy as np
317-
import pandas as pd
318-
316+
319317
# Initialize stats
320318
self.vote_stats = {
321319
'n_votes': 0,
@@ -325,84 +323,65 @@ def _compute_vote_stats(self) -> None:
325323
'comment_stats': {},
326324
'participant_stats': {}
327325
}
328-
329-
# Get matrix values and ensure they are numeric
326+
330327
try:
331-
# Make a clean copy that's definitely numeric
328+
# Get clean numeric matrix
332329
clean_mat = self._get_clean_matrix()
333-
# TODO: we can probably count without needing to convert to numpy array
334330
values = clean_mat.to_numpy()
335331

336-
# Count votes safely
332+
# Create boolean masks once for the entire matrix.
333+
# These are 2D arrays of the same shape as values.
334+
non_null_mask = ~np.isnan(values)
335+
agree_mask = np.abs(values - 1.0) < 0.001 # Close to 1
336+
disagree_mask = np.abs(values + 1.0) < 0.001 # Close to -1
337+
338+
# Global stats: sum over entire matrix
337339
try:
338-
# Create masks, handling non-numeric data
339-
non_null_mask = ~np.isnan(values)
340-
agree_mask = np.abs(values - 1.0) < 0.001 # Close to 1
341-
disagree_mask = np.abs(values + 1.0) < 0.001 # Close to -1
342-
343340
self.vote_stats['n_votes'] = int(np.sum(non_null_mask))
344341
self.vote_stats['n_agree'] = int(np.sum(agree_mask))
345342
self.vote_stats['n_disagree'] = int(np.sum(disagree_mask))
346-
self.vote_stats['n_pass'] = int(np.sum(np.isnan(values)))
343+
self.vote_stats['n_pass'] = int(np.sum(~non_null_mask))
347344
except Exception as e:
348-
logger.error(f"Error counting votes: {e}")
349-
# Set defaults if counting fails
350-
self.vote_stats['n_votes'] = 0
351-
self.vote_stats['n_agree'] = 0
352-
self.vote_stats['n_disagree'] = 0
353-
self.vote_stats['n_pass'] = 0
354-
355-
# Compute comment stats
356-
for i, cid in enumerate(clean_mat.columns):
357-
if i >= values.shape[1]:
358-
continue
359-
360-
try:
361-
col = values[:, i]
362-
n_votes = np.sum(~np.isnan(col))
363-
n_agree = np.sum(np.abs(col - 1.0) < 0.001)
364-
n_disagree = np.sum(np.abs(col + 1.0) < 0.001)
365-
366-
self.vote_stats['comment_stats'][cid] = {
367-
'n_votes': int(n_votes),
368-
'n_agree': int(n_agree),
369-
'n_disagree': int(n_disagree),
370-
'agree_ratio': float(n_agree / max(n_votes, 1))
371-
}
372-
except Exception as e:
373-
logger.error(f"Error computing stats for comment {cid}: {e}")
345+
logger.error(f"Error counting global votes: {e}")
346+
347+
# Per-comment stats: sum along axis=0 (columns).
348+
# axis=0 sums over rows, giving one value per column (comment).
349+
try:
350+
comment_n_votes = np.sum(non_null_mask, axis=0)
351+
comment_n_agree = np.sum(agree_mask, axis=0)
352+
comment_n_disagree = np.sum(disagree_mask, axis=0)
353+
# Avoid division by zero: use np.maximum to ensure denominator >= 1
354+
comment_agree_ratio = comment_n_agree / np.maximum(comment_n_votes, 1)
355+
356+
# Build comment_stats dict from the arrays.
357+
for i, cid in enumerate(clean_mat.columns):
374358
self.vote_stats['comment_stats'][cid] = {
375-
'n_votes': 0,
376-
'n_agree': 0,
377-
'n_disagree': 0,
378-
'agree_ratio': 0.0
359+
'n_votes': int(comment_n_votes[i]),
360+
'n_agree': int(comment_n_agree[i]),
361+
'n_disagree': int(comment_n_disagree[i]),
362+
'agree_ratio': float(comment_agree_ratio[i])
379363
}
380-
381-
# Compute participant stats
382-
for i, pid in enumerate(clean_mat.index):
383-
if i >= values.shape[0]:
384-
continue
385-
386-
try:
387-
row = values[i, :]
388-
n_votes = np.sum(~np.isnan(row))
389-
n_agree = np.sum(np.abs(row - 1.0) < 0.001)
390-
n_disagree = np.sum(np.abs(row + 1.0) < 0.001)
391-
392-
self.vote_stats['participant_stats'][pid] = {
393-
'n_votes': int(n_votes),
394-
'n_agree': int(n_agree),
395-
'n_disagree': int(n_disagree),
396-
'agree_ratio': float(n_agree / max(n_votes, 1))
397-
}
398-
except Exception as e:
399-
logger.error(f"Error computing stats for participant {pid}: {e}")
364+
except Exception as e:
365+
logger.error(f"Error computing comment stats: {e}")
366+
367+
# Per-participant stats: sum along axis=1 (rows).
368+
# axis=1 sums over columns, giving one value per row (participant).
369+
try:
370+
ptpt_n_votes = np.sum(non_null_mask, axis=1)
371+
ptpt_n_agree = np.sum(agree_mask, axis=1)
372+
ptpt_n_disagree = np.sum(disagree_mask, axis=1)
373+
ptpt_agree_ratio = ptpt_n_agree / np.maximum(ptpt_n_votes, 1)
374+
375+
# Build participant_stats dict from the arrays.
376+
for i, pid in enumerate(clean_mat.index):
400377
self.vote_stats['participant_stats'][pid] = {
401-
'n_votes': 0,
402-
'n_agree': 0,
403-
'n_disagree': 0,
404-
'agree_ratio': 0.0
378+
'n_votes': int(ptpt_n_votes[i]),
379+
'n_agree': int(ptpt_n_agree[i]),
380+
'n_disagree': int(ptpt_n_disagree[i]),
381+
'agree_ratio': float(ptpt_agree_ratio[i])
405382
}
383+
except Exception as e:
384+
logger.error(f"Error computing participant stats: {e}")
406385
except Exception as e:
407386
logger.error(f"Error in vote stats computation: {e}")
408387
# Initialize with empty stats if computation fails

0 commit comments

Comments
 (0)