I recently replicated the PurePRM evaluation using the standalone evaluation script provided and the HuggingFace checkpoints. My results came out higher than those reported in the paper.
Details:
Evaluation script used (slightly modified https://github.com/CJReinforce/PURE/blob/0628af60c3fe80b824700f667843c20afdbcc108/PRM/eval/process_bench.py):
Checkpoints used: HuggingFace checkpoints
Observed results (F1 scores):
GSM8K → Error Acc: 72.5, Correct Acc: 95.3, F1: 82.3
MATH → Error Acc: 85.5, Correct Acc: 83.5, F1: 84.5
OlympiadBench → Error Acc: 70.2, Correct Acc: 80.2, F1: 74.9
OmniMath → Error Acc: 66.7, Correct Acc: 79.7, F1: 72.6
Average F1: 78.6
To reproduce:
import json
import os
import random
from copy import deepcopy
import numpy as np
import torch
import transformers
from accelerate import Accelerator
from datasets import load_dataset
from torch.utils.data import DataLoader, DistributedSampler
from tqdm import tqdm
def collate_fn(batch, tokenizer, separator='\n'):
input_ids = []
score_ids = []
labels = []
separator_ids = tokenizer.encode(separator, add_special_tokens=False, return_tensors='pt')
for i in batch:
prompt_ids = tokenizer(i['problem'], add_special_tokens=False, return_tensors='pt')['input_ids']
score_ids.append([])
for completion in i['steps']:
completion_ids = tokenizer(completion, add_special_tokens=False, return_tensors='pt')['input_ids']
prompt_ids = torch.cat([prompt_ids, completion_ids, separator_ids], dim=-1)
score_ids[-1].append(prompt_ids.size(-1) - 1)
labels.append(i['label'])
input_ids.append(prompt_ids)
pad_token_id = tokenizer.pad_token_id
max_len = max([i.size(-1) for i in input_ids])
for i, input_idx in enumerate(input_ids):
input_ids[i] = torch.cat([
input_idx.squeeze(),
torch.LongTensor([pad_token_id] * (max_len - input_idx.size(-1)))
])
input_ids = torch.stack(input_ids)
return dict(input_ids=input_ids, labels=labels, score_ids=score_ids)
def gather_objects(data, accelerator):
world_size = accelerator.num_processes
if world_size == 1:
return data
all_data = [None] * world_size
torch.distributed.all_gather_object(all_data, data)
if accelerator.is_main_process:
result = []
for process_data in all_data:
result.extend(process_data)
return result
return None
def set_seed(seed):
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
def main(
model_path,
batch_size=24,
num_of_workers=4,
separator="\n",
temperature=0.1
):
model_name = model_path.split('/')[-1]
configs = {
'gsm8k': [207, 193],
'math': [594, 406],
'olympiadbench': [661, 339],
'omnimath': [759, 241],
}
all_f1_scores = []
save_dir = f'outputs/{model_name}'
os.makedirs(save_dir, exist_ok=True)
accelerator = Accelerator()
print(f'Loading model from {model_path}')
model = transformers.AutoModelForTokenClassification.from_pretrained(model_path)
tokenizer = transformers.AutoTokenizer.from_pretrained(model_path)
model = accelerator.prepare(model)
model.eval()
for config, num in configs.items():
dataset = load_dataset("Qwen/ProcessBench", split=config)
sampler = None
if accelerator.distributed_type == "MULTI_GPU":
sampler = DistributedSampler(
dataset,
num_replicas=accelerator.num_processes,
rank=accelerator.process_index,
shuffle=False,
)
dataloader = DataLoader(
dataset,
batch_size=batch_size,
collate_fn=lambda x: x,
num_workers=num_of_workers,
sampler=sampler,
drop_last=False,
)
res_data = []
for batch_ in tqdm(dataloader, disable=not accelerator.is_main_process):
new_batch = deepcopy(batch_)
batch = collate_fn(batch_, tokenizer, separator)
input_ids = batch['input_ids'].to(accelerator.device)
labels = batch['labels']
score_ids = batch['score_ids']
with accelerator.autocast(), torch.no_grad():
outputs = model(input_ids)
logits = outputs.logits
for i, score_id in enumerate(score_ids):
label = labels[i]
prob = logits[i, score_id].softmax(dim=-1)
score = prob[:, 1] - prob[:, 0]
pred_or = ((-score / temperature).softmax(dim=-1) * score).sum()
if (label != -1 and pred_or <= 0) or (label == -1 and pred_or > 0):
new_batch[i]['match'] = True
else:
new_batch[i]['match'] = False
res_data.extend(new_batch)
accelerator.wait_for_everyone()
gathered_data = gather_objects(res_data, accelerator)
if accelerator.is_main_process:
data1 = [e for e in gathered_data if e['label'] != -1]
data2 = [e for e in gathered_data if e['label'] == -1]
if len(data1) != num[0]:
print(f'{config} error num mismatch: {len(data1)} != {num[0]}')
if len(data2) != num[1]:
print(f'{config} correct num mismatch: {len(data2)} != {num[1]}')
acc1 = np.mean([e['match'] for e in data1]) * 100
acc2 = np.mean([e['match'] for e in data2]) * 100
f1 = 2 * acc1 * acc2 / (acc1 + acc2)
print(f'{config} error acc: {acc1:.1f}, correct acc: {acc2:.1f}, f1: {f1:.1f}')
all_f1_scores.append(f1)
if accelerator.is_main_process:
print(f'ProcessBench. Average F1: {np.mean(all_f1_scores):.1f}')
if accelerator.distributed_type == "MULTI_GPU":
import torch.distributed as dist
dist.destroy_process_group()
set_seed(42)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
main(
model_path="jinachris/PURE-PRM-7B",
batch_size=4,
num_of_workers=4,
separator="\n",
temperature=0.1
)
I recently replicated the PurePRM evaluation using the standalone evaluation script provided and the HuggingFace checkpoints. My results came out higher than those reported in the paper.
Details:
Evaluation script used (slightly modified https://github.com/CJReinforce/PURE/blob/0628af60c3fe80b824700f667843c20afdbcc108/PRM/eval/process_bench.py):
Checkpoints used: HuggingFace checkpoints
Observed results (F1 scores):
GSM8K → Error Acc: 72.5, Correct Acc: 95.3, F1: 82.3
MATH → Error Acc: 85.5, Correct Acc: 83.5, F1: 84.5
OlympiadBench → Error Acc: 70.2, Correct Acc: 80.2, F1: 74.9
OmniMath → Error Acc: 66.7, Correct Acc: 79.7, F1: 72.6
Average F1: 78.6
To reproduce: