Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ci] add fullbench testcase #1766

Merged
merged 29 commits into from
Dec 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 52 additions & 32 deletions .github/scripts/eval_regression_base_fullbench.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,61 +99,66 @@
]
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])

summary_groups = sum(
[v for k, v in locals().items() if k.endswith('_summary_groups')], [])
summary_groups.append(
{
'name': 'Mathbench',
'subsets': ['mathbench-a (average)', 'mathbench-t (average)'],
}, )

summarizer = dict(
dataset_abbrs=[
'Language',
['race-high', 'accuracy'],
['ARC-c', 'accuracy'],
['BoolQ', 'accuracy'],
['mmlu_pro', 'naive_average'],
['GPQA_diamond', 'accuracy'],
['cmmlu', 'naive_average'],
['mmlu', 'naive_average'],
['triviaqa_wiki_1shot', 'score'],
['nq_open_1shot', 'score'],
'',
'General Reasoning',
['drop', 'accuracy'],
['bbh', 'naive_average'],
['GPQA_diamond', 'accuracy'],
['hellaswag', 'accuracy'],
['TheoremQA', 'score'],
['winogrande', 'accuracy'],
'',
'Math Calculation',
['gsm8k', 'accuracy'],
['GaokaoBench', 'weighted_average'],
'GaokaoBench_2010-2022_Math_II_MCQs',
'GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank',
['math', 'accuracy'],
['Mathbench', 'naive_average'],
'',
'Knowledge',
['wikibench-wiki-single_choice_cncircular', 'perf_4'],
['cmmlu', 'naive_average'],
['mmlu', 'naive_average'],
['mmlu_pro', 'naive_average'],
'',
'Code',
['openai_humaneval', 'humaneval_pass@1'],
['openai_humaneval_v2', 'humaneval_pass@1'],
['sanitized_mbpp', 'score'],
['wikibench-wiki-single_choice_cncircular', 'perf_4'],
['gsm8k', 'accuracy'],
['GaokaoBench', 'weighted_average'],
['triviaqa_wiki_1shot', 'score'],
['nq_open_1shot', 'score'],
['winogrande', 'accuracy'],
['hellaswag', 'accuracy'],
['TheoremQA', 'score'],
'',
['dingo_en_192', 'score'],
['dingo_zh_170', 'score'],
'###### MathBench-A: Application Part ######',
'college',
'high',
'middle',
'primary',
'arithmetic',
'mathbench-a (average)',
'###### MathBench-T: Theory Part ######',
'college_knowledge',
'high_knowledge',
'middle_knowledge',
'primary_knowledge',
'mathbench-t (average)',
'###### Overall: Average between MathBench-A and MathBench-T ######',
'Overall',
'',
'bbh-logical_deduction_seven_objects',
'bbh-multistep_arithmetic_two',
'',
'mmlu',
'mmlu-stem',
'mmlu-social-science',
'mmlu-humanities',
['mmlu-other', 'accuracy'],
'',
'cmmlu',
'cmmlu-stem',
'cmmlu-social-science',
'cmmlu-humanities',
'cmmlu-other',
['cmmlu-china-specific', 'accuracy'],
'',
'mmlu_pro',
'mmlu_pro_biology',
'mmlu_pro_business',
Expand All @@ -169,9 +174,24 @@
'mmlu_pro_physics',
'mmlu_pro_psychology',
'mmlu_pro_other',
'',
'bbh-logical_deduction_seven_objects',
'bbh-multistep_arithmetic_two',
'###### MathBench-A: Application Part ######',
'college',
'high',
'middle',
'primary',
'arithmetic',
'mathbench-a (average)',
'###### MathBench-T: Theory Part ######',
'college_knowledge',
'high_knowledge',
'middle_knowledge',
'primary_knowledge',
'mathbench-t (average)',
],
summary_groups=sum(
[v for k, v in locals().items() if k.endswith('_summary_groups')], []),
summary_groups=summary_groups,
)

models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
Expand Down
23 changes: 19 additions & 4 deletions .github/scripts/eval_regression_chat_objective_fullbench.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,14 @@
aime2024_datasets # noqa: F401, E501
from opencompass.configs.datasets.ARC_c.ARC_c_cot_gen_926652 import \
ARC_c_datasets # noqa: F401, E501
# remove because of oom
# from opencompass.configs.datasets.ARC_Prize_Public_Evaluation.arc_prize_public_evaluation_gen_872059 import arc_prize_public_evaluation_datasets # noqa: F401, E501
from opencompass.configs.datasets.bbh.bbh_gen_5b92b0 import \
bbh_datasets # noqa: F401, E501
from opencompass.configs.datasets.bigcodebench.bigcodebench_hard_complete_gen_faf748 import \
bigcodebench_hard_complete_datasets # noqa: F401, E501
from opencompass.configs.datasets.bigcodebench.bigcodebench_hard_instruct_gen_8815eb import \
bigcodebench_hard_instruct_datasets # noqa: F401, E501
from opencompass.configs.datasets.cmmlu.cmmlu_0shot_cot_gen_305931 import \
cmmlu_datasets # noqa: F401, E501
from opencompass.configs.datasets.cmo_fib.cmo_fib_gen_ace24b import \
Expand All @@ -26,15 +32,17 @@
gsm8k_datasets # noqa: F401, E501
from opencompass.configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import \
hellaswag_datasets # noqa: F401, E501
from opencompass.configs.datasets.humaneval.humaneval_openai_sample_evals_gen_159614 import \
from opencompass.configs.datasets.humaneval.humaneval_openai_sample_evals_gen_dcae0e import \
humaneval_datasets # noqa: F401, E501
from opencompass.configs.datasets.humanevalx.humanevalx_gen_620cfa import \
from opencompass.configs.datasets.humanevalx.humanevalx_gen_3d84a3 import \
humanevalx_datasets # noqa: F401, E501
from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import \
from opencompass.configs.datasets.IFEval.IFEval_gen_353ae7 import \
ifeval_datasets # noqa: F401, E501
from opencompass.configs.datasets.korbench.korbench_single_0_shot_gen import \
korbench_0shot_single_datasets # noqa: F401, E501
from opencompass.configs.datasets.livecodebench.livecodebench_gen_b2b0fd import \
LCB_datasets # noqa: F401, E501
from opencompass.configs.datasets.math.math_0shot_gen_393424 import \
from opencompass.configs.datasets.math.math_0shot_gen_11c4b5 import \
math_datasets # noqa: F401, E501
from opencompass.configs.datasets.MathBench.mathbench_2024_gen_50a320 import \
mathbench_datasets # noqa: F401, E501
Expand Down Expand Up @@ -71,6 +79,7 @@
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
models as lmdeploy_internlm2_5_7b_chat_model # noqa: F401, E501
# Summary Groups
# Summary Groups
from opencompass.configs.summarizers.groups.bbh import \
bbh_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.cmmlu import \
Expand All @@ -81,6 +90,8 @@
GaokaoBench_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.humanevalx import \
humanevalx_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.korbench import \
korbench_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.mathbench_v1_2024 import \
mathbench_2024_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.mmlu import \
Expand Down Expand Up @@ -185,6 +196,8 @@
['hellaswag', 'accuracy'],
['TheoremQA', 'score'],
['musr_average', 'naive_average'],
['korbench_single', 'naive_average'],
['ARC_Prize_Public_Evaluation', 'accuracy'],
'',
'Math Calculation',
['gsm8k', 'accuracy'],
Expand All @@ -208,6 +221,8 @@
['lcb_code_generation', 'pass@1'],
['lcb_code_execution', 'pass@1'],
['lcb_test_output', 'pass@1'],
['bigcodebench_hard_instruct', 'pass@1'],
['bigcodebench_hard_complete', 'pass@1'],
'',
'Agent',
['teval', 'naive_average'],
Expand Down
149 changes: 138 additions & 11 deletions .github/scripts/eval_regression_chat_subjective_fullbench.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,35 +4,37 @@

from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.runners import LocalRunner
from opencompass.summarizers import SubjectiveSummarizer
from opencompass.summarizers import DefaultSubjectiveSummarizer
from opencompass.tasks.subjective_eval import SubjectiveEvalTask

with read_base():
# read hf models - chat models
# Dataset
from opencompass.configs.datasets.subjective.alignbench.alignbench_v1_1_judgeby_critiquellm import \
from opencompass.configs.datasets.chinese_simpleqa.chinese_simpleqa_gen import \
csimpleqa_datasets # noqa: F401, E501
from opencompass.configs.datasets.SimpleQA.simpleqa_gen_0283c3 import \
simpleqa_datasets # noqa: F401, E501; noqa: F401, E501
from opencompass.configs.datasets.subjective.alignbench.alignbench_v1_1_judgeby_critiquellm_new import \
alignbench_datasets # noqa: F401, E501
from opencompass.configs.datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import \
from opencompass.configs.datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4_new import \
alpacav2_datasets # noqa: F401, E501
from opencompass.configs.datasets.subjective.arena_hard.arena_hard_compare import \
from opencompass.configs.datasets.subjective.arena_hard.arena_hard_compare_new import \
arenahard_datasets # noqa: F401, E501
from opencompass.configs.datasets.subjective.compassarena.compassarena_compare import \
from opencompass.configs.datasets.subjective.compassarena.compassarena_compare_new import \
compassarena_datasets # noqa: F401, E501
from opencompass.configs.datasets.subjective.fofo.fofo_bilingual_judge import \
from opencompass.configs.datasets.subjective.fofo.fofo_bilingual_judge_new import \
fofo_datasets # noqa: F401, E501
from opencompass.configs.datasets.subjective.followbench.followbench_llmeval import \
from opencompass.configs.datasets.subjective.followbench.followbench_llmeval_new import \
followbench_llmeval_datasets # noqa: F401, E501
from opencompass.configs.datasets.subjective.multiround.mtbench101_judge import \
from opencompass.configs.datasets.subjective.multiround.mtbench101_judge_new import \
mtbench101_datasets # noqa: F401, E501
from opencompass.configs.datasets.subjective.wildbench.wildbench_pair_judge import \
from opencompass.configs.datasets.subjective.wildbench.wildbench_pair_judge_new import \
wildbench_datasets # noqa: F401, E501
from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b_chat import \
models as hf_internlm2_5_7b_chat_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
models as lmdeploy_internlm2_5_7b_chat_model # noqa: F401, E501

summarizer = dict(type=SubjectiveSummarizer, function='subjective')

datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')
and 'mtbench101' not in k and 'wildbench' not in k), [])
datasets += mtbench101_datasets # noqa: F401, E501
Expand Down Expand Up @@ -68,3 +70,128 @@
max_num_workers=16,
task=dict(type=SubjectiveEvalTask)),
)

summary_groups = []
summary_groups.append({
'name':
'compassarena_language',
'subsets': [
['compassarena_language', '内容总结'],
['compassarena_language', '情感分析'],
['compassarena_language', 'Information Retrival'],
['compassarena_language', '综合问答'],
['compassarena_language', '中华文化'],
],
})
summary_groups.append({
'name':
'compassarena_knowledge',
'subsets': [
['compassarena_knowledge', '生活常识_ZH'],
['compassarena_knowledge', '自然科学工科_ZH'],
['compassarena_knowledge', '人文科学_ZH'],
['compassarena_knowledge', '自然科学理科_ZH'],
['compassarena_knowledge', '社会科学_ZH'],
],
})
summary_groups.append({
'name': 'compassarena_reason_v2',
'subsets': [
['compassarena_reason_v2', 'reasoning'],
],
})
summary_groups.append({
'name':
'compassarena_math_v2',
'subsets': [
['compassarena_math_v2', '高等数学_ZH'],
['compassarena_math_v2', '初等数学_ZH'],
['compassarena_math_v2', '中等数学_ZH'],
],
})
summary_groups.append({
'name':
'compassarena_creationv2_zh',
'subsets': [
['compassarena_creationv2_zh', '内容扩写_ZH'],
['compassarena_creationv2_zh', '内容续写_ZH'],
['compassarena_creationv2_zh', '内容改写_ZH'],
],
})
summary_groups.append({
'name':
'CompassArena',
'subsets': [
'compassarena_language',
'compassarena_knowledge',
'compassarena_reason_v2',
'compassarena_math_v2',
'compassarena_creationv2_zh',
],
})
summary_groups.append({
'name':
'FoFo',
'subsets': [['fofo_test_prompts', 'overall'],
['fofo_test_prompts_cn', 'overall']],
})
summary_groups.append({
'name':
'Followbench',
'subsets': [
['followbench_llmeval_en', 'HSR_AVG'],
['followbench_llmeval_en', 'SSR_AVG'],
],
})

# Summarizer
summarizer = dict(
dataset_abbrs=[
['alignment_bench_v1_1', '总分'],
['alpaca_eval', 'total'],
['arenahard', 'score'],
['Followbench', 'naive_average'],
['CompassArena', 'naive_average'],
['FoFo', 'naive_average'],
['mtbench101', 'avg'],
['wildbench', 'average'],
['simpleqa', 'accuracy_given_attempted'],
['chinese_simpleqa', 'given_attempted_accuracy'],
'',
['alignment_bench_v1_1', '专业能力'],
['alignment_bench_v1_1', '数学计算'],
['alignment_bench_v1_1', '基本任务'],
['alignment_bench_v1_1', '逻辑推理'],
['alignment_bench_v1_1', '中文理解'],
['alignment_bench_v1_1', '文本写作'],
['alignment_bench_v1_1', '角色扮演'],
['alignment_bench_v1_1', '综合问答'],
['alpaca_eval', 'helpful_base'],
['alpaca_eval', 'koala'],
['alpaca_eval', 'oasst'],
['alpaca_eval', 'selfinstruct'],
['alpaca_eval', 'vicuna'],
['compassarena_language', 'naive_average'],
['compassarena_knowledge', 'naive_average'],
['compassarena_reason_v2', 'naive_average'],
['compassarena_math_v2', 'naive_average'],
['compassarena_creationv2_zh', 'naive_average'],
['fofo_test_prompts', 'overall'],
['fofo_test_prompts_cn', 'overall'],
['followbench_llmeval_en', 'HSR_AVG'],
['followbench_llmeval_en', 'SSR_AVG'],
['followbench_llmeval_en', 'HSR_L1'],
['followbench_llmeval_en', 'HSR_L2'],
['followbench_llmeval_en', 'HSR_L3'],
['followbench_llmeval_en', 'HSR_L4'],
['followbench_llmeval_en', 'HSR_L5'],
['followbench_llmeval_en', 'SSR_L1'],
['followbench_llmeval_en', 'SSR_L2'],
['followbench_llmeval_en', 'SSR_L3'],
['followbench_llmeval_en', 'SSR_L4'],
['followbench_llmeval_en', 'SSR_L5'],
['simpleqa', 'f1'],
],
type=DefaultSubjectiveSummarizer,
summary_groups=summary_groups,
)
Loading
Loading