Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
b66a500
[Benchmark] Support MMReason (#1316)
HJYao00 Dec 8, 2025
5f7aa1a
Add UniSVG dataset support (#1349)
JoeLeelyf Dec 11, 2025
18ce87c
Add support for SArena_MINI (#1353)
JoeLeelyf Dec 11, 2025
3dcd3f0
Update hipho physics dataset (#1341)
MarkovChain-why Dec 15, 2025
2be212c
[FIX BUG] Fix bug for SArena-MINI support (#1360)
JoeLeelyf Dec 17, 2025
b77b16f
[Fix] Fix MMVP metric (#1369)
kennymckormick Dec 20, 2025
db11569
[Feat] Add telemm2.0 (#1365)
CoreMeteor Dec 22, 2025
240f1d7
Support MMSI-Video-Bench (#1368)
rbler1234 Dec 23, 2025
63fa14e
[Fix] Fix SArena evaluation and parallelize UniSVG evaluation. (#1374)
mzr1996 Dec 25, 2025
837ae0a
update
zhulinJulia24 Dec 25, 2025
3fb8347
Merge pull request #1376 from zhulinJulia24/fix_timeout
zhulinJulia24 Dec 25, 2025
418054f
[Fix] Remove judge model restriction and add proxy support for GPT4V …
mzr1996 Dec 25, 2025
a039b67
[Fix] Fix evaluation of Physics and MM-IFEval (#1378)
mzr1996 Dec 25, 2025
f438f87
[Fix] 避免根据 OPENAI_API_KEY 判断是否使用 judge model (#1379)
mzr1996 Dec 25, 2025
26aee58
Update pr-run-test.yml
zhulinJulia24 Dec 29, 2025
03c76c0
Update pr-run-test.yml
zhulinJulia24 Dec 29, 2025
a42d914
Update pr-run-test.yml
zhulinJulia24 Dec 29, 2025
3d2438f
Update pr-run-test.yml
zhulinJulia24 Dec 29, 2025
70af89b
Change pip install to use --user flag
zhulinJulia24 Dec 29, 2025
4b9ee53
Update pr-run-test.yml
zhulinJulia24 Dec 29, 2025
242f993
Merge pull request #1387 from zhulinJulia24/add_param
zhulinJulia24 Dec 29, 2025
8573857
[Feat] Add tele2thinking (#1375)
CoreMeteor Dec 30, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 18 additions & 30 deletions .github/workflows/pr-run-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,30 +20,14 @@ env:
HF_HUB_CACHE: /mnt/shared-storage-user/large-model-center-share-weights/hf_hub
HF_HUB_OFFLINE: 1
CONDA_PATH: /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/miniconda3
WORK_PATH: /mnt/shared-storage-user/mllm/qa-llm-cicd/pr_wkdir/VLMEvalKit/VLMEvalKit
CONDA_ENV: vlm_pr_test
KUBEBRAIN_CLUSTER_ENTRY: https://h.pjlab.org.cn
KUBEBRAIN_NAMESPACE: ailab-opencompass

jobs:
prepare_env:
if: ${{!cancelled()}}
runs-on: [yidian_cu12_mllm]
steps:
- name: clone_repo
uses: actions/checkout@v3
- name: reinstall vlmeval
run: |
. ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}
pip uninstall vlmeval -y
pip install . -i https://pkg.pjlab.org.cn/repository/pypi-proxy/simple/ --trusted-host pkg.pjlab.org.cn --no-cache-dir
pip install numpy==1.23.0 transformers==4.57.1 -i https://pkg.pjlab.org.cn/repository/pypi-proxy/simple/ --trusted-host pkg.pjlab.org.cn --no-cache-dir

vlm_test:
if: ${{!cancelled()}}
runs-on: [yidian_cu12_mllm]
needs: [prepare_env]
strategy:
fail-fast: false
matrix:
Expand All @@ -61,43 +45,47 @@ jobs:
- dataset: OCRBench_MINI
dataset_name: ocrbench
steps:
- name: Clean workdir
run: sudo git clean -ffdx
- name: clone_repo
uses: actions/checkout@v3
- name: reinstall vlmeval
run: |
. ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}
pip uninstall vlmeval -y
pip install .
pip install numpy==1.23.0 transformers==4.57.1
- name: evaluation_model
run: |
. ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}
pip list

rjob submit --name=vllm-pr-test-${{ github.run_id }}-${{matrix.model_name}}-${{matrix.dataset_name}}-${{ github.run_attempt }} --charged-group=opencompass_gpu --private-machine=group --group=opencompass_gpu --gpu=1 --cpu=16 --memory=32568 --private-machine=group --image=registry.h.pjlab.org.cn/ailab-puyu-puyu_wsp_cpu/vlmevalkit:auto-v0.0.10 --env=HF_HUB_CACHE=/mnt/shared-storage-user/large-model-center-share-weights/hf_hub --env=HF_HUB_OFFLINE=1 --env=LMUData=/mnt/shared-storage-user/auto-eval-pipeline/vlmeval/LMUData --mount=gpfs://gpfs1/opencompass-shared:/mnt/shared-storage-user/opencompass-shared --mount=gpfs://gpfs1/auto-eval-pipeline:/mnt/shared-storage-user/auto-eval-pipeline --mount=gpfs://gpfs1/large-model-center-share-weights:/mnt/shared-storage-user/large-model-center-share-weights --mount=gpfs://gpfs1/mllm:/mnt/shared-storage-user/mllm --host-network=True -- bash -exc 'cd ${{env.WORK_PATH}}; source /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/miniconda3/bin/activate; conda activate ${{env.CONDA_ENV}}; python run.py --data ${{matrix.dataset}} --model ${{matrix.model}} --work-dir /mnt/shared-storage-user/mllm/qa-llm-cicd/eval_report/${{ github.run_id }}/${{matrix.model}} --reuse 2>&1'
rjob submit --metadata-name=vllm-pr-test-${{ github.run_id }}-${{matrix.model_name}}-${{matrix.dataset_name}}-${{ github.run_attempt }} --charged-group=opencompass_gpu --private-machine=group --group=opencompass_gpu --gpu=1 --cpu=16 --memory=32568 --private-machine=group --image=registry.h.pjlab.org.cn/ailab-puyu-puyu_wsp_cpu/vlmevalkit:auto-v0.0.10 --env=HF_HUB_CACHE=/mnt/shared-storage-user/large-model-center-share-weights/hf_hub --env=HF_HUB_OFFLINE=1 --env=LMUData=/mnt/shared-storage-user/auto-eval-pipeline/vlmeval/LMUData --mount=gpfs://gpfs1/qa-llm-cicd:/mnt/shared-storage-user/qa-llm-cicd --mount=gpfs://gpfs1/opencompass-shared:/mnt/shared-storage-user/opencompass-shared --mount=gpfs://gpfs1/auto-eval-pipeline:/mnt/shared-storage-user/auto-eval-pipeline --mount=gpfs://gpfs1/large-model-center-share-weights:/mnt/shared-storage-user/large-model-center-share-weights --mount=gpfs://gpfs1/mllm:/mnt/shared-storage-user/mllm --host-network=True -- bash -exc 'cd ${{github.workspace}}; source /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/miniconda3/bin/activate; conda activate ${{env.CONDA_ENV}}; python run.py --data ${{matrix.dataset}} --model ${{matrix.model}} --work-dir /mnt/shared-storage-user/mllm/qa-llm-cicd/eval_report/${{ github.run_id }}/${{matrix.model}} --reuse --judge exact_matching 2>&1'

for i in {1..300}; do
for i in {1..1200}; do
current_status=$(rjob get vllm-pr-test-${{ github.run_id }}-${{matrix.model_name}}-${{matrix.dataset_name}}-${{ github.run_attempt }} | grep -oP 'rjob [^:]+: \K[^ ]+')
echo "Current status: $current_status, stop checking"
if [[ $current_status == "Succeeded" ]]; then
echo "任务成功完成"
break
echo "Task succeeded"
exit 0
elif [[ $current_status == "Failed" || $current_status == "Stopped" ]]; then
echo "任务失败"
echo "Task failed or stopped, fetching logs"
rjob logs job vllm-pr-test-${{ github.run_id }}-${{matrix.model_name}}-${{matrix.dataset_name}}-${{ github.run_attempt }}
exit 1
elif [[ $current_status == "Failed" || $current_status == "Stopped" ]]; then
echo "任务被停止"
echo "Task failed or stopped"
exit 1
fi
sleep 6
done
rjob stop vllm-pr-test-${{ github.run_id }}-${{matrix.model_name}}-${{matrix.dataset_name}}-${{ github.run_attempt }}
echo "Task timeout"
exit 1
- name: assert_result
run: |
. ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}
cp -r /mnt/shared-storage-user/mllm/qa-llm-cicd/eval_report/${{ github.run_id }}/${{matrix.model}} outputs
python .github/scripts/assert_score.py --dataset "${{matrix.dataset}}" --base_score $BASE_SCORE --model-name ${{matrix.model}}
- name: Change code permission
if: always()
run: |
sudo chmod -R 777 .
- name: Stop job
if: cancelled()
run: |
rjob stop vllm-pr-test-${{ github.run_id }}-${{matrix.model_name}}-${{matrix.dataset_name}}-${{ github.run_attempt }}
1 change: 0 additions & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ repos:
- id: trailing-whitespace
- id: check-yaml
- id: end-of-file-fixer
- id: requirements-txt-fixer
- id: check-merge-conflict
- id: fix-encoding-pragma
args: ["--remove"]
Expand Down
26 changes: 25 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ math-verify
matplotlib
nltk
numpy
omegaconf
omegaconf>=2.4.0.dev4
openai
opencv-python>=4.7.0.72
openpyxl
Expand Down Expand Up @@ -42,3 +42,27 @@ transformers
typing_extensions
validators
xlsxwriter
scikit-learn
datasets
apted>=1.0.3
colormath>=3.0.0
decord>=0.6.0
distance>=0.1.3
lxml>=6.0.2
pdf2image>=1.17.0
zss>=1.2.0
polygon3>=3.0.9.1
levenshtein>=0.27.1
jieba>=0.42.1
editdistance>=0.8.1
anls>=0.0.2
antlr4-python3-runtime==4.11.1
# For SArena
torchmetrics
scikit-image
lpips
openai-clip
cairosvg
# For UniSVG
sentence_transformers
bert_score
2 changes: 2 additions & 0 deletions run.py
Original file line number Diff line number Diff line change
Expand Up @@ -402,6 +402,8 @@ def main():
judge_kwargs['model'] = 'gpt-4.1'
elif listinstr(['MathCanvas'], dataset_name):
judge_kwargs['model'] = 'gpt-4.1-2025-04-14'
elif listinstr(['MMReason'], dataset_name):
judge_kwargs['model'] = 'gpt-4.1'

if args.use_verifier:
judge_kwargs['use_verifier'] = True
Expand Down
4 changes: 3 additions & 1 deletion vlmeval/api/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
from .cloudwalk import CWWrapper
from .sensechat_vision import SenseChatVisionAPI
from .siliconflow import SiliconFlowAPI, TeleMMAPI
from .telemm import TeleMM2_API
from .telemm_thinking import TeleMM2Thinking_API
from .hunyuan import HunyuanVision
from .bailingmm import bailingMMAPI
from .bluelm_api import BlueLMWrapper, BlueLM_API
Expand All @@ -33,5 +35,5 @@
'bailingMMAPI', 'TaiyiAPI', 'TeleMMAPI', 'SiliconFlowAPI', 'LMDeployAPI', 'ARM_thinker',
'TaichuVLAPI', 'TaichuVLRAPI', 'DoubaoVL', "MUGUAPI", 'KimiVLAPIWrapper', 'KimiVLAPI',
'RBdashMMChat3_API', 'RBdashChat3_5_API', 'RBdashMMChat3_78B_API', 'RBdashMMChat3_5_38B_API',
'VideoChatOnlineV2API'
'VideoChatOnlineV2API', 'TeleMM2_API', 'TeleMM2Thinking_API'
]
17 changes: 12 additions & 5 deletions vlmeval/api/gpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,10 +105,6 @@ def __init__(self,
env_key = os.environ.get('OPENAI_API_KEY', '')
if key is None:
key = env_key
assert isinstance(key, str) and key.startswith('sk-'), (
f'Illegal openai_key {key}. '
'Please set the environment variable OPENAI_API_KEY to your openai key. '
)

self.key = key
assert img_size > 0 or img_size == -1
Expand Down Expand Up @@ -228,9 +224,20 @@ def generate_inner(self, inputs, **kwargs) -> str:
payload.pop('n')
payload['reasoning_effort'] = 'high'

proxies = {}
if os.getenv('http_proxy'):
proxies['http'] = os.getenv('http_proxy')
if os.getenv('https_proxy'):
proxies['https'] = os.getenv('https_proxy')
proxies = proxies or None

response = requests.post(
self.api_base,
headers=headers, data=json.dumps(payload), timeout=self.timeout * 1.1)
headers=headers,
data=json.dumps(payload),
proxies=proxies,
timeout=self.timeout * 1.1,
)
ret_code = response.status_code
ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code
answer = self.fail_msg
Expand Down
Loading