Specter-Co · rohan-rao7 · Dec 31, 2025 · Dec 8, 2025 · Dec 11, 2025 · Dec 11, 2025
diff --git a/.github/workflows/pr-run-test.yml b/.github/workflows/pr-run-test.yml
@@ -20,30 +20,14 @@ env:
   HF_HUB_CACHE: /mnt/shared-storage-user/large-model-center-share-weights/hf_hub
   HF_HUB_OFFLINE: 1
   CONDA_PATH: /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/miniconda3
-  WORK_PATH: /mnt/shared-storage-user/mllm/qa-llm-cicd/pr_wkdir/VLMEvalKit/VLMEvalKit
   CONDA_ENV: vlm_pr_test
   KUBEBRAIN_CLUSTER_ENTRY: https://h.pjlab.org.cn
   KUBEBRAIN_NAMESPACE: ailab-opencompass
 
 jobs:
-  prepare_env:
-    if: ${{!cancelled()}}
-    runs-on: [yidian_cu12_mllm]
-    steps:
-      - name: clone_repo
-        uses: actions/checkout@v3
-      - name: reinstall vlmeval
-        run: |
-          . ${{env.CONDA_PATH}}/bin/activate
-          conda activate ${{env.CONDA_ENV}}
-          pip uninstall vlmeval -y
-          pip install . -i https://pkg.pjlab.org.cn/repository/pypi-proxy/simple/ --trusted-host pkg.pjlab.org.cn --no-cache-dir
-          pip install numpy==1.23.0 transformers==4.57.1 -i https://pkg.pjlab.org.cn/repository/pypi-proxy/simple/ --trusted-host pkg.pjlab.org.cn --no-cache-dir
-
   vlm_test:
     if: ${{!cancelled()}}
     runs-on: [yidian_cu12_mllm]
-    needs: [prepare_env]
     strategy:
       fail-fast: false
       matrix:
@@ -61,43 +45,47 @@ jobs:
           - dataset: OCRBench_MINI
             dataset_name: ocrbench
     steps:
+      - name: Clean workdir
+        run: sudo git clean -ffdx
       - name: clone_repo
         uses: actions/checkout@v3
+      - name: reinstall vlmeval
+        run: |
+          . ${{env.CONDA_PATH}}/bin/activate
+          conda activate ${{env.CONDA_ENV}}
+          pip uninstall vlmeval -y
+          pip install .
+          pip install numpy==1.23.0 transformers==4.57.1
       - name: evaluation_model
         run: |
           . ${{env.CONDA_PATH}}/bin/activate
           conda activate ${{env.CONDA_ENV}}
           pip list
 
-          rjob submit --name=vllm-pr-test-${{ github.run_id }}-${{matrix.model_name}}-${{matrix.dataset_name}}-${{ github.run_attempt }} --charged-group=opencompass_gpu --private-machine=group --group=opencompass_gpu --gpu=1 --cpu=16 --memory=32568 --private-machine=group --image=registry.h.pjlab.org.cn/ailab-puyu-puyu_wsp_cpu/vlmevalkit:auto-v0.0.10 --env=HF_HUB_CACHE=/mnt/shared-storage-user/large-model-center-share-weights/hf_hub --env=HF_HUB_OFFLINE=1 --env=LMUData=/mnt/shared-storage-user/auto-eval-pipeline/vlmeval/LMUData --mount=gpfs://gpfs1/opencompass-shared:/mnt/shared-storage-user/opencompass-shared --mount=gpfs://gpfs1/auto-eval-pipeline:/mnt/shared-storage-user/auto-eval-pipeline --mount=gpfs://gpfs1/large-model-center-share-weights:/mnt/shared-storage-user/large-model-center-share-weights --mount=gpfs://gpfs1/mllm:/mnt/shared-storage-user/mllm --host-network=True -- bash -exc 'cd ${{env.WORK_PATH}}; source /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/miniconda3/bin/activate; conda activate ${{env.CONDA_ENV}}; python run.py --data ${{matrix.dataset}} --model ${{matrix.model}} --work-dir /mnt/shared-storage-user/mllm/qa-llm-cicd/eval_report/${{ github.run_id }}/${{matrix.model}} --reuse 2>&1'
+          rjob submit --metadata-name=vllm-pr-test-${{ github.run_id }}-${{matrix.model_name}}-${{matrix.dataset_name}}-${{ github.run_attempt }} --charged-group=opencompass_gpu --private-machine=group --group=opencompass_gpu --gpu=1 --cpu=16 --memory=32568 --private-machine=group --image=registry.h.pjlab.org.cn/ailab-puyu-puyu_wsp_cpu/vlmevalkit:auto-v0.0.10 --env=HF_HUB_CACHE=/mnt/shared-storage-user/large-model-center-share-weights/hf_hub --env=HF_HUB_OFFLINE=1 --env=LMUData=/mnt/shared-storage-user/auto-eval-pipeline/vlmeval/LMUData --mount=gpfs://gpfs1/qa-llm-cicd:/mnt/shared-storage-user/qa-llm-cicd --mount=gpfs://gpfs1/opencompass-shared:/mnt/shared-storage-user/opencompass-shared --mount=gpfs://gpfs1/auto-eval-pipeline:/mnt/shared-storage-user/auto-eval-pipeline --mount=gpfs://gpfs1/large-model-center-share-weights:/mnt/shared-storage-user/large-model-center-share-weights --mount=gpfs://gpfs1/mllm:/mnt/shared-storage-user/mllm --host-network=True -- bash -exc 'cd ${{github.workspace}}; source /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/miniconda3/bin/activate; conda activate ${{env.CONDA_ENV}}; python run.py --data ${{matrix.dataset}} --model ${{matrix.model}} --work-dir /mnt/shared-storage-user/mllm/qa-llm-cicd/eval_report/${{ github.run_id }}/${{matrix.model}} --reuse --judge exact_matching 2>&1'
 
-          for i in {1..300}; do
+          for i in {1..1200}; do
             current_status=$(rjob get vllm-pr-test-${{ github.run_id }}-${{matrix.model_name}}-${{matrix.dataset_name}}-${{ github.run_attempt }} | grep -oP 'rjob [^:]+: \K[^ ]+')
             echo "Current status: $current_status, stop checking"
             if [[ $current_status == "Succeeded" ]]; then
-              echo "任务成功完成"
-              break
+              echo "Task succeeded"
+              exit 0
             elif [[ $current_status == "Failed" || $current_status == "Stopped" ]]; then
-              echo "任务失败"
+              echo "Task failed or stopped, fetching logs"
               rjob logs job vllm-pr-test-${{ github.run_id }}-${{matrix.model_name}}-${{matrix.dataset_name}}-${{ github.run_attempt }}
               exit 1
             elif [[ $current_status == "Failed" || $current_status == "Stopped" ]]; then
-              echo "任务被停止"
+              echo "Task failed or stopped"
               exit 1
             fi
             sleep 6
           done
+          rjob stop vllm-pr-test-${{ github.run_id }}-${{matrix.model_name}}-${{matrix.dataset_name}}-${{ github.run_attempt }}
+          echo "Task timeout"
+          exit 1
       - name: assert_result
         run: |
           . ${{env.CONDA_PATH}}/bin/activate
           conda activate ${{env.CONDA_ENV}}
           cp -r /mnt/shared-storage-user/mllm/qa-llm-cicd/eval_report/${{ github.run_id }}/${{matrix.model}} outputs
           python .github/scripts/assert_score.py --dataset "${{matrix.dataset}}" --base_score $BASE_SCORE --model-name ${{matrix.model}}
-      - name: Change code permission
-        if: always()
-        run: |
-          sudo chmod -R 777 .
-      - name: Stop job
-        if: cancelled()
-        run: |
-          rjob stop vllm-pr-test-${{ github.run_id }}-${{matrix.model_name}}-${{matrix.dataset_name}}-${{ github.run_attempt }}
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -35,7 +35,6 @@ repos:
       - id: trailing-whitespace
       - id: check-yaml
       - id: end-of-file-fixer
-      - id: requirements-txt-fixer
       - id: check-merge-conflict
       - id: fix-encoding-pragma
         args: ["--remove"]

diff --git a/requirements.txt b/requirements.txt
@@ -13,7 +13,7 @@ math-verify
 matplotlib
 nltk
 numpy
-omegaconf
+omegaconf>=2.4.0.dev4
 openai
 opencv-python>=4.7.0.72
 openpyxl
@@ -42,3 +42,27 @@ transformers
 typing_extensions
 validators
 xlsxwriter
+scikit-learn
+datasets
+apted>=1.0.3
+colormath>=3.0.0
+decord>=0.6.0
+distance>=0.1.3
+lxml>=6.0.2
+pdf2image>=1.17.0
+zss>=1.2.0
+polygon3>=3.0.9.1
+levenshtein>=0.27.1
+jieba>=0.42.1
+editdistance>=0.8.1
+anls>=0.0.2
+antlr4-python3-runtime==4.11.1
+# For SArena
+torchmetrics
+scikit-image
+lpips
+openai-clip
+cairosvg
+# For UniSVG
+sentence_transformers
+bert_score
diff --git a/run.py b/run.py
@@ -402,6 +402,8 @@ def main():
                         judge_kwargs['model'] = 'gpt-4.1'
                     elif listinstr(['MathCanvas'], dataset_name):
                         judge_kwargs['model'] = 'gpt-4.1-2025-04-14'
+                    elif listinstr(['MMReason'], dataset_name):
+                        judge_kwargs['model'] = 'gpt-4.1'
 
                 if args.use_verifier:
                     judge_kwargs['use_verifier'] = True

diff --git a/vlmeval/api/__init__.py b/vlmeval/api/__init__.py
@@ -9,6 +9,8 @@
 from .cloudwalk import CWWrapper
 from .sensechat_vision import SenseChatVisionAPI
 from .siliconflow import SiliconFlowAPI, TeleMMAPI
+from .telemm import TeleMM2_API
+from .telemm_thinking import TeleMM2Thinking_API
 from .hunyuan import HunyuanVision
 from .bailingmm import bailingMMAPI
 from .bluelm_api import BlueLMWrapper, BlueLM_API
@@ -33,5 +35,5 @@
     'bailingMMAPI', 'TaiyiAPI', 'TeleMMAPI', 'SiliconFlowAPI', 'LMDeployAPI', 'ARM_thinker',
     'TaichuVLAPI', 'TaichuVLRAPI', 'DoubaoVL', "MUGUAPI", 'KimiVLAPIWrapper', 'KimiVLAPI',
     'RBdashMMChat3_API', 'RBdashChat3_5_API', 'RBdashMMChat3_78B_API', 'RBdashMMChat3_5_38B_API',
-    'VideoChatOnlineV2API'
+    'VideoChatOnlineV2API', 'TeleMM2_API', 'TeleMM2Thinking_API'
 ]
diff --git a/vlmeval/api/gpt.py b/vlmeval/api/gpt.py
@@ -105,10 +105,6 @@ def __init__(self,
                 env_key = os.environ.get('OPENAI_API_KEY', '')
                 if key is None:
                     key = env_key
-                assert isinstance(key, str) and key.startswith('sk-'), (
-                    f'Illegal openai_key {key}. '
-                    'Please set the environment variable OPENAI_API_KEY to your openai key. '
-                )
 
         self.key = key
         assert img_size > 0 or img_size == -1
@@ -228,9 +224,20 @@ def generate_inner(self, inputs, **kwargs) -> str:
             payload.pop('n')
             payload['reasoning_effort'] = 'high'
 
+        proxies = {}
+        if os.getenv('http_proxy'):
+            proxies['http'] = os.getenv('http_proxy')
+        if os.getenv('https_proxy'):
+            proxies['https'] = os.getenv('https_proxy')
+        proxies = proxies or None
+
         response = requests.post(
             self.api_base,
-            headers=headers, data=json.dumps(payload), timeout=self.timeout * 1.1)
+            headers=headers,
+            data=json.dumps(payload),
+            proxies=proxies,
+            timeout=self.timeout * 1.1,
+        )
         ret_code = response.status_code
         ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code
         answer = self.fail_msg