Merge branch 'main' into zhihao

jie-wu5 · Sep 6, 2024 · ea0fe03 · ea0fe03
2 parents 51cad77 + 169f76d
commit ea0fe03
Show file tree

Hide file tree

Showing 6 changed files with 18 additions and 6 deletions.
diff --git a/README.md b/README.md
@@ -57,6 +57,8 @@ We opensourced Qwen2-VL-2B and Qwen2-VL-7B with Apache 2.0 license, and we relea
 | TextVQA<sub>val</sub>  | 84.4 | - | - | **85.5** |84.3|79.7
 | OCRBench | 852 | 788 | 736 | **855** |845| 794
 | MTVQA | 17.3 | 25.7 | 27.8 | **30.9** |25.6| 18.1
+| VCR<sub>en easy</sub>  | 84.67 | 63.85 | 91.55 | **91.93** | 89.70| 81.45
+| VCR<sub>zh easy</sub>  | 22.09 | 1.0| 14.87 | **65.37** | 59.94| 46.16
 | RealWorldQA | 72.2 | 60.1 | 75.4 | **77.8** | 70.1| 62.9
 | MME<sub>sum</sub>   | 2414.7 | 1920.0 | 2328.7 | **2482.7** | 2326.8 | 1872.0
 | MMBench-EN<sub>test</sub>  | **86.5** | 79.7 | 83.4 | **86.5** | 83.0 | 74.9
@@ -69,6 +71,7 @@ We opensourced Qwen2-VL-2B and Qwen2-VL-7B with Apache 2.0 license, and we relea
 | MathVista<sub>testmini</sub>  | 67.5 | 67.7 | 63.8 | **70.5** |58.2| 43.0
 | MathVision  | 16.97 | - | **30.4** | 25.9 | 16.3| 12.4
 
+
 ### Video Benchmarks
 
 | Benchmark |  Previous SoTA<br><sup>(Open-source LVLM)<sup> | Gemini 1.5-Pro | GPT-4o | **Qwen2-VL-72B**<br><sup>(Coming soon) |**Qwen2-VL-7B**<br><sup>([🤗](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct) [🤖](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct)) |**Qwen2-VL-2B**<br><sup>([🤗](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct)[🤖](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct)) 
@@ -389,7 +392,7 @@ messages2 = [
     {"role": "user", "content": "Who are you?"},
 ]
 # Combine messages for batch processing
-messages = [messages1, messages1]
+messages = [messages1, messages2]
 
 # Preparation for batch inference
 texts = [
@@ -804,22 +807,22 @@ from qwen_vl_utils import process_vision_info
 
 # We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
 # model = Qwen2VLForConditionalGeneration.from_pretrained(
-#     "Qwen2-VL-7B-Instruct-GPTQ-Int4",
+#     "Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4",
 #     torch_dtype=torch.bfloat16,
 #     attn_implementation="flash_attention_2",
 #     device_map="auto",
 # )
 
 # default: Load the model on the available device(s)
 model = Qwen2VLForConditionalGeneration.from_pretrained(
-    "Qwen2-VL-7B-Instruct-GPTQ-Int4", torch_dtype="auto", device_map="auto"
+    "Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4", torch_dtype="auto", device_map="auto"
 )
 
 # The default range for the number of visual tokens per image in the model is 4-16384. You can set min_pixels and max_pixels according to your needs, such as a token count range of 256-1280, to balance speed and memory usage.
 min_pixels = 256 * 28 * 28
 max_pixels = 1280 * 28 * 28
 processor = AutoProcessor.from_pretrained(
-    "Qwen2-VL-7B-Instruct-GPTQ-Int4", min_pixels=min_pixels, max_pixels=max_pixels
+    "Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4", min_pixels=min_pixels, max_pixels=max_pixels
 )
 
 messages = [
@@ -1173,6 +1176,9 @@ messages = [
         ],
     },
 ]
+# For video input, you can pass following values instead:
+# "type": "video",
+# "video": "<video URL>",
 
 processor = AutoProcessor.from_pretrained(MODEL_PATH)
 prompt = processor.apply_chat_template(

diff --git a/docker/Dockerfile-cu121 b/docker/Dockerfile-cu121
@@ -41,7 +41,7 @@ FROM dev as bundle_req
 RUN pip3 install --no-cache-dir networkx==3.1
 RUN pip3 install --no-cache-dir torch==2.4.0 torchvision==0.19 torchaudio==2.4.0 xformers==0.0.27.post2 --index-url https://download.pytorch.org/whl/cu121
 
-RUN pip3 install --no-cache-dir git+https://github.com/huggingface/transformers@19e6e80e10118f855137b90740936c0b11ac397f  \
+RUN pip3 install --no-cache-dir git+https://github.com/huggingface/transformers@21fac7abba2a37fae86106f87fcf9974fd1e3830  \
     && pip3 install --no-cache-dir accelerate \
     && pip3 install --no-cache-dir qwen-vl-utils
 
@@ -67,4 +67,4 @@ RUN pip3 install --no-cache-dir \
 COPY ../utils.py ./
 COPY ../web_demo_mm.py ./
 
-EXPOSE 80
+EXPOSE 80
diff --git a/qwen-vl-utils/pyproject.toml b/qwen-vl-utils/pyproject.toml
@@ -8,6 +8,7 @@ authors = [
 dependencies = [
     "requests",
     "pillow",
+    "av",
 ]
 readme = "README.md"
 requires-python = ">= 3.8"

diff --git a/qwen-vl-utils/requirements-dev.lock b/qwen-vl-utils/requirements-dev.lock
@@ -9,6 +9,8 @@
 #   generate-hashes: false
 
 -e file:.
+av==12.3.0
+    # via qwen-vl-utils
 certifi==2022.12.7
     # via requests
 charset-normalizer==2.1.1

diff --git a/qwen-vl-utils/requirements.lock b/qwen-vl-utils/requirements.lock
@@ -9,6 +9,8 @@
 #   generate-hashes: false
 
 -e file:.
+av==12.3.0
+    # via qwen-vl-utils
 certifi==2022.12.7
     # via requests
 charset-normalizer==2.1.1

diff --git a/requirements_web_demo.txt b/requirements_web_demo.txt
@@ -7,6 +7,7 @@ torch==2.4.0
 torchvision==0.19.0
 git+https://github.com/huggingface/transformers.git
 accelerate
+av
 
 # Optional dependency
 # Uncomment the following line if you need flash-attn