chris-relational
diff --git a/‎NOTES.md
Lines changed: 41 additions & 0 deletions b/‎NOTES.md
Lines changed: 41 additions & 0 deletions
diff --git a/‎tests/models/language/pooling/test_embedding.py
Lines changed: 6 additions & 1 deletion b/‎tests/models/language/pooling/test_embedding.py
Lines changed: 6 additions & 1 deletion
diff --git a/‎tests/models/registry.py
Lines changed: 1 addition & 0 deletions b/‎tests/models/registry.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎tests/plugins/vllm_add_dummy_platform/setup.py
Lines changed: 3 additions & 1 deletion b/‎tests/plugins/vllm_add_dummy_platform/setup.py
Lines changed: 3 additions & 1 deletion
diff --git a/‎tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py
Lines changed: 4 additions & 0 deletions b/‎tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py
Lines changed: 3 additions & 2 deletions b/‎tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py
Lines changed: 3 additions & 2 deletions
diff --git a/‎tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_custom_ops.py
Lines changed: 20 additions & 0 deletions b/‎tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_custom_ops.py
Lines changed: 20 additions & 0 deletions
diff --git a/‎tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
Lines changed: 20 additions & 3 deletions b/‎tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
Lines changed: 20 additions & 3 deletions
diff --git a/‎tests/plugins_tests/test_platform_plugins.py
Lines changed: 14 additions & 0 deletions b/‎tests/plugins_tests/test_platform_plugins.py
Lines changed: 14 additions & 0 deletions
diff --git a/‎tests/v1/worker/test_gpu_model_runner.py
Lines changed: 49 additions & 0 deletions b/‎tests/v1/worker/test_gpu_model_runner.py
Lines changed: 49 additions & 0 deletions
@@ -0,0 +1,41 @@
+# TL; DR
+I log here my reading conprehension notes and tasks regarding the repository.  
+The `var` directory is contained in `.gitignore` so it does not mix with the repo code.
+
+
+
+# `docker/Dockerfile.arm`
+The command `ENV LD_PRELOAD="/usr/lib/aarch64-linux-gnu/libtcmalloc_minimal.so.4"`
+sets the container environment variable LD_PRELOAD to the reference shared library 
+(note this is an Ubuntu image created on an arm64 host).  
+
+(ChatGPT): LD_PRELOAD instructs the dynamic linker to load a shared library before any other library when running executables.  
+It allows you to override functions in system libraries or inject extra functionality without changing the application binary.  
+
+`tcmalloc_minimal` is the minimal version of Google’s TCMalloc library (Thread-Caching Malloc), 
+an optimized memory allocator from the "Google Performance Tools suite". It provides faster malloc/free 
+than the default system allocator (glibc malloc). Helps improve performance of memory-intensive apps.
+
+The vLLM documentation on docker installation, contains a build command for x86 cpus:
+```bash
+$ docker build -f docker/Dockerfile.cpu --tag vllm-cpu-env --target vllm-openai .
+```
+
+If we replace (as requested) `Dockerfile.cpu` by `Dockerfile.arm` the build fails. This is because `--target vllm-openai`
+refers to a stage in `Dockerfile.cpu` that is not contained in `Dockerfile.arm`. 
+See the [docker documentation](https://docs.docker.com/build/building/multi-stage/) on multi-staged builds (there
+it is also explained what happens with dockerfiles with multiple `FROM` commands).  
+
+Here we use the command `$ docker build -f docker/Dockerfile.cpu --tag vllm-openai:arm .`. 
+
+
+
+# Additional installation requirements
+1. For the chat REPL of `transformers` you need to `pip install accelerate`. 
+
+2. To run the `generate` SDK it is recommended to `pip install bitsandbytes` (by Huggingface).  
+   `bitsandbytes` has methods for quantizing (when loading to memory) LLMs that greately improves performance.  
+
+
+
+
@@ -1,5 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+
 import pytest
 
 from vllm.config import PoolerConfig
@@ -33,7 +35,7 @@ def v1(run_with_both_engines):
         # To avoid this problem, for now we skip v0 since it will be
         # deprecated anyway.
         pytest.param("ssmits/Qwen2-7B-Instruct-embed-base",
-                     marks=[pytest.mark.skip_v0]),
+                     marks=[pytest.mark.skip_v0, pytest.mark.cpu_model]),
         # [Encoder-only]
         pytest.param("BAAI/bge-base-en-v1.5",
                      marks=[
@@ -58,6 +60,9 @@ def test_models(
     model,
     monkeypatch,
 ) -> None:
+    if model == "intfloat/e5-mistral-7b-instruct" and current_platform.is_cpu(
+    ) and os.environ.get("VLLM_USE_V1", "0") == "1":
+        pytest.skip("CPU V1 doesn't support sliding window")
 
     if model == "BAAI/bge-multilingual-gemma2" and current_platform.is_rocm():
         # ROCm Triton FA does not currently support sliding window attention
 
@@ -267,6 +267,7 @@ def check_available_online(
     # [Text-only]
     "BertModel": _HfExamplesInfo("BAAI/bge-base-en-v1.5", v0_only=True),
     "Gemma2Model": _HfExamplesInfo("BAAI/bge-multilingual-gemma2", v0_only=True),  # noqa: E501
+    "GPT2ForSequenceClassification": _HfExamplesInfo("nie3e/sentiment-polish-gpt2-small"),  # noqa: E501
     "GritLM": _HfExamplesInfo("parasail-ai/GritLM-7B-vllm"),
     "GteModel": _HfExamplesInfo("Snowflake/snowflake-arctic-embed-m-v2.0",
                                                trust_remote_code=True),
 
@@ -10,5 +10,7 @@
     entry_points={
         'vllm.platform_plugins': [
             "dummy_platform_plugin = vllm_add_dummy_platform:dummy_platform_plugin"  # noqa
-        ]
+        ],
+        "vllm.general_plugins":
+        ["dummy_custom_ops = vllm_add_dummy_platform:register_ops"],
     })
@@ -6,3 +6,7 @@
 
 def dummy_platform_plugin() -> Optional[str]:
     return "vllm_add_dummy_platform.dummy_platform.DummyPlatform"
+
+
+def register_ops():
+    import vllm_add_dummy_platform.dummy_custom_ops  # noqa
@@ -1,10 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from vllm.attention.backends.flash_attn import FlashAttentionBackend
+from vllm.attention.backends.placeholder_attn import (
+    PlaceholderAttentionBackend)
 
 
-class DummyAttentionBackend(FlashAttentionBackend):
+class DummyAttentionBackend(PlaceholderAttentionBackend):
 
     @staticmethod
     def get_name() -> str:
 
@@ -0,0 +1,20 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+
+from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
+
+
+# Register CustomRotaryEmbedding to CustomOP.
+@RotaryEmbedding.register_oot
+class DummyRotaryEmbedding(RotaryEmbedding):
+    """Original rotary positional embedding."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.addition_config = True
+
+    def forward_oot(self, *args,
+                    **kwargs) -> tuple[torch.Tensor, torch.Tensor]:
+        return super().forward_oot(*args, **kwargs)
@@ -1,12 +1,29 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import TYPE_CHECKING
 
-from vllm.platforms.cuda import CudaPlatform
+from vllm.platforms.interface import Platform, PlatformEnum
 
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+else:
+    VllmConfig = None
+from vllm import envs
 
-class DummyPlatform(CudaPlatform):
+
+class DummyPlatform(Platform):
+    _enum = PlatformEnum.OOT
     device_name = "DummyDevice"
+    device_type: str = "privateuseone"
+    dispatch_key: str = "PrivateUse1"
+
+    @classmethod
+    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
+        if envs.VLLM_USE_V1:
+            compilation_config = vllm_config.compilation_config
+            # Activate custom ops for v1.
+            compilation_config.custom_ops = ["all"]
 
     def get_attn_backend_cls(self, backend_name, head_size, dtype,
                              kv_cache_dtype, block_size, use_v1, use_mla):
-        return "vllm_add_dummy_platform.dummy_attention_backend.DummyAttentionBackend"  # noqa E501
+        return "vllm_add_dummy_platform.dummy_attention_backend.DummyAttentionBackend"  # noqa E501
@@ -5,6 +5,7 @@
 import torch
 
 from vllm.attention.selector import get_attn_backend
+from vllm.plugins import load_general_plugins
 from vllm.utils import STR_BACKEND_ENV_VAR, STR_INVALID_VAL
 
 
@@ -32,3 +33,16 @@ def test_oot_attention_backend(monkeypatch: pytest.MonkeyPatch):
         m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL)
         backend = get_attn_backend(16, torch.float16, "auto", 16, False)
         assert backend.get_name() == "Dummy_Backend"
+
+
+def test_oot_custom_op(monkeypatch: pytest.MonkeyPatch):
+    # simulate workload by running an example
+    load_general_plugins()
+    from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
+    layer = RotaryEmbedding(16, 16, 16, 16, True, torch.float16)
+    assert layer.__class__.__name__ == "DummyRotaryEmbedding", (
+        f"Expected DummyRotaryEmbedding, got {layer.__class__.__name__}, "
+        "possibly because the custom op is not registered correctly.")
+    assert hasattr(layer, "addition_config"), (
+        "Expected DummyRotaryEmbedding to have an 'addition_config' attribute, "
+        "which is set by the custom op.")
@@ -4,6 +4,7 @@
 import random
 
 import pytest
+import torch
 
 from vllm.attention import Attention
 from vllm.config import (CacheConfig, ModelConfig, ParallelConfig,
@@ -277,6 +278,54 @@ def test_update_states_request_resumed(model_runner):
     assert _is_req_state_block_table_match(model_runner, req_id)
 
 
+def test_get_nans_in_logits(model_runner):
+    req_ids = ("req_0", "req_1")
+
+    scheduler_output = _schedule_new_request(*req_ids)
+    model_runner._update_states(scheduler_output)
+
+    logits = torch.tensor([
+        [1.0, 2.0, 3.0],
+        [3.0, 2.0, 1.0],
+    ], device=DEVICE)
+    result = model_runner._get_nans_in_logits(logits)
+    assert result == {"req_0": 0, "req_1": 0}
+
+    logits = torch.tensor([
+        [1.0, float('nan'), 3.0],
+        [4.0, float('nan'), float('nan')],
+    ],
+                          device=DEVICE)
+    result = model_runner._get_nans_in_logits(logits)
+    assert result == {"req_0": 1, "req_1": 2}
+
+    logits = torch.tensor([
+        [1.0, 2.0, 3.0],
+        [4.0, float('nan'), float('nan')],
+    ],
+                          device=DEVICE)
+    result = model_runner._get_nans_in_logits(logits)
+    assert result == {"req_0": 0, "req_1": 2}
+
+    result = model_runner._get_nans_in_logits(logits=None)
+    assert result == {"req_0": 0, "req_1": 0}
+
+    logits = torch.tensor([
+        [1.0, float('nan'), 3.0],
+    ], device=DEVICE)
+    result = model_runner._get_nans_in_logits(logits)
+    assert result == {'req_0': 1, 'req_1': 0}
+
+    logits = torch.tensor([
+        [float('nan'), float('nan'), 2.0],
+        [1.0, 2.0, 3.0],
+        [float('nan'), 2.0, 3.0],
+    ],
+                          device=DEVICE)
+    result = model_runner._get_nans_in_logits(logits)
+    assert result == {'req_0': 2, 'req_1': 0}
+
+
 def test_update_states_no_changes(model_runner):
     req_id = "req_0"