Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/format_pr_body.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ jobs:

- name: Get vLLM version
run: |
VLLM_COMMIT=v0.11.0
VLLM_COMMIT=releases/v0.11.1
echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV

- name: Checkout repository
Expand Down
7 changes: 4 additions & 3 deletions .github/workflows/vllm_ascend_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ jobs:
lint:
uses: ./.github/workflows/pre-commit.yml
with:
vllm: v0.11.0
vllm: releases/v0.11.1

changes:
runs-on: ubuntu-latest
Expand Down Expand Up @@ -83,7 +83,7 @@ jobs:
VLLM_USE_MODELSCOPE: True
strategy:
matrix:
vllm_version: [v0.11.0]
vllm_version: [releases/v0.11.1, v0.11.0]
steps:
- name: Install packages
run: |
Expand Down Expand Up @@ -128,6 +128,7 @@ jobs:
--ignore=tests/ut/kv_connector/test_remote_decode_lifecycle.py \
--ignore=tests/ut/kv_connector/test_remote_prefill_lifecycle.py \
--ignore=tests/ut/torchair/models/test_torchair_deepseek_v2.py \
--ignore=tests/ut/models/test_deepseek_v2.py \

- name: Upload coverage to Codecov
# only upload coverage when commits merged
Expand All @@ -144,7 +145,7 @@ jobs:
name: e2e-light
strategy:
matrix:
vllm_version: [v0.11.0]
vllm_version: [releases/v0.11.1, v0.11.0]
# Note (yikun): If CI resource are limited we can split job into two chain jobs
needs: [lint, changes]
# only trigger e2e test after lint passed and the change is e2e related with pull request.
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/vllm_ascend_test_full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ jobs:
name: e2e-full
strategy:
matrix:
vllm_version: [v0.11.0]
vllm_version: [releases/v0.11.1, v0.11.0]
needs: [changes]
if: ${{ needs.changes.outputs.e2e_tracker == 'true' }}
uses: ./.github/workflows/_e2e_test.yaml
Expand Down
2 changes: 1 addition & 1 deletion vllm_ascend/core/schedule_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def initialize_from_config(
scheduler_config[k] = getattr(ascend_scheduler_config, k)
return cls(**scheduler_config)

def __post_init__(self) -> None:
def __post_init__(self, *args) -> None:
self.max_num_encoder_input_tokens = self.max_num_batched_tokens
self.encoder_cache_size = self.max_num_batched_tokens
self.chunked_prefill_enabled = self.enable_chunked_prefill
Expand Down
54 changes: 37 additions & 17 deletions vllm_ascend/core/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
#
import time
from collections import deque
from typing import Iterable, Union
from typing import Iterable, Optional, Union

from vllm.config import VllmConfig
from vllm.distributed.kv_events import KVEventBatch
Expand All @@ -32,27 +32,19 @@
from vllm.v1.request import Request, RequestStatus
from vllm.v1.structured_output import StructuredOutputManager

from vllm_ascend.utils import vllm_version_is


class AscendScheduler(Scheduler):
"""This Scheduler extends vllm's original v1 scheduler
with prefill-first scheduling strategy."""

def __init__(
self,
vllm_config: VllmConfig,
kv_cache_config: KVCacheConfig,
structured_output_manager: StructuredOutputManager,
mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
include_finished_set: bool = False,
log_stats: bool = False,
) -> None:
super().__init__(vllm_config, kv_cache_config,
structured_output_manager, mm_registry,
include_finished_set, log_stats)
def _initialize_common(self) -> None:
"""Initialize common attributes shared across all versions."""
self.scheduled_req_ids: set[str] = set()
self.running: list[Request] = []

self.finished_prefill_reqs: deque[Request] = deque()

enable_pd_transfer = getattr(self.scheduler_config,
'enable_pd_transfer', False)
decode_max_num_seqs = getattr(self.scheduler_config,
Expand All @@ -61,6 +53,29 @@ def __init__(
self.decode_max_num_running_reqs = max(self.max_num_running_reqs,
decode_max_num_seqs)

def __init__(
self,
vllm_config: VllmConfig,
kv_cache_config: KVCacheConfig,
structured_output_manager: StructuredOutputManager,
block_size: Optional[int] = None,
mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
include_finished_set: bool = False,
log_stats: bool = False,
) -> None:
# Call the parent class's __init__ method
if vllm_version_is("0.11.0"):
super().__init__(vllm_config, kv_cache_config,
structured_output_manager, mm_registry,
include_finished_set, log_stats)
else:
super().__init__(vllm_config, kv_cache_config,
structured_output_manager, block_size,
mm_registry, include_finished_set, log_stats)

# Initialize common attributes
self._initialize_common()

def schedule(self) -> SchedulerOutput:
if self.scheduler_config.chunked_prefill_enabled:
return super().schedule()
Expand Down Expand Up @@ -440,9 +455,14 @@ def skip_cur_request():
self.kv_cache_config.kv_cache_groups)
if self.running:
any_request = self.running[0]
num_common_prefix_blocks = (
self.kv_cache_manager.get_num_common_prefix_blocks(
any_request, len(self.running)))
if vllm_version_is("0.11.0"):
num_common_prefix_blocks = (
self.kv_cache_manager.get_num_common_prefix_blocks(
any_request, len(self.running)))
else:
num_common_prefix_blocks = (
self.kv_cache_manager.get_num_common_prefix_blocks(
any_request.request_id))

# Construct the scheduler output.
new_reqs_data = [
Expand Down
8 changes: 7 additions & 1 deletion vllm_ascend/models/deepseek_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@
ReplicatedLinear,
RowParallelLinear)
from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.mla import MultiHeadLatentAttention
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.rotary_embedding import get_rope
from vllm.model_executor.layers.vocab_parallel_embedding import (
Expand All @@ -70,6 +69,13 @@
AscendSparseFlashAttention, Indexer)
from vllm_ascend.ops.common_fused_moe import AscendFusedMoE
from vllm_ascend.ops.linear import AscendLinearBase
from vllm_ascend.utils import vllm_version_is

if vllm_version_is("0.11.0"):
from vllm.model_executor.layers.mla import MultiHeadLatentAttention
else:
from vllm.model_executor.layers.mla import \
MultiHeadLatentAttentionWrapper as MultiHeadLatentAttention


@support_torch_compile
Expand Down
9 changes: 8 additions & 1 deletion vllm_ascend/models/layers/mla.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,17 @@
from vllm.attention import Attention, AttentionMetadata
from vllm.config import CacheConfig, get_current_vllm_config
from vllm.forward_context import ForwardContext, get_forward_context
from vllm.model_executor.layers.mla import MultiHeadLatentAttention
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.utils import direct_register_custom_op

from vllm_ascend.utils import vllm_version_is

if vllm_version_is("0.11.0"):
from vllm.model_executor.layers.mla import MultiHeadLatentAttention
else:
from vllm.model_executor.layers.mla import \
MultiHeadLatentAttentionWrapper as MultiHeadLatentAttention


@dataclass
class AscendMLAModules:
Expand Down
9 changes: 8 additions & 1 deletion vllm_ascend/models/layers/sfa.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,17 @@
from vllm.config import CacheConfig, get_current_vllm_config
from vllm.forward_context import ForwardContext, get_forward_context
from vllm.model_executor.layers.linear import ReplicatedLinear
from vllm.model_executor.layers.mla import MultiHeadLatentAttention
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.utils import direct_register_custom_op

from vllm_ascend.utils import vllm_version_is

if vllm_version_is("0.11.0"):
from vllm.model_executor.layers.mla import MultiHeadLatentAttention
else:
from vllm.model_executor.layers.mla import \
MultiHeadLatentAttentionWrapper as MultiHeadLatentAttention


@dataclass
class AscendSFAModules:
Expand Down
9 changes: 7 additions & 2 deletions vllm_ascend/ops/common_fused_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
from vllm.model_executor.layers.fused_moe.layer import (
FusedMoE, UnquantizedFusedMoEMethod, determine_expert_map,
get_compressed_expert_map)
from vllm.model_executor.layers.shared_fused_moe import SharedFusedMoE

from vllm_ascend.ascend_config import get_ascend_config
from vllm_ascend.ascend_forward_context import MoECommType
Expand All @@ -40,7 +39,13 @@
from vllm_ascend.ops.moe.moe_comm_method import setup_moe_comm_method
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, enable_sp, is_310p,
is_enable_nz, npu_stream_switch,
shared_expert_dp_enabled)
shared_expert_dp_enabled, vllm_version_is)

if vllm_version_is("0.11.0"):
from vllm.model_executor.layers.shared_fused_moe import SharedFusedMoE # type: ignore # isort:skip
else:
from vllm.model_executor.layers.fused_moe.shared_fused_moe import \
SharedFusedMoE


class AscendUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
Expand Down
Loading