|
34 | 34 | from vllm.model_executor.layers.linear import (ColumnParallelLinear,
|
35 | 35 | RowParallelLinear)
|
36 | 36 | from vllm.model_executor.layers.quantization import QuantizationConfig
|
37 |
| -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler |
| 37 | +from vllm.model_executor.layers.sampler import get_sampler |
38 | 38 | from vllm.model_executor.models.module_mapping import MultiModelKeys
|
39 | 39 | from vllm.model_executor.sampling_metadata import SamplingMetadata
|
40 | 40 | from vllm.multimodal import MULTIMODAL_REGISTRY
|
@@ -73,13 +73,6 @@ class GraniteSpeechMultiModalProcessingInfo(BaseProcessingInfo):
|
73 | 73 | def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
74 | 74 | return {"audio": 1}
|
75 | 75 |
|
76 |
| - def get_mm_max_tokens_per_item( |
77 |
| - self, |
78 |
| - seq_len: int, |
79 |
| - mm_counts: Mapping[str, int], |
80 |
| - ) -> Mapping[str, int]: |
81 |
| - return {"audio": self.get_max_audio_tokens()} |
82 |
| - |
83 | 76 | # There is no limit to the maximum number of audio tokens that can be
|
84 | 77 | # encoded as features; we pick ~5000 as a number that is probably higher
|
85 | 78 | # than we would expect to encounter. The sequence of length
|
@@ -768,13 +761,6 @@ def compute_logits(
|
768 | 761 | sampling_metadata,
|
769 | 762 | )
|
770 | 763 |
|
771 |
| - def sample( |
772 |
| - self, |
773 |
| - logits: torch.Tensor, |
774 |
| - sampling_metadata: SamplingMetadata, |
775 |
| - ) -> Optional[SamplerOutput]: |
776 |
| - return self.language_model.sample(logits, sampling_metadata) |
777 |
| - |
778 | 764 | def load_weights(
|
779 | 765 | self,
|
780 | 766 | weights: Iterable[Tuple[str, torch.Tensor]],
|
|
0 commit comments