1010 Union , cast , final )
1111
1212import numpy as np
13- import torch
14- import torch .types
15- from PIL .Image import Image
16- from transformers import BatchFeature
1713from typing_extensions import NotRequired , TypeAlias
1814
1915from vllm .jsontree import JSONTree , json_map_leaves
20- from vllm .utils import full_groupby , is_list_of
16+ from vllm .utils import LazyLoader , full_groupby , is_list_of
2117
2218if TYPE_CHECKING :
19+ import torch
20+ import torch .types
21+ from PIL .Image import Image
22+ from transformers .feature_extraction_utils import BatchFeature
23+
2324 from .hasher import MultiModalHashDict
25+ else :
26+ torch = LazyLoader ("torch" , globals (), "torch" )
2427
2528_T = TypeVar ("_T" )
2629
27- HfImageItem : TypeAlias = Union [Image , np .ndarray , torch .Tensor ]
30+ HfImageItem : TypeAlias = Union [" Image" , np .ndarray , " torch.Tensor" ]
2831"""
2932A {class}`transformers.image_utils.ImageInput` representing a single image
3033item, which can be passed to a HuggingFace `ImageProcessor`.
3134"""
3235
33- HfVideoItem : TypeAlias = Union [list [Image ], np .ndarray , torch .Tensor ,
34- list [np .ndarray ], list [torch .Tensor ]]
36+ HfVideoItem : TypeAlias = Union [list [" Image" ], np .ndarray , " torch.Tensor" ,
37+ list [np .ndarray ], list [" torch.Tensor" ]]
3538"""
3639A {class}`transformers.image_utils.VideoInput` representing a single video
3740item, which can be passed to a HuggingFace `VideoProcessor`.
3841"""
3942
40- HfAudioItem : TypeAlias = Union [list [float ], np .ndarray , torch .Tensor ]
43+ HfAudioItem : TypeAlias = Union [list [float ], np .ndarray , " torch.Tensor" ]
4144"""
4245Represents a single audio
4346item, which can be passed to a HuggingFace `AudioProcessor`.
4447"""
4548
46- ImageItem : TypeAlias = Union [HfImageItem , torch .Tensor ]
49+ ImageItem : TypeAlias = Union [HfImageItem , " torch.Tensor" ]
4750"""
4851A {class}`transformers.image_utils.ImageInput` representing a single image
4952item, which can be passed to a HuggingFace `ImageProcessor`.
5356these are directly passed to the model without HF processing.
5457"""
5558
56- VideoItem : TypeAlias = Union [HfVideoItem , torch .Tensor ]
59+ VideoItem : TypeAlias = Union [HfVideoItem , " torch.Tensor" ]
5760"""
5861A {class}`transformers.image_utils.VideoInput` representing a single video
5962item, which can be passed to a HuggingFace `VideoProcessor`.
6467"""
6568
6669AudioItem : TypeAlias = Union [HfAudioItem , tuple [np .ndarray , float ],
67- torch .Tensor ]
70+ " torch.Tensor" ]
6871"""
6972Represents a single audio
7073item, which can be passed to a HuggingFace `AudioProcessor`.
@@ -132,7 +135,7 @@ class PlaceholderRange:
132135 length : int
133136 """The length of the placeholder."""
134137
135- is_embed : Optional [torch .Tensor ] = None
138+ is_embed : Optional [" torch.Tensor" ] = None
136139 """
137140 A boolean mask of shape `(length,)` indicating which positions
138141 between `offset` and `offset + length` to assign embeddings to.
@@ -158,8 +161,8 @@ def __eq__(self, other: object) -> bool:
158161 return nested_tensors_equal (self .is_embed , other .is_embed )
159162
160163
161- NestedTensors = Union [list ["NestedTensors" ], list [torch .Tensor ], torch . Tensor ,
162- tuple [torch .Tensor , ...]]
164+ NestedTensors : TypeAlias = Union [list ["NestedTensors" ], list [" torch.Tensor" ] ,
165+ "torch.Tensor" , tuple [" torch.Tensor" , ...]]
163166"""
164167Uses a list instead of a tensor if the dimensions of each element do not match.
165168"""
@@ -261,7 +264,7 @@ def build_elems(
261264 """
262265 Construct {class}`MultiModalFieldElem` instances to represent
263266 the provided data.
264-
267+
265268 This is the inverse of {meth}`reduce_data`.
266269 """
267270 raise NotImplementedError
@@ -422,7 +425,7 @@ def flat(modality: str,
422425 modality: The modality of the multi-modal item that uses this
423426 keyword argument.
424427 slices: For each multi-modal item, a slice (dim=0) or a tuple of
425- slices (dim>0) that is used to extract the data corresponding
428+ slices (dim>0) that is used to extract the data corresponding
426429 to it.
427430 dim: The dimension to extract data, default to 0.
428431
@@ -465,7 +468,7 @@ def flat(modality: str,
465468
466469 @staticmethod
467470 def flat_from_sizes (modality : str ,
468- size_per_item : torch .Tensor ,
471+ size_per_item : " torch.Tensor" ,
469472 dim : int = 0 ):
470473 """
471474 Defines a field where an element in the batch is obtained by
@@ -602,7 +605,7 @@ class MultiModalKwargs(UserDict[str, NestedTensors]):
602605
603606 @staticmethod
604607 def from_hf_inputs (
605- hf_inputs : BatchFeature ,
608+ hf_inputs : " BatchFeature" ,
606609 config_by_key : Mapping [str , MultiModalFieldConfig ],
607610 ):
608611 # NOTE: This skips fields in `hf_inputs` that are not in `config_by_key`
@@ -792,7 +795,7 @@ def get_items(self, modality: str) -> Sequence[MultiModalKwargsItem]:
792795 return self ._items_by_modality [modality ]
793796
794797
795- MultiModalPlaceholderDict = Mapping [str , Sequence [PlaceholderRange ]]
798+ MultiModalPlaceholderDict : TypeAlias = Mapping [str , Sequence [PlaceholderRange ]]
796799"""
797800A dictionary containing placeholder ranges for each modality.
798801"""
@@ -823,7 +826,7 @@ class MultiModalInputs(TypedDict):
823826 mm_hashes : Optional ["MultiModalHashDict" ]
824827 """The hashes of the multi-modal data."""
825828
826- mm_placeholders : MultiModalPlaceholderDict
829+ mm_placeholders : " MultiModalPlaceholderDict"
827830 """
828831 For each modality, information about the placeholder tokens in
829832 `prompt_token_ids`.
0 commit comments