From a283ec2eece57454ec9301e5542cffa1201e175f Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Tue, 23 May 2023 17:58:51 -0700 Subject: [PATCH] Add contributing guideline and mypy config (#122) --- CONTRIBUTING.md | 74 ++++++++++++++++++++ cacheflow/core/scheduler.py | 2 +- cacheflow/model_executor/layers/attention.py | 4 +- cacheflow/model_executor/layers/sampler.py | 2 +- cacheflow/model_executor/model_loader.py | 4 +- cacheflow/model_executor/models/gpt2.py | 8 +-- cacheflow/model_executor/models/gpt_neox.py | 12 ++-- cacheflow/model_executor/models/llama.py | 12 ++-- cacheflow/model_executor/models/opt.py | 14 ++-- cacheflow/outputs.py | 2 +- cacheflow/sequence.py | 7 +- cacheflow/server/ray_utils.py | 2 +- cacheflow/worker/worker.py | 17 ++--- mypy.ini | 8 +++ requirements-dev.txt | 2 + tests/kernels/test_pos_encoding.py | 2 +- 16 files changed, 128 insertions(+), 44 deletions(-) create mode 100644 CONTRIBUTING.md create mode 100644 mypy.ini create mode 100644 requirements-dev.txt diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000000000..227c73b38b614 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,74 @@ +# Contributing to CacheFlow + +Thank you for your interest in contributing to CacheFlow! +Our community is open to everyone and welcomes all kinds of contributions, no matter how small or large. +There are several ways you can contribute to the project: + +- Identify and report any issues or bugs. +- Request or add a new model. +- Suggest or implement new features. + +However, remember that contributions aren't just about code. +We believe in the power of community support; thus, answering queries, assisting others, and enhancing the documentation are highly regarded and beneficial contributions. + +Finally, one of the most impactful ways to support us is by raising awareness about CacheFlow. +Talk about it in your blog posts, highlighting how it's driving your incredible projects. +Express your support on Twitter if CacheFlow aids you, or simply offer your appreciation by starring our repository. + + +## Setup for development + +### Build from source + +```bash +pip install -r requirements.txt +pip install -e . # This may take several minutes. +``` + +### Testing + +```bash +pip install -r requirements-dev.txt + +# Static type checking +mypy +# Unit tests +pytest tests/ +``` +**Note:** Currently, the repository does not pass the mypy tests. + + +## Contributing Guidelines + +### Issue Reporting + +If you encounter a bug or have a feature request, please check our issues page first to see if someone else has already reported it. +If not, please file a new issue, providing as much relevant information as possible. + +### Coding Style Guide + +In general, we adhere to [Google Python style guide](https://google.github.io/styleguide/pyguide.html) and [Google C++ style guide](https://google.github.io/styleguide/cppguide.html). + +### Pull Requests + +When submitting a pull request: + +1. Make sure your code has been rebased on top of the latest commit on the main branch. +2. Include a detailed description of the changes in the pull request. +Explain why you made the changes you did. +If your pull request fixes an open issue, please include a reference to it in the description. + +### Code Reviews + +All submissions, including submissions by project members, require a code review. +To make the review process as smooth as possible, please: + +1. Keep your changes as concise as possible. +If your pull request involves multiple unrelated changes, consider splitting it into separate pull requests. +2. Respond to all comments within a reasonable time frame. +If a comment isn't clear or you disagree with a suggestion, feel free to ask for clarification or discuss the suggestion. + +### Thank You + +Finally, thank you for taking the time to read these guidelines and for your interest in contributing to CacheFlow. +Your contributions make CacheFlow a great tool for everyone! diff --git a/cacheflow/core/scheduler.py b/cacheflow/core/scheduler.py index 08f855730dd61..795e576edf893 100644 --- a/cacheflow/core/scheduler.py +++ b/cacheflow/core/scheduler.py @@ -87,7 +87,7 @@ def add_seq_group(self, seq_group: SequenceGroup) -> None: def has_unfinished_seqs(self) -> bool: return self.waiting or self.running or self.swapped - def _schedule(self) -> Tuple[SchedulerOutputs, List[int]]: + def _schedule(self) -> Tuple[SchedulerOutputs, List[str]]: # Blocks that need to be swaped or copied before model execution. blocks_to_swap_in: Dict[int, int] = {} blocks_to_swap_out: Dict[int, int] = {} diff --git a/cacheflow/model_executor/layers/attention.py b/cacheflow/model_executor/layers/attention.py index 84bebb41892ff..0231ee9ae84b9 100644 --- a/cacheflow/model_executor/layers/attention.py +++ b/cacheflow/model_executor/layers/attention.py @@ -61,7 +61,7 @@ def multi_query_kv_attention( key: torch.Tensor, # [num_prompt_tokens, num_heads, head_size] value: torch.Tensor, # [num_prompt_tokens, num_heads, head_size] attn_bias: xops.AttentionBias, - ) -> None: + ) -> torch.Tensor: # TODO(woosuk): The unsqueeze op may incur some CPU overhead. Optimize. out = xops.memory_efficient_attention_forward( query.unsqueeze(0), @@ -197,7 +197,7 @@ def __init__( def forward( self, - positions: torch.LongTensor, # [num_tokens] + positions: torch.Tensor, # [num_tokens] query: torch.Tensor, # [num_tokens, num_heads * head_size] key: torch.Tensor, # [num_tokens, num_heads * head_size] value: torch.Tensor, # [num_tokens, num_heads * head_size] diff --git a/cacheflow/model_executor/layers/sampler.py b/cacheflow/model_executor/layers/sampler.py index 1c3187c054107..425d538575d8c 100644 --- a/cacheflow/model_executor/layers/sampler.py +++ b/cacheflow/model_executor/layers/sampler.py @@ -347,7 +347,7 @@ def _sample_from_generation_tokens( # Greedy sampling. assert len(seq_ids) == 1 next_token_id = torch.argmax(probs, dim=-1) - next_token_ids = [next_token_id.item()] + next_token_ids = [int(next_token_id.item())] parent_seq_ids = seq_ids else: # Random sampling. diff --git a/cacheflow/model_executor/model_loader.py b/cacheflow/model_executor/model_loader.py index 2d743d927d0a5..d33e8857b1767 100644 --- a/cacheflow/model_executor/model_loader.py +++ b/cacheflow/model_executor/model_loader.py @@ -1,4 +1,6 @@ """Utilities for selecting and loading models.""" +from typing import Type + import torch import torch.nn as nn from transformers import PretrainedConfig @@ -17,7 +19,7 @@ } -def _get_model_architecture(config: PretrainedConfig) -> nn.Module: +def _get_model_architecture(config: PretrainedConfig) -> Type[nn.Module]: architectures = getattr(config, "architectures", []) for arch in architectures: if arch in _MODEL_REGISTRY: diff --git a/cacheflow/model_executor/models/gpt2.py b/cacheflow/model_executor/models/gpt2.py index 16a16d32b4d6e..690bd7803c74c 100644 --- a/cacheflow/model_executor/models/gpt2.py +++ b/cacheflow/model_executor/models/gpt2.py @@ -168,8 +168,8 @@ def __init__(self, config: GPT2Config): def forward( self, - input_ids: torch.LongTensor, - position_ids: torch.LongTensor, + input_ids: torch.Tensor, + position_ids: torch.Tensor, kv_caches: List[KVCache], input_metadata: InputMetadata, cache_events: Optional[List[torch.cuda.Event]], @@ -204,8 +204,8 @@ def __init__(self, config: GPT2Config): def forward( self, - input_ids: torch.LongTensor, - positions: torch.LongTensor, + input_ids: torch.Tensor, + positions: torch.Tensor, kv_caches: List[KVCache], input_metadata: InputMetadata, cache_events: Optional[List[torch.cuda.Event]], diff --git a/cacheflow/model_executor/models/gpt_neox.py b/cacheflow/model_executor/models/gpt_neox.py index 101256371306b..c98514fd49ed9 100644 --- a/cacheflow/model_executor/models/gpt_neox.py +++ b/cacheflow/model_executor/models/gpt_neox.py @@ -67,7 +67,7 @@ def __init__(self, config: GPTNeoXConfig): def forward( self, - position_ids: torch.LongTensor, + position_ids: torch.Tensor, hidden_states: torch.Tensor, kv_cache: KVCache, input_metadata: InputMetadata, @@ -118,7 +118,7 @@ def __init__(self, config: GPTNeoXConfig): def forward( self, - position_ids: torch.LongTensor, + position_ids: torch.Tensor, hidden_states: torch.Tensor, kv_cache: KVCache, input_metadata: InputMetadata, @@ -162,8 +162,8 @@ def __init__(self, config: GPTNeoXConfig): def forward( self, - input_ids: torch.LongTensor, - position_ids: torch.LongTensor, + input_ids: torch.Tensor, + position_ids: torch.Tensor, kv_caches: List[KVCache], input_metadata: InputMetadata, cache_events: Optional[List[torch.cuda.Event]], @@ -199,8 +199,8 @@ def __init__(self, config): def forward( self, - input_ids: torch.LongTensor, - positions: torch.LongTensor, + input_ids: torch.Tensor, + positions: torch.Tensor, kv_caches: List[KVCache], input_metadata: InputMetadata, cache_events: Optional[List[torch.cuda.Event]], diff --git a/cacheflow/model_executor/models/llama.py b/cacheflow/model_executor/models/llama.py index 9a55ef0695688..b4ee6537d0d86 100644 --- a/cacheflow/model_executor/models/llama.py +++ b/cacheflow/model_executor/models/llama.py @@ -109,7 +109,7 @@ def __init__( def forward( self, - positions: torch.LongTensor, + positions: torch.Tensor, hidden_states: torch.Tensor, kv_cache: KVCache, input_metadata: InputMetadata, @@ -143,7 +143,7 @@ def __init__(self, config: LlamaConfig): def forward( self, - positions: torch.LongTensor, + positions: torch.Tensor, hidden_states: torch.Tensor, kv_cache: KVCache, input_metadata: InputMetadata, @@ -184,8 +184,8 @@ def __init__(self, config: LlamaConfig): def forward( self, - input_ids: torch.LongTensor, - positions: torch.LongTensor, + input_ids: torch.Tensor, + positions: torch.Tensor, kv_caches: List[KVCache], input_metadata: InputMetadata, cache_events: Optional[List[torch.cuda.Event]], @@ -222,8 +222,8 @@ def __init__(self, config): def forward( self, - input_ids: torch.LongTensor, - positions: torch.LongTensor, + input_ids: torch.Tensor, + positions: torch.Tensor, kv_caches: List[KVCache], input_metadata: InputMetadata, cache_events: Optional[List[torch.cuda.Event]], diff --git a/cacheflow/model_executor/models/opt.py b/cacheflow/model_executor/models/opt.py index eeaa77a624652..e340f68370be9 100644 --- a/cacheflow/model_executor/models/opt.py +++ b/cacheflow/model_executor/models/opt.py @@ -47,7 +47,7 @@ def __init__(self, num_embeddings: int, embedding_dim: int): self.offset = 2 super().__init__(num_embeddings + self.offset, embedding_dim) - def forward(self, positions: torch.LongTensor): + def forward(self, positions: torch.Tensor): return super().forward(positions + self.offset) @@ -199,8 +199,8 @@ def __init__(self, config: OPTConfig): def forward( self, - input_ids: torch.LongTensor, - positions: torch.LongTensor, + input_ids: torch.Tensor, + positions: torch.Tensor, kv_caches: List[KVCache], input_metadata: InputMetadata, cache_events: Optional[List[torch.cuda.Event]], @@ -235,8 +235,8 @@ def __init__(self, config: OPTConfig): def forward( self, - input_ids: torch.LongTensor, - positions: torch.LongTensor, + input_ids: torch.Tensor, + positions: torch.Tensor, kv_caches: List[KVCache], input_metadata: InputMetadata, cache_events: Optional[List[torch.cuda.Event]], @@ -258,8 +258,8 @@ def __init__(self, config): def forward( self, - input_ids: torch.LongTensor, - positions: torch.LongTensor, + input_ids: torch.Tensor, + positions: torch.Tensor, kv_caches: List[KVCache], input_metadata: InputMetadata, cache_events: Optional[List[torch.cuda.Event]], diff --git a/cacheflow/outputs.py b/cacheflow/outputs.py index 84fd71976a8cd..18b9a7cdc8ce8 100644 --- a/cacheflow/outputs.py +++ b/cacheflow/outputs.py @@ -31,7 +31,7 @@ class RequestOutput: def __init__( self, - request_id: int, + request_id: str, prompt: str, prompt_token_ids: List[int], outputs: List[CompletionOutput], diff --git a/cacheflow/sequence.py b/cacheflow/sequence.py index b2c19fae9c960..61c5091bfb16d 100644 --- a/cacheflow/sequence.py +++ b/cacheflow/sequence.py @@ -116,10 +116,11 @@ def get_output_token_ids(self) -> List[int]: def get_cumulative_logprob(self) -> float: return self.data.cumulative_logprob - def fork(self, child_seq: 'Sequence') -> 'Sequence': + def fork(self, child_seq: 'Sequence') -> None: child_seq.logical_token_blocks = copy.deepcopy(self.logical_token_blocks) child_seq.output_logprobs = copy.deepcopy(self.output_logprobs) child_seq.data = copy.deepcopy(self.data) + return None def __repr__(self) -> str: return (f'Sequence(seq_id={self.seq_id}, ' @@ -205,7 +206,9 @@ def __repr__(self) -> str: f'output_token={self.output_token}), ' f'logprobs={self.logprobs}') - def __eq__(self, other: 'SequenceOutputs') -> bool: + def __eq__(self, other: object) -> bool: + if not isinstance(other, SequenceOutputs): + return NotImplemented return (self.seq_id == other.seq_id and self.parent_seq_id == other.parent_seq_id and self.output_token == other.output_token and diff --git a/cacheflow/server/ray_utils.py b/cacheflow/server/ray_utils.py index 65569d9f3e638..4577fc8dc70ac 100644 --- a/cacheflow/server/ray_utils.py +++ b/cacheflow/server/ray_utils.py @@ -8,7 +8,7 @@ from cacheflow.config import ParallelConfig -DeviceID = Tuple[int, str, int] # rank, node resource (node IP), device id +DeviceID = Tuple[int, Optional[str], int] # rank, node resource (node IP), device id def initialize_cluster( diff --git a/cacheflow/worker/worker.py b/cacheflow/worker/worker.py index 12d977afa81cb..f202f2241617c 100644 --- a/cacheflow/worker/worker.py +++ b/cacheflow/worker/worker.py @@ -132,7 +132,7 @@ def init_cache_engine(self, cache_config: CacheConfig) -> None: def _prepare_inputs( self, seq_group_metadata_list: List[SequenceGroupMetadata], - ) -> Tuple[torch.LongTensor, torch.LongTensor, InputMetadata]: + ) -> Tuple[torch.Tensor, torch.Tensor, InputMetadata]: seq_groups: List[Tuple[List[int], SamplingParams]] = [] input_tokens: List[int] = [] input_positions: List[int] = [] @@ -216,19 +216,14 @@ def _prepare_inputs( input_positions = _pad_to_alignment(input_positions, multiple_of=8) # Convert to tensors. - tokens_tensor = torch.tensor( - input_tokens, dtype=torch.long, device='cuda') - positions_tensor = torch.tensor( - input_positions, dtype=torch.long, device='cuda') - slot_mapping_tensor = torch.tensor( - slot_mapping, dtype=torch.int, device='cuda') - context_lens_tensor = torch.tensor( - context_lens, dtype=torch.int, device='cuda') + tokens_tensor = torch.cuda.LongTensor(input_tokens) + positions_tensor = torch.cuda.LongTensor(input_positions) + slot_mapping_tensor = torch.cuda.IntTensor(slot_mapping) + context_lens_tensor = torch.cuda.IntTensor(context_lens) padded_block_tables = [ _pad_to_max(block_table, max_num_blocks_per_seq) for block_table in generation_block_tables] - block_tables_tensor = torch.tensor( - padded_block_tables, dtype=torch.int, device='cuda') + block_tables_tensor = torch.cuda.IntTensor(padded_block_tables) seq_data: Dict[int, SequenceData] = {} for seq_group_metadata in seq_group_metadata_list: diff --git a/mypy.ini b/mypy.ini new file mode 100644 index 0000000000000..f8d254f879b7a --- /dev/null +++ b/mypy.ini @@ -0,0 +1,8 @@ +[mypy] +python_version = 3.8 + +ignore_missing_imports = True + +files = cacheflow +# TODO(woosuk): Include the code from Megatron and HuggingFace. +exclude = cacheflow/model_executor/parallel_utils/|cacheflow/model_executor/models/ diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000000000..d76706eae71e3 --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,2 @@ +mypy +pytest diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/test_pos_encoding.py index 8299cd0e608ac..0c3d46d9b687d 100644 --- a/tests/kernels/test_pos_encoding.py +++ b/tests/kernels/test_pos_encoding.py @@ -49,7 +49,7 @@ def __init__( def forward( self, - positions: torch.LongTensor, # [num_tokens] + positions: torch.Tensor, # [num_tokens] query: torch.Tensor, # [num_tokens, num_heads, head_size] key: torch.Tensor, # [num_tokens, num_heads, head_size] ) -> Tuple[torch.Tensor, torch.Tensor]: