[integration] Make llama3 8B work with integration (vllm-project#535)

rickyyx · rkooo567 · web-flow · commit cbe65a294c5f · 2024-06-28T23:48:32.000+09:00
Co-authored-by: sang &lt;rkooo567@gmail.com&gt;
diff --git a/.buildkite/ci/build_scratch.sh b/.buildkite/ci/build_scratch.sh
@@ -17,8 +17,7 @@ rm -rf ${SCRATCH_DIR}
 git clone git@github.com:anyscale/scratchllm.git ${SCRATCH_DIR}
 pushd ${SCRATCH_DIR}
 
-# TEMPORARY.
-git checkout -b ricky/pr-pybind origin/ricky/pr-pybind
+git checkout a10-deployment
 
 echo "Build glog"
 git clone https://github.com/google/glog.git
@@ -28,30 +27,14 @@ cmake --build build
 sudo cmake --build build --target install
 popd
 
-echo "Build sentencepiece"
-git clone https://github.com/google/sentencepiece.git 
-pushd sentencepiece
-mkdir build
-cd build
-cmake ..
-make -j $(nproc)
-sudo make install
-sudo ldconfig -v
-popd
-
-echo "Build tiktokencpp"
-git clone git@github.com:anyscale/tiktokencpp.git
-pushd tiktokencpp
-mkdir build
-cd build
-cmake ..
-make
-sudo make install
-popd
-
 echo "Build scratchllm"
 # used for pybind.
-git submodule update --init --recursive
+chmod 700 setup_pybind.sh
+bash setup_pybind.sh
+
 # TODO(sang): Support custom flags.
-make h=cuda t=f16 b=fullopt scratch_runner
+# SANG-TODO H100
+# make m=ll38b h=cuda t=f16 b=fullopt s=4 scratch_runner
+# SANG-TODO A10
+make m=ll38b h=cuda t=f16 b=fullopt s=1 scratch_runner
 popd
diff --git a/.buildkite/ci/build_wheel.sh b/.buildkite/ci/build_wheel.sh
@@ -29,7 +29,8 @@ sudo apt install -y cmake
 
 
 echo "~~~ :python: Building wheel for ${VLLM_PROJECT}@${GIT_COMMIT}"
-BUILD_BAZEL=1 python setup.py bdist_wheel
+# Build scratch together.
+ANYSCALE_USE_SCRATCH_LLM=1 BUILD_BAZEL=1 python setup.py bdist_wheel
 
 VLLM_WHEEL=$(basename $(ls dist/*.whl))
 COMMIT_PATH="${S3_WHEEL_CACHE}/${VLLM_PROJECT}/${GIT_COMMIT}/${VLLM_WHEEL}"
diff --git a/setup.py b/setup.py
@@ -213,18 +213,25 @@ def build_extensions(self) -> None:
         
         temp_dir_path = os.path.join(ROOT_DIR, self.build_temp)
         print("Build and install ScratchLLM.")
-        print("Make sure to run ")
         subprocess.check_call(["chmod", "700", ".buildkite/ci/build_scratch.sh",])
         subprocess.check_call(["bash", ".buildkite/ci/build_scratch.sh", temp_dir_path])
         print("Copy .so file to vllm folder.")
         # TODO(sang): Support flexible .so file names.
         subprocess.check_call(["ls", f"{temp_dir_path}/scratchllm"])
-        subprocess.check_call([
-            "cp",
-            "-f",
-            f"{temp_dir_path}/scratchllm/scratch.cpython-39-x86_64-linux-gnu.so",
-            os.path.join(ROOT_DIR, "vllm"),
-        ])
+        # SANG-TODO: Support flexible models and shard size.
+        scratch_so_files = [
+            # SANG-TODO H100
+            # "scratch-ll38b-s4-cuda-f16-fullopt.cpython-39-x86_64-linux-gnu.so",
+            # SANG-TODO A10
+            "scratch-ll38b-s1-cuda-f16-fullopt.cpython-39-x86_64-linux-gnu.so",
+        ]
+        for shared_object_file in scratch_so_files:
+            subprocess.check_call([
+                "cp",
+                "-f",
+                f"{temp_dir_path}/scratchllm/{shared_object_file}",
+                os.path.join(ROOT_DIR, "vllm"),
+            ])
         # Anyscale end
 
 
diff --git a/tests/basic_correctness/test_scratch_correctness.py b/tests/basic_correctness/test_scratch_correctness.py
@@ -9,11 +9,12 @@
 MODELS = [
     # "facebook/opt-125m",
     "meta-llama/Llama-2-7b-hf",
+    # "meta-llama/Meta-Llama-3-8B",
 ]
 
-# assert USE_SCRATCH, ("ScratchLLM should be enabled to run a test. "
-#                      "Use ANYSCALE_USE_SCRATCH_LLM=1 pytest -vs "
-#                      "tests/basic_correctness/test_scratch_correctness.py")
+assert USE_SCRATCH, ("ScratchLLM should be enabled to run a test. "
+                     "Use ANYSCALE_USE_SCRATCH_LLM=1 pytest -vs "
+                     "tests/basic_correctness/test_scratch_correctness.py")
 
 
 @pytest.mark.parametrize("model", MODELS)
diff --git a/vllm/scratch_env.py b/vllm/scratch_env.py
@@ -2,17 +2,15 @@
 
 SCRATCH_ENV_VAR = "ANYSCALE_USE_SCRATCH_LLM"
 USE_SCRATCH = bool(int(os.getenv(SCRATCH_ENV_VAR, False)))
+SCRATCH_EXECUTABLE_PATH_ENV_VAR = "SCRATCH_EXECUTABLE_PATH"
+# SANG-TODO H100
+# SCRATCH_BUILD_PREFIX = "ll38b-s4-cuda-f16" # CHANGE THIS FOR DIFFERNT MODELS
+# SANG-TODO A10
+SCRATCH_BUILD_PREFIX = "ll38b-s1-cuda-f16" # CHANGE THIS FOR DIFFERNT MODELS
+SCRATCH_BUILD_TYPE = "fullopt" # We should remove this, this is needed because weights are the same for all builds types.
+SCRATCH_EXECUTABLE_PATH =os.getenv(SCRATCH_EXECUTABLE_PATH_ENV_VAR, f"./vllm/scratch-{SCRATCH_BUILD_PREFIX}-{SCRATCH_BUILD_TYPE}.cpython-39-x86_64-linux-gnu.so")
 SCRATCH_WEIGHTS_BUCKET_NAME = "scratch-working-dirs"
-SCRATCH_WEIGHTS_PREFIX = "weights/llama-7b/ll27b-cuda-f16/"
+SCRATCH_WEIGHTS_PREFIX = f"staging_weights/{SCRATCH_BUILD_PREFIX}/"
 SCRATCH_WEIGHTS_URI = f"s3://{SCRATCH_WEIGHTS_BUCKET_NAME}/{SCRATCH_WEIGHTS_PREFIX}"
 SCRATCH_TMP_DIR = "/tmp/scratch/"
 SCRATCH_WEIGHTS_PATH = "/tmp/scratch/"
-
-if USE_SCRATCH:
-    try:
-        from vllm.scratch import ScratchAPI
-    except ImportError:
-        raise AssertionError(
-            "Scratch API hasn't been built with vLLM properly. "
-            "See https://docs.google.com/document/d/1O9VIfnhYai-gJ1TLlP-3SQ4wH5LqxafxYeEHmEIPD7Q/edit#heading=h.1j3ik15fr6mh"
-        )  # noqa
diff --git a/vllm/worker/scratch_model_runner.py b/vllm/worker/scratch_model_runner.py
@@ -1,5 +1,7 @@
 from typing import List, Optional, Set, Hashable
 import time
+import importlib.util
+import sys
 
 import torch
 import torch.nn as nn
@@ -38,14 +40,23 @@
 
 LLAMA_7B_VOCAB_SIZE = 32000
 
-from vllm.scratch import ScratchAPI
-from vllm.scratch_env import (SCRATCH_TMP_DIR, SCRATCH_WEIGHTS_PREFIX,
+from vllm.scratch_env import (SCRATCH_EXECUTABLE_PATH, SCRATCH_TMP_DIR, SCRATCH_WEIGHTS_PREFIX,
                               SCRATCH_WEIGHTS_BUCKET_NAME)
 
 # SANG-TODO WORKS?
 MODEL_PARAMS_PATH = "/home/ray/default/weights"
 
 
+def import_scratch(path: Path):
+    SCRATCH_MODULE_NAME = "scratch"
+    logger.info(f"Importing scratch module from {path}")
+    spec = importlib.util.spec_from_file_location(SCRATCH_MODULE_NAME, path.resolve())
+    scratch = importlib.util.module_from_spec(spec)
+    sys.modules[SCRATCH_MODULE_NAME] = scratch
+    spec.loader.exec_module(scratch)
+    return scratch
+
+
 class ScratchSession:
 
     def __init__(self, scratch_session_id: int):
@@ -54,7 +65,7 @@ def __init__(self, scratch_session_id: int):
 
 class ScratchLRUCache(LRUCache[ScratchSession]):
 
-    def __init__(self, capacity: int, scratch_api: ScratchAPI):
+    def __init__(self, capacity: int, scratch_api):
         self._scratch_api = scratch_api
         super().__init__(capacity)
 
@@ -77,7 +88,7 @@ class ScratchSessionManager:
     information to model runner in a few weeks.
     """
 
-    def __init__(self, scratch_api: ScratchAPI, max_num_seqs: int):
+    def __init__(self, scratch_api, max_num_seqs: int):
         # ScratchAPI used to create/delete sessions.
         self._scratch_api = scratch_api
         # Set capacity to max_num_seqs * 2 so that old sequences are
@@ -134,7 +145,7 @@ def __init__(
         self.pin_memory = is_pin_memory_available()
 
         # Lazily initialized.
-        self.scratch: ScratchAPI
+        self.scratch: "ScratchAPI" # type: ignore
         # Scratch only returns embedding. We need to multiply it to lm_head
         # to get the final logits, and that happens in vLLM. In order to
         # do that, we create a torch module with lm_head weights loaded.
@@ -155,8 +166,10 @@ def _verify_scratch_config(self):
             "Vision model not supported")
         assert self.kv_cache_dtype == "auto", (
             "Currently, Scratch doesn't use kv cache.")
-        assert "llama-2" in self.model_config.model.lower(), (
-            "Only Llama 7B is supported.")
+        # SANG-TODO Support only llama 2 and 3.
+        assert ("llama-2" in self.model_config.model.lower()
+                    or "llama-3" in self.model_config.model.lower()), (
+            "Only Llama 2 7B or llama 3 8B is supported.")
         assert self.lora_manager is None, ("lora is not supported.")
         assert self.model_config.enforce_eager is True, (
             "cuda graph is not needed for Scratch.")
@@ -171,7 +184,12 @@ def load_model(self) -> None:
         weights_dir = tmp_dir / "parameters"
         weights_dir.mkdir(exist_ok=True)
         # TODO(sang): Need to obtain this programmatically.
-        download_dir = weights_dir / "ll27b-s1-cuda-f16-fullopt"
+        # download_dir = weights_dir / "ll27b-s1-cuda-f16-fullopt"
+        scratch_mod = import_scratch(Path(SCRATCH_EXECUTABLE_PATH))
+        base_dir = str(weights_dir.resolve())
+        self.scratch = scratch_mod.ScratchAPI(base_dir) 
+        scratch_subdir = self.scratch.get_param_subdir()
+        download_dir = weights_dir / scratch_subdir 
         download_dir.mkdir(exist_ok=True)
         download_dir_path = str(download_dir.absolute())
         self.load_config.download_dir = str(weights_dir.absolute())
@@ -190,7 +208,6 @@ def load_model(self) -> None:
                 scheduler_config=self.scheduler_config,
                 cache_config=self.cache_config,
             )
-            self.scratch = ScratchAPI(str(weights_dir.absolute()))
             self.scratch.start()
             self._scratch_session_manager = ScratchSessionManager(
                 self.scratch, self.scheduler_config.max_num_seqs)
@@ -223,7 +240,8 @@ def _download_scratch_weights(self, prefix: str, target_dir: str,
                     dirs.append(k)
             next_token = results.get('NextContinuationToken')
         # Assume there's no subdirectories.
-        assert len(dirs) == 1
+        dirs = {p.rsplit("/", 1)[0] for p in files}
+        assert len(dirs) == 1, dirs
 
         # NOTE(sang): Versioning is not supported now. We assume the
         # weights are always the same.
@@ -285,8 +303,8 @@ def execute_model(
                                                      self.device,
                                                      self.pin_memory)
         return self._execute_and_vllm_sample(prefill_groups, decode_groups,
-                                             input_tokens, session_ids,
-                                             parent_ids, sampling_metadata)
+                                            input_tokens, session_ids,
+                                            parent_ids, sampling_metadata)
         # return self._execute_and_scratch_sample(
         #     prefill_groups, decode_groups, input_tokens, session_ids, parent_ids)
 
@@ -327,7 +345,7 @@ def _execute_and_vllm_sample(
             input_tokens_tensor = torch.tensor(input_tokens[i],
                                                device="cuda",
                                                dtype=torch.int)
-            print(f"SANG-TODO {input_tokens_tensor=}")
+            # print(f"SANG-TODO {input_tokens_tensor=}")
             assert input_tokens_tensor.is_contiguous()
             # print(f"SANG-TODO {input_tokens_tensor.shape=}")
 
@@ -338,7 +356,7 @@ def _execute_and_vllm_sample(
             hidden_states_end_index = (len_prefix_before_this + len(input_tokens[i])) * self.model_config.get_hidden_size()
             # print(f"SANG-TODO {hidden_states_start_index=} {hidden_states_end_index=}")
             # print(f"SANG-TODO {hidden_states.shape=}")
-            print(f"SANG-TODO {hidden_states[hidden_states_start_index: hidden_states_end_index].shape=}")
+            # print(f"SANG-TODO {hidden_states[hidden_states_start_index: hidden_states_end_index].shape=}")
             assert hidden_states[hidden_states_start_index: hidden_states_end_index].is_contiguous()
             self.scratch.prefill(
                 session_id,
@@ -363,9 +381,9 @@ def _execute_and_vllm_sample(
                 hidden_states.data_ptr(),
             )
 
-        print(
-            f"SANG-TODO forward takes {(time.time() - s)* 1000} ms. Batch size: {len(session_ids)=} is_prefill: {len(prefill_groups) > 0}"
-        )
+        # print(
+        #     f"SANG-TODO forward takes {(time.time() - s)* 1000} ms. Batch size: {len(session_ids)=} is_prefill: {len(prefill_groups) > 0}"
+        # )
         # print(hidden_states)
         # print(f"SANG-TODO {hidden_states.shape=}")
         # Post process Scratch embeddings.
@@ -375,16 +393,16 @@ def _execute_and_vllm_sample(
         # is this expected? 
         hidden_states = hidden_states.view(-1,
                                            self.model_config.get_hidden_size())
-        if len(prefill_groups) > 0:
-            print(f"SANG-TODO before norm {hidden_states=}")
-            print(f"SANG-TODO {hidden_states.shape=}")
+        # if len(prefill_groups) > 0:
+            # print(f"SANG-TODO before norm {hidden_states=}")
+            # print(f"SANG-TODO {hidden_states.shape=}")
         # Scratch doesn't apply rms norm in its output, so we should do it ourselves.
         # Residual is set to None because it is already added from Scratch output.
         hidden_states = self.model.norm(hidden_states, None)
-        if len(prefill_groups) > 0:
-            print(f"SANG-TODO norm weights: {self.model.norm.weight=}")
-            print(f"SANG-TODO {hidden_states.shape=}")
-            print(f"SANG-TODO after norm {hidden_states=}")
+        # if len(prefill_groups) > 0:
+        #     print(f"SANG-TODO norm weights: {self.model.norm.weight=}")
+        #     print(f"SANG-TODO {hidden_states.shape=}")
+        #     print(f"SANG-TODO after norm {hidden_states=}")
         # print(f"{hidden_states.shape=}")
 
         # SANG-TODO remove it. Hack. It will work once scrath returns embedding of all tokens correctly.
@@ -401,14 +419,14 @@ def _execute_and_vllm_sample(
             logits=logits,
             sampling_metadata=sampling_metadata,
         )
-        if len(prefill_groups) > 0:
-            print(
-                f"SANG-TODO prefill takes {(time.time() - s)* 1000} ms. Batch size: {len(session_ids)=}"
-            )
-        else:
-            print(
-                f"SANG-TODO decode takes {(time.time() - s)* 1000} ms. Batch size: {len(session_ids)=}"
-            )
+        # if len(prefill_groups) > 0:
+        #     print(
+        #         f"SANG-TODO prefill takes {(time.time() - s)* 1000} ms. Batch size: {len(session_ids)=}"
+        #     )
+        # else:
+        #     print(
+        #         f"SANG-TODO decode takes {(time.time() - s)* 1000} ms. Batch size: {len(session_ids)=}"
+        #     )
         # print(output)
         return output
 
@@ -443,7 +461,7 @@ def _execute_and_scratch_sample(
                 batch_size,
                 tokens_out.data_ptr(),
             )
-        print(f"SANG-TODO token: {tokens_out}")
+        # print(f"SANG-TODO token: {tokens_out}")
 
         result_tokens = tokens_out.tolist()
         outputs = []
@@ -462,7 +480,7 @@ def _execute_and_scratch_sample(
                 )
             )
         output = SamplerOutput(outputs=outputs)
-        print(output)
+        # print(output)
         return output
 
     @torch.inference_mode()
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
@@ -371,6 +371,8 @@ def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype):
 
 def raise_if_cache_size_invalid(num_gpu_blocks, block_size,
                                 max_model_len) -> None:
+    if USE_SCRATCH:
+        return
     if num_gpu_blocks <= 0:
         raise ValueError("No available memory for the cache blocks. "
                          "Try increasing `gpu_memory_utilization` when "