up

metascroy · metascroy · commit 64f032178f63 · 2025-02-21T10:33:04.000-08:00
diff --git a/examples/apple/coreml/llama/export.py b/examples/apple/coreml/llama/export.py
@@ -27,27 +27,25 @@
 
 
 class SplitLinearModule(torch.nn.Module):
-    def __init__(self, in_features, out_features, target_size, max_splits):
+    def __init__(self, in_features, out_features, target_split_size, max_splits):
         super(SplitLinearModule, self).__init__()
-        self.num_splits = max(out_features // target_size, 1)
-        if self.num_splits > max_splits:
-            self.num_splits = max_splits
-        self.common_size = out_features // self.num_splits
-        self.remainder = out_features % self.num_splits
+        num_splits = max(out_features // target_split_size, 1)
+        if num_splits > max_splits:
+            num_splits = max_splits
+
+        self.split_size = out_features // num_splits
+        self.split_remainder = out_features % num_splits
         self.splits = torch.nn.ModuleList(
-            [
-                torch.nn.Linear(in_features, self.common_size)
-                for _ in range(self.num_splits)
-            ]
+            [torch.nn.Linear(in_features, self.split_size) for _ in range(num_splits)]
         )
         print(
-            f"Splitting out_features={out_features} into {self.num_splits} of size {self.common_size}"
+            f"Splitting out_features={out_features} into {num_splits} of size {self.split_size}"
         )
-        if self.remainder > 0:
+        if self.split_remainder > 0:
             print(
-                f"Warning: remainder {self.remainder} after splitting out_features={out_features} into {self.num_splits} of size {self.common_size}"
+                f"Warning: remainder {self.split_remainder} after splitting out_features={out_features} into {num_splits} of size {self.split_size}"
             )
-            self.splits.append(torch.nn.Linear(in_features, self.remainder))
+            self.splits.append(torch.nn.Linear(in_features, self.split_remainder))
 
     def split_sizes(self):
         return [split.out_features for split in self.splits]
@@ -56,11 +54,11 @@ def forward(self, x):
         return torch.cat([split(x) for split in self.splits], dim=-1)
 
 
-def replace_linear_with_split_linear(model, target_size, max_splits):
+def replace_linear_with_split_linear(model, target_split_size, max_splits):
     for name, module in model.named_children():
         if isinstance(module, torch.nn.Linear):
             new_module = SplitLinearModule(
-                module.in_features, module.out_features, target_size, max_splits
+                module.in_features, module.out_features, target_split_size, max_splits
             )
             split_sizes = new_module.split_sizes()
             if module.bias is not None:
@@ -74,7 +72,7 @@ def replace_linear_with_split_linear(model, target_size, max_splits):
                     split.bias = None
             setattr(model, name, new_module)
         else:
-            replace_linear_with_split_linear(module, target_size, max_splits)
+            replace_linear_with_split_linear(module, target_split_size, max_splits)
 
 
 def main() -> None:
@@ -98,7 +96,7 @@ def main() -> None:
     parser.add_argument(
         "--seq_length",
         type=int,
-        default=1,  # set to 1 for decode
+        default=1,
         help="length sequence to evaluate",
     )
     parser.add_argument(
@@ -132,10 +130,10 @@ def main() -> None:
         help="Use cache list to speed up model computation (does not work in pybindings)",
     )
     parser.add_argument(
-        "--target_size",
+        "--target_split_size",
         type=int,
         default=None,
-        help="Split linear layers into smaller chunks of target_size",
+        help="Split linear layers into smaller chunks of target_split_size.",
     )
     parser.add_argument(
         "--max_splits",
@@ -192,9 +190,9 @@ def main() -> None:
             packed=(bitwidth in [2, 4]),
         ).quantized_model()
 
-    if export_args.target_size is not None:
+    if export_args.target_split_size is not None:
         replace_linear_with_split_linear(
-            model, export_args.target_size, export_args.max_splits
+            model, export_args.target_split_size, export_args.max_splits
         )
 
     model = model.to(float_dtype)
diff --git a/examples/apple/coreml/llama/llama_transformer.py b/examples/apple/coreml/llama/llama_transformer.py
@@ -120,6 +120,7 @@ def __post_init__(self):
         if self.head_dim is None:
             self.head_dim = self.dim // self.n_heads
 
+
 class Rope(torch.nn.Module):
     def __init__(self, params: ModelArgs):
         super().__init__()
diff --git a/examples/apple/coreml/llama/readme.md b/examples/apple/coreml/llama/readme.md
@@ -7,6 +7,8 @@ Export model with:
 python export.py -n /path/to/output/model.pte -p /path/to/params.json -c /path/to/model.pth --seq_length 64 --max_seq_length 1024 --coreml-quantize c4w
 ```
 
+(Note the script should be run from the executorch/examples/apple/coreml/llama directory.)
+
 The runner is written in python and is only intended to serve as an example for how the model inputs should be processed; it is not performant.
 
 
@@ -15,18 +17,26 @@ Run model with:
 python run.py -m /path/to/model.pte -p /path/to/params.json -t /path/to/tokenizer.model --seq_length 64 --max_seq_length 1024 --prompt "Once upon a time," --n_steps 512
 ```
 
-The model here is based on a "sliding" cache, where old tokens are evicted from the cache.  There is no actual sliding in the implementation, though.tion.
+
+(Note the script should be run from the executorch/examples/apple/coreml/llama directory.)
+
+The model here is based on a "sliding" cache, where old tokens are evicted from the cache.  There is no actual sliding in the implementation, though.
 
 
 ## Export args
 * seq_length: the number of tokens processed by the model.  Sequences shorter than seq_length must be padded, and sequences longer than it must be chunked.
 * max_seq_length: the maximum context tokens that can be processed.
 * cache_size: the size of the KV cache sequences.  This parameter is optional, and defaults to max_seq_length - seq_length.  If a smaller cache_size is used, older tokens are evicted from the cache and no longer play a role in attention.  For example, if max_seq_length=1024, but cache_size is 512, the model can generate up to 1024 tokens, but only the current tokens and the previous 512 will participate in attention.  In terms of computation, cache_size plays a similar role to max_seq_length in models without cache eviction.
 * use_cache_list: boolean option that controls whether KV caches are passed as a list of 4D tensors, one per layer, or if they are passed as one 5D tensor.  (Note that use_cache_list does not work with ExecuTorch pybindings.)
-* target_size: this option splits linear layers into chunks of target size.  For example, if target_size is 1024, a linear layer with (in_features=512, out_features=8096) will be split into 8 linear layers with (in_features=512, out_features=1024) and the results concatted.  If not specified, the default is no splitting.
+* target_split_size: this option splits linear layers into chunks of target size.  For example, if target_split_size is 1024, a linear layer with (in_features=512, out_features=8096) will be split into 8 linear layers with (in_features=512, out_features=1024) and the results concatted.  If not specified, the default is no splitting.
 * max_splits: this controls the maximum number of splits for linear layers.  It is only relevant if target_size is passed and defaults to 8.
 
 ## Llama1B on iPhone 15
 
-We are actively experimenting with different settings, but here are ones we've found that work well on iPhone 15 Pro for Llama1B:
-* max_seq_length=1024, seq_length=64, use_cache_list, target_size=1024, max_splits=8
+We are actively experimenting with different settings.  But here are ones that we've found work well for Llama1B on iPhone 15 Pro:
+
+* Set use_cache_list
+* Split linear layers with target_split_size=1024, max_splits=8
+* Use seq_length=32 or seq_length=64, both of which offer reasonable tradeoffs for prefill and decode performance.  seq_length=32 is better at decode and seq_length=64 is better at prefill.
+
+In our tests, we set max_seq_length=1024, but if your application allows for it, performance can improve with max_seq_length=512 or by keeping max_seq_length=1024 and setting cache_size=512-seq_length.