up

metascroy · metascroy · commit efc53828bdb8 · 2025-02-21T10:33:04.000-08:00
diff --git a/examples/apple/coreml/llama/export.py b/examples/apple/coreml/llama/export.py
@@ -27,9 +27,11 @@
 
 
 class SplitLinearModule(torch.nn.Module):
-    def __init__(self, in_features, out_features, target_size):
+    def __init__(self, in_features, out_features, target_size, max_splits):
         super(SplitLinearModule, self).__init__()
         self.num_splits = max(out_features // target_size, 1)
+        if self.num_splits > max_splits:
+            self.num_splits = max_splits
         self.common_size = out_features // self.num_splits
         self.remainder = out_features % self.num_splits
         self.splits = torch.nn.ModuleList(
@@ -38,7 +40,13 @@ def __init__(self, in_features, out_features, target_size):
                 for _ in range(self.num_splits)
             ]
         )
+        print(
+            f"Splitting out_features={out_features} into {self.num_splits} of size {self.common_size}"
+        )
         if self.remainder > 0:
+            print(
+                f"Warning: remainder {self.remainder} after splitting out_features={out_features} into {self.num_splits} of size {self.common_size}"
+            )
             self.splits.append(torch.nn.Linear(in_features, self.remainder))
 
     def split_sizes(self):
@@ -48,11 +56,11 @@ def forward(self, x):
         return torch.cat([split(x) for split in self.splits], dim=-1)
 
 
-def replace_linear_with_split_linear(model, target_size):
+def replace_linear_with_split_linear(model, target_size, max_splits):
     for name, module in model.named_children():
         if isinstance(module, torch.nn.Linear):
             new_module = SplitLinearModule(
-                module.in_features, module.out_features, target_size
+                module.in_features, module.out_features, target_size, max_splits
             )
             split_sizes = new_module.split_sizes()
             if module.bias is not None:
@@ -66,8 +74,7 @@ def replace_linear_with_split_linear(model, target_size):
                     split.bias = None
             setattr(model, name, new_module)
         else:
-            replace_linear_with_split_linear(module, target_size)
-
+            replace_linear_with_split_linear(module, target_size, max_splits)
 
 
 def main() -> None:
@@ -130,6 +137,12 @@ def main() -> None:
         default=None,
         help="Split linear layers into smaller chunks of target_size",
     )
+    parser.add_argument(
+        "--max_splits",
+        type=int,
+        default=8,
+        help="Maximum number of splits to divide linear layers",
+    )
 
     export_args = parser.parse_args()
     params_path = export_args.params
@@ -180,7 +193,9 @@ def main() -> None:
         ).quantized_model()
 
     if export_args.target_size is not None:
-        replace_linear_with_split_linear(model, export_args.target_size)
+        replace_linear_with_split_linear(
+            model, export_args.target_size, export_args.max_splits
+        )
 
     model = model.to(float_dtype)
 
diff --git a/examples/apple/coreml/llama/extract_and_combine.py b/examples/apple/coreml/llama/extract_and_combine.py
diff --git a/examples/apple/coreml/llama/readme.md b/examples/apple/coreml/llama/readme.md
@@ -7,8 +7,6 @@ Export model with:
 python export.py -n /path/to/output/model.pte -p /path/to/params.json -c /path/to/model.pth --seq_length 64 --max_seq_length 1024 --coreml-quantize c4w
 ```
 
-For better performance, use "--use_cache_list" export arg (does not work with pybindings).  You can also set "--target_size", which splits linear layers into smaller sizes for the ANE (it defaults to no splitting).  This can have substantial impact on performance.  For example, on Llama1B by setting "--target_size" to 1024, I see 1.34x increase in inference speed on M1 Pro (but loading time is increased).  We need further experiments to tune this.
-
 The runner is written in python and is only intended to serve as an example for how the model inputs should be processed; it is not performant.
 
 
@@ -17,4 +15,18 @@ Run model with:
 python run.py -m /path/to/model.pte -p /path/to/params.json -t /path/to/tokenizer.model --seq_length 64 --max_seq_length 1024 --prompt "Once upon a time," --n_steps 512
 ```
 
-The model here is based on a "sliding" cache, where old tokens are evicted from the cache.  By default, the cache size is max_seq_length - seq_length, but you can explicitly pass in a smaller cache size (e.g., --cache_size 512).  This can speed up computation and reduce memory.  Keep in mind that once cache_size is reached, older tokens get evicted from the cache and do not participate in attention.
+The model here is based on a "sliding" cache, where old tokens are evicted from the cache.  There is no actual sliding in the implementation, though.tion.
+
+
+## Export args
+* seq_length: the number of tokens processed by the model.  Sequences shorter than seq_length must be padded, and sequences longer than it must be chunked.
+* max_seq_length: the maximum context tokens that can be processed.
+* cache_size: the size of the KV cache sequences.  This parameter is optional, and defaults to max_seq_length - seq_length.  If a smaller cache_size is used, older tokens are evicted from the cache and no longer play a role in attention.  For example, if max_seq_length=1024, but cache_size is 512, the model can generate up to 1024 tokens, but only the current tokens and the previous 512 will participate in attention.  In terms of computation, cache_size plays a similar role to max_seq_length in models without cache eviction.
+* use_cache_list: boolean option that controls whether KV caches are passed as a list of 4D tensors, one per layer, or if they are passed as one 5D tensor.  (Note that use_cache_list does not work with ExecuTorch pybindings.)
+* target_size: this option splits linear layers into chunks of target size.  For example, if target_size is 1024, a linear layer with (in_features=512, out_features=8096) will be split into 8 linear layers with (in_features=512, out_features=1024) and the results concatted.  If not specified, the default is no splitting.
+* max_splits: this controls the maximum number of splits for linear layers.  It is only relevant if target_size is passed and defaults to 8.
+
+## Llama1B on iPhone 15
+
+We are actively experimenting with different settings, but here are ones we've found that work well on iPhone 15 Pro for Llama1B:
+* max_seq_length=1024, seq_length=64, use_cache_list, target_size=1024, max_splits=8