up

metascroy · metascroy · commit cffa508d89e2 · 2025-02-21T10:33:04.000-08:00
diff --git a/examples/apple/coreml/llama/export.py b/examples/apple/coreml/llama/export.py
@@ -26,6 +26,50 @@
 from llama.llama_transformer import InputManager, ModelArgs, Transformer
 
 
+class SplitLinearModule(torch.nn.Module):
+    def __init__(self, in_features, out_features, target_size):
+        super(SplitLinearModule, self).__init__()
+        self.num_splits = max(out_features // target_size, 1)
+        self.common_size = out_features // self.num_splits
+        self.remainder = out_features % self.num_splits
+        self.splits = torch.nn.ModuleList(
+            [
+                torch.nn.Linear(in_features, self.common_size)
+                for _ in range(self.num_splits)
+            ]
+        )
+        if self.remainder > 0:
+            self.splits.append(torch.nn.Linear(in_features, self.remainder))
+
+    def split_sizes(self):
+        return [split.out_features for split in self.splits]
+
+    def forward(self, x):
+        return torch.cat([split(x) for split in self.splits], dim=-1)
+
+
+def replace_linear_with_split_linear(model, target_size):
+    for name, module in model.named_children():
+        if isinstance(module, torch.nn.Linear):
+            new_module = SplitLinearModule(
+                module.in_features, module.out_features, target_size
+            )
+            split_sizes = new_module.split_sizes()
+            if module.bias is not None:
+                split_bias = module.bias.split(split_sizes)
+            split_weights = module.weight.split(split_sizes, dim=0)
+            for i, split in enumerate(new_module.splits):
+                split.weight = torch.nn.Parameter(split_weights[i])
+                if module.bias is not None:
+                    split.bias = torch.nn.Parameter(split_bias[i])
+                else:
+                    split.bias = None
+            setattr(model, name, new_module)
+        else:
+            replace_linear_with_split_linear(module, target_size)
+
+
+
 def main() -> None:
     parser = argparse.ArgumentParser()
     parser.add_argument(
@@ -80,6 +124,12 @@ def main() -> None:
         action="store_true",
         help="Use cache list to speed up model computation (does not work in pybindings)",
     )
+    parser.add_argument(
+        "--target_size",
+        type=int,
+        default=None,
+        help="Split linear layers into smaller chunks of target_size",
+    )
 
     export_args = parser.parse_args()
     params_path = export_args.params
@@ -129,6 +179,9 @@ def main() -> None:
             packed=(bitwidth in [2, 4]),
         ).quantized_model()
 
+    if export_args.target_size is not None:
+        replace_linear_with_split_linear(model, export_args.target_size)
+
     model = model.to(float_dtype)
 
     op_linear_quantizer_config = None
@@ -184,6 +237,9 @@ def main() -> None:
     print("Edge program")
     print(edge_manager.exported_program())
 
+    for node in edge_manager.exported_program().graph_module.graph.nodes:
+        print(node.name, node.target, node.args, node.kwargs)
+
     edge_manager = edge_manager.to_backend(partitioner)
 
     print("Delegated program")
diff --git a/examples/apple/coreml/llama/llama_transformer.py b/examples/apple/coreml/llama/llama_transformer.py
@@ -120,7 +120,6 @@ def __post_init__(self):
         if self.head_dim is None:
             self.head_dim = self.dim // self.n_heads
 
-
 class Rope(torch.nn.Module):
     def __init__(self, params: ModelArgs):
         super().__init__()
@@ -401,7 +400,7 @@ def forward(
 
         if not self.generate_full_logits:
             # Only the last logit is used for the new generated token
-            h = h[:, input_length - 1, :]
+            h = h[:, input_length - 1, :].squeeze(1)
 
         h = self.norm(h)
 
diff --git a/examples/apple/coreml/llama/readme.md b/examples/apple/coreml/llama/readme.md
@@ -7,6 +7,7 @@ Export model with:
 python export.py -n /path/to/output/model.pte -p /path/to/params.json -c /path/to/model.pth --seq_length 64 --max_seq_length 1024 --coreml-quantize c4w
 ```
 
+For better performance, use "--use_cache_list" export arg (does not work with pybindings).  You can also set "--target_size", which splits linear layers into smaller sizes for the ANE (it defaults to no splitting).  This can have substantial impact on performance.  For example, on Llama1B by setting "--target_size" to 1024, I see 1.34x increase in inference speed on M1 Pro (but loading time is increased).  We need further experiments to tune this.
 
 The runner is written in python and is only intended to serve as an example for how the model inputs should be processed; it is not performant.