mark_lora_as_trainable (PaddlePaddle#5241)

sijunhe · web-flow · commit 230e03f66fa4 · 2023-03-15T19:21:28.000+08:00
diff --git a/paddlenlp/layers/__init__.py b/paddlenlp/layers/__init__.py
@@ -18,6 +18,6 @@
     GPLinkerForEventExtraction,
     GPLinkerForRelationExtraction,
 )
-from .lora import LoRAConfig, LoRALinear, get_lora_model
+from .lora import *
 from .sequence import sequence_mask
 from .tcn import TCN, TemporalBlock
diff --git a/paddlenlp/layers/lora.py b/paddlenlp/layers/lora.py
@@ -23,6 +23,14 @@
 import paddle.nn.functional as F
 
 from ..utils.env import LORA_CONFIG_NAME
+from ..utils.log import logger
+
+__all__ = [
+    "LoRAConfig",
+    "LoRALinear",
+    "get_lora_model",
+    "mark_only_lora_as_trainable",
+]
 
 
 class LoRALinear(nn.Linear):
@@ -116,6 +124,17 @@ def _find_and_replace_module(model, module_name, lora_config):
     setattr(parent_module, attribute_chain[-1], lora_module)
 
 
+def mark_only_lora_as_trainable(model: nn.Layer) -> None:
+    freeze_numel, trainable_numel = 0, 0
+    for name, weight in model.state_dict().items():
+        if "lora" not in name:
+            weight.stop_gradient = True
+            freeze_numel += weight.numel().numpy()[0]
+        else:
+            trainable_numel += weight.numel().numpy()[0]
+    logger.info(f"{freeze_numel:.2e} parameters are frozen, {trainable_numel:.2e} LoRA parameters are trainable")
+
+
 @dataclass
 class LoRAConfig:
     """
diff --git a/tests/layers/test_lora.py b/tests/layers/test_lora.py
@@ -21,7 +21,12 @@
 import numpy as np
 import paddle
 
-from paddlenlp.layers import LoRAConfig, LoRALinear, get_lora_model
+from paddlenlp.layers import (
+    LoRAConfig,
+    LoRALinear,
+    get_lora_model,
+    mark_only_lora_as_trainable,
+)
 from paddlenlp.transformers import AutoModel
 
 
@@ -88,14 +93,22 @@ def test_get_lora_model(self):
             "__internal_testing__/tiny-random-bert", hidden_dropout_prob=0, attention_probs_dropout_prob=0
         )
         lora_model = get_lora_model(model, lora_config)
+        mark_only_lora_as_trainable(lora_model)
         state_dict = lora_model.state_dict()
         for weight_name in state_dict:
+            is_target_module = False
             for target_module in lora_config.target_modules:
                 if re.fullmatch(target_module, weight_name):
-                    if "lora" in weight_name:
-                        self.assertFalse(state_dict[weight_name].stop_gradient)
-                    else:
-                        self.assertTrue(state_dict[weight_name].stop_gradient)
+                    is_target_module = True
+            # if this is a target module, lora weights are trainable, non-lora weights are not
+            if is_target_module:
+                if "lora" in weight_name:
+                    self.assertFalse(state_dict[weight_name].stop_gradient)
+                else:
+                    self.assertTrue(state_dict[weight_name].stop_gradient)
+            # if this is not a target module, all weights are not trainable
+            else:
+                self.assertTrue(state_dict[weight_name].stop_gradient)
         input_ids = paddle.to_tensor(np.random.randint(100, 200, [1, 20]))
         model.train()
         train_forward_results = model(input_ids)

Original file line number	Diff line number	Diff line change
`@@ -18,6 +18,6 @@`
`18`	`18`	`GPLinkerForEventExtraction,`
`19`	`19`	`GPLinkerForRelationExtraction,`
`20`	`20`	`)`
`21`		`-from .lora import LoRAConfig, LoRALinear, get_lora_model`
	`21`	`+from .lora import *`
`22`	`22`	`from .sequence import sequence_mask`
`23`	`23`	`from .tcn import TCN, TemporalBlock`