File tree Expand file tree Collapse file tree 2 files changed +9
-5
lines changed Expand file tree Collapse file tree 2 files changed +9
-5
lines changed Original file line number Diff line number Diff line change @@ -205,9 +205,10 @@ def collect_best_params(block, low_gpu_mem_usage: bool = False):
205205 if hasattr (m , "orig_layer" ):
206206 params [n ] = {}
207207 for key in m .params .keys ():
208- params [n ][key ] = copy .deepcopy (m .params [key ].data )
209208 if low_gpu_mem_usage :
210- params [n ][key ] = params [n ][key ].cpu ()
209+ params [n ][key ] = m .params [key ].data .cpu ()
210+ else :
211+ params [n ][key ] = copy .deepcopy (m .params [key ].data )
211212 return params
212213
213214
Original file line number Diff line number Diff line change @@ -855,10 +855,13 @@ def estimate_tuning_block_mem(
855855 else :
856856 output_memory_gb = 0.0
857857
858+ if has_moe :
859+ pparent_module = get_module (block , layer_name .rsplit ("." , 2 )[0 ]) if "." in layer_name else block
860+ is_moe_expert = "expert" in layer_name .lower () and isinstance (pparent_module , torch .nn .ModuleList )
861+ else :
862+ is_moe_expert = False
863+
858864 # memory * 2, because it contains grad tensor.
859- # Check if this is a MoE expert layer by layer name (e.g., "mlp.experts.0.gate_proj")
860- parent_module = get_module (block , layer_name .rsplit ("." , 1 )[0 ]) if "." in layer_name else block
861- is_moe_expert = "expert" in layer_name .lower () and isinstance (parent_module , torch .nn .ModuleList )
862865 layer_memory_dict [layer_name ] = {
863866 "param_memory" : param_memory_gb * 2 ,
864867 "output_memory" : output_memory_gb * 2 ,
You can’t perform that action at this time.
0 commit comments