@@ -302,275 +302,6 @@ def test_static_afp8_export(self, static_kv_dtype):
302302 self .assertEqual (f .get_tensor ("model.decoder.layers.5.self_attn.v_proj.weight" ).dtype , torch .float8_e4m3fn )
303303 shutil .rmtree (quantized_model_path , ignore_errors = True )
304304
305- def test_mxfp4_llmcompressor_format (self ):
306- model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
307- model = AutoModelForCausalLM .from_pretrained (model_name , torch_dtype = "auto" , trust_remote_code = True )
308- from transformers import AutoConfig
309-
310- scheme = "MXFP4"
311- layer_config = {}
312- fp_layers_str = "k_proj"
313- from auto_round .utils import get_fp_layer_names
314-
315- not_quantize_layer_names = get_fp_layer_names (model , fp_layers_str )
316- for name in not_quantize_layer_names :
317- layer_config [name ] = {"bits" : 16 , "act_bits" : 16 , "data_type" : "float" }
318- autoround = AutoRound (
319- model ,
320- self .tokenizer ,
321- scheme = scheme ,
322- iters = 2 ,
323- seqlen = 2 ,
324- layer_config = layer_config ,
325- dataset = self .llm_dataloader ,
326- )
327- quantized_model_path = self .save_dir
328- autoround .quantize ()
329- compressed_model = autoround .save_quantized (
330- output_dir = quantized_model_path , inplace = True , format = "llm_compressor"
331- )
332- tmp_layer = compressed_model .model .decoder .layers [3 ].self_attn .q_proj
333- skip_layer = compressed_model .model .decoder .layers [3 ].self_attn .k_proj
334- assert (
335- hasattr (tmp_layer , "weight_scale" )
336- and hasattr (tmp_layer , "weight_packed" )
337- and tmp_layer .weight_scale .dtype is torch .uint8
338- and tmp_layer .weight_scale .shape [0 ] == 768
339- ), "Illegal MXFP4 packing name or data_type or shape"
340- assert not hasattr (skip_layer , "weight_scale" ) and not hasattr ( ## check skipped layers
341- skip_layer , "weight_packed"
342- ), "Illegal MXFP4 quantization for fp_layers"
343- quantization_config = AutoConfig .from_pretrained (
344- quantized_model_path , trust_remote_code = True
345- ).quantization_config
346- assert (
347- quantization_config ["format" ] == "float-quantized"
348- and quantization_config ["config_groups" ]["group_0" ]["weights" ]["is_mx" ] is True
349- and quantization_config ["config_groups" ]["group_0" ]["weights" ]["num_bits" ] == 4
350- ), f"Invalid MXFP4 quantization configuration: { quantization_config } "
351-
352- shutil .rmtree ("./saved" , ignore_errors = True )
353-
354- def test_rtn_mxfp4_llmcompressor_format (self ):
355- model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
356- model = AutoModelForCausalLM .from_pretrained (model_name , torch_dtype = "auto" , trust_remote_code = True )
357- from transformers import AutoConfig
358-
359- scheme = "MXFP4"
360- layer_config = {}
361- fp_layers_str = "k_proj"
362- from auto_round .utils import get_fp_layer_names
363-
364- not_quantize_layer_names = get_fp_layer_names (model , fp_layers_str )
365- for name in not_quantize_layer_names :
366- layer_config [name ] = {"bits" : 16 , "act_bits" : 16 , "data_type" : "float" }
367- autoround = AutoRound (
368- model ,
369- self .tokenizer ,
370- scheme = scheme ,
371- iters = 0 ,
372- seqlen = 2 ,
373- layer_config = layer_config ,
374- dataset = self .llm_dataloader ,
375- )
376- quantized_model_path = self .save_dir
377- autoround .quantize ()
378- compressed_model = autoround .save_quantized (
379- output_dir = quantized_model_path , inplace = True , format = "llm_compressor"
380- )
381- tmp_layer = compressed_model .model .decoder .layers [3 ].self_attn .q_proj
382- skip_layer = compressed_model .model .decoder .layers [3 ].self_attn .k_proj
383- assert (
384- hasattr (tmp_layer , "weight_scale" )
385- and hasattr (tmp_layer , "weight_packed" )
386- and tmp_layer .weight_scale .dtype is torch .uint8
387- and tmp_layer .weight_scale .shape [0 ] == 768
388- ), "Illegal MXFP4 packing name or data_type or shape"
389- assert not hasattr (skip_layer , "weight_scale" ) and not hasattr ( ## check skipped layers
390- skip_layer , "weight_packed"
391- ), "Illegal MXFP4 quantization for fp_layers"
392- quantization_config = AutoConfig .from_pretrained (
393- quantized_model_path , trust_remote_code = True
394- ).quantization_config
395- assert (
396- quantization_config ["format" ] == "float-quantized"
397- and quantization_config ["config_groups" ]["group_0" ]["weights" ]["is_mx" ] is True
398- and quantization_config ["config_groups" ]["group_0" ]["weights" ]["num_bits" ] == 4
399- ), f"Invalid MXFP4 quantization configuration: { quantization_config } "
400- shutil .rmtree ("./saved" , ignore_errors = True )
401-
402- def test_mxfp8_llmcompressor_format (self ):
403- model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
404- model = AutoModelForCausalLM .from_pretrained (model_name , torch_dtype = "auto" , trust_remote_code = True )
405- from transformers import AutoConfig
406-
407- scheme = "MXFP8"
408- autoround = AutoRound (
409- model ,
410- self .tokenizer ,
411- scheme = scheme ,
412- iters = 2 ,
413- seqlen = 2 ,
414- dataset = self .llm_dataloader ,
415- )
416- quantized_model_path = self .save_dir
417- compressed_model , _ = autoround .quantize_and_save (output_dir = quantized_model_path , format = "llm_compressor" )
418- tmp_layer = compressed_model .model .decoder .layers [3 ].self_attn .q_proj
419- assert (
420- hasattr (tmp_layer , "weight_scale" )
421- and hasattr (tmp_layer , "weight" )
422- and tmp_layer .weight .dtype is torch .float8_e4m3fn
423- and tmp_layer .weight_scale .dtype is torch .uint8
424- and tmp_layer .weight_scale .shape [0 ] == 768
425- ), "Illegal MXFP8 packing name or data_type or shape"
426- quantization_config = AutoConfig .from_pretrained (
427- quantized_model_path , trust_remote_code = True
428- ).quantization_config
429- assert (
430- quantization_config ["format" ] == "float-quantized"
431- and quantization_config ["config_groups" ]["group_0" ]["weights" ]["is_mx" ] is True
432- and quantization_config ["config_groups" ]["group_0" ]["weights" ]["num_bits" ] == 8
433- ), f"Invalid MXFP8 quantization configuration: { quantization_config } "
434- folder_size_gb = _get_folder_size (quantized_model_path )
435- # Original opt-125m is < 0.5GB -> quantized mxfp8 model should be smaller but not empty
436- assert (
437- 0.15 < folder_size_gb < 0.2
438- ), f"Quantized model folder size { folder_size_gb :.2f} GB is outside the expected range (0.1~0.2 GB)"
439- shutil .rmtree ("./saved" , ignore_errors = True )
440-
441- def test_nvfp4_llmcompressor_format (self ):
442- model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
443- model = AutoModelForCausalLM .from_pretrained (model_name , torch_dtype = "auto" , trust_remote_code = True )
444- from transformers import AutoConfig
445-
446- scheme = "NVFP4"
447- autoround = AutoRound (
448- model ,
449- self .tokenizer ,
450- scheme = scheme ,
451- iters = 2 ,
452- seqlen = 2 ,
453- dataset = self .llm_dataloader ,
454- )
455- quantized_model_path = self .save_dir
456- compressed_model , _ = autoround .quantize_and_save (output_dir = quantized_model_path , format = "llm_compressor" )
457- tmp_layer = compressed_model .model .decoder .layers [3 ].self_attn .q_proj
458- assert (
459- hasattr (tmp_layer , "weight_scale" )
460- and hasattr (tmp_layer , "weight_global_scale" )
461- and hasattr (tmp_layer , "input_global_scale" )
462- and tmp_layer .weight_packed .dtype is torch .uint8
463- and tmp_layer .weight_scale .dtype is torch .float8_e4m3fn
464- and tmp_layer .weight_scale .shape [0 ] == 768
465- ), "Illegal NVFP4 packing name or data_type or shape"
466- quantization_config = AutoConfig .from_pretrained (
467- quantized_model_path , trust_remote_code = True
468- ).quantization_config
469- assert (
470- quantization_config ["format" ] == "nvfp4-pack-quantized"
471- and quantization_config ["config_groups" ]["group_0" ]["input_activations" ]["num_bits" ] == 4
472- ), f"Invalid NVFP4 quantization configuration: { quantization_config } "
473- folder_size_gb = _get_folder_size (quantized_model_path )
474- # Original opt-125m is < 0.5GB -> quantized nvfp4 model should be smaller but not empty
475- assert (
476- 0.1 < folder_size_gb < 0.15
477- ), f"Quantized model folder size { folder_size_gb :.2f} GB is outside the expected range (0.1~0.15 GB)"
478- shutil .rmtree ("./saved" , ignore_errors = True )
479-
480- def test_nvfp4_autoround_format (self ):
481- model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
482- model = AutoModelForCausalLM .from_pretrained (model_name , torch_dtype = "auto" , trust_remote_code = True )
483- from transformers import AutoConfig
484-
485- scheme = "NVFP4"
486- autoround = AutoRound (
487- model ,
488- self .tokenizer ,
489- scheme = "NVFP4" ,
490- iters = 2 ,
491- seqlen = 2 ,
492- dataset = self .llm_dataloader ,
493- )
494- quantized_model_path = self .save_dir
495- compressed_model , _ = autoround .quantize_and_save (output_dir = quantized_model_path , format = "auto_round" )
496- tmp_layer = compressed_model .model .decoder .layers [3 ].self_attn .q_proj
497- assert (
498- hasattr (tmp_layer , "weight_scale" )
499- and hasattr (tmp_layer , "weight_global_scale" )
500- and hasattr (tmp_layer , "input_global_scale" )
501- and tmp_layer .weight_packed .dtype is torch .uint8
502- and tmp_layer .weight_scale .dtype is torch .float8_e4m3fn
503- and tmp_layer .weight_scale .shape [0 ] == 768
504- ), "Illegal NVFP4 packing name or data_type or shape"
505- shutil .rmtree ("./saved" , ignore_errors = True )
506-
507- def test_nvfp4_autoround_save_quantized (self ):
508- model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
509- model = AutoModelForCausalLM .from_pretrained (model_name , torch_dtype = "auto" , trust_remote_code = True )
510- from transformers import AutoConfig
511-
512- scheme = "NVFP4"
513- autoround = AutoRound (
514- model ,
515- self .tokenizer ,
516- scheme = "NVFP4" ,
517- iters = 2 ,
518- seqlen = 2 ,
519- dataset = self .llm_dataloader ,
520- )
521- quantized_model_path = self .save_dir
522- autoround .quantize ()
523- compressed_model = autoround .save_quantized (output_dir = quantized_model_path , format = "auto_round" )
524- tmp_layer = compressed_model .model .decoder .layers [3 ].self_attn .q_proj
525- assert (
526- hasattr (tmp_layer , "weight_scale" )
527- and hasattr (tmp_layer , "weight_global_scale" )
528- and hasattr (tmp_layer , "input_global_scale" )
529- and tmp_layer .weight_packed .dtype is torch .uint8
530- and tmp_layer .weight_scale .dtype is torch .float8_e4m3fn
531- and tmp_layer .weight_scale .shape [0 ] == 768
532- ), "Illegal NVFP4 packing name or data_type or shape"
533- shutil .rmtree ("./saved" , ignore_errors = True )
534-
535- def test_nvfp4_moe_actmax_rtn (self ):
536- model_name = "/tf_dataset/auto_round/models/deepseek-ai/DeepSeek-V2-Lite"
537- layer_config = {
538- "self_attn" : {"bits" : 16 , "act_bits" : 16 },
539- "mlp.shared_experts" : {"bits" : 16 , "act_bits" : 16 },
540- }
541- scheme = "nvfp4"
542- autoround = AutoRound (
543- model_name ,
544- scheme = scheme ,
545- iters = 0 ,
546- seqlen = 2 ,
547- nsamples = 2 ,
548- dataset = self .llm_dataloader ,
549- layer_config = layer_config ,
550- )
551- compressed_model , _ = autoround .quantize ()
552- assert hasattr (compressed_model .model .layers [1 ].mlp .experts [0 ].gate_proj .orig_layer , "act_max" )
553-
554- def test_nvfp4_moe_actmax_ar (self ):
555- model_name = "/tf_dataset/auto_round/models/deepseek-ai/DeepSeek-V2-Lite"
556- layer_config = {
557- "q_proj" : {"bits" : 16 , "act_bits" : 16 },
558- "mlp.shared_experts" : {"bits" : 16 , "act_bits" : 16 },
559- "experts.*2" : {"bits" : 16 , "act_bits" : 16 },
560- "experts.*5" : {"bits" : 16 , "act_bits" : 16 },
561- }
562- scheme = "nvfp4"
563- autoround = AutoRound (
564- model_name ,
565- scheme = scheme ,
566- iters = 1 ,
567- seqlen = 2 ,
568- nsamples = 2 ,
569- dataset = self .llm_dataloader ,
570- layer_config = layer_config ,
571- )
572- autoround .quantize_and_save (output_dir = self .save_dir , inplace = True , format = "auto_round" )
573-
574305
575306if __name__ == "__main__" :
576307 unittest .main ()
0 commit comments