@@ -53,6 +53,7 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
5353 intermediate_size = roberta .args .encoder_ffn_embed_dim ,
5454 max_position_embeddings = 514 ,
5555 type_vocab_size = 1 ,
56+ layer_norm_eps = 1e-5 , # PyTorch default used in fairseq
5657 )
5758 if classification_head :
5859 config .num_labels = roberta .args .num_classes
@@ -69,7 +70,6 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
6970 model .roberta .embeddings .token_type_embeddings .weight .data = torch .zeros_like (model .roberta .embeddings .token_type_embeddings .weight ) # just zero them out b/c RoBERTa doesn't use them.
7071 model .roberta .embeddings .LayerNorm .weight = roberta_sent_encoder .emb_layer_norm .weight
7172 model .roberta .embeddings .LayerNorm .bias = roberta_sent_encoder .emb_layer_norm .bias
72- model .roberta .embeddings .LayerNorm .variance_epsilon = roberta_sent_encoder .emb_layer_norm .eps
7373
7474 for i in range (config .num_hidden_layers ):
7575 # Encoder: start of layer
@@ -98,7 +98,6 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
9898 self_output .dense .bias = roberta_layer .self_attn .out_proj .bias
9999 self_output .LayerNorm .weight = roberta_layer .self_attn_layer_norm .weight
100100 self_output .LayerNorm .bias = roberta_layer .self_attn_layer_norm .bias
101- self_output .LayerNorm .variance_epsilon = roberta_layer .self_attn_layer_norm .eps
102101
103102 ### intermediate
104103 intermediate : BertIntermediate = layer .intermediate
@@ -117,7 +116,6 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
117116 bert_output .dense .bias = roberta_layer .fc2 .bias
118117 bert_output .LayerNorm .weight = roberta_layer .final_layer_norm .weight
119118 bert_output .LayerNorm .bias = roberta_layer .final_layer_norm .bias
120- bert_output .LayerNorm .variance_epsilon = roberta_layer .final_layer_norm .eps
121119 #### end of layer
122120
123121 if classification_head :
@@ -131,7 +129,6 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
131129 model .lm_head .dense .bias = roberta .model .decoder .lm_head .dense .bias
132130 model .lm_head .layer_norm .weight = roberta .model .decoder .lm_head .layer_norm .weight
133131 model .lm_head .layer_norm .bias = roberta .model .decoder .lm_head .layer_norm .bias
134- model .lm_head .layer_norm .variance_epsilon = roberta .model .decoder .lm_head .layer_norm .eps
135132 model .lm_head .decoder .weight = roberta .model .decoder .lm_head .weight
136133 model .lm_head .bias = roberta .model .decoder .lm_head .bias
137134
@@ -144,6 +141,8 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
144141 else :
145142 their_output = roberta .model (input_ids )[0 ]
146143 print (our_output .shape , their_output .shape )
144+ max_absolute_diff = torch .max (torch .abs (our_output - their_output )).item ()
145+ print (f"max_absolute_diff = { max_absolute_diff } " ) # ~ 1e-7
147146 success = torch .allclose (our_output , their_output , atol = 1e-3 )
148147 print (
149148 "Do both models output the same tensors?" ,
0 commit comments