fix(nlp/bert): update bert README and add a script

wangkang-mt · wangkang-mt · commit 7a2c396c6857 · 2025-07-14T09:06:27.000-04:00
diff --git a/pytorch/nlp/bert/README.md b/pytorch/nlp/bert/README.md
@@ -3,6 +3,8 @@
 
 1. Prepare model
 ```
+apt-get update
+apt-get install git-lfs
 git lfs install
 git clone https://huggingface.co/google-bert/bert-base-chinese
 
@@ -32,8 +34,9 @@ bash run_train.sh
 bash run_dist_train.sh
 ```
 
-5. Inference
+5. Model Consistency Check
 ```shell
+# ⚠️  Make sure the model_path in test_bert.py is correctly set before running
 cp -r test_bert.py bert4torch/test/models/
 python bert4torch/test/models/test_bert.py
 ```
diff --git a/pytorch/nlp/bert/test_bert.py b/pytorch/nlp/bert/test_bert.py
@@ -0,0 +1,48 @@
+import torch
+import torch_musa
+from bert4torch.models import build_transformer_model
+from bert4torch.tokenizers import Tokenizer
+from transformers import BertConfig, BertTokenizer, BertModel
+import os
+
+
+device = 'musa' if torch.cuda.is_available() else 'cpu'
+
+def get_bert4torch_model(model_dir):
+    config_path = model_dir + "/bert4torch_config.json"
+    if not os.path.exists(config_path):
+        config_path = model_dir + "/config.json"
+    checkpoint_path = model_dir + '/pytorch_model.bin'
+
+    model = build_transformer_model(config_path, checkpoint_path)  # 建立模型，加载权重
+    return model.to(device)
+
+
+def get_hf_model(model_dir):
+    tokenizer = BertTokenizer.from_pretrained(model_dir)
+    model = BertModel.from_pretrained(model_dir)
+    return model.to(device), tokenizer
+
+
+@pytest.mark.parametrize("model_dir", ["E:/data/pretrain_ckpt/bert/google@bert-base-chinese",
+                                       "E:/data/pretrain_ckpt/bert/bert-base-multilingual-cased",
+                                       "E:/data/pretrain_ckpt/bert/hfl@macbert-base",
+                                       "E:/data/pretrain_ckpt/bert/hfl@chinese-bert-wwm-ext"])
+@torch.inference_mode()
+def test_bert(model_dir):
+    model = get_bert4torch_model(model_dir)
+    model_hf, tokenizer = get_hf_model(model_dir)
+
+    model.eval()
+    model_hf.eval()
+
+    inputs = tokenizer('语言模型', padding=True, return_tensors='pt').to(device)
+    sequence_output = model(**inputs)
+    sequence_output_hf = model_hf(**inputs).last_hidden_state
+    print(f"Output mean diff: {(sequence_output - sequence_output_hf).abs().mean().item()}")
+
+    assert (sequence_output - sequence_output_hf).abs().max().item() < 1e-4
+
+
+if __name__=='__main__':
+    test_bert("/data/bert-base-chinese/")