PaddlePaddle
diff --git a/‎community/junnyu/bert_compare.py‎
Lines changed: 663 additions & 0 deletions b/‎community/junnyu/bert_compare.py‎
Lines changed: 663 additions & 0 deletions
diff --git a/‎community/junnyu/bert_convert_huggingface2paddle.py‎
Lines changed: 78 additions & 0 deletions b/‎community/junnyu/bert_convert_huggingface2paddle.py‎
Lines changed: 78 additions & 0 deletions
diff --git a/‎community/junnyu/ckiplab-bert-base-chinese-ner/README.md‎
Lines changed: 158 additions & 0 deletions b/‎community/junnyu/ckiplab-bert-base-chinese-ner/README.md‎
Lines changed: 158 additions & 0 deletions
diff --git a/‎community/junnyu/ckiplab-bert-base-chinese-ner/files.json‎
Lines changed: 6 additions & 0 deletions b/‎community/junnyu/ckiplab-bert-base-chinese-ner/files.json‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎community/junnyu/ckiplab-bert-base-chinese-pos/README.md‎
Lines changed: 145 additions & 0 deletions b/‎community/junnyu/ckiplab-bert-base-chinese-pos/README.md‎
Lines changed: 145 additions & 0 deletions
diff --git a/‎community/junnyu/ckiplab-bert-base-chinese-pos/files.json‎
Lines changed: 6 additions & 0 deletions b/‎community/junnyu/ckiplab-bert-base-chinese-pos/files.json‎
Lines changed: 6 additions & 0 deletions
@@ -0,0 +1,78 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import OrderedDict
+import argparse
+
+huggingface_to_paddle = {
+    "embeddings.LayerNorm": "embeddings.layer_norm",
+    "encoder.layer": "encoder.layers",
+    "attention.self.query": "self_attn.q_proj",
+    "attention.self.key": "self_attn.k_proj",
+    "attention.self.value": "self_attn.v_proj",
+    "attention.output.dense": "self_attn.out_proj",
+    "intermediate.dense": "linear1",
+    "output.dense": "linear2",
+    "attention.output.LayerNorm": "norm1",
+    "output.LayerNorm": "norm2",
+    "predictions.decoder.": "predictions.decoder_",
+    "predictions.transform.dense": "predictions.transform",
+    "predictions.transform.LayerNorm": "predictions.layer_norm",
+}
+
+
+def convert_pytorch_checkpoint_to_paddle(pytorch_checkpoint_path,
+                                         paddle_dump_path):
+
+    import torch
+    import paddle
+    pytorch_state_dict = torch.load(pytorch_checkpoint_path, map_location="cpu")
+    paddle_state_dict = OrderedDict()
+    for k, v in pytorch_state_dict.items():
+        is_transpose = False
+        if k[-7:] == ".weight":
+            if ".embeddings." not in k and ".LayerNorm." not in k:
+                if v.ndim == 2:
+                    v = v.transpose(0, 1)
+                    is_transpose = True
+        oldk = k
+        for huggingface_name, paddle_name in huggingface_to_paddle.items():
+            k = k.replace(huggingface_name, paddle_name)
+
+        if "bert." not in k and "cls." not in k and "classifier" not in k:
+            k = "bert." + k
+
+        print(f"Converting: {oldk} => {k} | is_transpose {is_transpose}")
+        paddle_state_dict[k] = v.data.numpy()
+
+    paddle.save(paddle_state_dict, paddle_dump_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--pytorch_checkpoint_path",
+        default="MODEL/ckiplab-bert-base-chinese-ws/pytorch_model.bin",
+        type=str,
+        required=False,
+        help="Path to the Pytorch checkpoint path.")
+    parser.add_argument(
+        "--paddle_dump_path",
+        default="MODEL/ckiplab-bert-base-chinese-ws/model_state.pdparams",
+        type=str,
+        required=False,
+        help="Path to the output Paddle model.")
+    args = parser.parse_args()
+    convert_pytorch_checkpoint_to_paddle(args.pytorch_checkpoint_path,
+                                         args.paddle_dump_path)
@@ -0,0 +1,158 @@
+# 详细介绍
+**介绍**：ckiplab-bert-base-chinese-ner 是一个带有token分类头的BERT模型，该模型已经在**命名实体识别任务**上进行了微调。
+
+关于完整使用方法及其他信息，请参考 https://github.com/ckiplab/ckip-transformers 。
+
+**模型结构**： **`BertForTokenClassification`**，带有token分类头的Bert模型。
+**适用下游任务**：**命名实体识别**，该权重已经在下游`NER`任务上进行了微调，因此可直接使用。
+
+# 使用示例
+
+```python
+import paddle
+import paddle.nn.functional as F
+from paddlenlp.transformers import BertForTokenClassification, BertTokenizer
+path = "junnyu/ckiplab-bert-base-chinese-ner"
+model = BertForTokenClassification.from_pretrained(path)
+model.eval()
+tokenizer = BertTokenizer.from_pretrained(path)
+text = "傅達仁今將執行安樂死，卻突然爆出自己20年前遭緯來體育台封殺，他不懂自己哪裡得罪到電視台。"
+tokenized_text = tokenizer.tokenize(text)
+inputs = {
+    k: paddle.to_tensor(
+        v, dtype="int64").unsqueeze(0)
+    for k, v in tokenizer(text).items()
+}
+with paddle.no_grad():
+    score = F.softmax(model(**inputs), axis=-1)
+id2label = {
+    "0": "O",
+    "1": "B-CARDINAL",
+    "2": "B-DATE",
+    "3": "B-EVENT",
+    "4": "B-FAC",
+    "5": "B-GPE",
+    "6": "B-LANGUAGE",
+    "7": "B-LAW",
+    "8": "B-LOC",
+    "9": "B-MONEY",
+    "10": "B-NORP",
+    "11": "B-ORDINAL",
+    "12": "B-ORG",
+    "13": "B-PERCENT",
+    "14": "B-PERSON",
+    "15": "B-PRODUCT",
+    "16": "B-QUANTITY",
+    "17": "B-TIME",
+    "18": "B-WORK_OF_ART",
+    "19": "I-CARDINAL",
+    "20": "I-DATE",
+    "21": "I-EVENT",
+    "22": "I-FAC",
+    "23": "I-GPE",
+    "24": "I-LANGUAGE",
+    "25": "I-LAW",
+    "26": "I-LOC",
+    "27": "I-MONEY",
+    "28": "I-NORP",
+    "29": "I-ORDINAL",
+    "30": "I-ORG",
+    "31": "I-PERCENT",
+    "32": "I-PERSON",
+    "33": "I-PRODUCT",
+    "34": "I-QUANTITY",
+    "35": "I-TIME",
+    "36": "I-WORK_OF_ART",
+    "37": "E-CARDINAL",
+    "38": "E-DATE",
+    "39": "E-EVENT",
+    "40": "E-FAC",
+    "41": "E-GPE",
+    "42": "E-LANGUAGE",
+    "43": "E-LAW",
+    "44": "E-LOC",
+    "45": "E-MONEY",
+    "46": "E-NORP",
+    "47": "E-ORDINAL",
+    "48": "E-ORG",
+    "49": "E-PERCENT",
+    "50": "E-PERSON",
+    "51": "E-PRODUCT",
+    "52": "E-QUANTITY",
+    "53": "E-TIME",
+    "54": "E-WORK_OF_ART",
+    "55": "S-CARDINAL",
+    "56": "S-DATE",
+    "57": "S-EVENT",
+    "58": "S-FAC",
+    "59": "S-GPE",
+    "60": "S-LANGUAGE",
+    "61": "S-LAW",
+    "62": "S-LOC",
+    "63": "S-MONEY",
+    "64": "S-NORP",
+    "65": "S-ORDINAL",
+    "66": "S-ORG",
+    "67": "S-PERCENT",
+    "68": "S-PERSON",
+    "69": "S-PRODUCT",
+    "70": "S-QUANTITY",
+    "71": "S-TIME",
+    "72": "S-WORK_OF_ART"
+}
+for t, s in zip(tokenized_text, score[0][1:-1]):
+    index = paddle.argmax(s).item()
+    label = id2label[str(index)]
+    print(f"{label} {t} score {s[index].item()}")
+
+# B-PERSON 傅 score 0.9999995231628418
+# I-PERSON 達 score 0.9999994039535522
+# E-PERSON 仁 score 0.9999995231628418
+# B-DATE 今 score 0.9991734623908997
+# O 將 score 0.9852147698402405
+# O 執 score 1.0
+# O 行 score 0.9999998807907104
+# O 安 score 0.9999996423721313
+# O 樂 score 0.9999997615814209
+# O 死 score 0.9999997615814209
+# O ， score 1.0
+# O 卻 score 1.0
+# O 突 score 1.0
+# O 然 score 1.0
+# O 爆 score 1.0
+# O 出 score 1.0
+# O 自 score 1.0
+# O 己 score 1.0
+# B-DATE 20 score 0.9999992847442627
+# E-DATE 年 score 0.9999892711639404
+# O 前 score 0.9999995231628418
+# O 遭 score 1.0
+# B-ORG 緯 score 0.9999990463256836
+# I-ORG 來 score 0.9999986886978149
+# I-ORG 體 score 0.999998927116394
+# I-ORG 育 score 0.9999985694885254
+# E-ORG 台 score 0.999998927116394
+# O 封 score 1.0
+# O 殺 score 1.0
+# O ， score 1.0
+# O 他 score 1.0
+# O 不 score 1.0
+# O 懂 score 1.0
+# O 自 score 1.0
+# O 己 score 1.0
+# O 哪 score 1.0
+# O 裡 score 1.0
+# O 得 score 1.0
+# O 罪 score 1.0
+# O 到 score 1.0
+# O 電 score 1.0
+# O 視 score 1.0
+# O 台 score 1.0
+# O 。 score 0.9999960660934448
+
+```
+
+# 权重来源
+
+https://huggingface.co/ckiplab/bert-base-chinese-ner
+这个项目提供了繁体中文版transformer模型（包含ALBERT、BERT、GPT2）及自然语言处理工具（包含分词、词性标注、命名实体识别）。
@@ -0,0 +1,6 @@
+{
+  "model_config_file": "https://paddlenlp.bj.bcebos.com/models/transformers/community/junnyu/ckiplab-bert-base-chinese-ner/model_config.json",
+  "model_state": "https://paddlenlp.bj.bcebos.com/models/transformers/community/junnyu/ckiplab-bert-base-chinese-ner/model_state.pdparams",
+  "tokenizer_config_file": "https://paddlenlp.bj.bcebos.com/models/transformers/community/junnyu/ckiplab-bert-base-chinese-ner/tokenizer_config.json",
+  "vocab_file": "https://paddlenlp.bj.bcebos.com/models/transformers/community/junnyu/ckiplab-bert-base-chinese-ner/vocab.txt"
+}
@@ -0,0 +1,145 @@
+# 详细介绍
+**介绍**：ckiplab-bert-base-chinese-pos 是一个带有token分类头的BERT模型，该模型已经在**词性标注任务**上进行了微调。
+
+关于完整使用方法及其他信息，请参考 https://github.com/ckiplab/ckip-transformers 。
+
+**模型结构**： **`BertForTokenClassification`**，带有token分类头的Bert模型。
+**适用下游任务**：**词性标注**，该权重已经在下游`POS`任务上进行了微调，因此可直接使用。
+
+# 使用示例
+
+```python
+import paddle
+import paddle.nn.functional as F
+from paddlenlp.transformers import BertForTokenClassification, BertTokenizer
+path = "junnyu/ckiplab-bert-base-chinese-pos"
+model = BertForTokenClassification.from_pretrained(path)
+model.eval()
+tokenizer = BertTokenizer.from_pretrained(path)
+text = "傅達仁今將執行安樂死，卻突然爆出自己20年前遭緯來體育台封殺，他不懂自己哪裡得罪到電視台。"
+tokenized_text = tokenizer.tokenize(text)
+inputs = {
+    k: paddle.to_tensor(
+        v, dtype="int64").unsqueeze(0)
+    for k, v in tokenizer(text).items()
+}
+with paddle.no_grad():
+    score = F.softmax(model(**inputs), axis=-1)
+id2label = {
+    "0": "A",
+    "1": "Caa",
+    "2": "Cab",
+    "3": "Cba",
+    "4": "Cbb",
+    "5": "D",
+    "6": "Da",
+    "7": "Dfa",
+    "8": "Dfb",
+    "9": "Di",
+    "10": "Dk",
+    "11": "DM",
+    "12": "I",
+    "13": "Na",
+    "14": "Nb",
+    "15": "Nc",
+    "16": "Ncd",
+    "17": "Nd",
+    "18": "Nep",
+    "19": "Neqa",
+    "20": "Neqb",
+    "21": "Nes",
+    "22": "Neu",
+    "23": "Nf",
+    "24": "Ng",
+    "25": "Nh",
+    "26": "Nv",
+    "27": "P",
+    "28": "T",
+    "29": "VA",
+    "30": "VAC",
+    "31": "VB",
+    "32": "VC",
+    "33": "VCL",
+    "34": "VD",
+    "35": "VF",
+    "36": "VE",
+    "37": "VG",
+    "38": "VH",
+    "39": "VHC",
+    "40": "VI",
+    "41": "VJ",
+    "42": "VK",
+    "43": "VL",
+    "44": "V_2",
+    "45": "DE",
+    "46": "SHI",
+    "47": "FW",
+    "48": "COLONCATEGORY",
+    "49": "COMMACATEGORY",
+    "50": "DASHCATEGORY",
+    "51": "DOTCATEGORY",
+    "52": "ETCCATEGORY",
+    "53": "EXCLAMATIONCATEGORY",
+    "54": "PARENTHESISCATEGORY",
+    "55": "PAUSECATEGORY",
+    "56": "PERIODCATEGORY",
+    "57": "QUESTIONCATEGORY",
+    "58": "SEMICOLONCATEGORY",
+    "59": "SPCHANGECATEGORY"
+}
+for t, s in zip(tokenized_text, score[0][1:-1]):
+    index = paddle.argmax(s).item()
+    label = id2label[str(index)]
+    print(f"{label} {t} score {s[index].item()}")
+
+# Nb 傅 score 0.9999998807907104
+# Nb 達 score 0.9700667858123779
+# Na 仁 score 0.9985846281051636
+# Nd 今 score 0.9999947547912598
+# D 將 score 0.9999957084655762
+# VC 執 score 0.9999998807907104
+# VC 行 score 0.9951109290122986
+# Na 安 score 0.9999996423721313
+# Na 樂 score 0.9999638795852661
+# VH 死 score 0.9813857674598694
+# COMMACATEGORY ， score 1.0
+# D 卻 score 1.0
+# D 突 score 1.0
+# Cbb 然 score 0.9989008903503418
+# VJ 爆 score 0.9999979734420776
+# VC 出 score 0.9965670108795166
+# Nh 自 score 1.0
+# Nh 己 score 1.0
+# Neu 20 score 0.9999995231628418
+# Nf 年 score 0.9125530123710632
+# Ng 前 score 0.9999992847442627
+# P 遭 score 1.0
+# Nb 緯 score 0.9999996423721313
+# VA 來 score 0.9322434663772583
+# Na 體 score 0.9846553802490234
+# Nc 育 score 0.729569137096405
+# Nc 台 score 0.9999841451644897
+# VC 封 score 0.9999997615814209
+# VC 殺 score 0.9999991655349731
+# COMMACATEGORY ， score 1.0
+# Nh 他 score 0.9999996423721313
+# D 不 score 1.0
+# VK 懂 score 1.0
+# Nh 自 score 1.0
+# Nh 己 score 0.9999978542327881
+# Ncd 哪 score 0.9856181740760803
+# Ncd 裡 score 0.9999995231628418
+# VC 得 score 0.9999988079071045
+# Na 罪 score 0.9994786381721497
+# VCL 到 score 0.8332439661026001
+# Nc 電 score 1.0
+# Nc 視 score 0.9999986886978149
+# Nc 台 score 0.9973978996276855
+# PERIODCATEGORY 。 score 1.0
+
+```
+
+# 权重来源
+
+https://huggingface.co/ckiplab/bert-base-chinese-pos
+这个项目提供了繁体中文版transformer模型（包含ALBERT、BERT、GPT2）及自然语言处理工具（包含分词、词性标注、命名实体识别）。
@@ -0,0 +1,6 @@
+{
+  "model_config_file": "https://paddlenlp.bj.bcebos.com/models/transformers/community/junnyu/ckiplab-bert-base-chinese-pos/model_config.json",
+  "model_state": "https://paddlenlp.bj.bcebos.com/models/transformers/community/junnyu/ckiplab-bert-base-chinese-pos/model_state.pdparams",
+  "tokenizer_config_file": "https://paddlenlp.bj.bcebos.com/models/transformers/community/junnyu/ckiplab-bert-base-chinese-pos/tokenizer_config.json",
+  "vocab_file": "https://paddlenlp.bj.bcebos.com/models/transformers/community/junnyu/ckiplab-bert-base-chinese-pos/vocab.txt"
+}