PaddlePaddle · yingyibiao · Nov 9, 2021 · Sep 24, 2021 · Sep 24, 2021 · Sep 24, 2021
diff --git a/community/junnyu/distilgpt2/README.md b/community/junnyu/distilgpt2/README.md
@@ -0,0 +1,33 @@
+# 详细介绍
+# DistilGPT2
+DistilGPT2 英语语言模型使用 OpenWebTextCorpus（OpenAI 的 WebText 数据集），使用 GPT2 的最小版本的进行了预训练。 该模型有 6 层、768 个维度和 12 个头，总计 82M 参数（相比之下 GPT2 的参数为 124M）。 平均而言，DistilGPT2 比 GPT2 快两倍。
+
+在 WikiText-103 基准测试中，GPT2 在测试集上的困惑度为 16.3，而 DistilGPT2 的困惑度为 21.1（在训练集上进行微调后）。
+
+
+# 使用示例
+
+```python
+import paddle
+from paddlenlp.transformers import GPTLMHeadModel, GPTTokenizer
+
+path = "distilgpt2"
+model = GPTLMHeadModel.from_pretrained(path)
+model.eval()
+tokenizer = GPTTokenizer.from_pretrained(path)
+text = "Welcome to paddlenlp!"
+inputs = {
+    k: paddle.to_tensor(
+        v, dtype="int64").unsqueeze(0)
+    for k, v in tokenizer(
+        text, return_token_type_ids=False).items()
+}
+with paddle.no_grad():
+    logits = model(**inputs)
+
+print(logits.shape)
+```
+
+# 权重来源
+
+https://huggingface.co/distilgpt2
diff --git a/community/junnyu/distilgpt2/files.json b/community/junnyu/distilgpt2/files.json
@@ -0,0 +1,7 @@
+{
+  "model_config_file": "https://paddlenlp.bj.bcebos.com/models/transformers/community/junnyu/distilgpt2/model_config.json",
+  "model_state": "https://paddlenlp.bj.bcebos.com/models/transformers/community/junnyu/distilgpt2/model_state.pdparams",
+  "tokenizer_config_file": "https://paddlenlp.bj.bcebos.com/models/transformers/community/junnyu/distilgpt2/tokenizer_config.json",
+  "merges_file":"https://paddlenlp.bj.bcebos.com/models/transformers/community/junnyu/distilgpt2/merges.txt",
+  "vocab_file": "https://paddlenlp.bj.bcebos.com/models/transformers/community/junnyu/distilgpt2/vocab.json"
+}
diff --git a/community/junnyu/gpt_compare.py b/community/junnyu/gpt_compare.py
@@ -0,0 +1,116 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddlenlp.transformers import GPTLMHeadModel as PDGPT2LMHeadModel, GPTTokenizer, BertTokenizer
+from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel as PTGPT2LMHeadModel
+from paddlenlp.transformers import GPTForTokenClassification, GPTForSequenceClassification, GPTTokenizer
+import paddle
+import torch
+import numpy as np
+
+paddle.set_grad_enabled(False)
+torch.set_grad_enabled(False)
+
+
+def compare(a, b):
+    a = a.cpu().numpy()
+    b = b.cpu().numpy()
+    meandif = np.abs(a - b).mean()
+    maxdif = np.abs(a - b).max()
+    print("mean dif:", meandif)
+    print("max dif:", maxdif)
+
+
+def compare_lm(path="community/junnyu/microsoft-DialoGPT-small"):
+    pdmodel = PDGPT2LMHeadModel.from_pretrained(path)
+    ptmodel = PTGPT2LMHeadModel.from_pretrained(path).cuda()
+    if "chinese" in path:
+        text = "欢迎使用paddlenlp！"
+        tokenizer = BertTokenizer.from_pretrained(path)
+    else:
+        text = "Welcome to paddlenlp!"
+        tokenizer = GPTTokenizer.from_pretrained(path)
+    pdmodel.eval()
+    ptmodel.eval()
+    pdinputs = {
+        k: paddle.to_tensor(
+            v, dtype="int64").unsqueeze(0)
+        for k, v in tokenizer(
+            text, return_token_type_ids=False).items()
+    }
+    ptinputs = {
+        k: torch.tensor(
+            v, dtype=torch.long).unsqueeze(0).cuda()
+        for k, v in tokenizer(
+            text, return_token_type_ids=False).items()
+    }
+
+    pd_logits = pdmodel(**pdinputs)
+
+    pt_logits = ptmodel(**ptinputs).logits
+
+    compare(pd_logits, pt_logits)
+
+
+def test_GPTForTokenClassification():
+
+    tokenizer = GPTTokenizer.from_pretrained("community/junnyu/distilgpt2")
+    m = GPTForTokenClassification.from_pretrained("community/junnyu/distilgpt2")
+    inputs = tokenizer(
+        "Welcome to use PaddlePaddle and PaddleNLP!",
+        return_token_type_ids=False)
+    inputs = {
+        k: paddle.to_tensor(
+            [v], dtype="int64")
+        for (k, v) in inputs.items()
+    }
+    logits = m(**inputs)
+    print(logits.shape)
+
+
+def test_GPTForSequenceClassification():
+    paddle.set_grad_enabled(False)
+    tokenizer = GPTTokenizer.from_pretrained("community/junnyu/distilgpt2")
+    m = GPTForSequenceClassification.from_pretrained(
+        "community/junnyu/distilgpt2")
+    inputs = tokenizer(
+        "Welcome to use PaddlePaddle and PaddleNLP!",
+        return_token_type_ids=False)
+    inputs = {
+        k: paddle.to_tensor(
+            [v], dtype="int64")
+        for (k, v) in inputs.items()
+    }
+    logits = m(**inputs)
+    print(logits.shape)
+
+
+if __name__ == "__main__":
+    # compare_lm(
+    #     path="community/junnyu/microsoft-DialoGPT-small")
+    # mean dif: 7.501994e-05
+    # max dif: 0.00036621094
+    # compare_lm(
+    #     path="community/junnyu/distilgpt2")
+    # mean dif: 7.249901e-06
+    # max dif: 5.340576e-05
+    # compare_lm(
+    #     path="community/junnyu/uer-gpt2-chinese-poem")
+    # mean dif: 1.0497178e-06
+    # max dif: 1.335144e-05
+
+    # test_GPTForTokenClassification()
+    # [1, 13, 2]
+    test_GPTForSequenceClassification()
+    # [1, 2]
diff --git a/community/junnyu/gpt_convert_huggingface2paddle.py b/community/junnyu/gpt_convert_huggingface2paddle.py
@@ -0,0 +1,99 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import OrderedDict
+import argparse
+
+huggingface_to_paddle = {
+    "transformer.wte.weight": "gpt.embeddings.word_embeddings.weight",
+    "transformer.wpe.weight": "gpt.embeddings.position_embeddings.weight",
+    "transformer.h.": "gpt.decoder.layers.",
+    ".attn.c_proj.": ".self_attn.out_proj.",
+    ".ln_1.": ".norm1.",
+    ".mlp.c_fc.": ".linear1.",
+    ".mlp.c_proj.": ".linear2.",
+    ".ln_2.": ".norm2.",
+    "transformer.ln_f.": "gpt.decoder.norm.",
+    "lm_head.weight": "lm_head.decoder_weight"
+}
+
+skip_weights = [".attn.bias", "lm_head.weight"]
+dont_transpose = [
+    ".wte.weight", ".wpe.weight", ".ln_", ".mlp.c_proj.", ".mlp.c_fc.",
+    ".attn.c_proj.", "lm_head.weight"
+]
+
+
+# 注意，huggingface使用的Conv1D的weight和paddle.nn.Linear中的weight形状一致，因此不需要转置。
+# 如果使用了torch.nn.Linear那么就需要转置了！
+def convert_pytorch_checkpoint_to_paddle(pytorch_checkpoint_path,
+                                         paddle_dump_path):
+    import torch
+    import paddle
+    pytorch_state_dict = torch.load(pytorch_checkpoint_path, map_location="cpu")
+    paddle_state_dict = OrderedDict()
+    for k, v in pytorch_state_dict.items():
+        is_transpose = False
+        if k in skip_weights:
+            continue
+        # c_attn
+        if ".attn.c_attn." in k:
+            query_value_key = v.chunk(chunks=3, dim=-1)
+            for cross_value, new_name in zip(query_value_key, [
+                    ".self_attn.q_proj.", ".self_attn.k_proj.",
+                    ".self_attn.v_proj."
+            ]):
+                oldk = k
+                newk = k.replace("transformer.h.",
+                                 "gpt.decoder.layers.").replace(".attn.c_attn.",
+                                                                new_name)
+                paddle_state_dict[newk] = cross_value.data.numpy().astype(
+                    "float32")
+                print(
+                    f"Converting: {oldk} => {newk} | is_transpose {is_transpose}"
+                )
+            continue
+
+        if k[-7:] == ".weight":
+            if not any([w in k for w in dont_transpose]):
+                if v.ndim == 2:
+                    v = v.transpose(0, 1)
+                    is_transpose = True
+        oldk = k
+        for huggingface_name, paddle_name in huggingface_to_paddle.items():
+            k = k.replace(huggingface_name, paddle_name)
+
+        print(f"Converting: {oldk} => {k} | is_transpose {is_transpose}")
+        paddle_state_dict[k] = v.data.numpy().astype("float32")
+
+    paddle.save(paddle_state_dict, paddle_dump_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--pytorch_checkpoint_path",
+        default=r"community\junnyu\microsoft-DialoGPT-small\pytorch_model.bin",
+        type=str,
+        required=False,
+        help="Path to the Pytorch checkpoint path.")
+    parser.add_argument(
+        "--paddle_dump_path",
+        default=r"community\junnyu\microsoft-DialoGPT-small\model_state.pdparams",
+        type=str,
+        required=False,
+        help="Path to the output Paddle model.")
+    args = parser.parse_args()
+    convert_pytorch_checkpoint_to_paddle(args.pytorch_checkpoint_path,
+                                         args.paddle_dump_path)
diff --git a/community/junnyu/microsoft-DialoGPT-small/README.md b/community/junnyu/microsoft-DialoGPT-small/README.md
@@ -0,0 +1,32 @@
+# 详细介绍
+# microsoft-DialoGPT-small
+最先进的大规模预训练响应生成模型 (DialoGPT)
+DialoGPT 是一种用于多轮对话的 SOTA 大规模预训练对话响应生成模型。 人类评估结果表明，DialoGPT 生成的响应与单轮对话图灵测试下的人类响应质量相当。 该模型是在来自 Reddit 讨论线程的 147M 多轮对话上训练的。
+
+
+# 使用示例
+
+```python
+import paddle
+from paddlenlp.transformers import GPTLMHeadModel, GPTTokenizer
+
+path = "microsoft-DialoGPT-small"
+model = GPTLMHeadModel.from_pretrained(path)
+model.eval()
+tokenizer = GPTTokenizer.from_pretrained(path)
+text = "Welcome to paddlenlp!"
+inputs = {
+    k: paddle.to_tensor(
+        v, dtype="int64").unsqueeze(0)
+    for k, v in tokenizer(
+        text, return_token_type_ids=False).items()
+}
+with paddle.no_grad():
+    logits = model(**inputs)
+
+print(logits.shape)
+```
+
+# 权重来源
+
+https://huggingface.co/microsoft/DialoGPT-small
diff --git a/community/junnyu/microsoft-DialoGPT-small/files.json b/community/junnyu/microsoft-DialoGPT-small/files.json
@@ -0,0 +1,7 @@
+{
+  "model_config_file": "https://paddlenlp.bj.bcebos.com/models/transformers/community/junnyu/microsoft-DialoGPT-small/model_config.json",
+  "model_state": "https://paddlenlp.bj.bcebos.com/models/transformers/community/junnyu/microsoft-DialoGPT-small/model_state.pdparams",
+  "tokenizer_config_file": "https://paddlenlp.bj.bcebos.com/models/transformers/community/junnyu/microsoft-DialoGPT-small/tokenizer_config.json",
+  "merges_file":"https://paddlenlp.bj.bcebos.com/models/transformers/community/junnyu/microsoft-DialoGPT-small/merges.txt",
+  "vocab_file": "https://paddlenlp.bj.bcebos.com/models/transformers/community/junnyu/microsoft-DialoGPT-small/vocab.json"
+}
diff --git a/community/junnyu/uer-gpt2-chinese-poem/README.md b/community/junnyu/uer-gpt2-chinese-poem/README.md
@@ -0,0 +1,33 @@
+# 详细介绍
+# Chinese Poem GPT2 Model
+该模型用于生成中国古诗词。
+
+# 训练数据
+训练数据包含 80 万首中国古诗词，由 chinese-poetry 和 Poetry 项目收集。
+
+# 使用示例
+
+```python
+import paddle
+from paddlenlp.transformers import GPTLMHeadModel, BertTokenizer
+
+path = "uer-gpt2-chinese-poem"
+model = GPTLMHeadModel.from_pretrained(path)
+model.eval()
+tokenizer = BertTokenizer.from_pretrained(path)
+text = "欢迎使用paddlenlp！"
+inputs = {
+    k: paddle.to_tensor(
+        v, dtype="int64").unsqueeze(0)
+    for k, v in tokenizer(
+        text, return_token_type_ids=False).items()
+}
+with paddle.no_grad():
+    logits = model(**inputs)
+
+print(logits.shape)
+```
+
+# 权重来源
+
+https://huggingface.co/uer/gpt2-chinese-poem
diff --git a/community/junnyu/uer-gpt2-chinese-poem/files.json b/community/junnyu/uer-gpt2-chinese-poem/files.json
@@ -0,0 +1,6 @@
+{
+  "model_config_file": "https://paddlenlp.bj.bcebos.com/models/transformers/community/junnyu/uer-gpt2-chinese-poem/model_config.json",
+  "model_state": "https://paddlenlp.bj.bcebos.com/models/transformers/community/junnyu/uer-gpt2-chinese-poem/model_state.pdparams",
+  "tokenizer_config_file": "https://paddlenlp.bj.bcebos.com/models/transformers/community/junnyu/uer-gpt2-chinese-poem/tokenizer_config.json",
+  "vocab_file": "https://paddlenlp.bj.bcebos.com/models/transformers/community/junnyu/uer-gpt2-chinese-poem/vocab.txt"
+}