update chatgpt

jianzhnie · May 22, 2024 · 8d19642 · 8d19642
1 parent 4edadc3
commit 8d19642
Show file tree

Hide file tree

Showing 63 changed files with 236 additions and 3,321 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,18 +1,18 @@
 repos:
-  - repo: https://github.com/PyCQA/flake8
-    rev: 3.8.3
+  - repo: https://gitee.com/openmmlab/mirrors-flake8
+    rev: 5.0.4
     hooks:
       - id: flake8
-  - repo: https://github.com/PyCQA/isort
-    rev: 5.10.1
+  - repo: https://gitee.com/openmmlab/mirrors-isort
+    rev: 5.11.5
     hooks:
       - id: isort
-  - repo: https://github.com/pre-commit/mirrors-yapf
-    rev: v0.30.0
+  - repo: https://gitee.com/openmmlab/mirrors-yapf
+    rev: v0.32.0
     hooks:
       - id: yapf
-  - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.1.0
+  - repo: https://gitee.com/openmmlab/mirrors-pre-commit-hooks
+    rev: v4.3.0
     hooks:
       - id: trailing-whitespace
       - id: check-yaml
@@ -24,3 +24,17 @@ repos:
         args: ["--remove"]
       - id: mixed-line-ending
         args: ["--fix=lf"]
+  - repo: https://gitee.com/openmmlab/mirrors-mdformat
+    rev: 0.7.9
+    hooks:
+      - id: mdformat
+        args: ["--number"]
+        additional_dependencies:
+          - mdformat-openmmlab
+          - mdformat_frontmatter
+          - linkify-it-py
+  - repo: https://gitee.com/openmmlab/mirrors-docformatter
+    rev: v1.3.1
+    hooks:
+      - id: docformatter
+        args: ["--in-place", "--wrap-descriptions", "79"]
diff --git a/chatgpt/buffer/prompt_pipeline.py b/chatgpt/buffer/prompt_pipeline.py
@@ -9,6 +9,7 @@
 
 
 class BasePipeline(Dataset):
+
     def __init__(self, path: str = 'dataset'):
         super().__init__()
 
@@ -38,6 +39,7 @@ def create_loader(
 class PromptPipeline(BasePipeline):
     """Tokenizes prompts, unless they are already tokenized, and truncates them
     to `max_prompt_length` from the right."""
+
     def __init__(self, prompts: List[str], max_prompt_length: int,
                  tokenizer: PreTrainedTokenizer):
         super().__init__()

diff --git a/chatgpt/buffer/replay_buffer.py b/chatgpt/buffer/replay_buffer.py
@@ -47,6 +47,7 @@
 
 class DsExperienceDataset(Dataset):
     """Dataset to train the actor-critic models."""
+
     def __init__(self, memories: Deque[Memory]) -> None:
         super().__init__()
         self.data = list(memories)
@@ -70,6 +71,7 @@ def __getitem__(self, idx) -> Tuple:
 
 class ExperienceDataset(Dataset):
     """Dataset to train the actor-critic models."""
+
     def __init__(self, memories: Deque[Memory]) -> None:
         super().__init__()
         self.data = list(memories)
@@ -96,6 +98,7 @@ def __getitem__(self, idx) -> Tuple:
 
 
 class ExperienceMaker(ABC):
+
     def __init__(self,
                  actor: ActorModel,
                  critic: nn.Module,
@@ -148,6 +151,7 @@ def make_experience(self, input_ids: torch.Tensor,
 
 
 class ReplayBuffer(ABC):
+
     def __init__(self,
                  max_len: int = 10000,
                  sample_batch_size: int = 8,

diff --git a/chatgpt/buffer/rollout.py b/chatgpt/buffer/rollout.py
@@ -11,6 +11,7 @@
 
 
 class BaseRolloutStore(Dataset):
+
     def __init__(self, capacity=-1):
         self.history: Iterable[Any] = None
         self.capacity = capacity
@@ -42,6 +43,7 @@ def create_loader(self,
 
 class PPORolloutStorage(BaseRolloutStore):
     """Rollout storage for training PPO."""
+
     def __init__(self, pad_token_id):
         super().__init__()
 
@@ -73,6 +75,7 @@ def __len__(self) -> int:
         return len(self.history)
 
     def create_loader(self, batch_size: int, shuffle: bool) -> DataLoader:
+
         def collate_fn(elems: Iterable[PPORLElement]):
             return PPORLBatch(
                 # Left padding of already left-padded queries

diff --git a/chatgpt/dataset/README.md b/chatgpt/dataset/README.md
@@ -1,11 +1,7 @@
-
-
 ## Collection of prompt datasets
 
-
 I have collected a few datasets of prompts to train chatllm model. Follwing are the datasets:
 
-
 |              dataset name               |               Dataset Class                |                                  Links                                  | Description |
 | :-------------------------------------: | :----------------------------------------: | :---------------------------------------------------------------------: | :---------: |
 |      lvwerra/stack-exchange-paired      |            StackExchangeParied             |      https://huggingface.co/datasets/lvwerra/stack-exchange-paired      |             |
@@ -14,41 +10,38 @@ I have collected a few datasets of prompts to train chatllm model. Follwing are
 |          mosaicml/dolly_hhrlhf          |            MosaicMLDollyHhrlhf             |          https://huggingface.co/datasets/mosaicml/dolly_hhrlhf          |             |
 |      JosephusCheung/GuanacoDataset      |               GuanacoDataset               |      https://huggingface.co/datasets/josephuscheung/guanacodataset      |             |
 |       YeungNLP/firefly-train-1.1M       |              YeungNLP_Firefly              |       https://huggingface.co/datasets/yeungnlp/firefly-train-1.1M       |             |
-| instinwild_ch | InstructWildDataset | https://github.com/XueFuzhao/InstructionWild/tree/main/data | |
-| instinwild_en | InstructWildDataset | https://github.com/XueFuzhao/InstructionWild/tree/main/data | |
-|               llama_data      |              HuatuoMedDataset              |   https://github.com/SCIR-HI/Huatuo-Llama-Med-Chinese/tree/main/data    |             |
-| live_cancer | HuatuoMedDataset |  | |
+|              instinwild_ch              |            InstructWildDataset             |       https://github.com/XueFuzhao/InstructionWild/tree/main/data       |             |
+|              instinwild_en              |            InstructWildDataset             |       https://github.com/XueFuzhao/InstructionWild/tree/main/data       |             |
+|               llama_data                |              HuatuoMedDataset              |   https://github.com/SCIR-HI/Huatuo-Llama-Med-Chinese/tree/main/data    |             |
+|               live_cancer               |              HuatuoMedDataset              |                                                                         |             |
 |                laion/OIG                |                  LaionOIG                  |                https://huggingface.co/datasets/laion/oig                |             |
-|          OpenAssistant/oasst1           |            OpenAssistantOasst1             |          https://huggingface.co/datasets/openassistant/oasst1           | 8w |
-|         BelleGroup/train_1M_CN          |            BelleGroupTrain1MCN             |         https://huggingface.co/datasets/bellegroup/train_1M_CN          | 100w |
-|        BelleGroup/train_0.5M_CN         |            BelleGroupTrain05MCN            |        https://huggingface.co/datasets/bellegroup/train_0.5M_CN         | 50w |
-|            tatsu-lab/alpaca             |                   AlpacaDataset                   |            https://huggingface.co/datasets/tatsu-lab/alpaca             | 52k |
-|          yahma/alpaca-cleaned           |               AlpacaCleaned                |          https://huggingface.co/datasets/yahma/alpaca-cleaned           | 52k |
+|          OpenAssistant/oasst1           |            OpenAssistantOasst1             |          https://huggingface.co/datasets/openassistant/oasst1           |     8w      |
+|         BelleGroup/train_1M_CN          |            BelleGroupTrain1MCN             |         https://huggingface.co/datasets/bellegroup/train_1M_CN          |    100w     |
+|        BelleGroup/train_0.5M_CN         |            BelleGroupTrain05MCN            |        https://huggingface.co/datasets/bellegroup/train_0.5M_CN         |     50w     |
+|            tatsu-lab/alpaca             |               AlpacaDataset                |            https://huggingface.co/datasets/tatsu-lab/alpaca             |     52k     |
+|          yahma/alpaca-cleaned           |               AlpacaCleaned                |          https://huggingface.co/datasets/yahma/alpaca-cleaned           |     52k     |
 |           QingyiSi/Alpaca-CoT           |                 AlpacaCoT                  |           https://huggingface.co/datasets/qingyisi/alpaca-cot           |             |
-| trans_chinese_alpaca_data | AlpacaChinese | https://github.com/LC1332/Luotuo-Chinese-LLM/tree/main/data | |
-| trans_chinese_alpaca_data | AlpacaChinese | https://github.com/ymcui/Chinese-LLaMA-Alpaca/tree/main/data | |
-| fnlp/moss-002-sft-data | FudanMossDataset | https://huggingface.co/datasets/fnlp/moss-002-sft-data | |
-| nomic-ai/gpt4all-j-prompt-generations | Gpt4allPromptGeneration | https://huggingface.co/datasets/nomic-ai/gpt4all-j-prompt-generations | |
-|            Dahoas/rm-static             |       RmStaDahoasRmstaticDatasettic        |            https://huggingface.co/datasets/dahoas/rm-static             |       8w      |
-|           Dahoas/full-hh-rlhf           |          DahoasFullhhrlhfDataset           |           https://huggingface.co/datasets/dahoas/full-hh-rlhf           |         12w    |
-| Dahoas/synthetic-instruct-gptj-pairwise | DahoasSyntheticinstructgptjpairwiseDataset | https://huggingface.co/datasets/dahoas/synthetic-instruct-gptj-pairwise |        3w     |
-|     yitingxie/rlhf-reward-datasets      |     YitingxieRlhfrewarddatasetsDataset     |     https://huggingface.co/datasets/yitingxie/rlhf-reward-datasets      |       8w      |
-|        openai/webgpt_comparisons        |       OpenaiWebgptcomparisonsDataset       |        https://huggingface.co/datasets/openai/webgpt_comparisons        |         2w   |
-|             stanfordnlp/SHP             |           StanfordnlpSHPDataset            |             https://huggingface.co/datasets/stanfordnlp/SHP             |       5w      |
-|           wangrui6/Zhihu-KOL            |          Wangrui6ZhihuKOLDataset           |           https://huggingface.co/datasets/wangrui6/Zhihu-KOL            |        100w     |
-|     Cohere/miracl-zh-queries-22-12      |      CohereMiraclzhqueries2212Dataset      |     https://huggingface.co/datasets/cohere/miracl-zh-queries-22-12      |        1w     |
+|        trans_chinese_alpaca_data        |               AlpacaChinese                |       https://github.com/LC1332/Luotuo-Chinese-LLM/tree/main/data       |             |
+|        trans_chinese_alpaca_data        |               AlpacaChinese                |      https://github.com/ymcui/Chinese-LLaMA-Alpaca/tree/main/data       |             |
+|         fnlp/moss-002-sft-data          |              FudanMossDataset              |         https://huggingface.co/datasets/fnlp/moss-002-sft-data          |             |
+|  nomic-ai/gpt4all-j-prompt-generations  |          Gpt4allPromptGeneration           |  https://huggingface.co/datasets/nomic-ai/gpt4all-j-prompt-generations  |             |
+|            Dahoas/rm-static             |       RmStaDahoasRmstaticDatasettic        |            https://huggingface.co/datasets/dahoas/rm-static             |     8w      |
+|           Dahoas/full-hh-rlhf           |          DahoasFullhhrlhfDataset           |           https://huggingface.co/datasets/dahoas/full-hh-rlhf           |     12w     |
+| Dahoas/synthetic-instruct-gptj-pairwise | DahoasSyntheticinstructgptjpairwiseDataset | https://huggingface.co/datasets/dahoas/synthetic-instruct-gptj-pairwise |     3w      |
+|     yitingxie/rlhf-reward-datasets      |     YitingxieRlhfrewarddatasetsDataset     |     https://huggingface.co/datasets/yitingxie/rlhf-reward-datasets      |     8w      |
+|        openai/webgpt_comparisons        |       OpenaiWebgptcomparisonsDataset       |        https://huggingface.co/datasets/openai/webgpt_comparisons        |     2w      |
+|             stanfordnlp/SHP             |           StanfordnlpSHPDataset            |             https://huggingface.co/datasets/stanfordnlp/SHP             |     5w      |
+|           wangrui6/Zhihu-KOL            |          Wangrui6ZhihuKOLDataset           |           https://huggingface.co/datasets/wangrui6/Zhihu-KOL            |    100w     |
+|     Cohere/miracl-zh-queries-22-12      |      CohereMiraclzhqueries2212Dataset      |     https://huggingface.co/datasets/cohere/miracl-zh-queries-22-12      |     1w      |
 |       Hello-SimpleAI/HC3-Chinese        |       HelloSimpleAIHC3ChineseDataset       |       https://huggingface.co/datasets/hello-simpleai/HC3-Chinese        |             |
 |              mkqa-Chinese               |             MkqaChineseDataset             |              https://huggingface.co/datasets/mkqa/Chinese               |             |
 |              mkqa-Japanese              |            MkqaJapaneseDataset             |              https://huggingface.co/datasets/mkqa/Japanese              |             |
-|     Cohere/miracl-ja-queries-22-12      |      CohereMiracljaqueries2212Dataset      |     https://huggingface.co/datasets/cohere/miracl-ja-queries-22-12      |        1w     |
-|             lmqg/qg_jaquad              |            LmqgQgJaquadDataset             |             https://huggingface.co/datasets/lmqg/qg_jaquad              |        3w     |
-|             lmqg/qag_jaquad             |            LmqgQagJaquadDataset            |             https://huggingface.co/datasets/lmqg/qag_jaquad             |        1w     |
-
-
+|     Cohere/miracl-ja-queries-22-12      |      CohereMiracljaqueries2212Dataset      |     https://huggingface.co/datasets/cohere/miracl-ja-queries-22-12      |     1w      |
+|             lmqg/qg_jaquad              |            LmqgQgJaquadDataset             |             https://huggingface.co/datasets/lmqg/qg_jaquad              |     3w      |
+|             lmqg/qag_jaquad             |            LmqgQagJaquadDataset            |             https://huggingface.co/datasets/lmqg/qag_jaquad             |     1w      |
 
 ## Using for Training SFT model
 
-
 ```python
 HuggingFaceDataClass: Dict[str, Type] = {
     'Dahoas/rm-static': DahoasRmstaticDataset,
@@ -96,8 +89,6 @@ LocalDataClass：Dict[str, Type]  = {
 }
 ```
 
-
-
 ## Reference
 
 - https://github.com/yaodongC/awesome-instruction-dataset

diff --git a/chatgpt/dataset/data_utils.py b/chatgpt/dataset/data_utils.py
@@ -71,9 +71,8 @@ def get_raw_dataset(dataset_name: Optional[str] = None,
                     data_dir: Optional[str] = None,
                     test_data_ratio: float = 0.1,
                     seed: Optional[int] = None):
-    """
-    Given a dataset_name, returns an instance of the corresponding Dataset class,
-    initialized with the given test_data_ratio and seed arguments.
+    """Given a dataset_name, returns an instance of the corresponding Dataset
+    class, initialized with the given test_data_ratio and seed arguments.
 
     Args:
         dataset_name (str, optional): Name of the dataset to return.
@@ -110,8 +109,7 @@ def data_preprocess(
         tokenizer: Optional[PreTrainedTokenizer] = None,
         max_seq_len: int = 512,
         end_of_conversation_token: Optional[str] = None) -> PromptDataset:
-    """
-    Create different splits of a dataset based on the training phase.
+    """Create different splits of a dataset based on the training phase.
 
     Args:
         current_dataset (Dataset): The current state of the dataset.
@@ -175,8 +173,8 @@ def create_dataset(
     end_of_conversation_token: Optional[str] = None,
     seed: Optional[int] = None,
 ) -> Tuple:
-    """
-    A function that creates a training and evaluation dataset by splitting a raw dataset.
+    """A function that creates a training and evaluation dataset by splitting a
+    raw dataset.
 
     Args:
     - dataset_name (str): The name of the dataset to load.
@@ -285,6 +283,7 @@ def create_prompt_dataset(
 
 
 class DataCollatorReward:
+
     def __call__(self, data):
         batch = {}
         batch['input_ids'] = torch.cat([f[0]
@@ -297,6 +296,7 @@ def __call__(self, data):
 
 
 class DataCollatorRLHF:
+
     def __init__(self, max_token_len, inference_tp_size):
         self.max_token_len = max_token_len
         self.inference_tp_size = inference_tp_size

diff --git a/chatgpt/dataset/multi_round_dialogue.py b/chatgpt/dataset/multi_round_dialogue.py
@@ -5,6 +5,7 @@
 
 
 class UltraChatProcessor(object):
+
     def __init__(self):
         super.__init__()
 

diff --git a/chatgpt/dataset/prompt_dataset.py b/chatgpt/dataset/prompt_dataset.py
@@ -17,6 +17,7 @@ class TokenizedPromptDataset(Dataset):
         split (str): The split to use from the training data.
         max_length (int): The maximum length of the input sequences (default: 550).
     """
+
     def __init__(self,
                  data_path: str,
                  tokenizer: PreTrainedTokenizer,
@@ -68,6 +69,7 @@ class PromptDataset(Dataset):
         split (str): The split to use from the training data.
         max_length (int): The maximum length of the input sequences (default: 550).
     """
+
     def __init__(self,
                  data_path: str,
                  split: str,

diff --git a/chatgpt/dataset/rank_dataset.py b/chatgpt/dataset/rank_dataset.py
@@ -89,6 +89,7 @@ def __call__(self, features):
 
 
 class WebGPT(Dataset):
+
     def __init__(self) -> None:
         super().__init__()
 
@@ -129,6 +130,7 @@ class HFSummary(Dataset):
 
     labeling method : pair comparison, 0 or 1
     """
+
     def __init__(self,
                  split='train',
                  conf_threshold=-1,
@@ -196,6 +198,7 @@ class HFDataset(Dataset):
 
     we should do something like this for supervised datasets
     """
+
     def __init__(self,
                  dataset_name,
                  question_field,
@@ -232,6 +235,7 @@ def __getitem__(self, index):
 
 
 class GPTJSynthetic(HFDataset):
+
     def __init__(self) -> None:
         super().__init__('Dahoas/synthetic-instruct-gptj-pairwise', 'prompt',
                          'chosen', 'rejected', None, 'train')
@@ -246,6 +250,7 @@ class AnthropicRLHF(Dataset):
         one "chosen" and one "rejected".
     valid train size : 160780
     """
+
     def preprocess_dialogue(self, text):
         """trim prefix text to last two pairs.
Original file line number	Diff line number	Diff line change
Expand Up		@@ -5,6 +5,7 @@


		class UltraChatProcessor(object):

		def __init__(self):
		super.__init__()

Expand Down