Skip to content

Commit

Permalink
update chatgpt
Browse files Browse the repository at this point in the history
  • Loading branch information
jianzhnie committed May 22, 2024
1 parent 4edadc3 commit 8d19642
Show file tree
Hide file tree
Showing 63 changed files with 236 additions and 3,321 deletions.
30 changes: 22 additions & 8 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
repos:
- repo: https://github.com/PyCQA/flake8
rev: 3.8.3
- repo: https://gitee.com/openmmlab/mirrors-flake8
rev: 5.0.4
hooks:
- id: flake8
- repo: https://github.com/PyCQA/isort
rev: 5.10.1
- repo: https://gitee.com/openmmlab/mirrors-isort
rev: 5.11.5
hooks:
- id: isort
- repo: https://github.com/pre-commit/mirrors-yapf
rev: v0.30.0
- repo: https://gitee.com/openmmlab/mirrors-yapf
rev: v0.32.0
hooks:
- id: yapf
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.1.0
- repo: https://gitee.com/openmmlab/mirrors-pre-commit-hooks
rev: v4.3.0
hooks:
- id: trailing-whitespace
- id: check-yaml
Expand All @@ -24,3 +24,17 @@ repos:
args: ["--remove"]
- id: mixed-line-ending
args: ["--fix=lf"]
- repo: https://gitee.com/openmmlab/mirrors-mdformat
rev: 0.7.9
hooks:
- id: mdformat
args: ["--number"]
additional_dependencies:
- mdformat-openmmlab
- mdformat_frontmatter
- linkify-it-py
- repo: https://gitee.com/openmmlab/mirrors-docformatter
rev: v1.3.1
hooks:
- id: docformatter
args: ["--in-place", "--wrap-descriptions", "79"]
2 changes: 2 additions & 0 deletions chatgpt/buffer/prompt_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@


class BasePipeline(Dataset):

def __init__(self, path: str = 'dataset'):
super().__init__()

Expand Down Expand Up @@ -38,6 +39,7 @@ def create_loader(
class PromptPipeline(BasePipeline):
"""Tokenizes prompts, unless they are already tokenized, and truncates them
to `max_prompt_length` from the right."""

def __init__(self, prompts: List[str], max_prompt_length: int,
tokenizer: PreTrainedTokenizer):
super().__init__()
Expand Down
4 changes: 4 additions & 0 deletions chatgpt/buffer/replay_buffer.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@

class DsExperienceDataset(Dataset):
"""Dataset to train the actor-critic models."""

def __init__(self, memories: Deque[Memory]) -> None:
super().__init__()
self.data = list(memories)
Expand All @@ -70,6 +71,7 @@ def __getitem__(self, idx) -> Tuple:

class ExperienceDataset(Dataset):
"""Dataset to train the actor-critic models."""

def __init__(self, memories: Deque[Memory]) -> None:
super().__init__()
self.data = list(memories)
Expand All @@ -96,6 +98,7 @@ def __getitem__(self, idx) -> Tuple:


class ExperienceMaker(ABC):

def __init__(self,
actor: ActorModel,
critic: nn.Module,
Expand Down Expand Up @@ -148,6 +151,7 @@ def make_experience(self, input_ids: torch.Tensor,


class ReplayBuffer(ABC):

def __init__(self,
max_len: int = 10000,
sample_batch_size: int = 8,
Expand Down
3 changes: 3 additions & 0 deletions chatgpt/buffer/rollout.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@


class BaseRolloutStore(Dataset):

def __init__(self, capacity=-1):
self.history: Iterable[Any] = None
self.capacity = capacity
Expand Down Expand Up @@ -42,6 +43,7 @@ def create_loader(self,

class PPORolloutStorage(BaseRolloutStore):
"""Rollout storage for training PPO."""

def __init__(self, pad_token_id):
super().__init__()

Expand Down Expand Up @@ -73,6 +75,7 @@ def __len__(self) -> int:
return len(self.history)

def create_loader(self, batch_size: int, shuffle: bool) -> DataLoader:

def collate_fn(elems: Iterable[PPORLElement]):
return PPORLBatch(
# Left padding of already left-padded queries
Expand Down
57 changes: 24 additions & 33 deletions chatgpt/dataset/README.md
Original file line number Diff line number Diff line change
@@ -1,11 +1,7 @@


## Collection of prompt datasets


I have collected a few datasets of prompts to train chatllm model. Follwing are the datasets:


| dataset name | Dataset Class | Links | Description |
| :-------------------------------------: | :----------------------------------------: | :---------------------------------------------------------------------: | :---------: |
| lvwerra/stack-exchange-paired | StackExchangeParied | https://huggingface.co/datasets/lvwerra/stack-exchange-paired | |
Expand All @@ -14,41 +10,38 @@ I have collected a few datasets of prompts to train chatllm model. Follwing are
| mosaicml/dolly_hhrlhf | MosaicMLDollyHhrlhf | https://huggingface.co/datasets/mosaicml/dolly_hhrlhf | |
| JosephusCheung/GuanacoDataset | GuanacoDataset | https://huggingface.co/datasets/josephuscheung/guanacodataset | |
| YeungNLP/firefly-train-1.1M | YeungNLP_Firefly | https://huggingface.co/datasets/yeungnlp/firefly-train-1.1M | |
| instinwild_ch | InstructWildDataset | https://github.com/XueFuzhao/InstructionWild/tree/main/data | |
| instinwild_en | InstructWildDataset | https://github.com/XueFuzhao/InstructionWild/tree/main/data | |
| llama_data | HuatuoMedDataset | https://github.com/SCIR-HI/Huatuo-Llama-Med-Chinese/tree/main/data | |
| live_cancer | HuatuoMedDataset | | |
| instinwild_ch | InstructWildDataset | https://github.com/XueFuzhao/InstructionWild/tree/main/data | |
| instinwild_en | InstructWildDataset | https://github.com/XueFuzhao/InstructionWild/tree/main/data | |
| llama_data | HuatuoMedDataset | https://github.com/SCIR-HI/Huatuo-Llama-Med-Chinese/tree/main/data | |
| live_cancer | HuatuoMedDataset | | |
| laion/OIG | LaionOIG | https://huggingface.co/datasets/laion/oig | |
| OpenAssistant/oasst1 | OpenAssistantOasst1 | https://huggingface.co/datasets/openassistant/oasst1 | 8w |
| BelleGroup/train_1M_CN | BelleGroupTrain1MCN | https://huggingface.co/datasets/bellegroup/train_1M_CN | 100w |
| BelleGroup/train_0.5M_CN | BelleGroupTrain05MCN | https://huggingface.co/datasets/bellegroup/train_0.5M_CN | 50w |
| tatsu-lab/alpaca | AlpacaDataset | https://huggingface.co/datasets/tatsu-lab/alpaca | 52k |
| yahma/alpaca-cleaned | AlpacaCleaned | https://huggingface.co/datasets/yahma/alpaca-cleaned | 52k |
| OpenAssistant/oasst1 | OpenAssistantOasst1 | https://huggingface.co/datasets/openassistant/oasst1 | 8w |
| BelleGroup/train_1M_CN | BelleGroupTrain1MCN | https://huggingface.co/datasets/bellegroup/train_1M_CN | 100w |
| BelleGroup/train_0.5M_CN | BelleGroupTrain05MCN | https://huggingface.co/datasets/bellegroup/train_0.5M_CN | 50w |
| tatsu-lab/alpaca | AlpacaDataset | https://huggingface.co/datasets/tatsu-lab/alpaca | 52k |
| yahma/alpaca-cleaned | AlpacaCleaned | https://huggingface.co/datasets/yahma/alpaca-cleaned | 52k |
| QingyiSi/Alpaca-CoT | AlpacaCoT | https://huggingface.co/datasets/qingyisi/alpaca-cot | |
| trans_chinese_alpaca_data | AlpacaChinese | https://github.com/LC1332/Luotuo-Chinese-LLM/tree/main/data | |
| trans_chinese_alpaca_data | AlpacaChinese | https://github.com/ymcui/Chinese-LLaMA-Alpaca/tree/main/data | |
| fnlp/moss-002-sft-data | FudanMossDataset | https://huggingface.co/datasets/fnlp/moss-002-sft-data | |
| nomic-ai/gpt4all-j-prompt-generations | Gpt4allPromptGeneration | https://huggingface.co/datasets/nomic-ai/gpt4all-j-prompt-generations | |
| Dahoas/rm-static | RmStaDahoasRmstaticDatasettic | https://huggingface.co/datasets/dahoas/rm-static | 8w |
| Dahoas/full-hh-rlhf | DahoasFullhhrlhfDataset | https://huggingface.co/datasets/dahoas/full-hh-rlhf | 12w |
| Dahoas/synthetic-instruct-gptj-pairwise | DahoasSyntheticinstructgptjpairwiseDataset | https://huggingface.co/datasets/dahoas/synthetic-instruct-gptj-pairwise | 3w |
| yitingxie/rlhf-reward-datasets | YitingxieRlhfrewarddatasetsDataset | https://huggingface.co/datasets/yitingxie/rlhf-reward-datasets | 8w |
| openai/webgpt_comparisons | OpenaiWebgptcomparisonsDataset | https://huggingface.co/datasets/openai/webgpt_comparisons | 2w |
| stanfordnlp/SHP | StanfordnlpSHPDataset | https://huggingface.co/datasets/stanfordnlp/SHP | 5w |
| wangrui6/Zhihu-KOL | Wangrui6ZhihuKOLDataset | https://huggingface.co/datasets/wangrui6/Zhihu-KOL | 100w |
| Cohere/miracl-zh-queries-22-12 | CohereMiraclzhqueries2212Dataset | https://huggingface.co/datasets/cohere/miracl-zh-queries-22-12 | 1w |
| trans_chinese_alpaca_data | AlpacaChinese | https://github.com/LC1332/Luotuo-Chinese-LLM/tree/main/data | |
| trans_chinese_alpaca_data | AlpacaChinese | https://github.com/ymcui/Chinese-LLaMA-Alpaca/tree/main/data | |
| fnlp/moss-002-sft-data | FudanMossDataset | https://huggingface.co/datasets/fnlp/moss-002-sft-data | |
| nomic-ai/gpt4all-j-prompt-generations | Gpt4allPromptGeneration | https://huggingface.co/datasets/nomic-ai/gpt4all-j-prompt-generations | |
| Dahoas/rm-static | RmStaDahoasRmstaticDatasettic | https://huggingface.co/datasets/dahoas/rm-static | 8w |
| Dahoas/full-hh-rlhf | DahoasFullhhrlhfDataset | https://huggingface.co/datasets/dahoas/full-hh-rlhf | 12w |
| Dahoas/synthetic-instruct-gptj-pairwise | DahoasSyntheticinstructgptjpairwiseDataset | https://huggingface.co/datasets/dahoas/synthetic-instruct-gptj-pairwise | 3w |
| yitingxie/rlhf-reward-datasets | YitingxieRlhfrewarddatasetsDataset | https://huggingface.co/datasets/yitingxie/rlhf-reward-datasets | 8w |
| openai/webgpt_comparisons | OpenaiWebgptcomparisonsDataset | https://huggingface.co/datasets/openai/webgpt_comparisons | 2w |
| stanfordnlp/SHP | StanfordnlpSHPDataset | https://huggingface.co/datasets/stanfordnlp/SHP | 5w |
| wangrui6/Zhihu-KOL | Wangrui6ZhihuKOLDataset | https://huggingface.co/datasets/wangrui6/Zhihu-KOL | 100w |
| Cohere/miracl-zh-queries-22-12 | CohereMiraclzhqueries2212Dataset | https://huggingface.co/datasets/cohere/miracl-zh-queries-22-12 | 1w |
| Hello-SimpleAI/HC3-Chinese | HelloSimpleAIHC3ChineseDataset | https://huggingface.co/datasets/hello-simpleai/HC3-Chinese | |
| mkqa-Chinese | MkqaChineseDataset | https://huggingface.co/datasets/mkqa/Chinese | |
| mkqa-Japanese | MkqaJapaneseDataset | https://huggingface.co/datasets/mkqa/Japanese | |
| Cohere/miracl-ja-queries-22-12 | CohereMiracljaqueries2212Dataset | https://huggingface.co/datasets/cohere/miracl-ja-queries-22-12 | 1w |
| lmqg/qg_jaquad | LmqgQgJaquadDataset | https://huggingface.co/datasets/lmqg/qg_jaquad | 3w |
| lmqg/qag_jaquad | LmqgQagJaquadDataset | https://huggingface.co/datasets/lmqg/qag_jaquad | 1w |


| Cohere/miracl-ja-queries-22-12 | CohereMiracljaqueries2212Dataset | https://huggingface.co/datasets/cohere/miracl-ja-queries-22-12 | 1w |
| lmqg/qg_jaquad | LmqgQgJaquadDataset | https://huggingface.co/datasets/lmqg/qg_jaquad | 3w |
| lmqg/qag_jaquad | LmqgQagJaquadDataset | https://huggingface.co/datasets/lmqg/qag_jaquad | 1w |

## Using for Training SFT model


```python
HuggingFaceDataClass: Dict[str, Type] = {
'Dahoas/rm-static': DahoasRmstaticDataset,
Expand Down Expand Up @@ -96,8 +89,6 @@ LocalDataClass:Dict[str, Type] = {
}
```



## Reference

- https://github.com/yaodongC/awesome-instruction-dataset
Expand Down
14 changes: 7 additions & 7 deletions chatgpt/dataset/data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,9 +71,8 @@ def get_raw_dataset(dataset_name: Optional[str] = None,
data_dir: Optional[str] = None,
test_data_ratio: float = 0.1,
seed: Optional[int] = None):
"""
Given a dataset_name, returns an instance of the corresponding Dataset class,
initialized with the given test_data_ratio and seed arguments.
"""Given a dataset_name, returns an instance of the corresponding Dataset
class, initialized with the given test_data_ratio and seed arguments.
Args:
dataset_name (str, optional): Name of the dataset to return.
Expand Down Expand Up @@ -110,8 +109,7 @@ def data_preprocess(
tokenizer: Optional[PreTrainedTokenizer] = None,
max_seq_len: int = 512,
end_of_conversation_token: Optional[str] = None) -> PromptDataset:
"""
Create different splits of a dataset based on the training phase.
"""Create different splits of a dataset based on the training phase.
Args:
current_dataset (Dataset): The current state of the dataset.
Expand Down Expand Up @@ -175,8 +173,8 @@ def create_dataset(
end_of_conversation_token: Optional[str] = None,
seed: Optional[int] = None,
) -> Tuple:
"""
A function that creates a training and evaluation dataset by splitting a raw dataset.
"""A function that creates a training and evaluation dataset by splitting a
raw dataset.
Args:
- dataset_name (str): The name of the dataset to load.
Expand Down Expand Up @@ -285,6 +283,7 @@ def create_prompt_dataset(


class DataCollatorReward:

def __call__(self, data):
batch = {}
batch['input_ids'] = torch.cat([f[0]
Expand All @@ -297,6 +296,7 @@ def __call__(self, data):


class DataCollatorRLHF:

def __init__(self, max_token_len, inference_tp_size):
self.max_token_len = max_token_len
self.inference_tp_size = inference_tp_size
Expand Down
1 change: 1 addition & 0 deletions chatgpt/dataset/multi_round_dialogue.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@


class UltraChatProcessor(object):

def __init__(self):
super.__init__()

Expand Down
2 changes: 2 additions & 0 deletions chatgpt/dataset/prompt_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ class TokenizedPromptDataset(Dataset):
split (str): The split to use from the training data.
max_length (int): The maximum length of the input sequences (default: 550).
"""

def __init__(self,
data_path: str,
tokenizer: PreTrainedTokenizer,
Expand Down Expand Up @@ -68,6 +69,7 @@ class PromptDataset(Dataset):
split (str): The split to use from the training data.
max_length (int): The maximum length of the input sequences (default: 550).
"""

def __init__(self,
data_path: str,
split: str,
Expand Down
5 changes: 5 additions & 0 deletions chatgpt/dataset/rank_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ def __call__(self, features):


class WebGPT(Dataset):

def __init__(self) -> None:
super().__init__()

Expand Down Expand Up @@ -129,6 +130,7 @@ class HFSummary(Dataset):
labeling method : pair comparison, 0 or 1
"""

def __init__(self,
split='train',
conf_threshold=-1,
Expand Down Expand Up @@ -196,6 +198,7 @@ class HFDataset(Dataset):
we should do something like this for supervised datasets
"""

def __init__(self,
dataset_name,
question_field,
Expand Down Expand Up @@ -232,6 +235,7 @@ def __getitem__(self, index):


class GPTJSynthetic(HFDataset):

def __init__(self) -> None:
super().__init__('Dahoas/synthetic-instruct-gptj-pairwise', 'prompt',
'chosen', 'rejected', None, 'train')
Expand All @@ -246,6 +250,7 @@ class AnthropicRLHF(Dataset):
one "chosen" and one "rejected".
valid train size : 160780
"""

def preprocess_dialogue(self, text):
"""trim prefix text to last two pairs.
Expand Down
Loading

0 comments on commit 8d19642

Please sign in to comment.