Skip to content

Commit

Permalink
add a demo (scifi-train)
Browse files Browse the repository at this point in the history
  • Loading branch information
waylandzhang committed Apr 29, 2024
1 parent e7cbdca commit def9688
Show file tree
Hide file tree
Showing 9 changed files with 459 additions and 0 deletions.
2 changes: 2 additions & 0 deletions llama-3/README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# 中文微调llama-3

这个目录是用来存放中文微调llama-3的代码的。与其他目录无关。

目前最省GPU最快的方法微调llama-3就是通过usloth的方法,这个方法是在llama-3的基础上,他们预先量化到了4bit,减少微调时所需的内存。

这个方法的优点是,不需要重新训练模型,只需要下载预训练模型,然后微调即可。
Expand Down
13 changes: 13 additions & 0 deletions scifi-demo/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
## About this repo

这个目录是我在B站、抖音上发布视频里的一个demo。

通过我们跟目录下一样的手写的模型来实现一个科幻小说微调原理。教学作用,代码不适用于生产环境。

### 准备数据集

准备数据集的部分需要预处理,并存放在 `/data` 子文件夹下。

- 训练数据集 `tools_download`工具代码中的第18~35行。先把zip包解压,然后合并所有的小说txt文件到单一的txt文件当中。

- 微调数据集 `tools_download`工具代码中的第6~14行。从huggingface上下载一个带微调指令的数据集。
Empty file added scifi-demo/data/scifi.txt
Empty file.
101 changes: 101 additions & 0 deletions scifi-demo/finetune-scifi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
"""
Fine-tune a model
"""
import os
import sys
import pickle
from contextlib import nullcontext
import torch
import tiktoken
from aim import Run
from model import Model
import json


# Hyperparameters
batch_size = 8 # How many batches per training step
context_length = 128 # Length of the token chunk each batch
max_iters = 1000 # Total of training iterations <- Change this to smaller number for testing
learning_rate = 1e-4 # 0.001
eval_interval = 10 # How often to evaluate
eval_iters = 10 # Number of iterations to average for evaluation
device = 'cuda' if torch.cuda.is_available() else 'cpu' # Use GPU if it's available.
TORCH_SEED = 1337
torch.manual_seed(TORCH_SEED)


# 准备训练数据
with open('data/scifi-finetune.json', 'r') as file:
alpaca = json.load(file)
text = alpaca[1000:5001]

# print(text)
# sys.exit(0)

# Using TikToken (Same as GPT3) to tokenize the source text
encoding = tiktoken.get_encoding("cl100k_base")
tokenized_text = encoding.encode(str(text))
tokenized_text = torch.tensor(tokenized_text, dtype=torch.long, device=device) # 将77,919个tokens 转换到Pytorch张量中

total_tokens = encoding.encode_ordinary(str(text))
print(f"数据集合计有 {len(total_tokens):,} tokens")


# Split train and validation
train_size = int(len(tokenized_text) * 0.9)
train_data = tokenized_text[:train_size]
val_data = tokenized_text[train_size:]


# Initialize the model
model = Model()
model.load_state_dict(torch.load('model/model-scifi.pt'))
model.to(device)

# get batch
def get_batch(split: str):
data = train_data if split == 'train' else val_data
idxs = torch.randint(low=0, high=len(data) - context_length, size=(batch_size,))
x = torch.stack([data[idx:idx + context_length] for idx in idxs]).to(device)
y = torch.stack([data[idx + 1:idx + context_length + 1] for idx in idxs]).to(device)
return x, y


# calculate the loss
@torch.no_grad()
def estimate_loss():
out = {}
model.eval()
for split in ['train', 'valid']:
losses = torch.zeros(eval_iters)
for k in range(eval_iters):
x_batch, y_batch = get_batch(split)
logits, loss = model(x_batch, y_batch)
losses[k] = loss.item()
out[split] = losses.mean()
model.train()
return out


# Create the optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
tracked_losses = list()
for step in range(max_iters):
if step % eval_iters == 0 or step == max_iters - 1:
losses = estimate_loss()
tracked_losses.append(losses)
print('Step:', step, 'Training Loss:', round(losses['train'].item(), 3), 'Validation Loss:', round(losses['valid'].item(), 3))

xb, yb = get_batch('train')
logits, loss = model(xb, yb)
optimizer.zero_grad(set_to_none=True)
loss.backward()
optimizer.step()

# Save the model
torch.save(model.state_dict(), 'model/model-scifi-finetune.pt')





42 changes: 42 additions & 0 deletions scifi-demo/inference-scifi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# -*- coding: utf-8 -*-
"""
Sample from a trained model
"""
import os
import pickle
from contextlib import nullcontext
import torch
import tiktoken
from model import Model


# Hyperparameters
device = 'cuda' if torch.cuda.is_available() else 'cpu'
TORCH_SEED = 1337
torch.manual_seed(TORCH_SEED)
torch.cuda.manual_seed(TORCH_SEED)


encoding = tiktoken.get_encoding("cl100k_base")


# Initiate from trained model
model = Model()
model.load_state_dict(torch.load('model/model-scifi.pt'))
model.eval()
model.to(device)

# start = 'Write a short story about Sam Altman.'
start = 'Sam Altman was born in'
start_ids = encoding.encode(start)
x = (torch.tensor(start_ids, dtype=torch.long, device=device)[None, ...])

# run generation
with torch.no_grad():
y = model.generate(x, max_new_tokens=500)
print('---------------')
print(encoding.decode(y[0].tolist()))
print('---------------')



138 changes: 138 additions & 0 deletions scifi-demo/model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
import math
import torch
import torch.nn as nn
from torch.nn import functional as F
import tiktoken

# Hyperparameters
context_length = 128 # Length of the token chunk each batch
d_model = 512 # The size of our model token embeddings
num_blocks = 12 # Number of transformer blocks
num_heads = 8 # Number of heads in Multi-head attention
dropout = 0.1 # Dropout rate
device = 'cuda' if torch.cuda.is_available() else 'cpu' # Use GPU if it's available.
TORCH_SEED = 1337
torch.manual_seed(TORCH_SEED)

# Define feed forward network
class FeedForwardNetwork(nn.Module):
def __init__(self):
super().__init__()
self.ffn = nn.Sequential(
nn.Linear(d_model, d_model * 4),
nn.ReLU(),
nn.Linear(d_model * 4, d_model),
nn.Dropout(dropout)
)

def forward(self, x):
return self.ffn(x)


# Define Scaled Dot Product Attention
class Attention(nn.Module):
def __init__(self):
super().__init__()
self.Wq = nn.Linear(d_model, d_model // num_heads, bias=False)
self.Wk = nn.Linear(d_model, d_model // num_heads, bias=False)
self.Wv = nn.Linear(d_model, d_model // num_heads, bias=False)
self.register_buffer('mask', torch.tril(torch.ones(context_length, context_length)))
self.dropout = nn.Dropout(dropout)

def forward(self, x):
B, T, C = x.shape
q = self.Wq(x)
k = self.Wk(x)
v = self.Wv(x)

weights = (q @ k.transpose(-2, -1)) / math.sqrt(d_model // num_heads)
weights = weights.masked_fill(self.mask[:T, :T] == 0, float('-inf'))
weights = F.softmax(weights, dim=-1)
weights = self.dropout(weights)

output = weights @ v

return output


# Define Multi-head Attention
class MultiHeadAttention(nn.Module):
def __init__(self):
super().__init__()
self.heads = nn.ModuleList([Attention() for _ in range(num_heads)])
self.projection_layer = nn.Linear(d_model, d_model)
self.dropout = nn.Dropout(dropout)

def forward(self, x):
head_outputs = [head(x) for head in self.heads]
head_outputs = torch.cat(head_outputs, dim=-1)
out = self.dropout(self.projection_layer(head_outputs))
return out


# Define Transformer Block
class TransformerBlock(nn.Module):
def __init__(self):
super().__init__()
self.ln1 = nn.LayerNorm(d_model)
self.ln2 = nn.LayerNorm(d_model)
self.mha = MultiHeadAttention()
self.ffn = FeedForwardNetwork()

def forward(self, x):
x = x + self.mha(self.ln1(x))
x = x + self.ffn(self.ln2(x))
return x


# Define the model
class Model(nn.Module):
def __init__(self, max_token_value=100256): # if not passed, force to be default tiktoken cl100k vocab size
super().__init__()
self.token_embedding_lookup_table = nn.Embedding(max_token_value, d_model)
self.transformer_blocks = nn.Sequential(*(
[TransformerBlock() for _ in range(num_blocks)] +
[nn.LayerNorm(d_model)]
))
self.model_out_linear_layer = nn.Linear(d_model, max_token_value)

def forward(self, idx, targets=None):
B, T = idx.shape
position_encoding_lookup_table = torch.zeros(context_length, d_model, device=device)
position = torch.arange(0, context_length, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
position_encoding_lookup_table[:, 0::2] = torch.sin(position * div_term)
position_encoding_lookup_table[:, 1::2] = torch.cos(position * div_term)
# change position_encoding_lookup_table from (context_length, d_model) to (T, d_model)
position_embedding = position_encoding_lookup_table[:T, :].to(device)
x = self.token_embedding_lookup_table(idx) + position_embedding
x = self.transformer_blocks(x)
# get the final logits
logits = self.model_out_linear_layer(x)

if targets is not None:
B, T, C = logits.shape
logits_reshaped = logits.view(B * T, C)
targets_reshaped = targets.view(B * T)
loss = F.cross_entropy(input=logits_reshaped, target=targets_reshaped)
else:
loss = None
return logits, loss

def generate(self, idx, max_new_tokens=100):
# idx is (B,T) array of indices in the current context
for _ in range(max_new_tokens):
# Crop idx to the max size of our positional embeddings table
idx_crop = idx[:, -context_length:]
# Get predictions
logits, loss = self.forward(idx_crop)
# Get the last time step from logits where the dimensions of the logits are (B,T,C)
logits_last_timestep = logits[:, -1, :]
# Apply softmax to get probabilities
probs = F.softmax(input=logits_last_timestep, dim=-1)
# Sample from the probabilities' distribution.
idx_next = torch.multinomial(input=probs, num_samples=1)
# Append the sampled indexes idx_next to idx
idx = torch.cat((idx, idx_next), dim=1)
return idx

11 changes: 11 additions & 0 deletions scifi-demo/show-parameters.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import torch
from model import Model

model = Model()
state_dict = torch.load('model/model-scifi.pt')
model.load_state_dict(state_dict)

# Calculate the number of parameters
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"模型参数量为: {total_params:,}")
35 changes: 35 additions & 0 deletions scifi-demo/tools_download.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import os
import sys
import requests
import glob

def download_file(url, save_path):
response = requests.get(url, stream=True)
with open(save_path, 'wb') as file:
for chunk in response.iter_content(chunk_size=8192):
file.write(chunk)

url = "https://huggingface.co/datasets/zxbsmk/webnovel_cn/resolve/main/novel_cn_token512_50k.json?download=true"
save_path = "data/scifi-finetune.json"
download_file(url, save_path)

sys.exit(0)

# 小说模型训练数据集下载链接: https://pan.baidu.com/s/1bC8fH8hyt28L9pV3fjOHIQ 提取码: 9i9g
def find_txt_files(directory):
return glob.glob(os.path.join(directory, '**', '*.txt'), recursive=True)

def concatenate_txt_files(files, output_file):
with open(output_file, 'w') as outfile:
for file in files:
with open(file, 'r') as infile:
outfile.write(infile.read() + '\n') # Adds a newline between files

directory = 'data'
output_file = 'data/scifi.txt'

# Find all .txt files
txt_files = find_txt_files(directory)

# Concatenate all found .txt files into one
concatenate_txt_files(txt_files, output_file)
Loading

0 comments on commit def9688

Please sign in to comment.