Description
When training Llama3 I wish to pad my unstructured text to the same length. This has been addressed by #1394
Anyways, this means that the dataset tokens sequence length will be the max tensor length found in that specific dataset, because this is how the padded_collate function works.
While in Llama3 I want to have in my custom torch Dataset a specific length, defined externally, like:
def load_dataset(seq_length=2048):
dataset = text_completion_dataset(
tokenizer,
source="text",
column="text",
data_files="t8.shakespeare.txt",
split="train",
max_seq_len=seq_length,
packed=False
)
return dataset
def get_text_completion_dataset_tokens(seq_length,batch_size):
from torchtune.utils import padded_collate
dataset = load_dataset(seq_length=seq_length)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, num_workers=0, shuffle=False, collate_fn=padded_collate)
tokens = []
for sample in dataloader:
batch = sample['tokens'].tolist()
for sample in batch:
tokens.append( sample )
return tokens
class RandomTokenDataset(Dataset):
def __init__(self, vocab_size: int, seq_length: int, batch_size:int):
self.vocab_size = vocab_size
self.seq_length = seq_length # 8
self.batch_size = batch_size # 128
self.tokens = get_text_completion_dataset_tokens(seq_length,batch_size)
def __len__(self) -> int:
return self.seq_length
def __getitem__(self, item: int):
return self.tokens[item]
and in Llama3 side
class Llama3(L.LightningModule):
def __init__(self):
super().__init__()
self.save_hyperparameters()
self.model_args = ModelArgs(vocab_size=32000)
self.model = Transformer(self.model_args)
self.save_hyperparameters() # save to logging
def on_train_start(self) -> None:
self.model.init_weights()
def training_step(self, batch):
inputs = batch[:, :-1] if torch.is_tensor(batch) else batch['tokens'][:, :-1]
labels = batch[:, 1:] if torch.is_tensor(batch) else batch['tokens'][:, 1:]
output = self.model(inputs)
with loss_parallel():
loss = F.cross_entropy(output.reshape(-1, output.size(-1)), labels.reshape(-1))
return loss
def on_train_batch_end(self, outputs, batch, batch_idx):
loss = outputs['loss']
self.log('train_loss', loss, sync_dist=True, on_step=True, on_epoch=True, prog_bar=True, logger=self.logger)
def backward(self, *args, **kwargs):
with loss_parallel():
super().backward(*args, **kwargs)
def configure_optimizers(self):
return torch.optim.AdamW(self.model.parameters(), lr=3e-3, foreach=True)
def train_dataloader(self):
seq_length = 128
batch_size = 2
num_workers = 4
dataset = RandomTokenDataset(vocab_size=self.model_args.vocab_size, seq_length=seq_length, batch_size=batch_size)
return DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False)
So, how can I use the padding collate to pad the a given sequence length in torchtune rather than to the max tensor length?
The code above will break because my Tensor size to output = self.model(inputs)
will be [2,35]
(the max tensor length found in that dataset is 35) but sequence length is 128, so it will only work if Tensor torch size will be [2,128]
so I will get a CUDA error cuda Assertion
srcIndex < srcSelectDimSize failed.
after that because tokenizer size will not match that length.
To be more specific, that dimensionality issue will happen in the forward
pass of the Transformer
block on the Llama3
class in where embeddings are assigned by the tokens: h = self.tok_embeddings(tokens) if self.tok_embeddings else tokens
:
class Transformer(nn.Module):
def __init__(self, model_args: ModelArgs):
super().__init__()
self.model_args = model_args
self.vocab_size = model_args.vocab_size
self.n_layers = model_args.n_layers
# vocab_size=32000, dim=3200
self.tok_embeddings = nn.Embedding(model_args.vocab_size, model_args.dim)
self.layers = torch.nn.ModuleDict()
for layer_id in range(model_args.n_layers):
self.layers[str(layer_id)] = TransformerBlock(layer_id, model_args)
self.norm = RMSNorm(dim=model_args.dim, eps=model_args.norm_eps)
self.output = nn.Linear(model_args.dim, model_args.vocab_size, bias=False)
self.init_weights()
# ....
def forward(self, tokens: torch.Tensor):
"""Perform a forward pass through the Transformer model.
Args:
tokens (torch.Tensor): Input token indices.
Returns:
torch.Tensor: Output logits after applying the Transformer model.
"""
# error here: my tokenizer output did not match model vocabulary size.
# passthrough for nonexistent layers, allows easy configuration of pipeline parallel stages
h = self.tok_embeddings(tokens) if self.tok_embeddings else tokens
for layer in self.layers.values():
h = layer(h, self.freqs_cis)
h = self.norm(h) if self.norm else h
return self.output(h).float() if self.output else h