Skip to content

[Feature Request] Padding Dataset to max_seq_length #1416

Closed
@loretoparisi

Description

@loretoparisi

When training Llama3 I wish to pad my unstructured text to the same length. This has been addressed by #1394
Anyways, this means that the dataset tokens sequence length will be the max tensor length found in that specific dataset, because this is how the padded_collate function works.
While in Llama3 I want to have in my custom torch Dataset a specific length, defined externally, like:

def load_dataset(seq_length=2048):
     dataset = text_completion_dataset(
        tokenizer,
        source="text",
        column="text",
        data_files="t8.shakespeare.txt",
        split="train",
        max_seq_len=seq_length,
        packed=False
    )
    return dataset

def get_text_completion_dataset_tokens(seq_length,batch_size):
    from torchtune.utils import padded_collate
    dataset = load_dataset(seq_length=seq_length)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, num_workers=0, shuffle=False, collate_fn=padded_collate)
    tokens = []
    for sample in dataloader:
        batch = sample['tokens'].tolist()
        for sample in batch:
            tokens.append( sample )
    return tokens

class RandomTokenDataset(Dataset):
    def __init__(self, vocab_size: int, seq_length: int, batch_size:int):
        self.vocab_size = vocab_size
        self.seq_length = seq_length # 8
        self.batch_size = batch_size # 128
        
        self.tokens = get_text_completion_dataset_tokens(seq_length,batch_size)
        
    def __len__(self) -> int:
        return self.seq_length

    def __getitem__(self, item: int):
        return self.tokens[item]

and in Llama3 side

class Llama3(L.LightningModule):
    def __init__(self):
        super().__init__()
        self.save_hyperparameters()
        self.model_args = ModelArgs(vocab_size=32000)
        self.model = Transformer(self.model_args)
        self.save_hyperparameters() # save to logging
        
    def on_train_start(self) -> None:
        self.model.init_weights()

    def training_step(self, batch):
        inputs = batch[:, :-1] if torch.is_tensor(batch) else batch['tokens'][:, :-1]
        labels =  batch[:, 1:] if torch.is_tensor(batch) else batch['tokens'][:, 1:]
        
        output = self.model(inputs)
        
        with loss_parallel():
            loss = F.cross_entropy(output.reshape(-1, output.size(-1)), labels.reshape(-1))
            return loss
            
    def on_train_batch_end(self, outputs, batch, batch_idx):
        loss = outputs['loss']
        self.log('train_loss', loss, sync_dist=True, on_step=True, on_epoch=True, prog_bar=True, logger=self.logger)
        
    def backward(self, *args, **kwargs):
        with loss_parallel():
            super().backward(*args, **kwargs)

    def configure_optimizers(self):
        return torch.optim.AdamW(self.model.parameters(), lr=3e-3, foreach=True)

    def train_dataloader(self):
        
        seq_length = 128
        batch_size = 2
        num_workers = 4
        
        dataset = RandomTokenDataset(vocab_size=self.model_args.vocab_size, seq_length=seq_length, batch_size=batch_size)
        return DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False)

So, how can I use the padding collate to pad the a given sequence length in torchtune rather than to the max tensor length?
The code above will break because my Tensor size to output = self.model(inputs) will be [2,35] (the max tensor length found in that dataset is 35) but sequence length is 128, so it will only work if Tensor torch size will be [2,128] so I will get a CUDA error cuda Assertion srcIndex < srcSelectDimSize failed. after that because tokenizer size will not match that length.

To be more specific, that dimensionality issue will happen in the forward pass of the Transformer block on the Llama3 class in where embeddings are assigned by the tokens: h = self.tok_embeddings(tokens) if self.tok_embeddings else tokens:

class Transformer(nn.Module):
   def __init__(self, model_args: ModelArgs):
        super().__init__()
        self.model_args = model_args
        self.vocab_size = model_args.vocab_size
        self.n_layers = model_args.n_layers

        # vocab_size=32000, dim=3200
        self.tok_embeddings = nn.Embedding(model_args.vocab_size, model_args.dim)
    
        self.layers = torch.nn.ModuleDict()
        for layer_id in range(model_args.n_layers):
            self.layers[str(layer_id)] = TransformerBlock(layer_id, model_args)

        self.norm = RMSNorm(dim=model_args.dim, eps=model_args.norm_eps)

        self.output = nn.Linear(model_args.dim, model_args.vocab_size, bias=False)
        self.init_weights()

# ....

    def forward(self, tokens: torch.Tensor):
        """Perform a forward pass through the Transformer model.

        Args:
            tokens (torch.Tensor): Input token indices.

        Returns:
            torch.Tensor: Output logits after applying the Transformer model.

        """
        
        #  error here: my tokenizer output did not match model vocabulary size. 
        # passthrough for nonexistent layers, allows easy configuration of pipeline parallel stages
        h = self.tok_embeddings(tokens) if self.tok_embeddings else tokens

        for layer in self.layers.values():
            h = layer(h, self.freqs_cis)

        h = self.norm(h) if self.norm else h
        return self.output(h).float() if self.output else h

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions