Using HybridParallelPlugin can I do [2D, 2.5D, 3D] tensor parallelism #6194
giriprasad51
started this conversation in
Community | General
Replies: 0 comments
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Uh oh!
There was an error while loading. Please reload this page.
-
I am able to do 1D tensor parallelism below code If possible using HybridParallelPlugin can do remaining parallelism please help me
%%writefile /kaggle/working/Training-RESNET-with-ColossalAI/vit_1D.py
from tqdm import tqdm
For the network
import torch
import torch.nn as nn
import torch.distributed as dist
from torch.optim.lr_scheduler import MultiStepLR
from torch.optim import Optimizer
For datasets
import torchvision.transforms as transforms
import torchvision.datasets as datasets
For dataloader
from torch.utils.data import DataLoader
For distributed training
import colossalai
from colossalai.cluster import DistCoordinator
from colossalai.booster import Booster
from colossalai.booster.plugin import TorchDDPPlugin
from colossalai.booster.plugin.dp_plugin_base import DPPluginBase
from colossalai.booster.plugin import HybridParallelPlugin
from colossalai.nn.optimizer import HybridAdam
from colossalai.accelerator import get_accelerator
For Vision Transformer
from transformers import ViTConfig, ViTForImageClassification
Prepare Hyperparameters
NUM_EPOCHS = 10
LEARNING_RATE = 0.001
def get_train_transform_augmentation():
return transforms.Compose([
transforms.Pad(4),
transforms.RandomHorizontalFlip(),
transforms.RandomCrop(32),
transforms.ToTensor(),
])
def get_test_transform_augmentation():
return transforms.ToTensor()
def build_dataloader(batch_size: int, coordinator: DistCoordinator, plugin: DPPluginBase):
data_path = '/scratch/pusunuru/data'
with coordinator.priority_execution():
train_dataset = datasets.CIFAR10(
root=data_path, train=True, download=False, transform=get_train_transform_augmentation()
)
test_dataset = datasets.CIFAR10(
root=data_path, train=False, download=False, transform=get_test_transform_augmentation()
)
def train(model: nn.Module, optimizer: Optimizer, criterion: nn.Module, train_dataloader: DataLoader, booster: Booster, coordinator: DistCoordinator):
model.train()
with tqdm(train_dataloader, disable=not coordinator.is_master()) as data:
for images, labels in data:
images = images.to(device='cuda' if torch.cuda.is_available() else 'cpu')
labels = labels.to(device='cuda' if torch.cuda.is_available() else 'cpu')
outputs = model(images).logits
loss = criterion(outputs, labels)
@torch.no_grad()
def test(model: nn.Module, test_dataloader: DataLoader, coordinator: DistCoordinator):
model.eval()
correct = torch.zeros(1, device=get_accelerator().get_current_device())
total = torch.zeros(1, device=get_accelerator().get_current_device())
for images, labels in test_dataloader:
images = images.to(device='cuda' if torch.cuda.is_available() else 'cpu')
labels = labels.to(device='cuda' if torch.cuda.is_available() else 'cpu')
outputs = model(images).logits
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
dist.all_reduce(correct)
dist.all_reduce(total)
accuracy = correct.item() / total.item()
if coordinator.is_master():
print(f"Accuracy of the model on the test: {accuracy * 100:.2f} %")
def main():
colossalai.launch_from_torch()
coordinator = DistCoordinator()
coordinator.print_on_master('hello world')
if name == "main":
main()
Beta Was this translation helpful? Give feedback.
All reactions