Skip to content

Commit

Permalink
Update references to torchvision
Browse files Browse the repository at this point in the history
  • Loading branch information
loadams committed Jan 8, 2025
1 parent 1842b4f commit f14c4b0
Show file tree
Hide file tree
Showing 8 changed files with 47 additions and 47 deletions.
2 changes: 1 addition & 1 deletion training/cifar/cifar10_deepspeed.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
from torchvision import transforms
from deepspeed.accelerator import get_accelerator
from deepspeed.moe.utils import split_params_into_different_moe_groups_for_optimizer

Expand Down
2 changes: 1 addition & 1 deletion training/cifar/cifar10_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@
"""
import torch
import torchvision
import torchvision.transforms as transforms
from torchvision import transforms

########################################################################
# The output of torchvision datasets are PILImage images of range [0, 1].
Expand Down
46 changes: 23 additions & 23 deletions training/data_efficiency/vit_finetuning/main_imagenet.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@
import torch.multiprocessing as mp
import torch.utils.data
import torch.utils.data.distributed
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torchvision import transforms
from torchvision import datasets
import torchvision.models as models
from torch.utils.data import Subset
import models
Expand Down Expand Up @@ -105,7 +105,7 @@ def _get_model(args):
nchannels = 3
model = models.__dict__[args.arch](num_classes=nclasses, nchannels=nchannels)
return model

def _get_dist_model(gpu, args):
ngpus_per_node = torch.cuida.device_count()
if args.distributed:
Expand Down Expand Up @@ -149,9 +149,9 @@ def _get_dist_model(gpu, args):
else:
model = torch.nn.DataParallel(model).cuda()
return model

def main():

args = parser.parse_args()

if args.seed is not None:
Expand Down Expand Up @@ -190,7 +190,7 @@ def main():
def main_worker(gpu, ngpus_per_node, args):
global best_acc1
global history

if args.deepspeed:
gpu = args.local_rank
args.gpu = gpu
Expand All @@ -205,7 +205,7 @@ def main_worker(gpu, ngpus_per_node, args):
deepspeed.init_distributed()
print(f'created model on gpu {gpu}')
# exit ()

# define loss function (criterion), optimizer, and learning rate scheduler
criterion = nn.CrossEntropyLoss().cuda(args.gpu)

Expand Down Expand Up @@ -284,14 +284,14 @@ def main_worker(gpu, ngpus_per_node, args):
validate(val_loader, model, criterion, args)
# return
args.completed_step = 0

optimizer = torch.optim.SGD(model.parameters(), args.lr,
momentum=args.momentum,
weight_decay=args.weight_decay)

"""Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
scheduler = StepLR(optimizer, step_size=int(len(train_loader)*args.epochs//3), gamma=0.1)# None #


model, optimizer, _, scheduler = deepspeed.initialize(
model=model,
Expand All @@ -311,17 +311,17 @@ def main_worker(gpu, ngpus_per_node, args):
time_epoch = time.time() - start_time
# evaluate on validation set
top5_val, top1_val, losses_val = validate(val_loader, model, criterion, args)
if args.gpu==0:
if args.gpu==0:
history["epoch"].append(epoch)
history["val_loss"].append(losses_val)
history["val_acc1"].append(top1_val)
history["val_acc5"].append(top5_val)
history["val_acc1"].append(top1_val)
history["val_acc5"].append(top5_val)
history["train_loss"].append(losses_train)
history["train_acc1"].append(top1_train)
history["train_acc1"].append(top1_train)
history["train_acc5"].append(top5_train)
torch.save(history,f"{args.out_dir}/stat.pt")
torch.save(history,f"{args.out_dir}/stat.pt")
try:
print (f'{epoch} epoch at time {time_epoch}s and learning rate {scheduler.get_last_lr()}')
print (f'{epoch} epoch at time {time_epoch}s and learning rate {scheduler.get_last_lr()}')
except:
print (f'{epoch} epoch at time {time_epoch}s and learning rate {args.lr}')
print (f"finish epoch {epoch} or iteration {args.completed_step}, train_accuracy is {top1_train}, val_accuracy {top1_val}")
Expand Down Expand Up @@ -393,14 +393,14 @@ def train(scheduler, train_loader, model, criterion, optimizer, epoch, args):
loss.backward()
optimizer.step()
scheduler.step()

# measure elapsed time
batch_time.update(time.time() - end)
end = time.time()

if i % args.print_freq == 0 and args.gpu==0:
if i % args.print_freq == 0 and args.gpu==0:
progress.display(i + 1)

if args.distributed:
losses.all_reduce()
top1.all_reduce()
Expand Down Expand Up @@ -432,7 +432,7 @@ def run_validate(loader, base_progress=0):
batch_time.update(time.time() - end)
end = time.time()

if i % args.print_freq == 0 and args.gpu==0:
if i % args.print_freq == 0 and args.gpu==0:
progress.display(i + 1)

batch_time = AverageMeter('Time', ':6.3f', Summary.NONE)
Expand Down Expand Up @@ -509,7 +509,7 @@ def all_reduce(self):
def __str__(self):
fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
return fmtstr.format(**self.__dict__)

def summary(self):
fmtstr = ''
if self.summary_type is Summary.NONE:
Expand All @@ -522,7 +522,7 @@ def summary(self):
fmtstr = '{name} {count:.3f}'
else:
raise ValueError('invalid summary type %r' % self.summary_type)

return fmtstr.format(**self.__dict__)


Expand All @@ -536,7 +536,7 @@ def display(self, batch):
entries = [self.prefix + self.batch_fmtstr.format(batch)]
entries += [str(meter) for meter in self.meters]
print ('\t'.join(entries))

def display_summary(self):
entries = [" *"]
entries += [meter.summary() for meter in self.meters]
Expand Down
22 changes: 11 additions & 11 deletions training/data_efficiency/vit_finetuning/utils/get_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,18 @@
# limitations under the License.
import torch
import os
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torchvision import transforms
from torchvision import datasets

def get_dataset(dataset_name, data_dir, split, rand_fraction=None,clean=False, transform=None, imsize=None, bucket='pytorch-data', **kwargs):

if dataset_name in [ 'cifar10', 'cifar100']:
dataset = globals()[f'get_{dataset_name}'](dataset_name, data_dir, split, imsize=imsize, bucket=bucket, **kwargs)
dataset = globals()[f'get_{dataset_name}'](dataset_name, data_dir, split, imsize=imsize, bucket=bucket, **kwargs)
elif dataset_name in [ 'cifar10vit224', 'cifar100vit224','cifar10vit384', 'cifar100vit384',]:
imsize = int(dataset_name.split('vit')[-1])
dataset_name = dataset_name.split('vit')[0]
#print ('here')
dataset = globals()['get_cifar_vit'](dataset_name, data_dir, split, imsize=imsize, bucket=bucket, **kwargs)
dataset = globals()['get_cifar_vit'](dataset_name, data_dir, split, imsize=imsize, bucket=bucket, **kwargs)
else:
assert 'cifar' in dataset_name
print (dataset_name)
Expand Down Expand Up @@ -59,10 +59,10 @@ def get_transform(split, normalize=None, transform=None, imsize=None, aug='large
if transform is None:
if normalize is None:
if aug == 'large':

normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
else:
normalize = transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.2023, 0.1994, 0.2010])
normalize = transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.2023, 0.1994, 0.2010])
transform = transforms.Compose(get_aug(split, imsize=imsize, aug=aug)
+ [transforms.ToTensor(), normalize])
return transform
Expand All @@ -71,7 +71,7 @@ def get_transform(split, normalize=None, transform=None, imsize=None, aug='large
def get_cifar10(dataset_name, data_dir, split, transform=None, imsize=None, bucket='pytorch-data', **kwargs):
if imsize==224:
transform = get_transform(split, transform=transform, imsize=imsize, aug='large')
else:
else:
transform = get_transform(split, transform=transform, imsize=imsize, aug='small')
return datasets.CIFAR10(data_dir, train=(split=='train'), transform=transform, download=True, **kwargs)

Expand All @@ -88,7 +88,7 @@ def get_cifar100N(dataset_name, data_dir, split, rand_fraction=None,transform=No
if split=='train':
return CIFAR100N(root=data_dir, train=(split=='train'), transform=transform, download=True, rand_fraction=rand_fraction)
else:
return datasets.CIFAR100(data_dir, train=(split=='train'), transform=transform, download=True, **kwargs)
return datasets.CIFAR100(data_dir, train=(split=='train'), transform=transform, download=True, **kwargs)

def get_cifar_vit(dataset_name, data_dir, split, transform=None, imsize=None, bucket='pytorch-data', **kwargs):
if imsize==224:
Expand All @@ -111,12 +111,12 @@ def get_cifar_vit(dataset_name, data_dir, split, transform=None, imsize=None, bu
if dataset_name =='cifar10':
return datasets.CIFAR10(data_dir, train=(split=='train'), transform=transform_data, download=True, **kwargs)
elif dataset_name =='cifar100':

return datasets.CIFAR100(data_dir, train=(split=='train'), transform=transform_data, download=True, **kwargs)
else:
assert dataset_name in ['cifar10', 'cifar100']
else:

if split=='train':
transform_data = transforms.Compose([# transforms.ColorJitter(brightness= 0.4, contrast= 0.4, saturation= 0.4, hue= 0.1),
transforms.Resize(imsize),
Expand Down Expand Up @@ -164,4 +164,4 @@ def get_imagenet_vit(dataset_name, data_dir, split, transform=None, imsize=None,
#return torch.utils.data.distributed.DistributedSampler(train_dataset)
else:
return datasets.ImageFolder(valdir, transform_data)
#Ereturn torch.utils.data.distributed.DistributedSampler(val_dataset, shuffle=False, drop_last=True)
#Ereturn torch.utils.data.distributed.DistributedSampler(val_dataset, shuffle=False, drop_last=True)
2 changes: 1 addition & 1 deletion training/gan/gan_baseline_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import torch.nn as nn
import torch.utils.data
import torchvision.datasets as dset
import torchvision.transforms as transforms
from torchvision import transforms
import torchvision.utils as vutils
from torch.utils.tensorboard import SummaryWriter
from time import time
Expand Down
2 changes: 1 addition & 1 deletion training/gan/gan_deepspeed_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import torch.nn as nn
import torch.utils.data
import torchvision.datasets as dset
import torchvision.transforms as transforms
from torchvision import transforms
import torchvision.utils as vutils
from torch.utils.tensorboard import SummaryWriter
from time import time
Expand Down
16 changes: 8 additions & 8 deletions training/imagenet/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@
import torch.optim
import torch.utils.data
import torch.utils.data.distributed
import torchvision.datasets as datasets
import torchvision.models as models
import torchvision.transforms as transforms
from torchvision import transforms
from torchvision import datasets
from torch.optim.lr_scheduler import StepLR
from torch.utils.data import Subset

Expand Down Expand Up @@ -94,7 +94,7 @@ def main():
'which can slow down your training considerably! '
'You may see unexpected behavior when restarting '
'from checkpoints.')

if args.gpu is not None:
warnings.warn('You have chosen a specific GPU. This will completely '
'disable data parallelism.')
Expand All @@ -112,7 +112,7 @@ def main():
args.world_size = ngpus_per_node * args.world_size
t_losses, t_acc1s = main_worker(args.gpu, ngpus_per_node, args)
#dist.barrier()

# Write the losses to an excel file
if dist.get_rank() ==0:
all_losses = [torch.empty_like(t_losses) for _ in range(ngpus_per_node)]
Expand Down Expand Up @@ -278,7 +278,7 @@ def print_rank_0(msg):
acc1s[epoch] = acc1

scheduler.step()

# remember best acc@1 and save checkpoint
is_best = acc1 > best_acc1
best_acc1 = max(acc1, best_acc1)
Expand Down Expand Up @@ -449,7 +449,7 @@ def all_reduce(self):
def __str__(self):
fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
return fmtstr.format(**self.__dict__)

def summary(self):
fmtstr = ''
if self.summary_type is Summary.NONE:
Expand All @@ -462,7 +462,7 @@ def summary(self):
fmtstr = '{name} {count:.3f}'
else:
raise ValueError('invalid summary type %r' % self.summary_type)

return fmtstr.format(**self.__dict__)


Expand All @@ -476,7 +476,7 @@ def display(self, batch):
entries = [self.prefix + self.batch_fmtstr.format(batch)]
entries += [str(meter) for meter in self.meters]
print('\t'.join(entries))

def display_summary(self):
entries = [" *"]
entries += [meter.summary() for meter in self.meters]
Expand Down
2 changes: 1 addition & 1 deletion training/pipeline_parallelism/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import torch.distributed as dist

import torchvision
import torchvision.transforms as transforms
from torchvision import transforms
from torchvision.models import AlexNet
from torchvision.models import vgg19

Expand Down

0 comments on commit f14c4b0

Please sign in to comment.