Skip to content

Commit 4bf5670

Browse files
committed
Add
1 parent fc71c9e commit 4bf5670

File tree

3 files changed

+128
-39
lines changed

3 files changed

+128
-39
lines changed

components/distributed_training/single_gpu.py

Lines changed: 0 additions & 39 deletions
This file was deleted.
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
import torch.nn as nn
2+
import torch.nn.functional as F
3+
import torch.optim as optim
4+
from torch.utils.data import DataLoader
5+
from datautils import MyTrainDataset
6+
7+
8+
class Trainer:
9+
def __init__(self, gpu_id, data, model, optimizer, save_every):
10+
self.gpu_id = gpu_id
11+
self.data = data
12+
self.model = model.to(gpu_id)
13+
self.optimizer = optimizer
14+
self.save_every = save_every
15+
16+
def _run_batch(self, source, target):
17+
self.optimizer.zero_grad()
18+
output = self.model(source)
19+
loss = F.cross_entropy(output, target)
20+
loss.backward()
21+
self.optimizer.step()
22+
23+
def _run_epoch(self, epoch_id):
24+
print(f"gpu_id = {self.gpu_id}, epoch = {epoch_id}, batch_size = {len(next(iter(self.data))[0])}, batch_num = {len(self.data)}")
25+
for source, target in self.data:
26+
source = source.to(self.gpu_id)
27+
target = target.to(self.gpu_id)
28+
self._run_batch(source, target)
29+
30+
def train(self, total_epochs):
31+
for epoch_id in range(total_epochs):
32+
self._run_epoch(epoch_id)
33+
34+
35+
def load_train_objs():
36+
dataset = MyTrainDataset(2048)
37+
model = nn.Linear(20, 1)
38+
optimizer = optim.SGD(model.parameters(), lr=1e-3)
39+
return dataset, model, optimizer
40+
41+
42+
def prepare_dataloader(dataset, batch_size):
43+
return DataLoader(
44+
dataset,
45+
batch_size=batch_size,
46+
pin_memory=True,
47+
shuffle=True,
48+
)
49+
50+
51+
def main(device, batch_size, total_epochs, save_every):
52+
dataset, model, optimizer = load_train_objs()
53+
train_data = prepare_dataloader(dataset, batch_size)
54+
trainer = Trainer(device, train_data, model, optimizer, save_every)
55+
trainer.train(total_epochs)
56+
57+
58+
if __name__ == "__main__":
59+
import argparse
60+
parser = argparse.ArgumentParser(description='simple distributed training job')
61+
parser.add_argument('-t', '--total_epochs', default=2, type=int, help='Total epochs to train the model')
62+
parser.add_argument('-s', '--save_every', default=2, type=int, help='How often to save a snapshot')
63+
parser.add_argument('-b', '--batch_size', default=32, type=int, help='Input batch size on each device (default: 32)')
64+
args = parser.parse_args()
65+
66+
device = 0 # shorthand for cuda:0
67+
main(device, args.batch_size, args.total_epochs, args.save_every)
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
import argparse
2+
import torch.nn.functional as F
3+
from datautils import MyTrainDataset
4+
from torch.nn import Linear
5+
from torch.optim import SGD
6+
from torch.utils.data import DataLoader
7+
8+
9+
def get_train_objs(batch_size, learning_rate):
10+
dataset = MyTrainDataset(2048)
11+
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, pin_memory=True)
12+
model = Linear(20, 1)
13+
optimizer = SGD(model.parameters(), lr=learning_rate)
14+
return dataloader, model, optimizer
15+
16+
17+
class Trainer:
18+
def __init__(self, gpu_id, dataloader, model, optimizer, total_epochs, save_every):
19+
self.gpu_id = gpu_id
20+
self.dataloader = dataloader
21+
self.model = model.to(gpu_id)
22+
self.optimizer = optimizer
23+
self.total_epochs = total_epochs
24+
self.save_every = save_every
25+
26+
def _run_batch(self, source, target):
27+
self.optimizer.zero_grad()
28+
output = self.model(source)
29+
loss = F.cross_entropy(output, target)
30+
loss.backward()
31+
self.optimizer.step()
32+
33+
def _run_epoch(self, epoch_id):
34+
for batch_id, (source, target) in enumerate(self.dataloader):
35+
source = source.to(self.gpu_id)
36+
target = target.to(self.gpu_id)
37+
self._run_batch(source, target)
38+
39+
print(f"device {self.gpu_id}, epoch {epoch_id}/{self.total_epochs}, batch {batch_id}/{len(self.dataloader)}")
40+
41+
def train(self):
42+
for epoch_id in range(self.total_epochs):
43+
self._run_epoch(epoch_id)
44+
45+
46+
def main(gpu_id, batch_size, learning_rate, total_epochs, save_every):
47+
dataloader, model, optimizer = get_train_objs(batch_size, learning_rate)
48+
trainer = Trainer(gpu_id, dataloader, model, optimizer, total_epochs, save_every)
49+
trainer.train()
50+
51+
52+
if __name__ == "__main__":
53+
parser = argparse.ArgumentParser(description='simple distributed training job')
54+
parser.add_argument('-b', '--batch_size', default=32, type=int, help='Input batch size on each device (default: 32)')
55+
parser.add_argument('-l', '--learning_rate', default=1e-3, type=float, help='Learning rate of the optimizer (default" 1e-3)')
56+
parser.add_argument('-t', '--total_epochs', default=2, type=int, help='Total epochs to train the model (default: 2)')
57+
parser.add_argument('-s', '--save_every', default=2, type=int, help='How often to save a snapshot (default: 2)')
58+
args = parser.parse_args()
59+
60+
gpu_id = 0
61+
main(gpu_id, args.batch_size, args.learning_rate, args.total_epochs, args.save_every)

0 commit comments

Comments
 (0)