Skip to content

Commit

Permalink
oops i should not be needing or multiplying by world_size to calculat…
Browse files Browse the repository at this point in the history
…e mfu
  • Loading branch information
karpathy committed Feb 7, 2023
1 parent 8b1e432 commit e58f0cf
Showing 1 changed file with 1 addition and 3 deletions.
4 changes: 1 addition & 3 deletions train.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,14 +84,12 @@
init_process_group(backend=backend)
ddp_rank = int(os.environ['RANK'])
ddp_local_rank = int(os.environ['LOCAL_RANK'])
world_size = int(os.environ['WORLD_SIZE']) # total number of training processes
device = f'cuda:{ddp_local_rank}'
torch.cuda.set_device(device)
master_process = ddp_rank == 0 # this process will do logging, checkpointing etc.
seed_offset = ddp_rank # each process gets a different seed
else:
# if not ddp, we are running on a single gpu, and one process
world_size = 1
master_process = True
seed_offset = 0

Expand Down Expand Up @@ -309,7 +307,7 @@ def get_lr(it):
if iter_num % log_interval == 0 and master_process:
lossf = loss.item() # loss as float. note: this is a CPU-GPU sync point
if local_iter_num >= 5: # let the training loop settle a bit
mfu = raw_model.estimate_mfu(batch_size * world_size * gradient_accumulation_steps, dt)
mfu = raw_model.estimate_mfu(batch_size * gradient_accumulation_steps, dt)
running_mfu = mfu if running_mfu == -1.0 else 0.9*running_mfu + 0.1*mfu
print(f"iter {iter_num}: loss {lossf:.4f}, time {dt*1000:.2f}ms, mfu {running_mfu*100:.2f}%")
iter_num += 1
Expand Down

0 comments on commit e58f0cf

Please sign in to comment.