iml1111
diff --git a/‎src/11_seq2seq/modules/trainer.py‎
Lines changed: 48 additions & 84 deletions b/‎src/11_seq2seq/modules/trainer.py‎
Lines changed: 48 additions & 84 deletions
@@ -1,14 +1,18 @@
 import numpy as np
+
 import torch
 import torch.nn.utils as torch_utils
 # from torch.cuda.amp import autocast
 # from torch.cuda.amp import GradScaler
+
 from ignite.engine import Engine
 from ignite.engine import Events
 from ignite.metrics import RunningAverage
 from ignite.contrib.handlers.tqdm_logger import ProgressBar
+
 from modules.utils import get_grad_norm, get_parameter_norm
 
+
 VERBOSE_SILENT = 0
 VERBOSE_EPOCH_WISE = 1
 VERBOSE_BATCH_WISE = 2
@@ -22,116 +26,66 @@ def __init__(self, func, model, crit, optimizer, lr_scheduler, config):
         self.optimizer = optimizer
         self.lr_scheduler = lr_scheduler
         self.config = config
+
         super().__init__(func)
 
         self.best_loss = np.inf
         #self.scaler = GradScaler()
 
     @staticmethod
+    #@profile
     def train(engine, mini_batch):
+        # You have to reset the gradients of all model parameters
+        # before to take another step in gradient descent.
         engine.model.train()
-
-        '''
-        Gradient Accumulation
-        - 기계 번역의 경우, batch_size가 256 정도가 적당
-        즉 batch_size 크기 자체도 성능에 영향을 끼침 
-        하지만 GPU 성능에 따라 하지 못할 수도 있음
-
-        속도는 몰라도, 성능을 보존시켜 주기 위해 일부러 
-        N번 정도의 iteration을 건너뛰어서 원하는 성능을 유지시킴
-
-        1. engine.state.iteration % engine.config.iteration_per_update == 1
-        - 현재 iter가 per_update로 나눠서 나머지가 1일때마다 zero_grad 수행
-        2. engine.config.iteration_per_update == 1
-        - 그냥 통상적인 경우 매번 zero_grad 시키기
-
-        '''
         if engine.state.iteration % engine.config.iteration_per_update == 1 or \
-           engine.config.iteration_per_update == 1:
-           if engine.state.iteration > 1:
+            engine.config.iteration_per_update == 1:
+            if engine.state.iteration > 1:
                 engine.optimizer.zero_grad()
 
-        # 모델의 첫번째 파라미터가 config임
         device = next(engine.model.parameters()).device
-        '''
-        src와 tgt는 각각 (실제 문장 데이터, 각 문장의 길이 정보) tuple 형태
-        - torchText에서 애초에 저렇게 제공됨
-        그 중에서 실제 문장 데이터만 GPU 메모리로 전송
-        '''
         mini_batch.src = (mini_batch.src[0].to(device), mini_batch.src[1])
-        mini_batch.tgt = (mini_batch.src[0].to(device), mini_batch.tgt[1])
-
-        '''
-        맨 처음 Input으로 x가 들어감
-        최종 Output과의 검증을 위해 y가 들어감
-        x의 경우, 그냥 그대로 넣어주면 됨(BOS EOS 들어가도 노상관)
-        y의 경우,
-        - 각 문장의 길이 정보는 버림
-        - 또한 실제 문장에서도 맨처음 BOS 토큰을 제거
-        (왜냐하면 예측은 BOS 다음 단어부터 수행하기 때문)
-
-        x = (batch_size, length_n)
-        y = (batch_size, length_m)
-        '''
+        mini_batch.tgt = (mini_batch.tgt[0].to(device), mini_batch.tgt[1])
+
+        # Raw target variable has both BOS and EOS token. 
+        # The output of sequence-to-sequence does not have BOS token. 
+        # Thus, remove BOS token for reference.
         x, y = mini_batch.src, mini_batch.tgt[0][:, 1:]
+        # |x| = (batch_size, length)
+        # |y| = (batch_size, length)
 
-        #-------------------------#
-        # autocast로 공간효율적으로 학습 실행
-        # with autocast(not engine.config.off_autocast):
-        # y_hat = (batch_size, length_m, output_size)
-        # 입력 tgt의 경우, 맨뒤에 EOS를 토큰을 제거
+        #with autocast(not engine.config.off_autocast):
+        # Take feed-forward
+        # Similar as before, the input of decoder does not have EOS token.
+        # Thus, remove EOS token for decoder input.
         y_hat = engine.model(x, mini_batch.tgt[0][:, :-1])
+        # |y_hat| = (batch_size, length, output_size)
 
-        '''
-        loss값 연산을 위해 다음과 같이 텐서 모양 정리
-        모든 문장의 각 단어를 순서대로 배치했다고 보면됨
-        변경 전(3D):
-            y_hat = (batch_size, length_m, output_size)
-            y = (batch_size, length_m)
-        변경 후(2D):
-            y_hat = (batch_size * length_m, output_size)
-            y = (batch_size * length_m)
-        '''
         loss = engine.crit(
             y_hat.contiguous().view(-1, y_hat.size(-1)),
             y.contiguous().view(-1)
         )
-        '''
-        div(y.size(0)): loss를 구한후, batch_size만큼 나눠준 후
-        div(engine.config.iteration_per_update): 
-        Gradient Accumulation을 위해 미리 나눠줌
-        즉, backward_target이 진짜 적용시킬 loss 값이라 보면 됨
-        '''
         backward_target = loss.div(y.size(0)).div(engine.config.iteration_per_update)
-        #-------------------------#
 
-        # autocast가 켜져 있는 경우, scale 작업 후에, backward
         # if engine.config.gpu_id >= 0 and not engine.config.off_autocast:
         #     engine.scaler.scale(backward_target).backward()
         # else:
         backward_target.backward()
 
-        # 현재 batch 내에 모든 토큰 수
         word_count = int(mini_batch.tgt[1].sum())
         p_norm = float(get_parameter_norm(engine.model.parameters()))
         g_norm = float(get_grad_norm(engine.model.parameters()))
 
-        # Gradient Accumulation 여부, 맞아 떨어진다면 step까지 수행, 아니면 스킵
         if engine.state.iteration % engine.config.iteration_per_update == 0 and \
-           engine.state.iteration > 0:
-            '''
-            Gradient Clipping
-            시퀸스의 time_step이 길수록, gradient가 매우 커질수도 있음
-            g_norm이 너무 커서 많이 움직이는 걸 막기 위해 사용
-            - 단, Adam을 쓰면 큰 필요는 없다고 함 ㅇㅇ
-            '''
+            engine.state.iteration > 0:
+            # In orther to avoid gradient exploding, we apply gradient clipping.
             torch_utils.clip_grad_norm_(
                 engine.model.parameters(),
                 engine.config.max_grad_norm,
             )
-
+            # Take a step of gradient descent.
             # if engine.config.gpu_id >= 0 and not engine.config.off_autocast:
-            #     # GPU를 사용할 경우, 기존 optim.step() 대신에 scaler로 step 수행
+            #     # Use scaler instead of engine.optimizer.step() if using GPU.
             #     engine.scaler.step(engine.optimizer)
             #     engine.scaler.update()
             # else:
@@ -156,17 +110,18 @@ def validate(engine, mini_batch):
             mini_batch.src = (mini_batch.src[0].to(device), mini_batch.src[1])
             mini_batch.tgt = (mini_batch.tgt[0].to(device), mini_batch.tgt[1])
 
-            # x = (batch_size, length_n)
-            # y = (batch_size, length_m)
             x, y = mini_batch.src, mini_batch.tgt[0][:, 1:]
+            # |x| = (batch_size, length)
+            # |y| = (batch_size, length)
 
             #with autocast(not engine.config.off_autocast):
             y_hat = engine.model(x, mini_batch.tgt[0][:, :-1])
+            # |y_hat| = (batch_size, n_classes)
             loss = engine.crit(
                 y_hat.contiguous().view(-1, y_hat.size(-1)),
                 y.contiguous().view(-1),
             )
-
+        
         word_count = int(mini_batch.tgt[1].sum())
         loss = float(loss / word_count)
         ppl = np.exp(loss)
@@ -179,18 +134,18 @@ def validate(engine, mini_batch):
     @staticmethod
     def attach(
         train_engine, validation_engine,
-        training_metric_names=['loss', 'ppl', '|param|', '|g_param|'],
-        validation_metric_names=['loss', 'ppl'],
+        training_metric_names = ['loss', 'ppl', '|param|', '|g_param|'],
+        validation_metric_names = ['loss', 'ppl'],
         verbose=VERBOSE_BATCH_WISE,
     ):
-        # 현재 상황 보고 및 출력 함수
+        # Attaching would be repaeted for serveral metrics.
+        # Thus, we can reduce the repeated codes by using this function.
         def attach_running_average(engine, metric_name):
             RunningAverage(output_transform=lambda x: x[metric_name]).attach(
                 engine,
                 metric_name,
             )
 
-        '''Train Attach Process'''
         for metric_name in training_metric_names:
             attach_running_average(train_engine, metric_name)
 
@@ -213,7 +168,6 @@ def print_train_logs(engine):
                     np.exp(avg_loss),
                 ))
 
-        '''Validation Attach Process'''
         for metric_name in validation_metric_names:
             attach_running_average(validation_engine, metric_name)
 
@@ -249,7 +203,6 @@ def save_model(engine, train_engine, config, src_vocab, tgt_vocab):
         avg_train_loss = train_engine.state.metrics['loss']
         avg_valid_loss = engine.state.metrics['loss']
 
-        # 주의!, best_model이 아닌 모든 에포크의 모델 저장 
         # Set a filename for model of last epoch.
         # We need to put every information to filename, as much as possible.
         model_fn = config.model_fn.split('.')
@@ -275,7 +228,7 @@ def save_model(engine, train_engine, config, src_vocab, tgt_vocab):
                 'tgt_vocab': tgt_vocab,
             }, model_fn
         )
-
+    
 
 class Trainer():
 
@@ -291,6 +244,7 @@ def train(
         n_epochs,
         lr_scheduler=None
     ):
+        # Declare train and validation engine with necessary objects.
         train_engine = self.target_engine_class(
             self.target_engine_class.train,
             model,
@@ -308,31 +262,41 @@ def train(
             config=self.config
         )
 
+        # Do necessary attach procedure to train & validation engine.
+        # Progress bar and metric would be attached.
         self.target_engine_class.attach(
             train_engine,
             validation_engine,
             verbose=self.config.verbose
         )
 
+        # After every train epoch, run 1 validation epoch.
+        # Also, apply LR scheduler if it is necessary.
         def run_validation(engine, validation_engine, valid_loader):
             validation_engine.run(valid_loader, max_epochs=1)
+
             if engine.lr_scheduler is not None:
                 engine.lr_scheduler.step()
 
+        # Attach above call-back function.
         train_engine.add_event_handler(
             Events.EPOCH_COMPLETED,
             run_validation,
             validation_engine,
             valid_loader
         )
+        # Attach other call-back function for initiation of the training.
         train_engine.add_event_handler(
             Events.STARTED,
             self.target_engine_class.resume_training,
             self.config.init_epoch,
         )
+
+        # Attach validation loss check procedure for every end of validation epoch.
         validation_engine.add_event_handler(
             Events.EPOCH_COMPLETED, self.target_engine_class.check_best
         )
+        # Attach model save procedure for every end of validation epoch.
         validation_engine.add_event_handler(
             Events.EPOCH_COMPLETED,
             self.target_engine_class.save_model,
@@ -342,7 +306,7 @@ def run_validation(engine, validation_engine, valid_loader):
             tgt_vocab,
         )
 
-        # Start training
+        # Start training.
         train_engine.run(train_loader, max_epochs=n_epochs)
 
-        return model
+        return model