iml1111
diff --git a/‎src/12_transformer/continue_train.py‎
Lines changed: 54 additions & 0 deletions b/‎src/12_transformer/continue_train.py‎
Lines changed: 54 additions & 0 deletions
diff --git a/‎src/12_transformer/detokenizer.py‎
Lines changed: 16 additions & 0 deletions b/‎src/12_transformer/detokenizer.py‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎src/12_transformer/modules/data_loader.py‎
Lines changed: 119 additions & 0 deletions b/‎src/12_transformer/modules/data_loader.py‎
Lines changed: 119 additions & 0 deletions
diff --git a/‎src/12_transformer/modules/search.py‎
Lines changed: 188 additions & 0 deletions b/‎src/12_transformer/modules/search.py‎
Lines changed: 188 additions & 0 deletions
@@ -0,0 +1,54 @@
+import sys
+import os.path
+import torch
+
+from train import define_argparser
+from train import main
+
+
+def overwrite_config(config, prev_config):
+    # This method provides a compatibility for new or missing arguments.
+    for prev_key in vars(prev_config).keys():
+        if not prev_key in vars(config).keys():
+            # No such argument in current config. Ignore that value.
+            print('WARNING!!! Argument "--%s" is not found in current argument parser.\tIgnore saved value:' % prev_key,
+                  vars(prev_config)[prev_key])
+
+    for key in vars(config).keys():
+        if not key in vars(prev_config).keys():
+            # No such argument in saved file. Use current value.
+            print('WARNING!!! Argument "--%s" is not found in saved model.\tUse current value:' % key,
+                  vars(config)[key])
+        elif vars(config)[key] != vars(prev_config)[key]:
+            if '--%s' % key in sys.argv:
+                # User changed argument value at this execution.
+                print('WARNING!!! You changed value for argument "--%s".\tUse current value:' % key,
+                      vars(config)[key])
+            else:
+                # User didn't changed at this execution, but current config and saved config is different.
+                # This may caused by user's intension at last execution.
+                # Load old value, and replace current value.
+                vars(config)[key] = vars(prev_config)[key]
+
+    return config
+
+
+def continue_main(config, main):
+    # If the model exists, load model and configuration to continue the training.
+    if os.path.isfile(config.load_fn):
+        saved_data = torch.load(config.load_fn, map_location='cpu')
+
+        prev_config = saved_data['config']
+        config = overwrite_config(config, prev_config)
+
+        model_weight = saved_data['model']
+        opt_weight = saved_data['opt']
+
+        main(config, model_weight=model_weight, opt_weight=opt_weight)
+    else:
+        print('Cannot find file %s' % config.load_fn)
+
+
+if __name__ == '__main__':
+    config = define_argparser(is_continue=True)
+    continue_main(config, main)
@@ -0,0 +1,16 @@
+#-*- coding:utf-8 -*-
+import sys
+sys.stdin.reconfigure(encoding='utf-8')
+
+
+if __name__ == "__main__":
+    for line in sys.stdin:
+        if line.strip() != "":
+            if '▁▁' in line:
+                line = line.strip().replace(' ', '').replace('▁▁', ' ').replace('▁', '').strip()
+            else:
+                line = line.strip().replace(' ', '').replace('▁', ' ').strip()
+
+            sys.stdout.write(line + '\n')
+        else:
+            sys.stdout.write('\n')
@@ -0,0 +1,119 @@
+import os
+from torchtext import data
+
+PAD, BOS, EOS = 1, 2, 3
+
+class DataLoader:
+
+    def __init__(
+        self,
+        train_fn=None,
+        valid_fn=None,
+        exts=None,
+        batch_size=64,
+        device='cpu',
+        max_vocab=9999999,
+        max_length=255,
+        fix_length=None,
+        use_bos=True,
+        use_eos=True,
+        shuffle=True,
+    ):
+
+        self.src = data.Field(
+            sequential=True,
+            use_vocab=True,
+            batch_first=True,
+            include_lengths=True,
+            fix_length=fix_length,
+            init_token=None,
+            eos_token=None,
+        )
+        self.tgt = data.Field(
+            sequential=True,
+            use_vocab=True,
+            batch_first=True,
+            include_lengths=True,
+            fix_length=fix_length,
+            init_token='<BOS>',
+            eos_token='<EOS>',
+        )
+
+        if train_fn is not None and valid_fn is not None and exts is not None:
+            train = TranslationDataset(
+                path=train_fn,
+                exts=exts,
+                fields=[('src', self.src), ('tgt', self.tgt)],
+                max_length=max_length
+            )
+            valid = TranslationDataset(
+                path=valid_fn,
+                exts=exts,
+                fields=[('src', self.src), ('tgt', self.tgt)],
+                max_length=max_length,
+            )
+
+            self.train_iter = data.BucketIterator(
+                train,
+                batch_size=batch_size,
+                device='cuda:%d' % device if device >= 0 else 'cpu',
+                shuffle=shuffle,
+                # 비슷한 길이끼리 미니 배치를 만들도록 정렬
+                sort_key=lambda x: len(x.tgt) + (max_length * len(x.src)),
+                sort_within_batch=True,
+            )
+            self.valid_iter = data.BucketIterator(
+                valid,
+                batch_size=batch_size,
+                device='cuda:%d' % device if device >= 0 else 'cpu',
+                shuffle=False,
+                # 비슷한 길이끼리 미니 배치를 만들도록 정렬
+                sort_key=lambda x: len(x.tgt) + (max_length * len(x.src)),
+                sort_within_batch=True,
+            )
+
+            self.src.build_vocab(train, max_size=max_vocab)
+            self.tgt.build_vocab(train, max_size=max_vocab)
+
+    def load_vocab(self, src_vocab, tgt_vocab):
+        self.src.vocab = src_vocab
+        self.tgt.vocab = tgt_vocab
+
+
+class TranslationDataset(data.Dataset):
+
+    def __init__(self, path, exts, fields, max_length=None, **kwargs):
+        """Create a TranslationDataset given paths and fields.
+        
+        MAX LENGTH로 각 데이터를 자르기 위한 예외처리 오버라이딩
+        
+        Arguments:
+            path: Common prefix of paths to the data files for both languages.
+            exts: A tuple containing the extension to path for each language.
+            fields: A tuple containing the fields that will be used for data
+                in each language.
+            Remaining keyword arguments: Passed to the constructor of
+                data.Dataset.
+        """
+        if not isinstance(fields[0], (tuple, list)):
+            fields = [('src', fields[0]), ('trg', fields[1])]
+
+        if not path.endswith('.'):
+            path += '.'
+
+        src_path, trg_path = tuple(os.path.expanduser(path + x) for x in exts)
+
+        examples = []
+        with open(src_path, encoding='utf-8') as src_file, open(trg_path, encoding='utf-8') as trg_file:
+            for src_line, trg_line in zip(src_file, trg_file):
+                src_line, trg_line = src_line.strip(), trg_line.strip()
+                if max_length and max_length < max(len(src_line.split()), len(trg_line.split())):
+                    continue
+                if src_line != '' and trg_line != '':
+                    examples += [data.Example.fromlist([src_line, trg_line], fields)]
+
+        super().__init__(examples, fields, **kwargs)
+
+    @staticmethod
+    def sort_key(ex):
+        return data.interleave_keys(len(ex.src), len(ex.trg))
@@ -0,0 +1,188 @@
+from operator import itemgetter
+
+import torch
+import torch.nn as nn
+
+import modules.data_loader as data_loader
+
+LENGTH_PENALTY = .2
+MIN_LENGTH = 5
+
+
+class SingleBeamSearchBoard():
+
+    def __init__(
+        self,
+        device,
+        prev_status_config,
+        beam_size=5,
+        max_length=255,
+    ):
+        self.beam_size = beam_size
+        self.max_length = max_length
+
+        # To put data to same device.
+        self.device = device
+        # Inferred word index for each time-step. For now, initialized with initial time-step.
+        self.word_indice = [torch.LongTensor(beam_size).zero_().to(self.device) + data_loader.BOS]
+        # Beam index for selected word index, at each time-step.
+        self.beam_indice = [torch.LongTensor(beam_size).zero_().to(self.device) - 1]
+        # Cumulative log-probability for each beam.
+        self.cumulative_probs = [torch.FloatTensor([.0] + [-float('inf')] * (beam_size - 1)).to(self.device)]
+        # 1 if it is done else 0
+        self.masks = [torch.BoolTensor(beam_size).zero_().to(self.device)]
+
+        # We don't need to remember every time-step of hidden states:
+        #       prev_hidden, prev_cell, prev_h_t_tilde
+        # What we need is remember just last one.
+
+        self.prev_status = {}
+        self.batch_dims = {}
+        for prev_status_name, each_config in prev_status_config.items():
+            init_status = each_config['init_status']
+            batch_dim_index = each_config['batch_dim_index']
+            if init_status is not None:
+                self.prev_status[prev_status_name] = torch.cat([init_status] * beam_size,
+                                                               dim=batch_dim_index)
+            else:
+                self.prev_status[prev_status_name] = None
+            self.batch_dims[prev_status_name] = batch_dim_index
+
+        self.current_time_step = 0
+        self.done_cnt = 0
+
+    def get_length_penalty(
+        self,
+        length,
+        alpha=LENGTH_PENALTY,
+        min_length=MIN_LENGTH,
+    ):
+        # Calculate length-penalty,
+        # because shorter sentence usually have bigger probability.
+        # In fact, we represent this as log-probability, which is negative value.
+        # Thus, we need to multiply bigger penalty for shorter one.
+        p = ((min_length + 1) / (min_length + length))**alpha
+
+        return p
+
+    def is_done(self):
+        # Return 1, if we had EOS more than 'beam_size'-times.
+        if self.done_cnt >= self.beam_size:
+            return 1
+        return 0
+
+    def get_batch(self):
+        y_hat = self.word_indice[-1].unsqueeze(-1)
+        # |y_hat| = (beam_size, 1)
+        # if model != transformer:
+        #     |hidden| = |cell| = (n_layers, beam_size, hidden_size)
+        #     |h_t_tilde| = (beam_size, 1, hidden_size) or None
+        # else:
+        #     |prev_state_i| = (beam_size, length, hidden_size),
+        #     where i is an index of layer.
+        return y_hat, self.prev_status
+
+    #@profile
+    def collect_result(self, y_hat, prev_status):
+        # |y_hat| = (beam_size, 1, output_size)
+        # prev_status is a dict, which has following keys:
+        # if model != transformer:
+        #     |hidden| = |cell| = (n_layers, beam_size, hidden_size)
+        #     |h_t_tilde| = (beam_size, 1, hidden_size)
+        # else:
+        #     |prev_state_i| = (beam_size, length, hidden_size),
+        #     where i is an index of layer.
+        output_size = y_hat.size(-1)
+
+        self.current_time_step += 1
+
+        # Calculate cumulative log-probability.
+        # First, fill -inf value to last cumulative probability, if the beam is already finished.
+        # Second, expand -inf filled cumulative probability to fit to 'y_hat'.
+        # (beam_size) --> (beam_size, 1, 1) --> (beam_size, 1, output_size)
+        # Third, add expanded cumulative probability to 'y_hat'
+        cumulative_prob = self.cumulative_probs[-1].masked_fill_(self.masks[-1], -float('inf'))
+        cumulative_prob = y_hat + cumulative_prob.view(-1, 1, 1).expand(self.beam_size, 1, output_size)
+        # |cumulative_prob| = (beam_size, 1, output_size)
+
+        # Now, we have new top log-probability and its index.
+        # We picked top index as many as 'beam_size'.
+        # Be aware that we picked top-k from whole batch through 'view(-1)'.
+
+        # Following lines are using torch.topk, which is slower than torch.sort.
+        # top_log_prob, top_indice = torch.topk(
+        #     cumulative_prob.view(-1), # (beam_size * output_size,)
+        #     self.beam_size,
+        #     dim=-1,
+        # )
+
+        # Following lines are using torch.sort, instead of using torch.topk.
+        top_log_prob, top_indice = cumulative_prob.view(-1).sort(descending=True)
+        top_log_prob, top_indice = top_log_prob[:self.beam_size], top_indice[:self.beam_size]
+
+        # |top_log_prob| = (beam_size,)
+        # |top_indice| = (beam_size,)
+
+        # Because we picked from whole batch, original word index should be calculated again.
+        self.word_indice += [top_indice.fmod(output_size)]
+        # Also, we can get an index of beam, which has top-k log-probability search result.
+        self.beam_indice += [top_indice.div(float(output_size)).long()]
+
+        # Add results to history boards.
+        self.cumulative_probs += [top_log_prob]
+        self.masks += [torch.eq(self.word_indice[-1], data_loader.EOS)] # Set finish mask if we got EOS.
+        # Calculate a number of finished beams.
+        self.done_cnt += self.masks[-1].float().sum()
+
+        # In beam search procedure, we only need to memorize latest status.
+        # For seq2seq, it would be lastest hidden and cell state, and h_t_tilde.
+        # The problem is hidden(or cell) state and h_t_tilde has different dimension order.
+        # In other words, a dimension for batch index is different.
+        # Therefore self.batch_dims stores the dimension index for batch index.
+        # For transformer, lastest status is each layer's decoder output from the biginning.
+        # Unlike seq2seq, transformer has to memorize every previous output for attention operation.
+        for prev_status_name, prev_status in prev_status.items():
+            self.prev_status[prev_status_name] = torch.index_select(
+                prev_status,
+                dim=self.batch_dims[prev_status_name],
+                index=self.beam_indice[-1]
+            ).contiguous()
+
+    def get_n_best(self, n=1, length_penalty=.2):
+        sentences, probs, founds = [], [], []
+
+        for t in range(len(self.word_indice)):  # for each time-step,
+            for b in range(self.beam_size):  # for each beam,
+                if self.masks[t][b] == 1:  # if we had EOS on this time-step and beam,
+                    # Take a record of penaltified log-proability.
+                    probs += [self.cumulative_probs[t][b] * self.get_length_penalty(t, alpha=length_penalty)]
+                    founds += [(t, b)]
+
+        # Also, collect log-probability from last time-step, for the case of EOS is not shown.
+        for b in range(self.beam_size):
+            if self.cumulative_probs[-1][b] != -float('inf'): # If this beam does not have EOS,
+                if not (len(self.cumulative_probs) - 1, b) in founds:
+                    probs += [self.cumulative_probs[-1][b] * self.get_length_penalty(len(self.cumulative_probs),
+                                                                                     alpha=length_penalty)]
+                    founds += [(t, b)]
+
+        # Sort and take n-best.
+        sorted_founds_with_probs = sorted(
+            zip(founds, probs),
+            key=itemgetter(1),
+            reverse=True,
+        )[:n]
+        probs = []
+
+        for (end_index, b), prob in sorted_founds_with_probs:
+            sentence = []
+
+            # Trace from the end.
+            for t in range(end_index, 0, -1):
+                sentence = [self.word_indice[t][b]] + sentence
+                b = self.beam_indice[t][b]
+
+            sentences += [sentence]
+            probs += [prob]
+
+        return sentences, probs