documentation and cleanup

awslabs · fhieber · Sep 28, 2018 · Sep 17, 2018 · Sep 17, 2018 · Sep 17, 2018
commit ff324fbdb324e4a1b8b931a9f19ee3b9b77b0b43
diff --git a/sockeye/arguments.py b/sockeye/arguments.py
@@ -789,7 +789,8 @@ def add_training_args(params):
                                     'Default: %(default)s.')
     train_params.add_argument('--fill-up',
                               type=str,
-                              default=C.DEFAULT_FILL_UP_STRATEGY,
+                              default=C.FILL_UP_DEFAULT,
+                              choices=C.FILL_UP_CHOICES,
                               help=argparse.SUPPRESS)
 
     train_params.add_argument('--loss',
@@ -1100,14 +1101,14 @@ def add_score_cli_args(params):
     params.add_argument('--length-penalty-alpha',
                         default=1.0,
                         type=float,
-                        help='Alpha factor for the length penalty used in beam search: '
+                        help='Alpha factor for the length penalty used in scoring: '
                         '(beta + len(Y))**alpha/(beta + 1)**alpha. A value of 0.0 will therefore turn off '
                         'length normalization. Default: %(default)s')
 
     params.add_argument('--length-penalty-beta',
                         default=0.0,
                         type=float,
-                        help='Beta factor for the length penalty used in beam search: '
+                        help='Beta factor for the length penalty used in scoring: '
                         '(beta + len(Y))**alpha/(beta + 1)**alpha. Default: %(default)s')
 
     params.add_argument('--output-type',

diff --git a/sockeye/constants.py b/sockeye/constants.py
@@ -398,7 +398,11 @@
 DATA_CONFIG = "data.config"
 PREPARED_DATA_VERSION_FILE = "data.version"
 PREPARED_DATA_VERSION = 2
-DEFAULT_FILL_UP_STRATEGY = 'replicate'
+
+FILL_UP_REPLICATE = 'replicate'
+FILL_UP_ZEROS = 'zeros'
+FILL_UP_DEFAULT = FILL_UP_REPLICATE
+FILL_UP_CHOICES = [FILL_UP_REPLICATE, FILL_UP_ZEROS]
 
 # reranking
 RERANK_BLEU = "bleu"

diff --git a/sockeye/data_io.py b/sockeye/data_io.py
@@ -769,7 +769,7 @@ def get_training_data_iters(sources: List[str],
                             bucketing: bool,
                             bucket_width: int,
                             no_permute: bool = False) -> Tuple['BaseParallelSampleIter',
-                                                               'BaseParallelSampleIter',
+                                                               Optional['BaseParallelSampleIter'],
                                                                'DataConfig', 'DataInfo']:
     """
     Returns data iterators for training and validation data.
@@ -1027,18 +1027,18 @@ def ids2strids(ids: Iterable[int]) -> str:
 
 def ids2tokens(token_ids: Iterable[int],
                vocab_inv: Dict[int, str],
-               exclude_list: Set[int] = set()) -> List[str]:
+               exclude_set: Set[int] = set()) -> List[str]:
     """
-    Transforms a list of token IDs into a list of words, exluding any IDs in `exclude_list`.
+    Transforms a list of token IDs into a list of words, exluding any IDs in `exclude_set`.
 
     :param token_ids: The list of token IDs.
     :param vocab_inv: The inverse vocabulary.
-    :param exclude_list: The list of token IDs to exclude.
+    :param exclude_set: The list of token IDs to exclude.
     :return: The list of words.
 """
 
     tokens = [vocab_inv[token] for token in token_ids]
-    return [tok for token_id, tok in zip(token_ids, tokens) if token_id not in exclude_list]
+    return [tok for token_id, tok in zip(token_ids, tokens) if token_id not in exclude_set]
 
 
 class SequenceReader(Iterable):
@@ -1283,16 +1283,19 @@ def fill_up(self,
             # 'zeros' instead repeats the last element and then writes zeros over everything.
             if num_samples % bucket_batch_size != 0:
                 rest = bucket_batch_size - num_samples % bucket_batch_size
-                if fill_up == 'replicate':
+                if fill_up == C.FILL_UP_REPLICATE:
                     logger.info("Filling bucket %s from size %d to %d by sampling with replacement",
                                 bucket, num_samples, bucket_batch_size)
                     desired_indices_np = rs.randint(num_samples, size=rest)
                     desired_indices = mx.nd.array(desired_indices_np)
 
-                elif fill_up == 'zeros':
-                    logger.info("Filling bucket %s from size %d to %d by repeating the last element %d %s",
-                                bucket, num_samples, bucket_batch_size, rest, inflect('time', rest))
-                    desired_indices_np = np.array([num_samples-1] * rest)
+                elif fill_up == C.FILL_UP_ZEROS:
+                    logger.info("Filling bucket %s from size %d to %d with zeros",
+                                bucket, num_samples, bucket_batch_size)
+                    desired_indices_np = np.full((rest), num_samples - 1)
+                       #      data_source = [np.full((num_samples, source_len, num_factors), self.pad_id, dtype=self.dtype)
+                       # for (source_len, target_len), num_samples in zip(self.buckets, num_samples_per_bucket)]
+
                     desired_indices = mx.nd.array(desired_indices_np)
 
                 else:
@@ -1306,9 +1309,9 @@ def fill_up(self,
                 label[bucket_idx] = mx.nd.concat(bucket_label, bucket_label.take(desired_indices), dim=0)
 
                 if fill_up == 'zeros':
-                    source[bucket_idx][num_samples:,:,:] = C.PAD_ID
-                    target[bucket_idx][num_samples:,:] = C.PAD_ID
-                    label[bucket_idx][num_samples:,:] = C.PAD_ID
+                    source[bucket_idx][num_samples:, :, :] = C.PAD_ID
+                    target[bucket_idx][num_samples:, :] = C.PAD_ID
+                    label[bucket_idx][num_samples:, :] = C.PAD_ID
 
         return ParallelDataSet(source, target, label)
 
@@ -1391,6 +1394,8 @@ class MetaBaseParallelSampleIter(ABC):
 class BaseParallelSampleIter(mx.io.DataIter):
     """
     Base parallel sample iterator.
+
+    :param no_permute: Turn off random shuffling of parallel data.
     """
     __metaclass__ = MetaBaseParallelSampleIter
 
@@ -1402,6 +1407,7 @@ def __init__(self,
                  target_data_name,
                  label_name,
                  num_factors: int = 1,
+                 no_permute: bool = False,
                  dtype='float32') -> None:
         super().__init__(batch_size=batch_size)
 
@@ -1412,6 +1418,7 @@ def __init__(self,
         self.target_data_name = target_data_name
         self.label_name = label_name
         self.num_factors = num_factors
+        self.no_permute = no_permute
         self.dtype = dtype
 
         # "Staging area" that needs to fit any size batch we're using by total number of elements.
@@ -1478,12 +1485,11 @@ def __init__(self,
                  dtype='float32') -> None:
         super().__init__(buckets=buckets, batch_size=batch_size, bucket_batch_sizes=bucket_batch_sizes,
                          source_data_name=source_data_name, target_data_name=target_data_name,
-                         label_name=label_name, num_factors=num_factors, dtype=dtype)
+                         label_name=label_name, num_factors=num_factors, no_permute=no_permute, dtype=dtype)
         assert len(shards_fnames) > 0
         self.shards_fnames = list(shards_fnames)
         self.shard_index = -1
         self.fill_up = fill_up
-        self.no_permute = no_permute
 
         self.reset()
 
@@ -1572,7 +1578,7 @@ def __init__(self,
                  dtype='float32') -> None:
         super().__init__(buckets=buckets, batch_size=batch_size, bucket_batch_sizes=bucket_batch_sizes,
                          source_data_name=source_data_name, target_data_name=target_data_name,
-                         label_name=label_name, num_factors=num_factors, dtype=dtype)
+                         label_name=label_name, num_factors=num_factors, no_permute=no_permute, dtype=dtype)
 
         # create independent lists to be shuffled
         self.data = ParallelDataSet(list(data.source), list(data.target), list(data.label))
@@ -1586,8 +1592,6 @@ def __init__(self,
         self.data_permutations = [mx.nd.arange(0, max(1, self.data.source[i].shape[0]))
                                   for i in range(len(self.data))]
 
-        self.no_permute = no_permute
-
         self.reset()
 
     def reset(self):

diff --git a/sockeye/output_handler.py b/sockeye/output_handler.py
@@ -147,7 +147,7 @@ def handle(self,
 
 class PairWithScoreOutputHandler(OutputHandler):
     """
-    Output handler to write translation score along with sntence input and output (tab-delimited).
+    Output handler to write translation score along with sentence input and output (tab-delimited).
 
     :param stream: Stream to write translations to (e.g., sys.stdout).
     """

diff --git a/sockeye/score.py b/sockeye/score.py
@@ -71,6 +71,11 @@ def score(args: argparse.Namespace):
         else:
             max_seq_len_source, max_seq_len_target = args.max_seq_len
 
+        # This call has a number of different parameters compared to training which reflect our need to get scores
+        # one-for-one and in order with the input data.
+        # Bucketing and permuting need to be turned off in order to preserve the ordering of sentences.
+        # The 'zeros' fill_up strategy fills underfilled buckets with zeros which can then be used to find the last item.
+        # Finally, 'resume_training' needs to be set to True because it causes the model to be loaded instead of initialized.
         score_iter, _, config_data, source_vocabs, target_vocab, data_info = train.create_data_iters_and_vocabs(
             args=args,
             max_seq_len_source=max_seq_len_source,

diff --git a/sockeye/scoring.py b/sockeye/scoring.py
@@ -224,6 +224,7 @@ def score(self,
               score_type: str,
               output_handler: OutputHandler):
 
+        total_time = 0.
         tic = time.time()
         sentence_no = 0
         for i, batch in enumerate(score_iter):
@@ -233,8 +234,8 @@ def score(self,
             self.model.run_forward(batch)
             scores, __ = self.model.get_outputs()
 
-            total_time = time.time() - tic
             batch_time = time.time() - batch_tic
+            total_time += batch_time
 
             for source, target, score in zip(batch.data[0], batch.data[1], scores):
 

diff --git a/sockeye/train.py b/sockeye/train.py
@@ -224,7 +224,7 @@ def create_data_iters_and_vocabs(args: argparse.Namespace,
                                  output_folder: Optional[str] = None,
                                  bucketing: bool = True,
                                  bucket_width: int = 10,
-                                 fill_up: str = C.DEFAULT_FILL_UP_STRATEGY,
+                                 fill_up: str = C.FILL_UP_DEFAULT,
                                  no_permute: bool = False) -> Tuple['data_io.BaseParallelSampleIter',
                                                                     'data_io.BaseParallelSampleIter',
                                                                     'data_io.DataConfig',