pulled out bucketing args, removed options from scoring

awslabs · fhieber · Sep 28, 2018 · Sep 17, 2018 · Sep 17, 2018 · Sep 17, 2018
commit aa417ddd82a8e90d8525f4f723e1e23b6f51acb3
diff --git a/sockeye/arguments.py b/sockeye/arguments.py
@@ -1089,7 +1089,6 @@ def add_score_cli_args(params):
     add_training_data_args(params, required=False)
     add_prepared_data_args(params)
     add_vocab_args(params)
-    add_bucketing_args(params)
     add_scoring_args(params)
     add_device_args(params)
     add_logging_args(params)

diff --git a/sockeye/score.py b/sockeye/score.py
@@ -78,7 +78,7 @@ def create_scoring_model(config: model.ModelConfig,
                                          context=context,
                                          provide_data=score_iter.provide_data,
                                          default_bucket_key=score_iter.default_bucket_key,
-                                         bucketing=bucketing)
+                                         bucketing=False)
 
     return scoring_model
 
@@ -97,13 +97,6 @@ def score(args: argparse.Namespace):
 
     utils.log_basic_info(args)
 
-    max_seq_len_source, max_seq_len_target = args.max_seq_len
-    # The maximum length is the length before we add the BOS/EOS symbols
-    max_seq_len_source = max_seq_len_source + C.SPACE_FOR_XOS
-    max_seq_len_target = max_seq_len_target + C.SPACE_FOR_XOS
-    logger.info("Adjusting maximum length to reserve space for a BOS/EOS marker. New maximum length: (%d, %d)",
-                max_seq_len_source, max_seq_len_target)
-
     with ExitStack() as exit_stack:
         context = utils.determine_context(device_ids=args.device_ids,
                                           use_cpu=args.use_cpu,
@@ -116,25 +109,26 @@ def score(args: argparse.Namespace):
                                                                  "size that is a multiple of %d." % len(context))
         logger.info("Scoring Device(s): %s", ", ".join(str(c) for c in context))
 
+        model_config = model.SockeyeModel.load_config(os.path.join(args.model, C.CONFIG_NAME))
+        max_seq_len_source = model_config.config_data.max_seq_len_source
+        max_seq_len_target = model_config.config_data.max_seq_len_target
+
         score_iter, _, config_data, source_vocabs, target_vocab, data_info = train.create_data_iters_and_vocabs(
             args=args,
             max_seq_len_source=max_seq_len_source,
             max_seq_len_target=max_seq_len_target,
             shared_vocab=args.shared_vocab,
             resume_training=True,
             output_folder=args.model,
+            bucketing=False,
             fill_up='zeros',
             no_permute=True)
 
-        max_seq_len_source = config_data.max_seq_len_source
-        max_seq_len_target = config_data.max_seq_len_target
-
-        model_config = model.SockeyeModel.load_config(os.path.join(args.model, C.CONFIG_NAME))
 
         scoring_model = create_scoring_model(config=model_config,
                                              model_dir=args.model,
                                              context=context,
-                                             bucketing=not args.no_bucketing,
+                                             bucketing=False,
                                              score_iter=score_iter)
 
         scorer = scoring.Scorer(scoring_model, source_vocabs, target_vocab,

diff --git a/sockeye/train.py b/sockeye/train.py
@@ -222,6 +222,8 @@ def create_data_iters_and_vocabs(args: argparse.Namespace,
                                  validation_sources: Optional[List[str]] = None,
                                  validation_target: Optional[str] = None,
                                  output_folder: Optional[str] = None,
+                                 bucketing: bool = True,
+                                 bucket_width: int = 10,
                                  fill_up: str = C.DEFAULT_FILL_UP_STRATEGY,
                                  no_permute: bool = False) -> Tuple['data_io.BaseParallelSampleIter',
                                                                     'data_io.BaseParallelSampleIter',
@@ -349,8 +351,8 @@ def create_data_iters_and_vocabs(args: argparse.Namespace,
             no_permute=no_permute,
             max_seq_len_source=max_seq_len_source,
             max_seq_len_target=max_seq_len_target,
-            bucketing=not args.no_bucketing,
-            bucket_width=args.bucket_width)
+            bucketing=bucketing,
+            bucket_width=bucket_width)
 
         return train_iter, validation_iter, config_data, source_vocabs, target_vocab, data_info
 
@@ -812,6 +814,8 @@ def train(args: argparse.Namespace):
             validation_sources=[args.validation_source] + args.validation_source_factors,
             validation_target=args.validation_target,
             output_folder=output_folder,
+            bucketing=not args.no_bucketing,
+            bucket_width=args.bucket_width,
             fill_up=args.fill_up)
         max_seq_len_source = config_data.max_seq_len_source
         max_seq_len_target = config_data.max_seq_len_target