Skip to content
This repository was archived by the owner on Jul 7, 2023. It is now read-only.

Registry refactor #1410

Merged
merged 6 commits into from
Jan 28, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion tensor2tensor/bin/t2t_attack.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def create_attack_params():


def create_attack(attack):
return registry.attacks(attack)
return registry.attack(attack)


def create_surrogate_hparams():
Expand Down
5 changes: 3 additions & 2 deletions tensor2tensor/bin/t2t_datagen.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ def main(_):

# Calculate the list of problems to generate.
problems = sorted(
list(_SUPPORTED_PROBLEM_GENERATORS) + registry.list_problems())
list(_SUPPORTED_PROBLEM_GENERATORS) + registry.list_base_problems())
for exclude in FLAGS.exclude_problems.split(","):
if exclude:
problems = [p for p in problems if exclude not in p]
Expand All @@ -169,7 +169,8 @@ def main(_):

if not problems:
problems_str = "\n * ".join(
sorted(list(_SUPPORTED_PROBLEM_GENERATORS) + registry.list_problems()))
sorted(list(_SUPPORTED_PROBLEM_GENERATORS) +
registry.list_base_problems()))
error_msg = ("You must specify one of the supported problems to "
"generate data for:\n * " + problems_str + "\n")
error_msg += ("TIMIT and parsing need data_sets specified with "
Expand Down
2 changes: 1 addition & 1 deletion tensor2tensor/bin/t2t_prune.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def create_pruning_params():


def create_pruning_strategy(name):
return registry.pruning_strategies(name)
return registry.pruning_strategy(name)


def main(argv):
Expand Down
4 changes: 2 additions & 2 deletions tensor2tensor/layers/common_hparams.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def basic_params1():
initializer="orthogonal",
initializer_gain=1.5,
label_smoothing=0.1,
optimizer="Adam",
optimizer="adam",
optimizer_adam_epsilon=1e-6,
optimizer_adam_beta1=0.85,
optimizer_adam_beta2=0.997,
Expand Down Expand Up @@ -466,7 +466,7 @@ def basic_range1(ranged_hparams):
rhp.set_float("optimizer_adam_beta2", 0.995, 0.999)
rhp.set_categorical(
"optimizer",
["Adam", "Adagrad", "Momentum", "RMSProp", "SGD", "YellowFin"])
["adam", "adagrad", "momentum", "rms_prop", "sgd", "yellow_fin"])


@registry.register_ranged_hparams
Expand Down
2 changes: 1 addition & 1 deletion tensor2tensor/models/mtf_transformer2.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,7 @@ def sample(self, features, mesh):
return self.combine_batch_dims(ret)


layers_registry = registry.create_registry("layers")
layers_registry = registry.Registries.layers


# The following functions construct layers based on hyperparmeters
Expand Down
10 changes: 5 additions & 5 deletions tensor2tensor/models/research/adafactor_experiments.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,16 +30,16 @@ def mimic_adam_with_adafactor(hparams):
Some minor things may be different, like epsilon and beta1 correction.

Args:
hparams: model hyperparameters where "Adam" in hparams.optimizer
hparams: model hyperparameters where "adam" in hparams.optimizer
"""
assert "Adam" in hparams.optimizer
hparams.optimizer = "Adafactor"
assert "adam" in hparams.optimizer
hparams.optimizer = "adafactor"
hparams.optimizer_adafactor_beta1 = hparams.optimizer_adam_beta1
hparams.optimizer_adafactor_beta2 = hparams.optimizer_adam_beta2
hparams.optimizer_adafactor_multiply_by_parameter_scale = False
hparams.optimizer_adafactor_factored = False
hparams.optimizer_adafactor_clipping_threshold = None
hparams.optimizer_adafactor_decay_type = "Adam"
hparams.optimizer_adafactor_decay_type = "adam"


@registry.register_hparams
Expand All @@ -50,7 +50,7 @@ def afx_adam():
hparams.optimizer_adam_beta2 = 0.999
hparams.symbol_modality_num_shards = 1
hparams.batch_size = 2048
hparams.optimizer = "Adam"
hparams.optimizer = "adam"
hparams.learning_rate_schedule = (
"constant*rsqrt_decay*linear_warmup*rsqrt_hidden_size")
hparams.learning_rate_constant = 2.0
Expand Down
2 changes: 1 addition & 1 deletion tensor2tensor/models/research/autoencoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -1020,7 +1020,7 @@ def body(self, features):
def autoencoder_basic():
"""Basic autoencoder model."""
hparams = common_hparams.basic_params1()
hparams.optimizer = "Adam"
hparams.optimizer = "adam"
hparams.learning_rate_constant = 0.0002
hparams.learning_rate_warmup_steps = 500
hparams.learning_rate_schedule = "constant * linear_warmup"
Expand Down
2 changes: 1 addition & 1 deletion tensor2tensor/models/research/transformer_nat.py
Original file line number Diff line number Diff line change
Expand Up @@ -392,7 +392,7 @@ def transformer_nat_small():
hparams.filter_size = 2048
hparams.label_smoothing = 0.0
hparams.force_full_predict = True
hparams.optimizer = "Adam"
hparams.optimizer = "adam"
hparams.optimizer_adam_epsilon = 1e-9
hparams.optimizer_adam_beta1 = 0.9
hparams.optimizer_adam_beta2 = 0.997
Expand Down
4 changes: 2 additions & 2 deletions tensor2tensor/models/research/transformer_vae.py
Original file line number Diff line number Diff line change
Expand Up @@ -767,7 +767,7 @@ def transformer_ae_small():
hparams.filter_size = 2048
hparams.add_hparam("compress_filter_size", 2048 * 2)
hparams.label_smoothing = 0.0
hparams.optimizer = "Adam" # Can be unstable, maybe try Adam.
hparams.optimizer = "adam" # Can be unstable, maybe try Adam.
hparams.optimizer_adam_epsilon = 1e-9
hparams.optimizer_adam_beta1 = 0.9
hparams.optimizer_adam_beta2 = 0.997 # Needs tuning, try 0.98 to 0.999.
Expand Down Expand Up @@ -941,7 +941,7 @@ def transformer_ae_a3():
def transformer_ae_a6():
"""Best hparams for transformer with semhash."""
hparams = transformer_ae_a3()
hparams.optimizer = "Adam"
hparams.optimizer = "adam"
hparams.noise_dev = 0.5
return hparams

Expand Down
2 changes: 1 addition & 1 deletion tensor2tensor/models/research/vqa_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -335,7 +335,7 @@ def vqa_attention_base():
hparams = common_hparams.basic_params1()
hparams.batch_size = 128
hparams.use_fixed_batch_size = True,
hparams.optimizer = "Adam"
hparams.optimizer = "adam"
hparams.optimizer_adam_beta1 = 0.9
hparams.optimizer_adam_beta2 = 0.999
hparams.optimizer_adam_epsilon = 1e-8
Expand Down
2 changes: 1 addition & 1 deletion tensor2tensor/models/research/vqa_self_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -684,7 +684,7 @@ def vqa_self_attention_base():
hparams = common_hparams.basic_params1()
hparams.batch_size = 128
hparams.use_fixed_batch_size = True,
hparams.optimizer = "Adam"
hparams.optimizer = "adam"
hparams.optimizer_adam_beta1 = 0.9
hparams.optimizer_adam_beta2 = 0.997
hparams.optimizer_adam_epsilon = 1e-9
Expand Down
2 changes: 1 addition & 1 deletion tensor2tensor/models/shake_shake.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ def shakeshake_small():
@registry.register_hparams
def shake_shake_quick():
hparams = shakeshake_small()
hparams.optimizer = "Adam"
hparams.optimizer = "adam"
hparams.learning_rate_cosine_cycle_steps = 1000
hparams.learning_rate = 0.5
hparams.batch_size = 100
Expand Down
4 changes: 2 additions & 2 deletions tensor2tensor/models/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1694,7 +1694,7 @@ def transformer_tall_pretrain_lm():
hparams.learning_rate_constant = 2e-4
hparams.learning_rate_schedule = (
"linear_warmup*constant*cosdecay")
hparams.optimizer = "AdamW"
hparams.optimizer = "adam_w"
hparams.optimizer_adam_beta1 = 0.9
hparams.optimizer_adam_beta2 = 0.999
hparams.optimizer_adam_epsilon = 1e-8
Expand Down Expand Up @@ -1739,7 +1739,7 @@ def transformer_tall_pretrain_lm_tpu():
# Optimizer gets reset in update_hparams_for_tpu so we set it again here.
hparams.learning_rate_constant = 2e-4
hparams.learning_rate_schedule = ("linear_warmup * constant * cosdecay")
hparams.optimizer = "AdamW"
hparams.optimizer = "adam_w"
return hparams


Expand Down
2 changes: 1 addition & 1 deletion tensor2tensor/models/vanilla_gan.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ def infer(self, *args, **kwargs): # pylint: disable=arguments-differ
def sliced_gan():
"""Basic parameters for a vanilla_gan."""
hparams = common_hparams.basic_params1()
hparams.optimizer = "Adam"
hparams.optimizer = "adam"
hparams.learning_rate_constant = 0.0002
hparams.learning_rate_warmup_steps = 500
hparams.learning_rate_schedule = "constant * linear_warmup"
Expand Down
2 changes: 1 addition & 1 deletion tensor2tensor/problems.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def problem(name):


def available():
return sorted(registry.list_problems())
return registry.list_base_problems()


all_problems.import_modules(all_problems.ALL_MODULES)
2 changes: 1 addition & 1 deletion tensor2tensor/rl/datagen_with_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def main(_):

# Create problem if not already defined
problem_name = "gym_discrete_problem_with_agent_on_%s" % FLAGS.game
if problem_name not in registry.list_problems():
if problem_name not in registry.Registries.problems:
gym_env.register_game(FLAGS.game)

# Generate
Expand Down
2 changes: 1 addition & 1 deletion tensor2tensor/test_data/transformer_test_ckpt/hparams.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"daisy_chain_variables": true, "optimizer_adam_beta1": 0.9, "scheduled_sampling_prob": 0.0, "num_hidden_layers": 2, "moe_loss_coef": 0.01, "max_target_seq_length": 0, "clip_grad_norm": 0.0, "pos": "timing", "scheduled_sampling_gold_mixin_prob": 0.5, "initializer": "uniform_unit_scaling", "grad_noise_scale": 0.0, "optimizer_momentum_momentum": 0.9, "nbr_decoder_problems": 1, "attention_key_channels": 0, "eval_drop_long_sequences": false, "learning_rate_cosine_cycle_steps": 250000, "prepend_mode": "none", "weight_decay": 0.0, "symbol_modality_skip_top": false, "weight_noise": 0.0, "target_modality": "default", "attention_dropout": 0.1, "parameter_attention_value_channels": 0, "factored_logits": false, "relu_dropout": 0.1, "no_data_parallelism": false, "layer_preprocess_sequence": "n", "sampling_method": "argmax", "learning_rate": 0.2, "num_heads": 2, "max_length": 256, "summarize_grads": false, "attention_value_channels": 0, "num_encoder_layers": 0, "label_smoothing": 0.1, "use_fixed_batch_size": false, "optimizer": "Adam", "moe_k": 2, "self_attention_type": "dot_product", "learning_rate_decay_scheme": "noam", "sampling_temp": 1.0, "kernel_height": 3, "use_pad_remover": true, "batch_size": 4096, "max_relative_position": 0, "force_full_predict": false, "min_length_bucket": 8, "layer_prepostprocess_dropout": 0.1, "eval_run_autoregressive": false, "shared_embedding_and_softmax_weights": true, "symbol_modality_num_shards": 16, "dropout": 0.2, "compress_steps": 0, "parameter_attention_key_channels": 0, "length_bucket_step": 1.1, "kernel_width": 1, "hidden_size": 16, "num_decoder_layers": 0, "input_modalities": "default", "filter_size": 8, "optimizer_adam_beta2": 0.98, "scheduled_sampling_warmup_steps": 50000, "norm_type": "layer", "min_length": 0, "moe_num_experts": 64, "multiply_embedding_mode": "sqrt_depth", "max_input_seq_length": 0, "learning_rate_warmup_steps": 8000, "proximity_bias": false, "ffn_layer": "dense_relu_dense", "initializer_gain": 1.0, "layer_postprocess_sequence": "da", "moe_hidden_sizes": "2048", "optimizer_adam_epsilon": 1e-09, "norm_epsilon": 1e-06}
{"daisy_chain_variables": true, "optimizer_adam_beta1": 0.9, "scheduled_sampling_prob": 0.0, "num_hidden_layers": 2, "moe_loss_coef": 0.01, "max_target_seq_length": 0, "clip_grad_norm": 0.0, "pos": "timing", "scheduled_sampling_gold_mixin_prob": 0.5, "initializer": "uniform_unit_scaling", "grad_noise_scale": 0.0, "optimizer_momentum_momentum": 0.9, "nbr_decoder_problems": 1, "attention_key_channels": 0, "eval_drop_long_sequences": false, "learning_rate_cosine_cycle_steps": 250000, "prepend_mode": "none", "weight_decay": 0.0, "symbol_modality_skip_top": false, "weight_noise": 0.0, "target_modality": "default", "attention_dropout": 0.1, "parameter_attention_value_channels": 0, "factored_logits": false, "relu_dropout": 0.1, "no_data_parallelism": false, "layer_preprocess_sequence": "n", "sampling_method": "argmax", "learning_rate": 0.2, "num_heads": 2, "max_length": 256, "summarize_grads": false, "attention_value_channels": 0, "num_encoder_layers": 0, "label_smoothing": 0.1, "use_fixed_batch_size": false, "optimizer": "adam", "moe_k": 2, "self_attention_type": "dot_product", "learning_rate_decay_scheme": "noam", "sampling_temp": 1.0, "kernel_height": 3, "use_pad_remover": true, "batch_size": 4096, "max_relative_position": 0, "force_full_predict": false, "min_length_bucket": 8, "layer_prepostprocess_dropout": 0.1, "eval_run_autoregressive": false, "shared_embedding_and_softmax_weights": true, "symbol_modality_num_shards": 16, "dropout": 0.2, "compress_steps": 0, "parameter_attention_key_channels": 0, "length_bucket_step": 1.1, "kernel_width": 1, "hidden_size": 16, "num_decoder_layers": 0, "input_modalities": "default", "filter_size": 8, "optimizer_adam_beta2": 0.98, "scheduled_sampling_warmup_steps": 50000, "norm_type": "layer", "min_length": 0, "moe_num_experts": 64, "multiply_embedding_mode": "sqrt_depth", "max_input_seq_length": 0, "learning_rate_warmup_steps": 8000, "proximity_bias": false, "ffn_layer": "dense_relu_dense", "initializer_gain": 1.0, "layer_postprocess_sequence": "da", "moe_hidden_sizes": "2048", "optimizer_adam_epsilon": 1e-09, "norm_epsilon": 1e-06}
2 changes: 1 addition & 1 deletion tensor2tensor/utils/adafactor.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,7 +326,7 @@ def adafactor_optimizer_from_hparams(hparams, lr):
Raises:
ValueError: on illegal values
"""
if hparams.optimizer_adafactor_decay_type == "Adam":
if hparams.optimizer_adafactor_decay_type == "adam":
decay_rate = adafactor_decay_rate_adam(
hparams.optimizer_adafactor_beta2)
elif hparams.optimizer_adafactor_decay_type == "pow":
Expand Down
2 changes: 1 addition & 1 deletion tensor2tensor/utils/learning_rate.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def legacy_learning_rate_schedule(hparams):
warmup = _learning_rate_warmup(warmup_steps, hparams=hparams)
decay = _learning_rate_decay(hparams, warmup_steps)
ret = tf.where(step_num < warmup_steps, warmup, decay)
optimizer_correction = 0.002 if "Adam" in hparams.optimizer else 1.0
optimizer_correction = 0.002 if "adam" in hparams.optimizer else 1.0
tf.logging.info("Base learning rate: %f", hparams.learning_rate)
return ret * optimizer_correction * hparams.learning_rate

Expand Down
19 changes: 11 additions & 8 deletions tensor2tensor/utils/optimize.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def optimize(loss, learning_rate, hparams, use_tpu=False, variables=None):
return train_op


@registry.register_optimizer("Adam")
@registry.register_optimizer
def adam(learning_rate, hparams):
# We change the default epsilon for Adam.
# Using LazyAdam as it's much faster for large vocabulary embeddings.
Expand All @@ -105,7 +105,7 @@ def adam(learning_rate, hparams):
epsilon=hparams.optimizer_adam_epsilon)


@registry.register_optimizer("MultistepAdam")
@registry.register_optimizer
def multistep_adam(learning_rate, hparams):
return multistep_optimizer.MultistepAdamOptimizer(
learning_rate,
Expand All @@ -115,22 +115,22 @@ def multistep_adam(learning_rate, hparams):
n=hparams.optimizer_multistep_accumulate_steps)


@registry.register_optimizer("Momentum")
@registry.register_optimizer
def momentum(learning_rate, hparams):
return tf.train.MomentumOptimizer(
learning_rate,
momentum=hparams.optimizer_momentum_momentum,
use_nesterov=hparams.optimizer_momentum_nesterov)


@registry.register_optimizer("YellowFin")
@registry.register_optimizer
def yellow_fin(learning_rate, hparams):
return yellowfin.YellowFinOptimizer(
learning_rate=learning_rate,
momentum=hparams.optimizer_momentum_momentum)


@registry.register_optimizer("TrueAdam")
@registry.register_optimizer
def true_adam(learning_rate, hparams):
return tf.train.AdamOptimizer(
learning_rate,
Expand All @@ -139,7 +139,7 @@ def true_adam(learning_rate, hparams):
epsilon=hparams.optimizer_adam_epsilon)


@registry.register_optimizer("AdamW")
@registry.register_optimizer
def adam_w(learning_rate, hparams):
# Openai gpt used weight decay.
# Given the internals of AdamW, weight decay dependent on the
Expand All @@ -156,7 +156,7 @@ def adam_w(learning_rate, hparams):
epsilon=hparams.optimizer_adam_epsilon)


@registry.register_optimizer("Adafactor")
@registry.register_optimizer("adafactor")
def register_adafactor(learning_rate, hparams):
return adafactor.adafactor_optimizer_from_hparams(hparams, learning_rate)

Expand All @@ -169,8 +169,11 @@ def _register_base_optimizer(key, fn):


for k in tf.contrib.layers.OPTIMIZER_CLS_NAMES:
if k not in registry._OPTIMIZERS: # pylint: disable=protected-access
if k not in registry.Registries.optimizers and k not in ('SGD', 'RMSProp'):
_register_base_optimizer(k, tf.contrib.layers.OPTIMIZER_CLS_NAMES[k])
_register_base_optimizer('sgd', tf.contrib.layers.OPTIMIZER_CLS_NAMES['SGD'])
_register_base_optimizer(
'rms_prop', tf.contrib.layers.OPTIMIZER_CLS_NAMES['RMSProp'])


class ConditionalOptimizer(tf.train.Optimizer):
Expand Down
Loading