added file for building Resnet estimator

ysqyang · ysqyang · commit 836eb12b35d0 · 2018-05-07T14:24:29.000-04:00
diff --git a/ResNet_model.ipynb b/ResNet_model.ipynb
@@ -266,7 +266,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.4"
+   "version": "3.6.5"
   }
  },
  "nbformat": 4,
diff --git a/ResNet_run.ipynb b/ResNet_run.ipynb
@@ -0,0 +1,352 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def process_record_dataset(dataset, is_training, batch_size, shuffle_buffer, parse_record_fn, num_epochs):\n",
+    "    # make the dataset prefetchable for parallellism\n",
+    "    dataset = dataset.prefetch(buffer_size=batch_size)\n",
+    "    \n",
+    "    # shuffle dataset\n",
+    "    if is_training:\n",
+    "        dataset = dataset.shuffle(buffer_size=shuffle_buffer)\n",
+    "    \n",
+    "    # repeat shuffled dataset for multi-epoch training\n",
+    "    dataset = dataset.repeat(num_epochs)\n",
+    "\n",
+    "    # Parse the raw records into images and labels and batch them\n",
+    "    dataset = dataset.map(lambda x : parse_record_fn(x, is_training), num_parallel_calls=1)        \n",
+    "    dataset = dataset.batch(batch_size)\n",
+    "    \n",
+    "    # prefetch one batch at a time\n",
+    "    dataset.prefetch(1)\n",
+    "\n",
+    "    return dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def learning_schedule(batch_size, batch_denom, num_images, boundary_epochs, decay_rates):\n",
+    "    initial_learning_rate = 0.1 * batch_size / batch_denom\n",
+    "    batches_per_epoch = num_images / batch_size\n",
+    "\n",
+    "    # Multiply the learning rate by 0.1 at 100, 150, and 200 epochs.\n",
+    "    boundaries = [int(batches_per_epoch * epoch) for epoch in boundary_epochs]\n",
+    "    vals = [initial_learning_rate * decay for decay in decay_rates]\n",
+    "\n",
+    "    # a global step means running an optimization op on a batch\n",
+    "    def learning_rate_fn(global_step):\n",
+    "        global_step = tf.cast(global_step, tf.int32)\n",
+    "        return tf.train.piecewise_constant(global_step, boundaries, vals)\n",
+    "\n",
+    "    return learning_rate_fn"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def resnet_model_fn(features, labels, mode, model_class,\n",
+    "                    resnet_size, weight_decay, learning_rate_fn, momentum,\n",
+    "                    data_format, resnet_version, loss_scale,\n",
+    "                    loss_filter_fn=None, dtype=resnet_model.DEFAULT_DTYPE):\n",
+    "    \"\"\"Shared functionality for different resnet model_fns.\n",
+    "    Initializes the ResnetModel representing the model layers\n",
+    "    and uses that model to build the necessary EstimatorSpecs for\n",
+    "    the `mode` in question. For training, this means building losses,\n",
+    "    the optimizer, and the train op that get passed into the EstimatorSpec.\n",
+    "    For evaluation and prediction, the EstimatorSpec is returned without\n",
+    "    a train op, but with the necessary parameters for the given mode.\n",
+    "    Args:\n",
+    "    features: tensor representing input images\n",
+    "    labels: tensor representing class labels for all input images\n",
+    "    mode: current estimator mode; should be one of\n",
+    "      `tf.estimator.ModeKeys.TRAIN`, `EVALUATE`, `PREDICT`\n",
+    "    model_class: a class representing a TensorFlow model that has a __call__\n",
+    "      function. We assume here that this is a subclass of ResnetModel.\n",
+    "    resnet_size: A single integer for the size of the ResNet model.\n",
+    "    weight_decay: weight decay loss rate used to regularize learned variables.\n",
+    "    learning_rate_fn: function that returns the current learning rate given\n",
+    "      the current global_step\n",
+    "    momentum: momentum term used for optimization\n",
+    "    data_format: Input format ('channels_last', 'channels_first', or None).\n",
+    "      If set to None, the format is dependent on whether a GPU is available.\n",
+    "    resnet_version: Integer representing which version of the ResNet network to\n",
+    "      use. See README for details. Valid values: [1, 2]\n",
+    "    loss_scale: The factor to scale the loss for numerical stability. A detailed\n",
+    "      summary is present in the arg parser help text.\n",
+    "    loss_filter_fn: function that takes a string variable name and returns\n",
+    "      True if the var should be included in loss calculation, and False\n",
+    "      otherwise. If None, batch_normalization variables will be excluded\n",
+    "      from the loss.\n",
+    "    dtype: the TensorFlow dtype to use for calculations.\n",
+    "    Returns:\n",
+    "    EstimatorSpec parameterized according to the input params and the\n",
+    "    current mode.\n",
+    "    \"\"\"\n",
+    "\n",
+    "    # Generate a summary node for the images\n",
+    "    tf.summary.image('images', features, max_outputs=6)\n",
+    "\n",
+    "    features = tf.cast(features, dtype)\n",
+    "\n",
+    "    model = model_class(resnet_size, data_format, resnet_version=resnet_version, dtype=dtype)\n",
+    "\n",
+    "    logits = model(features, mode == tf.estimator.ModeKeys.TRAIN)\n",
+    "\n",
+    "    # This acts as a no-op if the logits are already in fp32 (provided logits are\n",
+    "    # not a SparseTensor). If dtype is is low precision, logits must be cast to\n",
+    "    # fp32 for numerical stability.\n",
+    "    logits = tf.cast(logits, tf.float32)\n",
+    "\n",
+    "    predictions = {\n",
+    "      'classes': tf.argmax(logits, axis=1),\n",
+    "      'probabilities': tf.nn.softmax(logits, name='softmax_tensor')\n",
+    "    }\n",
+    "\n",
+    "    if mode == tf.estimator.ModeKeys.PREDICT:\n",
+    "        # Return the predictions and the specification for serving a SavedModel\n",
+    "        return tf.estimator.EstimatorSpec(\n",
+    "            mode=mode,\n",
+    "            predictions=predictions,\n",
+    "            export_outputs={\n",
+    "                'predict': tf.estimator.export.PredictOutput(predictions)\n",
+    "            })\n",
+    "\n",
+    "    # Calculate loss, which includes softmax cross entropy and L2 regularization.\n",
+    "    # cross entropy part\n",
+    "    cross_entropy = tf.losses.softmax_cross_entropy(logits=logits, onehot_labels=labels)\n",
+    "\n",
+    "    # Create a tensor named cross_entropy for logging purposes.\n",
+    "    tf.identity(cross_entropy, name='cross_entropy')\n",
+    "    tf.summary.scalar('cross_entropy', cross_entropy)\n",
+    "    \n",
+    "    # L2 regularization part\n",
+    "    def exclude_batch_norm(name):\n",
+    "        return 'batch_normalization' not in name\n",
+    "    \n",
+    "    loss_filter_fn = loss_filter_fn or exclude_batch_norm\n",
+    "\n",
+    "    # Add weight decay to the loss.\n",
+    "    l2_loss = weight_decay * tf.add_n(\n",
+    "      [tf.nn.l2_loss(tf.cast(v, tf.float32)) for v in tf.trainable_variables()\n",
+    "       if loss_filter_fn(v.name)])\n",
+    "    \n",
+    "    tf.summary.scalar('l2_loss', l2_loss)\n",
+    "    loss = cross_entropy + l2_loss\n",
+    "\n",
+    "    if mode == tf.estimator.ModeKeys.TRAIN:\n",
+    "        global_step = tf.train.get_or_create_global_step()\n",
+    "\n",
+    "        learning_rate = learning_rate_fn(global_step)\n",
+    "\n",
+    "        # Create a tensor named learning_rate for logging purposes\n",
+    "        tf.identity(learning_rate, name='learning_rate')\n",
+    "        tf.summary.scalar('learning_rate', learning_rate)\n",
+    "\n",
+    "        optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=momentum)\n",
+    "\n",
+    "        if loss_scale != 1:\n",
+    "            # When computing fp16 gradients, often intermediate tensor values are\n",
+    "            # so small, they underflow to 0. To avoid this, we multiply the loss by\n",
+    "            # loss_scale to make these tensor values loss_scale times bigger.\n",
+    "            scaled_grad_vars = optimizer.compute_gradients(loss * loss_scale)\n",
+    "\n",
+    "            # Once the gradient computation is complete we can scale the gradients\n",
+    "            # back to the correct scale before passing them to the optimizer.\n",
+    "            unscaled_grad_vars = [(grad / loss_scale, var) for grad, var in scaled_grad_vars]\n",
+    "            minimize_op = optimizer.apply_gradients(unscaled_grad_vars, global_step)\n",
+    "        else:\n",
+    "            minimize_op = optimizer.minimize(loss, global_step)\n",
+    "        \n",
+    "        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)\n",
+    "        train_op = tf.group(minimize_op, update_ops)\n",
+    "    else:\n",
+    "        train_op = None\n",
+    "\n",
+    "       \n",
+    "    if not tf.contrib.distribute.has_distribution_strategy():\n",
+    "        accuracy = tf.metrics.accuracy(tf.argmax(labels, axis=1), predictions['classes'])\n",
+    "    else:\n",
+    "        # Metrics are currently not compatible with distribution strategies during\n",
+    "        # training. This does not affect the overall performance of the model.\n",
+    "        accuracy = (tf.no_op(), tf.constant(0))\n",
+    "\n",
+    "    metrics = {'accuracy': accuracy}\n",
+    "\n",
+    "    # Create a tensor named train_accuracy for logging purposes\n",
+    "    tf.identity(accuracy[1], name='train_accuracy')\n",
+    "    tf.summary.scalar('train_accuracy', accuracy[1])\n",
+    "\n",
+    "    return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions, loss=loss, \n",
+    "                                      train_op=train_op, eval_metric_ops=metrics)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def resnet_main(flags_obj, model_function, input_function, dataset_name, shape=None):\n",
+    "    \"\"\"Shared main loop for ResNet Models.\n",
+    "    Args:\n",
+    "    flags_obj: An object containing parsed flags. See define_resnet_flags()\n",
+    "    for details.\n",
+    "    model_function: the function that instantiates the Model and builds the\n",
+    "    ops for train/eval. This will be passed directly into the estimator.\n",
+    "    input_function: the function that processes the dataset and returns a\n",
+    "    dataset that the estimator can train on. This will be wrapped with\n",
+    "    all the relevant flags for running and passed to estimator.\n",
+    "    dataset_name: the name of the dataset for training and evaluation. This is\n",
+    "    used for logging purpose.\n",
+    "    shape: list of ints representing the shape of the images used for training.\n",
+    "    This is only used if flags_obj.export_dir is passed.\n",
+    "    \"\"\"\n",
+    "\n",
+    "    # Using the Winograd non-fused algorithms provides a small performance boost.\n",
+    "    os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'\n",
+    "\n",
+    "    # Create session config based on values of inter_op_parallelism_threads and\n",
+    "    # intra_op_parallelism_threads. Note that we default to having\n",
+    "    # allow_soft_placement = True, which is required for multi-GPU and not\n",
+    "    # harmful for other modes.\n",
+    "    session_config = tf.ConfigProto(\n",
+    "        inter_op_parallelism_threads=flags_obj.inter_op_parallelism_threads,\n",
+    "        intra_op_parallelism_threads=flags_obj.intra_op_parallelism_threads,\n",
+    "        allow_soft_placement=True)\n",
+    "\n",
+    "    if flags_core.get_num_gpus(flags_obj) == 0:\n",
+    "        distribution = tf.contrib.distribute.OneDeviceStrategy('device:CPU:0')\n",
+    "    elif flags_core.get_num_gpus(flags_obj) == 1:\n",
+    "        distribution = tf.contrib.distribute.OneDeviceStrategy('device:GPU:0')\n",
+    "    else:\n",
+    "        distribution = tf.contrib.distribute.MirroredStrategy(num_gpus=flags_core.get_num_gpus(flags_obj))\n",
+    "\n",
+    "    run_config = tf.estimator.RunConfig(train_distribute=distribution, session_config=session_config)\n",
+    "\n",
+    "    classifier = tf.estimator.Estimator(model_fn=model_function, model_dir=flags_obj.model_dir, \n",
+    "                                        config=run_config,\n",
+    "                                        params={\n",
+    "                                            'resnet_size': int(flags_obj.resnet_size),\n",
+    "                                            'data_format': flags_obj.data_format,\n",
+    "                                            'batch_size': flags_obj.batch_size,\n",
+    "                                            'resnet_version': int(flags_obj.resnet_version),\n",
+    "                                            'loss_scale': flags_core.get_loss_scale(flags_obj),\n",
+    "                                            'dtype': flags_core.get_tf_dtype(flags_obj)\n",
+    "                                            })\n",
+    "\n",
+    "    run_params = {\n",
+    "        'batch_size': flags_obj.batch_size,\n",
+    "        'dtype': flags_core.get_tf_dtype(flags_obj),\n",
+    "        'resnet_size': flags_obj.resnet_size,\n",
+    "        'resnet_version': flags_obj.resnet_version,\n",
+    "        'synthetic_data': flags_obj.use_synthetic_data,\n",
+    "        'train_epochs': flags_obj.train_epochs,\n",
+    "    }\n",
+    "    \n",
+    "    benchmark_logger = logger.config_benchmark_logger(flags_obj.benchmark_log_dir)\n",
+    "    benchmark_logger.log_run_info('resnet', dataset_name, run_params)\n",
+    "\n",
+    "    train_hooks = hooks_helper.get_train_hooks(flags_obj.hooks,batch_size=flags_obj.batch_size,\n",
+    "                                               benchmark_log_dir=flags_obj.benchmark_log_dir)\n",
+    "\n",
+    "    def input_fn_train():\n",
+    "        return input_function(is_training=True, data_dir=flags_obj.data_dir,\n",
+    "                              batch_size=per_device_batch_size(flags_obj.batch_size, \n",
+    "                                                               flags_core.get_num_gpus(flags_obj)),\n",
+    "                              num_epochs=flags_obj.epochs_between_evals)\n",
+    "\n",
+    "    def input_fn_eval():\n",
+    "        return input_function(is_training=False, data_dir=flags_obj.data_dir,\n",
+    "                              batch_size=per_device_batch_size(flags_obj.batch_size, \n",
+    "                                                               flags_core.get_num_gpus(flags_obj)),\n",
+    "                              num_epochs=1)\n",
+    "\n",
+    "    total_training_cycle = (flags_obj.train_epochs // flags_obj.epochs_between_evals)\n",
+    "    \n",
+    "    for cycle_index in range(total_training_cycle):\n",
+    "        tf.logging.info('Starting a training cycle: %d/%d', cycle_index, total_training_cycle)\n",
+    "\n",
+    "        classifier.train(input_fn=input_fn_train, hooks=train_hooks,\n",
+    "        max_steps=flags_obj.max_train_steps)\n",
+    "\n",
+    "        tf.logging.info('Starting to evaluate.')\n",
+    "\n",
+    "        # flags_obj.max_train_steps is generally associated with testing and\n",
+    "        # profiling. As a result it is frequently called with synthetic data, which\n",
+    "        # will iterate forever. Passing steps=flags_obj.max_train_steps allows the\n",
+    "        # eval (which is generally unimportant in those circumstances) to terminate.\n",
+    "        # Note that eval will run for max_train_steps each loop, regardless of the\n",
+    "        # global_step count.\n",
+    "        eval_results = classifier.evaluate(input_fn=input_fn_eval,\n",
+    "                                           steps=flags_obj.max_train_steps)\n",
+    "\n",
+    "        benchmark_logger.log_evaluation_result(eval_results)\n",
+    "\n",
+    "        if model_helpers.past_stop_threshold(flags_obj.stop_threshold, eval_results['accuracy']):\n",
+    "            break\n",
+    "\n",
+    "    if flags_obj.export_dir is not None:\n",
+    "        # Exports a saved model for the given classifier.\n",
+    "        input_receiver_fn = export.build_tensor_serving_input_receiver_fn(shape, batch_size=flags_obj.batch_size)\n",
+    "        classifier.export_savedmodel(flags_obj.export_dir, input_receiver_fn)\n",
+    "\n",
+    "\n",
+    "def define_resnet_flags(resnet_size_choices=None):\n",
+    "    \"\"\"Add flags and validators for ResNet.\"\"\"\n",
+    "    flags_core.define_base()\n",
+    "    flags_core.define_performance(num_parallel_calls=False)\n",
+    "    flags_core.define_image()\n",
+    "    flags_core.define_benchmark()\n",
+    "    flags.adopt_module_key_flags(flags_core)\n",
+    "\n",
+    "    flags.DEFINE_enum(\n",
+    "    name='resnet_version', short_name='rv', default='2',\n",
+    "    enum_values=['1', '2'],\n",
+    "    help=flags_core.help_wrap(\n",
+    "    'Version of ResNet. (1 or 2) See README.md for details.'))\n",
+    "\n",
+    "\n",
+    "    choice_kwargs = dict(\n",
+    "    name='resnet_size', short_name='rs', default='50',\n",
+    "    help=flags_core.help_wrap('The size of the ResNet model to use.'))\n",
+    "\n",
+    "    if resnet_size_choices is None:\n",
+    "    flags.DEFINE_string(**choice_kwargs)\n",
+    "    else:\n",
+    "    flags.DEFINE_enum(enum_values=resnet_size_choices, **choice_kwargs)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/temp.py b/temp.py

Original file line number	Diff line number	Diff line change
`@@ -266,7 +266,7 @@`
`266`	`266`	`"name": "python",`
`267`	`267`	`"nbconvert_exporter": "python",`
`268`	`268`	`"pygments_lexer": "ipython3",`
`269`		`- "version": "3.6.4"`
	`269`	`+ "version": "3.6.5"`
`270`	`270`	`}`
`271`	`271`	`},`
`272`	`272`	`"nbformat": 4,`