examples/ANNOTATED_EXAMPLE.yaml

# ----------------------------------------------------------------------------
# Copyright 2014 Nervana Systems Inc.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ----------------------------------------------------------------------------
# neon configuration files are written in YAML (a human readable markup
# language that supports key/value pairs).  We actually make use of pyyaml and
# customize things slightly to support arbitrary python objects as values and
# so forth.
#
# Basic parameters are written in the format:
#     key: value
# Where value can be either:
# - a scalar string, boolean, or number ex: 'hello', False, 25.6
# - a list of values ex: [1, 2, 3]
# - a dictionary of key/value pairs ex: { a: 1, b: 2, c: 3 }
# - a python object instantiation ex: !obj:transforms.Logistic {}
#   note that object constructor arguments are passed between the curly braces
#   as key/value pairs
#
# Parameters can be split over multiple lines, and comments can be added by
# prepending a # character.
#
# A common source of error is forgetting the ',' between parameters belonging
# to the same object
#
# To save on explicit repetition, you can reference parameters by assigning a
# variable alias, then dereference them where needed later.  The syntax for
# doing so is:
#     key: &var_ref value  # creates a reference for key named var_ref
#     new_key: *var_ref    # dereferences var_ref to assign its val to new_key


# At the highest level, a neon YAML file consists of an experiment definition.
# Experiments are sub-classes of the python class
# neon.experiments.experiment.Experiment and carry out tasks like training a
# model, generating predictions, and so forth.
#
# Experiment types:
# -----------------
# FitExperiment - train a model to learn a set of parameters
# FitPredictErrorExperiment - as above, but also generate predictions and
#                             evaluate performance on those, potentially
#                             saving predictions to file.
# GradientChecker - Use finite differences to check the gradients of an
#                   experiment.  Currently this can be carried out using
#                   the separately provided ``grad`` executable
# GenOutputExperiment - fit experiment where at the end we want to visualize
#                       the output compared to the input (useful for things
#                       like autoencoders or balance networks).
# WriteErrorToFile - fit experiment where at the end we save off the
#                    predictions to a file on disk
!obj:experiments.FitPredictErrorExperiment {
  # Fit*Experiment parameters:
  # --------------------------
  # dataset - The input data to train/test against.  types:
  #           MNIST - handwritten digit image dataset
  #           CIFAR10 - 10 class object image dataset
  #           I1K - Imagenet 1000 class image dataset
  #           I1Knq - Imagenet 1000 class image dataset (non-queueing)
  #           SPARSENET - small natural image dataset (Olshausen and Field)
  #           Imageset - Generic macro batched image dataset support
  #           Iris - RA Fisher's 3 class flower dataset
  #           MOBYDICK - Content of the book Moby Dick
  #           UniformRandom - synthetic random data
  #           ToyImages - synthetic image classification dataset
  # weight_init - controls initial value setup for the model weights.  types:
  #               UniformValGen - Uniform distribution
  #               AutoUniformValGen - Uniform distribution where range is set
  #                                   automatically based on the shape being
  #                                   inititalized
  #               SparseEigenValGen - Each weight has a configurable number of
  #                                   nonzero inputs and these weights are
  #                                   uniformly randomly initialized followed
  #                                   by a scaling based on the maximum 
  #                                   eigenvalues found.  See Sutskever2013 for
  #                                   details.
  #               GaussianValGen or NormalValGen - normal distribution
  #               NodeNormalizedValGen - initialization is as discussed in
  #                                      Glorot2010
  # lrule - the learning rule used to update weight values.  types:
  #         gradient_descent - vanilla gradient descent (supports weight decay)
  #         gradient_descent_pretrain - gradient descent that supports a
  #                                     separate pretrain learning rate
  #         gradient_descent_momentum - gradient descent that supports momentum
  #                                     based updates
  #         adadelta - Adadelta based updates.  See Zeiler2012
  # model - the type of machine learning algorithm to employ.  types:
  #         MLP - feed-forward multi-layer perceptron neural network
  #         RBM - restricted Boltzmann machine
  #         RNN - Recurrent neural network
  #         Balance - Balance network
  #         Autoencoder - stacked autoencoder
  #         DBN - deep belief network
  # logging - controls the amount and format of diagnositc information.  Based
  #           off of python's logging module
  #
  # FitPredictErrorExperiment specific parameters:
  # ----------------------------------------------
  # predictions - which data types to compute and save predictions on.
  #               Should be a list of strings, where each string should be
  #               one of 'train', 'test', or 'validation'.  Saved
  #               predictions will be written to disk in the same directory
  #               as the associated input dataset, and are written as python
  #               serialized object (*-inference.pkl) files.
  # metrics - The performance measurements to compute for each specified data
  #           type.  Should be a dict of dicts of Metric objects.  Currently
  #           implemented metrics are defined below
  #           'AUC' - Area under the ROC curve
  #           'MisclassRate' - Proportion of instances where the output class
  #                            predicted by the model does not match the
  #                            expected value.  Note that we can also consider
  #                            matches as being those within the top x most
  #                            probable by setting the error_rank parameter to
  #                            x in the constructor.  This rate produces values
  #                            in the range [0, 1]
  #           'MisclassPercentage' - Identical to MisclassRate but on the scale
  #                                  [0,100]
  #           'MisclassSum' - Identical to MisclassRate but returns the integer
  #                           number of misclassifications instead of the
  #                           fraction.
  #           'LogLossSum' - The negative log likelihood of the true labels
  #                          given the predicted probabilities from the model.
  #           'LogLossMean' - The negative log likelihood of the true labels
  #                           give the predicted probabilities from the model.
  #                           This value is normalized by the number of records
  #                           examined.
  dataset: &ds !obj:datasets.Iris {
    # Iris dataset parameters
    # -----------------------
    # repo_path - on-disk repository location where this dataset resides
    #             (specify repository root containing the Iris dir, and not
    #             the Iris dir directly)
    # sample_pct - if 100, full dataset will be used, otherwise we uniformly
    #              downsample records to the specified percentage
    repo_path: '~/data',
    sample_pct: 100,
  },
  
  metrics: {
    # Metric objects
    # --------------
    # 'AUC' - Area under the ROC curve
    # 'MisclassRate' - Proportion of instances where the output class predicted
    #                  by the model does not match the expected value.  Note
    #                  that we can also consider matches as being those within
    #                  the top x most probable by setting the error_rank
    #                  parameter to x in the constructor.  This rate produces
    #                  values in the range [0, 1]
    # 'MisclassPercentage' - Identical to MisclassRate but on the scale [0,100]
    # 'MisclassSum' - Identical to MisclassRate but returns the integer number
    #                 of misclassifications instead of the fraction.
    # 'LogLossSum' - The negative log likelihood of the true labels given the
    #                predicted probabilities from the model.
    # 'LogLossMean' - The negative log likelihood of the true labels given the
    #                 predicted probabilities from the model.  This value is
    #                 normalized by the number of records examined.
    train: [
      !obj:metrics.AUC {},
      !obj:metrics.MisclassPercentage {},
    ],
    test: [
      !obj:metrics.AUC {},
      !obj:metrics.MisclassPercentage {},
    ],
  },

  weight_init: &wt_init !obj:params.UniformValGen {
    # UniformValGen weight_init parameters
    # ------------------------------------
    # backend - backend to use to construct the values
    # low - minimal uniform random value range endpoint
    # high - maximal uniform random value range endpoint
    # bias_init - initial values for biases (if used)
    low: -1,
    high: 1,
    bias_init: 0.0,
  },

  lrule: &gdm {
    # gradient_descent_momentum parameters
    # ------------------------------------
    # type - set this to graident_descent_momentum
    # lr_params - parameters associated with this type of learning rate. types:
    #   learning_rate - fraction of backpropagated gradient values to
    #                   add to this layers weights during each update
    #                   (should be <= 1.0)
    #   backend - backend to use to store data constructed for these
    #             layer updates
    #   momentum_params - dictionary containing several parameters that
    #                     can be used to adjust the learning rate over
    #                     time based on the curvature of gradient
    #                     directions.  These parameters are:
    #     type - should be one of: constant, linear_monotone, nesterov
    #     initial_coef - initial momentum coefficient value.  0 disables
    #                    momentum, but a positive value gives portion of
    #                    previous iteration velocity to apply initially
    #     saturated_coef - final momentum coefficient value to apply
    #                      once we reach the saturation epoch and
    #                      beyond.  Only applies to linear_monotone
    #     start_epoch - specify a number indicating at which epoch we
    #                   begin applying momentum based updates
    #     saturate_epoch - specify a number (>= start_epoch) at which
    #                      we will saturate momentum based updates.
    #                      For linear_monotone momentum, we interpolate
    #                      between initial and saturate momentum coef
    #                      values until we reach the saturate_epoch.
    type: gradient_descent_momentum,
    lr_params: {
      learning_rate: 0.5,
      momentum_params: {
        type: constant,
        coef: 0,
      },
    },
  },

  model: !obj:models.MLP {
    # MLP model parameters
    # --------------------
    # batch_size - number of training instances in each mini-batch.  Should
    #              be no larger than the size of the training dataset.
    # num_epochs - the number of iterations through the entire training set
    #              to make in training the model
    # serialized_path - the full location and name to a serialized .prm file
    #                   that we will use to cache this model.  Speeds up
    #                   subsequent use of this model for inference purposes.
    # deserialize_path - the full location and name to deserialize a .prm file from
    #                   that we will use to initialize this model. Defaults to
    #                   serialized_path if deserialize_path doesn't exist
    # layers - list of neural network layer objects.  Models will typically
    #          have a single DataLayer, followed by 1 or more hidden layers,
    #          then a final CostLayer.
    num_epochs: 25,
    batch_size: &bs 30,
    serialized_path: './iris-mlp-simple.prm',
    layers: [
      &datalayer !obj:layers.DataLayer {
        # DataLayer parameters
        # --------------------
        # The data layer feeds data from the input dataset defined in the
        # experiment.
        # Parameters:
        # name - string used to identify this layer in the logs.
        # nout - number of connections output by this layer
        name: d0,
        nout: 4,
      },
      !obj:layers.FCLayer {
        # FCLayer parameters
        # ------------------
        # Apart from Data and Cost layers, internal layers have parameters:
        # name - string used to identify this layer in the logs.
        # lrule_init - How updates get applied to the weights.  See lrule
        #              defined at the Experiment level
        # nout - number of connections output by this layer
        # activation - (non-linear) transformation function to apply to the
        #              connections in this layer.  types are:
        #              Logistic - sigmoidal squashing transform
        #              RectLin - ReLU (rectified linear unit) transform
        #              Tanh - hyperbolic tangent squashing transform
        #              SoftMax - generalized logistic squashing transform
        #                        normalized over k dimensions
        # weight_init - Controls how weights get initialized.  See weight_init
        #                        defined at the Experiment level
        name: h0,
        nout: 2,
        lrule_init: *gdm,
        weight_init: *wt_init,
        activation: !obj:transforms.Logistic {},
      },
      &lastlayer !obj:layers.FCLayer {
        name: output,
        nout: 3,
        lrule_init: *gdm,
        weight_init: *wt_init,
        activation: !obj:transforms.Logistic {},
      },
      &costlayer !obj:layers.CostLayer {
        # CostLayer parameters
        # --------------------
        # This layer defines the cost or objective function to be optimized.
        # Parameters:
        # name - string used to identify this layer in the logs.
        # ref_layer - the input data layer reference
        # cost - the actual transform to use for the cost.  Types include:
        #        CrossEntropy - typically used to measure the difference
        #                       between two probability distributions.
        #                       The default is the binomial version
        #                       When binomial version is paired with a logistic
        #                       activation in the previous layer,
        #                       shortcut_deriv is True
        #        SumSquaredDiffs - computes the sum of squared differences.
        name: cost,
        ref_layer: *datalayer,
        cost: !obj:transforms.CrossEntropy {},
      },
    ],
  },

  logging: {
    # logging parameters:
    # -------------------
    # level - numeric threshold that controls the types of messages that get
    #         displayed.  All logging types with values greater than or equal
    #         to this value will appear.
    #         types of logging are:
    #         0 - NOTSET
    #         10 - DEBUG
    #         20 - INFO
    #         30 - WARNING
    #         40 - ERROR
    #         50 - CRITICAL
    # filename - write logging to the specified file (instead of stdout)
    # format - string giving what fields to display in each log entry.  See
    #          python's logging record attributes for full details.
    level: 20,
    format: '%(asctime)-15s %(levelname)s:%(module)s - %(message)s'
  },
}