From 0b5fb432ccac1da3fd470173f23a38f7efe11bed Mon Sep 17 00:00:00 2001
From: Andrew Hundt <ATHundt@gmail.com>
Date: Tue, 29 Aug 2017 18:12:14 -0400
Subject: [PATCH] cnn_finetune models added from
 https://github.com/flyyufelix/cnn_finetune

---
 keras_contrib/applications/densenet.py.orig | 1161 +++++++++++++++++++
 keras_contrib/applications/densenet121.py   |  236 ++++
 keras_contrib/applications/densenet161.py   |  236 ++++
 keras_contrib/applications/densenet169.py   |  236 ++++
 keras_contrib/applications/inception_v3.py  |  249 ++++
 keras_contrib/applications/inception_v4.py  |  298 +++++
 keras_contrib/applications/resnet_101.py    |  206 ++++
 keras_contrib/applications/resnet_152.py    |  206 ++++
 keras_contrib/applications/resnet_50.py     |  198 ++++
 9 files changed, 3026 insertions(+)
 create mode 100644 keras_contrib/applications/densenet.py.orig
 create mode 100644 keras_contrib/applications/densenet121.py
 create mode 100644 keras_contrib/applications/densenet161.py
 create mode 100644 keras_contrib/applications/densenet169.py
 create mode 100644 keras_contrib/applications/inception_v3.py
 create mode 100644 keras_contrib/applications/inception_v4.py
 create mode 100644 keras_contrib/applications/resnet_101.py
 create mode 100644 keras_contrib/applications/resnet_152.py
 create mode 100644 keras_contrib/applications/resnet_50.py

diff --git a/keras_contrib/applications/densenet.py.orig b/keras_contrib/applications/densenet.py.orig
new file mode 100644
index 000000000..4881a1caf
--- /dev/null
+++ b/keras_contrib/applications/densenet.py.orig
@@ -0,0 +1,1161 @@
+# -*- coding: utf-8 -*-
+"""DenseNet models for Keras.
+# Reference
+<<<<<<< HEAD
+
+- [Densely Connected Convolutional Networks]
+  (https://arxiv.org/pdf/1608.06993.pdf)
+- [The One Hundred Layers Tiramisu: Fully Convolutional
+  DenseNets for Semantic Segmentation](https://arxiv.org/pdf/1611.09326.pdf)
+"""
+=======
+- [Densely Connected Convolutional Networks](https://arxiv.org/pdf/1608.06993.pdf)
+- [The One Hundred Layers Tiramisu: Fully Convolutional DenseNets for Semantic Segmentation](https://arxiv.org/pdf/1611.09326.pdf)
+"""
+>>>>>>> master
+from __future__ import print_function
+from __future__ import absolute_import
+from __future__ import division
+
+import warnings
+
+from keras.models import Model
+from keras.layers.core import Dense, Dropout, Activation, Reshape
+from keras.layers.convolutional import Conv2D, Conv2DTranspose, UpSampling2D
+from keras.layers.pooling import AveragePooling2D
+from keras.layers.pooling import MaxPooling2D
+from keras.layers.pooling import GlobalAveragePooling2D
+from keras.layers import Input
+from keras.layers.merge import concatenate
+from keras.layers.normalization import BatchNormalization
+from keras.regularizers import l2
+from keras.utils.layer_utils import convert_all_kernels_in_model
+from keras.utils.data_utils import get_file
+from keras.engine.topology import get_source_inputs
+from keras.applications.imagenet_utils import _obtain_input_shape
+import keras.backend as K
+
+from keras_contrib.layers.convolutional import SubPixelUpscaling
+
+<<<<<<< HEAD
+TH_WEIGHTS_PATH = ('https://github.com/titu1994/DenseNet/releases/download'
+                   '/v2.0/DenseNet-40-12-Theano-Backend-TH-dim-ordering.h5')
+TF_WEIGHTS_PATH = ('https://github.com/titu1994/DenseNet/releases/download'
+                   '/v2.0/DenseNet-40-12-Tensorflow'
+                   '-Backend-TF-dim-ordering.h5')
+TH_WEIGHTS_PATH_NO_TOP = ('https://github.com/titu1994/DenseNet/releases'
+                          '/download/v2.0/DenseNet-40-12-Theano-Backend-TH'
+                          '-dim-ordering-no-top.h5')
+TF_WEIGHTS_PATH_NO_TOP = ('https://github.com/titu1994/DenseNet/releases/'
+                          'download/v2.0/DenseNet-40-12-Tensorflow-Backend-'
+                          'TF-dim-ordering-no-top.h5')
+
+
+def DenseNet(input_shape=None, depth=40, nb_dense_block=3,
+             growth_rate=12, nb_filter=16,
+             nb_layers_per_block=-1, bottleneck=False, reduction=0.0,
+             dropout_rate=0.0, weight_decay=1E-4, include_top=True,
+             top='classification',
+             weights='cifar10', input_tensor=None,
+             classes=10, transition_dilation_rate=1,
+             transition_pooling='avg',
+             transition_kernel_size=(1, 1),
+             activation='softmax'):
+    """Instantiate the DenseNet architecture,
+        optionally loading weights pre-trained
+        on CIFAR-10. Note that when using TensorFlow,
+        for best performance you should set
+        `image_dim_ordering='tf'` in your Keras config
+=======
+TH_WEIGHTS_PATH = 'https://github.com/titu1994/DenseNet/releases/download/v2.0/DenseNet-40-12-Theano-Backend-TH-dim-ordering.h5'
+TF_WEIGHTS_PATH = 'https://github.com/titu1994/DenseNet/releases/download/v2.0/DenseNet-40-12-Tensorflow-Backend-TF-dim-ordering.h5'
+TH_WEIGHTS_PATH_NO_TOP = 'https://github.com/titu1994/DenseNet/releases/download/v2.0/DenseNet-40-12-Theano-Backend-TH-dim-ordering-no-top.h5'
+TF_WEIGHTS_PATH_NO_TOP = 'https://github.com/titu1994/DenseNet/releases/download/v2.0/DenseNet-40-12-Tensorflow-Backend-TF-dim-ordering-no-top.h5'
+
+
+def DenseNet(input_shape=None, depth=40, nb_dense_block=3, growth_rate=12, nb_filter=16, nb_layers_per_block=-1,
+             bottleneck=False, reduction=0.0, dropout_rate=0.0, weight_decay=1E-4,
+             include_top=True, weights='cifar10', input_tensor=None,
+             classes=10, activation='softmax'):
+    """Instantiate the DenseNet architecture,
+        optionally loading weights pre-trained
+        on CIFAR-10. Note that when using TensorFlow,
+        for best performance you should set
+        `image_data_format='channels_last'` in your Keras config
+>>>>>>> master
+        at ~/.keras/keras.json.
+        The model and the weights are compatible with both
+        TensorFlow and Theano. The dimension ordering
+        convention used by the model is the one
+        specified in your Keras config file.
+<<<<<<< HEAD
+
+        For segmentation problems specify `transition_dilation_rate >= 2`,
+        `transition_pooling=None`, `weights=None`, `top='segmentation'`.
+        Good options also include `nb_dense_block=4`, `nb_layers_per_block=4`,
+        and `depth=None`, but this varies by application.
+
+        # Arguments
+
+            input_shape: optional shape tuple, only to be specified
+                if `include_top` is False (otherwise the input shape
+                has to be `(32, 32, 3)` (with `tf` dim ordering)
+                or `(3, 32, 32)` (with `th` dim ordering).
+                It should have exactly 3 inputs channels,
+                and width and height should be no smaller than 8.
+                E.g. `(200, 200, 3)` would be one valid value.
+            depth: Number of layers in the DenseNet. May be None if
+                nb_dense_block and nb_layers_per_block are set.
+            nb_dense_block: number of dense blocks to add to end
+                (generally = 3)
+=======
+        # Arguments
+            input_shape: optional shape tuple, only to be specified
+                if `include_top` is False (otherwise the input shape
+                has to be `(32, 32, 3)` (with `channels_last` dim ordering)
+                or `(3, 32, 32)` (with `channels_first` dim ordering).
+                It should have exactly 3 inputs channels,
+                and width and height should be no smaller than 8.
+                E.g. `(200, 200, 3)` would be one valid value.
+            depth: number or layers in the DenseNet
+            nb_dense_block: number of dense blocks to add to end (generally = 3)
+>>>>>>> master
+            growth_rate: number of filters to add per dense block
+            nb_filter: initial number of filters. -1 indicates initial
+                number of filters is 2 * growth_rate
+            nb_layers_per_block: number of layers in each dense block.
+                Can be a -1, positive integer or a list.
+                If -1, calculates nb_layer_per_block from the network depth.
+                If positive integer, a set number of layers per dense block.
+                If list, nb_layer is used as provided. Note that list size must
+                be (nb_dense_block + 1)
+            bottleneck: flag to add bottleneck blocks in between dense blocks
+            reduction: reduction factor of transition blocks.
+                Note : reduction value is inverted to compute compression.
+            dropout_rate: dropout rate
+            weight_decay: weight decay factor
+            include_top: whether to include the fully-connected
+                layer at the top of the network.
+            top: One of 'segmentation', 'classification', or None.
+                'classification' includes global average pooling and
+                a dense activation layer with a single output and multiple
+                classes. 'segmentation' includes a Conv2D and
+                a softmax activation. None is the same as `include_top=False`.
+            weights: one of `None` (random initialization) or
+                'cifar10' (pre-training on CIFAR-10)..
+<<<<<<< HEAD
+            input_tensor: optional Keras tensor
+                (i.e. output of `layers.Input()`)
+=======
+            input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
+>>>>>>> master
+                to use as image input for the model.
+            classes: optional number of classes to classify images
+                into, only to be specified if `include_top` is True, and
+                if no `weights` argument is specified.
+<<<<<<< HEAD
+            transition_dilation_rate: An integer or tuple/list of 2 integers,
+                specifying the dilation rate to in transition blocks for
+                dilated convolution, increasing the receptive field of the
+                algorithm. Can be a single integer to specify the same value
+                for all spatial dimensions.
+            transition_pooling: Data pooling to reduce resolution in transition
+                blocks, one of 'avg', 'max', or None.
+            transition_kernel_size: Adjusts the filter size of the Conv2D in
+                each transition block, useful in segmentation for controlling
+                the receptive field, particularly when combined with
+                transition_dilation_rate.
+            activation: Type of activation at the top layer. Can be one of
+               'softmax' or 'sigmoid'. Note that if sigmoid is used,
+                classes must be 1.
+
+=======
+            activation: Type of activation at the top layer. Can be one of 'softmax' or 'sigmoid'.
+                Note that if sigmoid is used, classes must be 1.
+>>>>>>> master
+        # Returns
+            A Keras model instance.
+        """
+
+    if weights not in {'cifar10', None}:
+        raise ValueError('The `weights` argument should be either '
+                         '`None` (random initialization) or `cifar10` '
+                         '(pre-training on CIFAR-10).')
+
+    if weights == 'cifar10' and include_top and classes != 10:
+        raise ValueError('If using `weights` as CIFAR 10 with `include_top`'
+                         ' as true, `classes` should be 10')
+
+    if activation not in ['softmax', 'sigmoid']:
+        raise ValueError('activation must be one of "softmax" or "sigmoid"')
+<<<<<<< HEAD
+    if activation == 'sigmoid' and classes != 1:
+        raise ValueError('sigmoid activation can' +
+                         'only be used when classes = 1')
+=======
+
+    if activation == 'sigmoid' and classes != 1:
+        raise ValueError('sigmoid activation can only be used when classes = 1')
+
+>>>>>>> master
+    # Determine proper input shape
+    # If doing segmentation we still include
+    # top but _obtain_input_shape only
+    # supports labeling.
+    input_shape_include_top = (include_top and transition_dilation_rate is 1)
+    input_shape = _obtain_input_shape(input_shape,
+                                      default_size=32,
+                                      min_size=8,
+                                      data_format=K.image_data_format(),
+<<<<<<< HEAD
+                                      include_top=input_shape_include_top)
+=======
+                                      include_top=include_top)
+>>>>>>> master
+
+    if input_tensor is None:
+        img_input = Input(shape=input_shape)
+    else:
+        if not K.is_keras_tensor(input_tensor):
+            img_input = Input(tensor=input_tensor, shape=input_shape)
+        else:
+            img_input = input_tensor
+
+<<<<<<< HEAD
+    x = __create_dense_net(classes, img_input, include_top, top, depth,
+                           nb_dense_block, growth_rate, nb_filter,
+                           nb_layers_per_block, bottleneck,
+                           reduction, dropout_rate, weight_decay,
+                           transition_dilation_rate,
+                           transition_pooling, transition_kernel_size,
+                           input_shape,
+                           activation)
+=======
+    x = __create_dense_net(classes, img_input, include_top, depth, nb_dense_block,
+                           growth_rate, nb_filter, nb_layers_per_block, bottleneck, reduction,
+                           dropout_rate, weight_decay, activation)
+>>>>>>> master
+
+    # Ensure that the model takes into account
+    # any potential predecessors of `input_tensor`.
+    if input_tensor is not None:
+        inputs = get_source_inputs(input_tensor)
+    else:
+        inputs = img_input
+    # Create model.
+    model = Model(inputs, x, name='densenet')
+
+    # load weights
+    if weights == 'cifar10':
+        if (depth == 40) and (nb_dense_block == 3) and \
+           (growth_rate == 12) and (nb_filter == 16) and \
+           (bottleneck is False) and (reduction == 0.0) and \
+           (dropout_rate == 0.0) and (weight_decay == 1E-4):
+            # Default parameters match. Weights for this model exist:
+
+            if K.image_data_format() == 'channels_first':
+                if include_top:
+                    fname = 'densenet_40_12_th_dim_ordering_th_kernels.h5'
+                    weights_path = get_file(fname,
+                                            TH_WEIGHTS_PATH,
+                                            cache_subdir='models')
+                else:
+                    fname = 'densenet_40_12_th_dim_ordering' \
+                            '_th_kernels_no_top.h5'
+                    weights_path = get_file(fname,
+                                            TH_WEIGHTS_PATH_NO_TOP,
+                                            cache_subdir='models')
+
+                model.load_weights(weights_path)
+
+                if K.backend() == 'tensorflow':
+                    warnings.warn('You are using the TensorFlow backend, '
+                                  'yet you are using the Theano '
+                                  'image dimension ordering convention '
+                                  '(`image_data_format="channels_first"`). '
+                                  'For best performance, set '
+                                  '`image_data_format="channels_last"` in '
+                                  'your Keras config '
+                                  'at ~/.keras/keras.json.')
+                    convert_all_kernels_in_model(model)
+            else:
+                if include_top:
+
+                    weights_path = get_file('densenet_40_12_tf_dim_ordering'
+                                            '_tf_kernels.h5',
+                                            TF_WEIGHTS_PATH,
+                                            cache_subdir='models')
+                else:
+                    weights_path = get_file('densenet_40_12_tf_dim_ordering'
+                                            '_tf_kernels_no_top.h5',
+                                            TF_WEIGHTS_PATH_NO_TOP,
+                                            cache_subdir='models')
+
+                model.load_weights(weights_path)
+
+                if K.backend() == 'theano':
+                    convert_all_kernels_in_model(model)
+
+    return model
+
+
+<<<<<<< HEAD
+def DenseNetFCN(input_shape, nb_dense_block=5, growth_rate=16,
+                nb_layers_per_block=4, reduction=0.0, dropout_rate=0.0,
+                weight_decay=1E-4, init_conv_filters=48,
+                include_top=True, top='segmentation',
+                weights=None, input_tensor=None, classes=1,
+                activation='softmax',
+                upsampling_conv=128, upsampling_type='upsampling',
+                batchsize=None,
+                transition_dilation_rate=1,
+                transition_pooling='avg',
+                transition_kernel_size=(1, 1)):
+    """Instantiate the DenseNet FCN architecture.
+=======
+def DenseNetFCN(input_shape, nb_dense_block=5, growth_rate=16, nb_layers_per_block=4,
+                reduction=0.0, dropout_rate=0.0, weight_decay=1E-4, init_conv_filters=48,
+                include_top=True, weights=None, input_tensor=None, classes=1, activation='softmax',
+                upsampling_conv=128, upsampling_type='upsampling'):
+    """Instantiate the DenseNet FCN architecture.
+>>>>>>> master
+        Note that when using TensorFlow,
+        for best performance you should set
+        `image_data_format='channels_last'` in your Keras config
+        at ~/.keras/keras.json.
+        # Arguments
+
+            nb_dense_block: number of dense blocks to add to end
+                (generally = 5)
+            growth_rate: number of filters to add per dense block
+            nb_layers_per_block: number of layers in each dense block.
+                Can be a positive integer or a list.
+                If positive integer, a set number of layers per dense block.
+                If list, nb_layer is used as provided. Note that list size must
+                be (nb_dense_block + 1)
+            reduction: reduction factor of transition blocks with
+                0 <= reduction < 1.
+                Note : reduction value is inverted to compute compression.
+            dropout_rate: dropout rate
+            weight_decay: weight decay factor
+            init_conv_filters: number of layers in the initial
+                convolution layer
+            include_top: whether to include the fully-connected
+                layer at the top of the network.
+            weights: one of `None` (random initialization) or
+<<<<<<< HEAD
+                "cifar10" (pre-training on CIFAR-10)..
+            input_tensor: optional Keras tensor
+                (i.e. output of `layers.Input()`)
+=======
+                'cifar10' (pre-training on CIFAR-10)..
+            input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
+>>>>>>> master
+                to use as image input for the model.
+            input_shape: optional shape tuple, only to be specified
+                if `include_top` is False (otherwise the input shape
+                has to be `(32, 32, 3)` (with `channels_last` dim ordering)
+                or `(3, 32, 32)` (with `channels_first` dim ordering).
+                It should have exactly 3 inputs channels,
+                and width and height should be no smaller than 8.
+                E.g. `(200, 200, 3)` would be one valid value.
+            classes: optional number of classes to classify images
+                into, only to be specified if `include_top` is True, and
+                if no `weights` argument is specified.
+<<<<<<< HEAD
+            activation: Type of activation at the top layer. Can be one of
+                'softmax' or 'sigmoid'. Note that if sigmoid is used,
+                classes must be 1.
+            upsampling_conv: number of convolutional layers in
+                upsampling via subpixel convolution
+            upsampling_type: Can be one of 'upsampling', 'deconv', and
+=======
+            activation: Type of activation at the top layer. Can be one of 'softmax' or 'sigmoid'.
+                Note that if sigmoid is used, classes must be 1.
+            upsampling_conv: number of convolutional layers in upsampling via subpixel convolution
+            upsampling_type: Can be one of 'upsampling', 'deconv' and
+>>>>>>> master
+                'subpixel'. Defines type of upsampling algorithm used.
+            batchsize: Fixed batch size. This is a temporary
+                requirement for computation of output shape in the
+                case of Deconvolution2D layers. Parameter will be removed
+                in next iteration of Keras, which infers
+                output shape of deconvolution layers automatically.
+<<<<<<< HEAD
+            transition_dilation_rate: An integer or tuple/list of 2 integers,
+                specifying the dilation rate to in transition blocks for
+                dilated convolution, increasing the receptive field of the
+                algorithm. Can be a single integer to specify the same value
+                for all spatial dimensions.
+            transition_pooling: Data pooling to reduce resolution in transition
+                blocks, one of "avg", "max", or None.
+            transition_kernel_size: Adjusts the filter size of the Conv2D in
+                each transition block, useful in segmentation for controlling
+                the receptive field, particularly when combined with
+                transition_dilation_rate.
+
+=======
+>>>>>>> master
+        # Returns
+
+            A Keras model instance.
+    """
+
+    if weights not in {None}:
+        raise ValueError('The `weights` argument should be '
+                         '`None` (random initialization) as no '
+                         'model weights are provided.')
+
+    upsampling_type = upsampling_type.lower()
+
+    if upsampling_type not in ['upsampling', 'deconv', 'subpixel']:
+<<<<<<< HEAD
+        raise ValueError('Parameter "upsampling_type" must be one of '
+                         '"upsampling", "deconv", or "subpixel".')
+
+    if upsampling_type == 'deconv' and batchsize is None:
+        raise ValueError('If "upsampling_type" is deconvoloution, then a '
+                         'fixed batch size must be provided in '
+                         'batchsize parameter.')
+=======
+        raise ValueError('Parameter "upsampling_type" must be one of "upsampling", '
+                         '"deconv" or "subpixel".')
+>>>>>>> master
+
+    if input_shape is None:
+        raise ValueError(
+            'For fully convolutional models, input shape must be supplied.')
+
+    if type(nb_layers_per_block) is not list and nb_dense_block < 1:
+        raise ValueError('Number of dense layers per block must be greater '
+                         'than 1. Argument value was %d.' %
+                         (nb_layers_per_block))
+
+    if activation not in ['softmax', 'sigmoid']:
+        raise ValueError('activation must be one of "softmax" or "sigmoid"')
+
+<<<<<<< HEAD
+    if activation == 'sigmoid' and classes != 1:
+        raise ValueError(
+            'sigmoid activation can only be used when classes = 1')
+
+    # Determine proper input shape
+    # If doing segmentation we still include top
+    # but _obtain_input_shape only supports
+    # labeling, not segmentation networks.
+    input_shape = _obtain_input_shape(input_shape,
+                                      default_size=32,
+                                      min_size=16,
+                                      data_format=K.image_data_format(),
+                                      include_top=False)
+=======
+    if activation not in ['softmax', 'sigmoid']:
+        raise ValueError('activation must be one of "softmax" or "sigmoid"')
+
+    if activation == 'sigmoid' and classes != 1:
+        raise ValueError('sigmoid activation can only be used when classes = 1')
+
+    # Determine proper input shape
+    min_size = 2 ** nb_dense_block
+
+    if K.image_data_format() == 'channels_first':
+        if input_shape is not None:
+            if ((input_shape[1] is not None and input_shape[1] < min_size) or
+                    (input_shape[2] is not None and input_shape[2] < min_size)):
+                raise ValueError('Input size must be at least ' +
+                                 str(min_size) + 'x' + str(min_size) + ', got '
+                                                                       '`input_shape=' + str(input_shape) + '`')
+        else:
+            input_shape = (classes, None, None)
+    else:
+        if input_shape is not None:
+            if ((input_shape[0] is not None and input_shape[0] < min_size) or
+                    (input_shape[1] is not None and input_shape[1] < min_size)):
+                raise ValueError('Input size must be at least ' +
+                                 str(min_size) + 'x' + str(min_size) + ', got '
+                                                                       '`input_shape=' + str(input_shape) + '`')
+        else:
+            input_shape = (None, None, classes)
+>>>>>>> master
+
+    if input_tensor is None:
+        img_input = Input(shape=input_shape)
+    else:
+        if not K.is_keras_tensor(input_tensor):
+            img_input = Input(tensor=input_tensor, shape=input_shape)
+        else:
+            img_input = input_tensor
+
+<<<<<<< HEAD
+    x = __create_fcn_dense_net(classes, img_input, include_top, top,
+                               nb_dense_block,
+                               growth_rate, reduction, dropout_rate,
+                               weight_decay, nb_layers_per_block,
+                               upsampling_conv, upsampling_type,
+                               batchsize, init_conv_filters,
+                               transition_dilation_rate,
+                               transition_pooling, transition_kernel_size,
+                               activation, input_shape)
+=======
+    x = __create_fcn_dense_net(classes, img_input, include_top, nb_dense_block,
+                               growth_rate, reduction, dropout_rate, weight_decay,
+                               nb_layers_per_block, upsampling_conv, upsampling_type,
+                               init_conv_filters, input_shape, activation)
+>>>>>>> master
+
+    # Ensure that the model takes into account
+    # any potential predecessors of `input_tensor`.
+    if input_tensor is not None:
+        inputs = get_source_inputs(input_tensor)
+    else:
+        inputs = img_input
+    # Create model.
+    model = Model(inputs, x, name='fcn-densenet')
+
+    return model
+
+
+def __conv_block(ip, nb_filter, bottleneck=False,
+                 dropout_rate=None, weight_decay=1E-4):
+    """ Apply BatchNorm, Relu, 3x3 Conv2D, optional bottleneck block and dropout
+    Args:
+        ip: Input keras tensor
+        nb_filter: number of filters
+        bottleneck: add bottleneck block
+        dropout_rate: dropout rate
+        weight_decay: weight decay factor
+<<<<<<< HEAD
+
+    Returns: keras tensor with batch_norm, relu and convolution2d added
+             (optional bottleneck)
+=======
+    Returns: keras tensor with batch_norm, relu and convolution2d added (optional bottleneck)
+>>>>>>> master
+    """
+
+    concat_axis = 1 if K.image_data_format() == 'channels_first' else -1
+
+    x = BatchNormalization(axis=concat_axis,
+                           gamma_regularizer=l2(weight_decay),
+                           beta_regularizer=l2(weight_decay))(ip)
+    x = Activation('relu')(x)
+
+    if bottleneck:
+        # Obtained from
+        # https://github.com/liuzhuang13/DenseNet/blob/master/densenet.lua
+        inter_channel = nb_filter * 4
+
+<<<<<<< HEAD
+        x = Conv2D(inter_channel, (1, 1), kernel_initializer='he_uniform',
+                   padding='same', use_bias=False,
+=======
+        x = Conv2D(inter_channel, (1, 1), kernel_initializer='he_uniform', padding='same', use_bias=False,
+>>>>>>> master
+                   kernel_regularizer=l2(weight_decay))(x)
+
+        if dropout_rate:
+            x = Dropout(dropout_rate)(x)
+
+<<<<<<< HEAD
+        x = BatchNormalization(axis=concat_axis,
+                               gamma_regularizer=l2(weight_decay),
+                               beta_regularizer=l2(weight_decay))(x)
+        x = Activation('relu')(x)
+
+    x = Conv2D(nb_filter, (3, 3), kernel_initializer='he_uniform',
+               padding='same', use_bias=False,
+=======
+        x = BatchNormalization(axis=concat_axis, gamma_regularizer=l2(weight_decay),
+                               beta_regularizer=l2(weight_decay))(x)
+        x = Activation('relu')(x)
+
+    x = Conv2D(nb_filter, (3, 3), kernel_initializer='he_uniform', padding='same', use_bias=False,
+>>>>>>> master
+               kernel_regularizer=l2(weight_decay))(x)
+    if dropout_rate:
+        x = Dropout(dropout_rate)(x)
+
+    return x
+
+
+<<<<<<< HEAD
+def __transition_block(ip, nb_filter, compression=1.0, dropout_rate=None,
+                       weight_decay=1E-4, dilation_rate=1, pooling='avg',
+                       kernel_size=(1, 1)):
+    """ Apply BatchNorm, Relu 1x1, Conv2D, compression, dropout and Maxpooling2D
+
+=======
+def __transition_block(ip, nb_filter, compression=1.0, dropout_rate=None, weight_decay=1E-4):
+    """ Apply BatchNorm, Relu 1x1, Conv2D, optional compression, dropout and Maxpooling2D
+>>>>>>> master
+    Args:
+        ip: keras tensor
+        nb_filter: number of filters
+        compression: calculated as 1 - reduction. Reduces the number of
+            feature maps in the transition block, is optional.
+        dropout_rate: dropout rate
+        weight_decay: weight decay factor
+<<<<<<< HEAD
+        dilation_rate: an integer or tuple/list of 2 integers, specifying the
+          dilation rate to use for dilated, or atrous convolution.
+          Can be a single integer to specify the same value for all
+          spatial dimensions.
+        pooling: Data pooling to reduce resolution,
+            one of "avg", "max", or None.
+
+    Returns:
+
+        keras tensor, after applying batch_norm, relu-conv, dropout, maxpool
+=======
+    Returns: keras tensor, after applying batch_norm, relu-conv, dropout, maxpool
+>>>>>>> master
+    """
+
+    concat_axis = 1 if K.image_data_format() == 'channels_first' else -1
+
+    x = BatchNormalization(axis=concat_axis,
+                           gamma_regularizer=l2(weight_decay),
+                           beta_regularizer=l2(weight_decay))(ip)
+    x = Activation('relu')(x)
+<<<<<<< HEAD
+    x = Conv2D(int(nb_filter * compression), kernel_size,
+               kernel_initializer='he_uniform', padding='same', use_bias=False,
+               kernel_regularizer=l2(weight_decay),
+               dilation_rate=dilation_rate)(x)
+=======
+    x = Conv2D(int(nb_filter * compression), (1, 1), kernel_initializer='he_uniform', padding='same', use_bias=False,
+               kernel_regularizer=l2(weight_decay))(x)
+>>>>>>> master
+    if dropout_rate:
+        x = Dropout(dropout_rate)(x)
+
+    if pooling == 'avg':
+        x = AveragePooling2D((2, 2), strides=(2, 2))(x)
+    elif pooling == 'max':
+        x = MaxPooling2D((2, 2), strides=(2, 2))(x)
+
+    return x
+
+
+def __dense_block(x, nb_layers, nb_filter, growth_rate, bottleneck=False,
+                  dropout_rate=None, weight_decay=1E-4,
+                  grow_nb_filters=True, return_concat_list=False):
+<<<<<<< HEAD
+    """ Build a dense_block where each conv_block is fed to subsequent ones
+
+=======
+    """ Build a dense_block where the output of each conv_block is fed to subsequent ones
+>>>>>>> master
+    Args:
+        x: keras tensor
+        nb_layers: the number of layers of conv_block to append to the model.
+        nb_filter: number of filters
+        growth_rate: growth rate
+        bottleneck: bottleneck block
+        dropout_rate: dropout rate
+        weight_decay: weight decay factor
+        grow_nb_filters: flag to decide to allow number of filters to grow
+<<<<<<< HEAD
+        return_concat_list: return the list of feature maps along with the
+            actual output
+
+=======
+        return_concat_list: return the list of feature maps along with the actual output
+>>>>>>> master
+    Returns: keras tensor with nb_layers of conv_block appended
+    """
+
+    concat_axis = 1 if K.image_data_format() == 'channels_first' else -1
+
+    x_list = [x]
+
+    for i in range(nb_layers):
+        x = __conv_block(x, growth_rate, bottleneck,
+                         dropout_rate, weight_decay)
+        x_list.append(x)
+
+<<<<<<< HEAD
+        x = concatenate(x_list, concat_axis)
+=======
+        x = concatenate(x_list, axis=concat_axis)
+>>>>>>> master
+
+        if grow_nb_filters:
+            nb_filter += growth_rate
+
+    if return_concat_list:
+        return x, nb_filter, x_list
+    else:
+        return x, nb_filter
+
+
+<<<<<<< HEAD
+def __transition_up_block(ip, nb_filters, type='upsampling',
+                          output_shape=None, weight_decay=1E-4):
+=======
+def __transition_up_block(ip, nb_filters, type='upsampling', weight_decay=1E-4):
+>>>>>>> master
+    """ SubpixelConvolutional Upscaling (factor = 2)
+    Args:
+        ip: keras tensor
+        nb_filters: number of layers
+<<<<<<< HEAD
+        type: can be 'upsampling', 'subpixel', or 'deconv'. Determines type of
+            upsampling performed
+        output_shape: required if type = 'deconv'. Output shape of tensor
+=======
+        type: can be 'upsampling', 'subpixel', 'deconv'. Determines type of upsampling performed
+>>>>>>> master
+        weight_decay: weight decay factor
+    Returns: keras tensor, after applying upsampling operation.
+    """
+
+    if type == 'upsampling':
+        x = UpSampling2D()(ip)
+    elif type == 'subpixel':
+<<<<<<< HEAD
+        x = Conv2D(nb_filters, (3, 3), activation='relu', padding='same',
+                   kernel_regularizer=l2(weight_decay),
+                   use_bias=False, kernel_initializer='he_uniform')(ip)
+        x = SubPixelUpscaling(scale_factor=2)(x)
+        x = Conv2D(nb_filters, (3, 3), activation='relu', padding='same',
+                   kernel_regularizer=l2(weight_decay),
+                   use_bias=False, kernel_initializer='he_uniform')(x)
+    else:
+        x = Conv2DTranspose(nb_filters, (3, 3), output_shape,
+                            activation='relu', padding='same',
+                            subsample=(2, 2),
+=======
+        x = Conv2D(nb_filters, (3, 3), activation='relu', padding='same', W_regularizer=l2(weight_decay),
+                   use_bias=False, kernel_initializer='he_uniform')(ip)
+        x = SubPixelUpscaling(scale_factor=2)(x)
+        x = Conv2D(nb_filters, (3, 3), activation='relu', padding='same', W_regularizer=l2(weight_decay),
+                   use_bias=False, kernel_initializer='he_uniform')(x)
+    else:
+        x = Conv2DTranspose(nb_filters, (3, 3), activation='relu', padding='same', strides=(2, 2),
+>>>>>>> master
+                            kernel_initializer='he_uniform')(ip)
+
+    return x
+
+
+<<<<<<< HEAD
+def __create_dense_net(nb_classes, img_input, include_top=True,
+                       top='classification', depth=40,
+                       nb_dense_block=3, growth_rate=12, nb_filter=-1,
+                       nb_layers_per_block=-1, bottleneck=False, reduction=0.0,
+                       dropout_rate=None, weight_decay=1E-4,
+                       transition_dilation_rate=1, transition_pooling="avg",
+                       transition_kernel_size=(1, 1), input_shape=None,
+                       activation='softmax'):
+    """Build the DenseNet model
+
+=======
+def __create_dense_net(nb_classes, img_input, include_top, depth=40, nb_dense_block=3, growth_rate=12, nb_filter=-1,
+                       nb_layers_per_block=-1, bottleneck=False, reduction=0.0, dropout_rate=None, weight_decay=1E-4,
+                       activation='softmax'):
+    """ Build the DenseNet model
+>>>>>>> master
+    Args:
+        nb_classes: number of classes
+        img_input: tuple of shape (channels, rows, columns) or
+            (rows, columns, channels)
+        include_top: flag to include the final Dense layer
+        depth: number or layers
+        nb_dense_block: number of dense blocks to add to end (generally = 3)
+        growth_rate: number of filters to add per dense block
+        nb_filter: initial number of filters. Default -1 indicates
+            initial number of filters is 2 * growth_rate.
+        nb_layers_per_block: number of layers in each dense block.
+            Can be a -1, positive integer or a list.
+            If -1, calculates nb_layer_per_block from the depth of the network.
+            If positive integer, a set number of layers per dense block.
+            If list, nb_layer is used as provided. Note that list size must
+            be (nb_dense_block + 1)
+        bottleneck: add bottleneck blocks
+        reduction: reduction factor of transition blocks.
+            Note : reduction value is inverted to compute compression
+        dropout_rate: dropout rate
+        weight_decay: weight decay
+<<<<<<< HEAD
+        transition_dilation_rate: An integer or tuple/list of 2 integers,
+            specifying the dilation rate to in transition blocks for
+            dilated convolution, increasing the receptive field of the
+            algorithm. Can be a single integer to specify the same value
+            for all spatial dimensions.
+        transition_pooling: Data pooling to reduce resolution in transition
+            blocks, one of "avg", "max", or None.
+        transition_kernel_size: Adjusts the filter size of the Conv2D in
+            each transition block, useful in segmentation for controlling
+            the receptive field, particularly when combined with
+            transition_dilation_rate.
+        input_shape: Only used for shape inference in fully
+            convolutional networks.
+        activation: Type of activation at the top layer. Can be one of
+            'softmax' or 'sigmoid'. Note that if sigmoid is used,
+            classes must be 1.
+
+=======
+        activation: Type of activation at the top layer. Can be one of 'softmax' or 'sigmoid'.
+                Note that if sigmoid is used, classes must be 1.
+>>>>>>> master
+    Returns: keras tensor with nb_layers of conv_block appended
+    """
+
+    concat_axis = 1 if K.image_data_format() == 'channels_first' else -1
+
+<<<<<<< HEAD
+    if depth is not None:
+        assert (depth - 4) % 3 == 0, 'Depth must be nb_dense_block * N + 4'
+    else:
+        assert nb_layers_per_block is not - \
+            1, 'Depth cannot be None when nb_layers_per_block is -1.' \
+            ' Specify either parameter.'
+    if reduction != 0.0:
+        assert reduction <= 1.0 and reduction > 0.0, \
+            "reduction value must lie between 0.0 and 1.0"
+=======
+    assert (depth - 4) % 3 == 0, 'Depth must be 3 N + 4'
+    if reduction != 0.0:
+        assert reduction <= 1.0 and reduction > 0.0, 'reduction value must lie between 0.0 and 1.0'
+>>>>>>> master
+
+    # layers in each dense block
+    if type(nb_layers_per_block) is list or type(nb_layers_per_block) is tuple:
+        nb_layers = list(nb_layers_per_block)  # Convert tuple to list
+
+<<<<<<< HEAD
+        assert len(nb_layers) == (nb_dense_block + 1), \
+            "If list, nb_layer is used as provided. " \
+            "Note that list size must be (nb_dense_block + 1)"
+=======
+        assert len(nb_layers) == (nb_dense_block + 1), 'If list, nb_layer is used as provided. ' \
+                                                       'Note that list size must be (nb_dense_block + 1)'
+>>>>>>> master
+        final_nb_layer = nb_layers[-1]
+        nb_layers = nb_layers[:-1]
+    else:
+        if nb_layers_per_block == -1:
+            count = int((depth - 4) / 3)
+            nb_layers = [count for _ in range(nb_dense_block)]
+            final_nb_layer = count
+        else:
+            final_nb_layer = nb_layers_per_block
+            nb_layers = [nb_layers_per_block] * nb_dense_block
+
+    if bottleneck:
+        nb_layers = [int(layer // 2) for layer in nb_layers]
+
+    # compute initial nb_filter if -1, else accept users initial nb_filter
+    if nb_filter <= 0:
+        nb_filter = 2 * growth_rate
+
+    # compute compression factor
+    compression = 1.0 - reduction
+
+    # Initial convolution
+<<<<<<< HEAD
+    x = Conv2D(nb_filter, (3, 3), kernel_initializer='he_uniform',
+               padding='same', name='initial_conv2D', use_bias=False,
+               kernel_regularizer=l2(weight_decay))(img_input)
+=======
+    x = Conv2D(nb_filter, (3, 3), kernel_initializer='he_uniform', padding='same', name='initial_conv2D',
+               use_bias=False, kernel_regularizer=l2(weight_decay))(img_input)
+>>>>>>> master
+
+    # Add dense blocks
+    for block_idx in range(nb_dense_block - 1):
+        x, nb_filter = __dense_block(x, nb_layers[block_idx], nb_filter,
+                                     growth_rate, bottleneck=bottleneck,
+                                     dropout_rate=dropout_rate,
+                                     weight_decay=weight_decay)
+        # add transition_block
+        x = __transition_block(x, nb_filter, compression=compression,
+                               dropout_rate=dropout_rate,
+                               weight_decay=weight_decay,
+                               dilation_rate=transition_dilation_rate,
+                               pooling=transition_pooling,
+                               kernel_size=transition_kernel_size)
+        nb_filter = int(nb_filter * compression)
+
+    # The last dense_block does not have a transition_block
+    x, nb_filter = __dense_block(x, final_nb_layer, nb_filter,
+                                 growth_rate, bottleneck=bottleneck,
+                                 dropout_rate=dropout_rate,
+                                 weight_decay=weight_decay)
+
+    x = BatchNormalization(axis=concat_axis,
+                           gamma_regularizer=l2(weight_decay),
+                           beta_regularizer=l2(weight_decay))(x)
+    x = Activation('relu')(x)
+
+<<<<<<< HEAD
+    if include_top and top is 'classification':
+        x = GlobalAveragePooling2D()(x)
+        x = Dense(nb_classes, activation=activation, kernel_regularizer=l2(
+            weight_decay), bias_regularizer=l2(weight_decay))(x)
+    elif include_top and top is 'segmentation':
+        x = Conv2D(nb_classes, (1, 1), activation='linear',
+                   padding='same', kernel_regularizer=l2(weight_decay),
+                   use_bias=False)(x)
+
+        if K.image_data_format() == 'channels_first':
+            channel, row, col = input_shape
+        else:
+            row, col, channel = input_shape
+
+        x = Reshape((row * col, nb_classes))(x)
+        x = Activation(activation)(x)
+        x = Reshape((row, col, nb_classes))(x)
+=======
+    if include_top:
+        x = Dense(nb_classes, activation=activation, W_regularizer=l2(weight_decay), b_regularizer=l2(weight_decay))(x)
+>>>>>>> master
+
+    return x
+
+
+def __create_fcn_dense_net(nb_classes, img_input, include_top,
+                           top='segmentation',
+                           nb_dense_block=5, growth_rate=12,
+                           reduction=0.0, dropout_rate=None, weight_decay=1E-4,
+<<<<<<< HEAD
+                           nb_layers_per_block=4, nb_upsampling_conv=128,
+                           upsampling_type='upsampling',
+                           batchsize=None, init_conv_filters=48,
+                           transition_dilation_rate=1,
+                           transition_pooling='avg',
+                           transition_kernel_size=(1, 1),
+                           activation='softmax',
+                           input_shape=None):
+=======
+                           nb_layers_per_block=4, nb_upsampling_conv=128, upsampling_type='upsampling',
+                           init_conv_filters=48, input_shape=None, activation='softmax'):
+>>>>>>> master
+    """ Build the DenseNet model
+    Args:
+        nb_classes: number of classes
+        img_input: tuple of shape (channels, rows, columns) or
+            (rows, columns, channels)
+        include_top: flag to include the final Dense layer
+        nb_dense_block: number of dense blocks to add to end (generally = 3)
+        growth_rate: number of filters to add per dense block
+        reduction: reduction factor of transition blocks.
+            Note: reduction value is inverted to compute compression
+        dropout_rate: dropout rate
+        weight_decay: weight decay
+        nb_layers_per_block: number of layers in each dense block.
+            Can be a positive integer or a list.
+            If positive integer, a set number of layers per dense block.
+            If list, nb_layer is used as provided. Note that list size must
+            be (nb_dense_block + 1)
+<<<<<<< HEAD
+        nb_upsampling_conv: number of convolutional layers in
+            upsampling via subpixel convolution
+        upsampling_type: Can be one of 'upsampling', 'deconv', and
+            'subpixel'. Defines type of upsampling algorithm used.
+        batchsize: Fixed batch size. This is a temporary requirement for
+            computation of output shape in the case of Deconvolution2D layers.
+            Parameter will be removed in next iteration of Keras, which infers
+            output shape of deconvolution layers automatically.
+        input_shape: Only used for shape inference in fully
+            convolutional networks.
+        transition_dilation_rate: An integer or tuple/list of 2 integers,
+            specifying the dilation rate to in transition blocks for
+            dilated convolution, increasing the receptive field of the
+            algorithm. Can be a single integer to specify the same value
+            for all spatial dimensions.
+        transition_pooling: Data pooling to reduce resolution in transition
+            blocks, one of "avg", "max", or None.
+        transition_kernel_size: Adjusts the filter size of the Conv2D in
+            each transition block, useful in segmentation for controlling
+            the receptive field, particularly when combined with
+            transition_dilation_rate.
+
+=======
+        nb_upsampling_conv: number of convolutional layers in upsampling via subpixel convolution
+        upsampling_type: Can be one of 'upsampling', 'deconv' and 'subpixel'. Defines
+            type of upsampling algorithm used.
+        input_shape: Only used for shape inference in fully convolutional networks.
+        activation: Type of activation at the top layer. Can be one of 'softmax' or 'sigmoid'.
+                    Note that if sigmoid is used, classes must be 1.
+>>>>>>> master
+    Returns: keras tensor with nb_layers of conv_block appended
+    """
+
+    concat_axis = 1 if K.image_data_format() == 'channels_first' else -1
+
+    if concat_axis == 1:  # channels_first dim ordering
+        _, rows, cols = input_shape
+    else:
+        rows, cols, _ = input_shape
+
+    if reduction != 0.0:
+<<<<<<< HEAD
+        assert reduction <= 1.0 and reduction > 0.0, \
+            'reduction value must lie between 0.0 and 1.0'
+
+    # check if upsampling_conv has minimum number of filters
+    # minimum is set to 12, as at least 3 color channels are needed for
+    # correct upsampling
+    assert nb_upsampling_conv >= 12 and nb_upsampling_conv % 4 == 0, \
+        'Parameter `upsampling_conv` number of channels must ' \
+        'be a positive number divisible by 4 and greater ' \
+        'than 12'
+=======
+        assert reduction <= 1.0 and reduction > 0.0, 'reduction value must lie between 0.0 and 1.0'
+
+    # check if upsampling_conv has minimum number of filters
+    # minimum is set to 12, as at least 3 color channels are needed for correct upsampling
+    assert nb_upsampling_conv > 12 and nb_upsampling_conv % 4 == 0, 'Parameter `upsampling_conv` number of channels must ' \
+                                                                    'be a positive number divisible by 4 and greater ' \
+                                                                    'than 12'
+>>>>>>> master
+
+    # layers in each dense block
+    if type(nb_layers_per_block) is list or type(nb_layers_per_block) is tuple:
+        nb_layers = list(nb_layers_per_block)  # Convert tuple to list
+
+<<<<<<< HEAD
+        assert len(nb_layers) == (nb_dense_block + 1), \
+            'If list, nb_layer is used as provided. ' \
+            'Note that list size must be (nb_dense_block + 1)'
+=======
+        assert len(nb_layers) == (nb_dense_block + 1), 'If list, nb_layer is used as provided. ' \
+                                                       'Note that list size must be (nb_dense_block + 1)'
+>>>>>>> master
+
+        bottleneck_nb_layers = nb_layers[-1]
+        rev_layers = nb_layers[::-1]
+        nb_layers.extend(rev_layers[1:])
+    else:
+        bottleneck_nb_layers = nb_layers_per_block
+        nb_layers = [nb_layers_per_block] * (2 * nb_dense_block + 1)
+
+    # compute compression factor
+    compression = 1.0 - reduction
+
+    # Initial convolution
+<<<<<<< HEAD
+    x = Conv2D(init_conv_filters, (3, 3), kernel_initializer="he_uniform",
+               padding="same", name="initial_conv2D", use_bias=False,
+               kernel_regularizer=l2(weight_decay))(img_input)
+=======
+    x = Conv2D(init_conv_filters, (3, 3), kernel_initializer='he_uniform', padding='same', name='initial_conv2D',
+               use_bias=False, kernel_regularizer=l2(weight_decay))(img_input)
+>>>>>>> master
+
+    nb_filter = init_conv_filters
+
+    skip_list = []
+
+    # Add dense blocks and transition down block
+    for block_idx in range(nb_dense_block):
+        x, nb_filter = __dense_block(x, nb_layers[block_idx],
+                                     nb_filter, growth_rate,
+                                     dropout_rate=dropout_rate,
+                                     weight_decay=weight_decay)
+
+        # Skip connection
+        skip_list.append(x)
+
+        # add transition_block
+        x = __transition_block(x, nb_filter, compression=compression,
+                               dropout_rate=dropout_rate,
+                               weight_decay=weight_decay)
+
+        # this is calculated inside transition_down_block
+        nb_filter = int(nb_filter * compression)
+
+    # The last dense_block does not have a transition_down_block
+    # return the concatenated feature maps without the concatenation of the
+    # input
+    _, nb_filter, concat_list = __dense_block(x, bottleneck_nb_layers,
+                                              nb_filter,
+                                              growth_rate,
+                                              dropout_rate=dropout_rate,
+                                              weight_decay=weight_decay,
+                                              return_concat_list=True)
+
+    skip_list = skip_list[::-1]  # reverse the skip list
+
+<<<<<<< HEAD
+    if K.image_data_format() == 'channels_first':
+        out_shape = [batchsize, nb_filter, rows // 16, cols // 16]
+    else:
+        out_shape = [batchsize, rows // 16, cols // 16, nb_filter]
+
+=======
+>>>>>>> master
+    # Add dense blocks and transition up block
+    for block_idx in range(nb_dense_block):
+        n_filters_keep = growth_rate * nb_layers[nb_dense_block + block_idx]
+
+<<<<<<< HEAD
+        if K.image_data_format() == 'channels_first':
+            out_shape[1] = n_filters_keep
+        else:
+            out_shape[3] = n_filters_keep
+
+        # upsampling block must upsample only the
+        # feature maps (concat_list[1:]),
+        # not the concatenation of the input with the
+        # feature maps (concat_list[0]).
+        l = concatenate(concat_list[1:], axis=concat_axis)
+
+        t = __transition_up_block(l, nb_filters=n_filters_keep,
+                                  type=upsampling_type,
+                                  output_shape=out_shape)
+=======
+        # upsampling block must upsample only the feature maps (concat_list[1:]),
+        # not the concatenation of the input with the feature maps (concat_list[0].
+        l = concatenate(concat_list[1:], axis=concat_axis)
+
+        t = __transition_up_block(l, nb_filters=n_filters_keep, type=upsampling_type)
+>>>>>>> master
+
+        # concatenate the skip connection with the transition block
+        x = concatenate([t, skip_list[block_idx]], axis=concat_axis)
+
+<<<<<<< HEAD
+        if K.image_data_format() == 'channels_first':
+            out_shape[2] *= 2
+            out_shape[3] *= 2
+        else:
+            out_shape[1] *= 2
+            out_shape[2] *= 2
+
+        # Dont allow the feature map size to grow in upsampling dense blocks
+        _, nb_filter, concat_list = \
+            __dense_block(x,
+                          nb_layers[nb_dense_block + block_idx + 1],
+                          nb_filter=growth_rate, growth_rate=growth_rate,
+                          dropout_rate=dropout_rate,
+                          weight_decay=weight_decay,
+                          return_concat_list=True, grow_nb_filters=False)
+
+    if include_top and top is 'segmentation':
+        x = Conv2D(nb_classes, (1, 1), activation='linear',
+                   padding='same', kernel_regularizer=l2(weight_decay),
+=======
+        # Dont allow the feature map size to grow in upsampling dense blocks
+        _, nb_filter, concat_list = __dense_block(x, nb_layers[nb_dense_block + block_idx + 1], nb_filter=growth_rate,
+                                                  growth_rate=growth_rate, dropout_rate=dropout_rate,
+                                                  weight_decay=weight_decay,
+                                                  return_concat_list=True, grow_nb_filters=False)
+
+    if include_top:
+        x = Conv2D(nb_classes, (1, 1), activation='linear', padding='same', kernel_regularizer=l2(weight_decay),
+>>>>>>> master
+                   use_bias=False)(x)
+
+        if K.image_data_format() == 'channels_first':
+            channel, row, col = input_shape
+        else:
+            row, col, channel = input_shape
+
+        x = Reshape((row * col, nb_classes))(x)
+        x = Activation(activation)(x)
+        x = Reshape((row, col, nb_classes))(x)
+
+    return x
diff --git a/keras_contrib/applications/densenet121.py b/keras_contrib/applications/densenet121.py
new file mode 100644
index 000000000..0f3281a61
--- /dev/null
+++ b/keras_contrib/applications/densenet121.py
@@ -0,0 +1,236 @@
+# -*- coding: utf-8 -*-
+
+from keras.optimizers import SGD
+from keras.layers import Input, merge, ZeroPadding2D
+from keras.layers.core import Dense, Dropout, Activation
+from keras.layers.convolutional import Convolution2D
+from keras.layers.pooling import AveragePooling2D, GlobalAveragePooling2D, MaxPooling2D
+from keras.layers.normalization import BatchNormalization
+from keras.models import Model
+import keras.backend as K
+
+from sklearn.metrics import log_loss
+
+from custom_layers.scale_layer import Scale
+
+from load_cifar10 import load_cifar10_data
+
+def densenet121_model(img_rows, img_cols, color_type=1, nb_dense_block=4, growth_rate=32, nb_filter=64, reduction=0.5, dropout_rate=0.0, weight_decay=1e-4, num_classes=None):
+    '''
+    DenseNet 121 Model for Keras
+
+    Model Schema is based on 
+    https://github.com/flyyufelix/DenseNet-Keras
+
+    ImageNet Pretrained Weights 
+    Theano: https://drive.google.com/open?id=0Byy2AcGyEVxfMlRYb3YzV210VzQ
+    TensorFlow: https://drive.google.com/open?id=0Byy2AcGyEVxfSTA4SHJVOHNuTXc
+
+    # Arguments
+        nb_dense_block: number of dense blocks to add to end
+        growth_rate: number of filters to add per dense block
+        nb_filter: initial number of filters
+        reduction: reduction factor of transition blocks.
+        dropout_rate: dropout rate
+        weight_decay: weight decay factor
+        classes: optional number of classes to classify images
+        weights_path: path to pre-trained weights
+    # Returns
+        A Keras model instance.
+    '''
+    eps = 1.1e-5
+
+    # compute compression factor
+    compression = 1.0 - reduction
+
+    # Handle Dimension Ordering for different backends
+    global concat_axis
+    if K.image_dim_ordering() == 'tf':
+      concat_axis = 3
+      img_input = Input(shape=(img_rows, img_cols, color_type), name='data')
+    else:
+      concat_axis = 1
+      img_input = Input(shape=(color_type, img_rows, img_cols), name='data')
+
+    # From architecture for ImageNet (Table 1 in the paper)
+    nb_filter = 64
+    nb_layers = [6,12,24,16] # For DenseNet-121
+
+    # Initial convolution
+    x = ZeroPadding2D((3, 3), name='conv1_zeropadding')(img_input)
+    x = Convolution2D(nb_filter, 7, 7, subsample=(2, 2), name='conv1', bias=False)(x)
+    x = BatchNormalization(epsilon=eps, axis=concat_axis, name='conv1_bn')(x)
+    x = Scale(axis=concat_axis, name='conv1_scale')(x)
+    x = Activation('relu', name='relu1')(x)
+    x = ZeroPadding2D((1, 1), name='pool1_zeropadding')(x)
+    x = MaxPooling2D((3, 3), strides=(2, 2), name='pool1')(x)
+
+    # Add dense blocks
+    for block_idx in range(nb_dense_block - 1):
+        stage = block_idx+2
+        x, nb_filter = dense_block(x, stage, nb_layers[block_idx], nb_filter, growth_rate, dropout_rate=dropout_rate, weight_decay=weight_decay)
+
+        # Add transition_block
+        x = transition_block(x, stage, nb_filter, compression=compression, dropout_rate=dropout_rate, weight_decay=weight_decay)
+        nb_filter = int(nb_filter * compression)
+
+    final_stage = stage + 1
+    x, nb_filter = dense_block(x, final_stage, nb_layers[-1], nb_filter, growth_rate, dropout_rate=dropout_rate, weight_decay=weight_decay)
+
+    x = BatchNormalization(epsilon=eps, axis=concat_axis, name='conv'+str(final_stage)+'_blk_bn')(x)
+    x = Scale(axis=concat_axis, name='conv'+str(final_stage)+'_blk_scale')(x)
+    x = Activation('relu', name='relu'+str(final_stage)+'_blk')(x)
+
+    x_fc = GlobalAveragePooling2D(name='pool'+str(final_stage))(x)
+    x_fc = Dense(1000, name='fc6')(x_fc)
+    x_fc = Activation('softmax', name='prob')(x_fc)
+
+    model = Model(img_input, x_fc, name='densenet')
+
+    if K.image_dim_ordering() == 'th':
+      # Use pre-trained weights for Theano backend
+      weights_path = 'imagenet_models/densenet121_weights_th.h5'
+    else:
+      # Use pre-trained weights for Tensorflow backend
+      weights_path = 'imagenet_models/densenet121_weights_tf.h5'
+
+    model.load_weights(weights_path, by_name=True)
+
+    # Truncate and replace softmax layer for transfer learning
+    # Cannot use model.layers.pop() since model is not of Sequential() type
+    # The method below works since pre-trained weights are stored in layers but not in the model
+    x_newfc = GlobalAveragePooling2D(name='pool'+str(final_stage))(x)
+    x_newfc = Dense(num_classes, name='fc6')(x_newfc)
+    x_newfc = Activation('softmax', name='prob')(x_newfc)
+
+    model = Model(img_input, x_newfc)
+
+    # Learning rate is changed to 0.001
+    sgd = SGD(lr=1e-3, decay=1e-6, momentum=0.9, nesterov=True)
+    model.compile(optimizer=sgd, loss='categorical_crossentropy', metrics=['accuracy'])
+
+    return model
+
+
+def conv_block(x, stage, branch, nb_filter, dropout_rate=None, weight_decay=1e-4):
+    '''Apply BatchNorm, Relu, bottleneck 1x1 Conv2D, 3x3 Conv2D, and option dropout
+        # Arguments
+            x: input tensor 
+            stage: index for dense block
+            branch: layer index within each dense block
+            nb_filter: number of filters
+            dropout_rate: dropout rate
+            weight_decay: weight decay factor
+    '''
+    eps = 1.1e-5
+    conv_name_base = 'conv' + str(stage) + '_' + str(branch)
+    relu_name_base = 'relu' + str(stage) + '_' + str(branch)
+
+    # 1x1 Convolution (Bottleneck layer)
+    inter_channel = nb_filter * 4  
+    x = BatchNormalization(epsilon=eps, axis=concat_axis, name=conv_name_base+'_x1_bn')(x)
+    x = Scale(axis=concat_axis, name=conv_name_base+'_x1_scale')(x)
+    x = Activation('relu', name=relu_name_base+'_x1')(x)
+    x = Convolution2D(inter_channel, 1, 1, name=conv_name_base+'_x1', bias=False)(x)
+
+    if dropout_rate:
+        x = Dropout(dropout_rate)(x)
+
+    # 3x3 Convolution
+    x = BatchNormalization(epsilon=eps, axis=concat_axis, name=conv_name_base+'_x2_bn')(x)
+    x = Scale(axis=concat_axis, name=conv_name_base+'_x2_scale')(x)
+    x = Activation('relu', name=relu_name_base+'_x2')(x)
+    x = ZeroPadding2D((1, 1), name=conv_name_base+'_x2_zeropadding')(x)
+    x = Convolution2D(nb_filter, 3, 3, name=conv_name_base+'_x2', bias=False)(x)
+
+    if dropout_rate:
+        x = Dropout(dropout_rate)(x)
+
+    return x
+
+
+def transition_block(x, stage, nb_filter, compression=1.0, dropout_rate=None, weight_decay=1E-4):
+    ''' Apply BatchNorm, 1x1 Convolution, averagePooling, optional compression, dropout 
+        # Arguments
+            x: input tensor
+            stage: index for dense block
+            nb_filter: number of filters
+            compression: calculated as 1 - reduction. Reduces the number of feature maps in the transition block.
+            dropout_rate: dropout rate
+            weight_decay: weight decay factor
+    '''
+
+    eps = 1.1e-5
+    conv_name_base = 'conv' + str(stage) + '_blk'
+    relu_name_base = 'relu' + str(stage) + '_blk'
+    pool_name_base = 'pool' + str(stage) 
+
+    x = BatchNormalization(epsilon=eps, axis=concat_axis, name=conv_name_base+'_bn')(x)
+    x = Scale(axis=concat_axis, name=conv_name_base+'_scale')(x)
+    x = Activation('relu', name=relu_name_base)(x)
+    x = Convolution2D(int(nb_filter * compression), 1, 1, name=conv_name_base, bias=False)(x)
+
+    if dropout_rate:
+        x = Dropout(dropout_rate)(x)
+
+    x = AveragePooling2D((2, 2), strides=(2, 2), name=pool_name_base)(x)
+
+    return x
+
+
+def dense_block(x, stage, nb_layers, nb_filter, growth_rate, dropout_rate=None, weight_decay=1e-4, grow_nb_filters=True):
+    ''' Build a dense_block where the output of each conv_block is fed to subsequent ones
+        # Arguments
+            x: input tensor
+            stage: index for dense block
+            nb_layers: the number of layers of conv_block to append to the model.
+            nb_filter: number of filters
+            growth_rate: growth rate
+            dropout_rate: dropout rate
+            weight_decay: weight decay factor
+            grow_nb_filters: flag to decide to allow number of filters to grow
+    '''
+
+    eps = 1.1e-5
+    concat_feat = x
+
+    for i in range(nb_layers):
+        branch = i+1
+        x = conv_block(concat_feat, stage, branch, growth_rate, dropout_rate, weight_decay)
+        concat_feat = merge([concat_feat, x], mode='concat', concat_axis=concat_axis, name='concat_'+str(stage)+'_'+str(branch))
+
+        if grow_nb_filters:
+            nb_filter += growth_rate
+
+    return concat_feat, nb_filter
+
+if __name__ == '__main__':
+
+    # Example to fine-tune on 3000 samples from Cifar10
+
+    img_rows, img_cols = 224, 224 # Resolution of inputs
+    channel = 3
+    num_classes = 10 
+    batch_size = 16 
+    nb_epoch = 10
+
+    # Load Cifar10 data. Please implement your own load_data() module for your own dataset
+    X_train, Y_train, X_valid, Y_valid = load_cifar10_data(img_rows, img_cols)
+
+    # Load our model
+    model = densenet121_model(img_rows=img_rows, img_cols=img_cols, color_type=channel, num_classes=num_classes)
+
+    # Start Fine-tuning
+    model.fit(X_train, Y_train,
+              batch_size=batch_size,
+              nb_epoch=nb_epoch,
+              shuffle=True,
+              verbose=1,
+              validation_data=(X_valid, Y_valid),
+              )
+
+    # Make predictions
+    predictions_valid = model.predict(X_valid, batch_size=batch_size, verbose=1)
+
+    # Cross-entropy loss score
+    score = log_loss(Y_valid, predictions_valid)
diff --git a/keras_contrib/applications/densenet161.py b/keras_contrib/applications/densenet161.py
new file mode 100644
index 000000000..8f43beb8f
--- /dev/null
+++ b/keras_contrib/applications/densenet161.py
@@ -0,0 +1,236 @@
+# -*- coding: utf-8 -*-
+
+from keras.optimizers import SGD
+from keras.layers import Input, merge, ZeroPadding2D
+from keras.layers.core import Dense, Dropout, Activation
+from keras.layers.convolutional import Convolution2D
+from keras.layers.pooling import AveragePooling2D, GlobalAveragePooling2D, MaxPooling2D
+from keras.layers.normalization import BatchNormalization
+from keras.models import Model
+import keras.backend as K
+
+from sklearn.metrics import log_loss
+
+from custom_layers.scale_layer import Scale
+
+from load_cifar10 import load_cifar10_data
+
+def densenet161_model(img_rows, img_cols, color_type=1, nb_dense_block=4, growth_rate=48, nb_filter=96, reduction=0.5, dropout_rate=0.0, weight_decay=1e-4, num_classes=None):
+    '''
+    DenseNet 161 Model for Keras
+
+    Model Schema is based on 
+    https://github.com/flyyufelix/DenseNet-Keras
+
+    ImageNet Pretrained Weights 
+    Theano: https://drive.google.com/open?id=0Byy2AcGyEVxfVnlCMlBGTDR3RGs
+    TensorFlow: https://drive.google.com/open?id=0Byy2AcGyEVxfUDZwVjU2cFNidTA
+
+    # Arguments
+        nb_dense_block: number of dense blocks to add to end
+        growth_rate: number of filters to add per dense block
+        nb_filter: initial number of filters
+        reduction: reduction factor of transition blocks.
+        dropout_rate: dropout rate
+        weight_decay: weight decay factor
+        classes: optional number of classes to classify images
+        weights_path: path to pre-trained weights
+    # Returns
+        A Keras model instance.
+    '''
+    eps = 1.1e-5
+
+    # compute compression factor
+    compression = 1.0 - reduction
+
+    # Handle Dimension Ordering for different backends
+    global concat_axis
+    if K.image_dim_ordering() == 'tf':
+      concat_axis = 3
+      img_input = Input(shape=(224, 224, 3), name='data')
+    else:
+      concat_axis = 1
+      img_input = Input(shape=(3, 224, 224), name='data')
+
+    # From architecture for ImageNet (Table 1 in the paper)
+    nb_filter = 96
+    nb_layers = [6,12,36,24] # For DenseNet-161
+
+    # Initial convolution
+    x = ZeroPadding2D((3, 3), name='conv1_zeropadding')(img_input)
+    x = Convolution2D(nb_filter, 7, 7, subsample=(2, 2), name='conv1', bias=False)(x)
+    x = BatchNormalization(epsilon=eps, axis=concat_axis, name='conv1_bn')(x)
+    x = Scale(axis=concat_axis, name='conv1_scale')(x)
+    x = Activation('relu', name='relu1')(x)
+    x = ZeroPadding2D((1, 1), name='pool1_zeropadding')(x)
+    x = MaxPooling2D((3, 3), strides=(2, 2), name='pool1')(x)
+
+    # Add dense blocks
+    for block_idx in range(nb_dense_block - 1):
+        stage = block_idx+2
+        x, nb_filter = dense_block(x, stage, nb_layers[block_idx], nb_filter, growth_rate, dropout_rate=dropout_rate, weight_decay=weight_decay)
+
+        # Add transition_block
+        x = transition_block(x, stage, nb_filter, compression=compression, dropout_rate=dropout_rate, weight_decay=weight_decay)
+        nb_filter = int(nb_filter * compression)
+
+    final_stage = stage + 1
+    x, nb_filter = dense_block(x, final_stage, nb_layers[-1], nb_filter, growth_rate, dropout_rate=dropout_rate, weight_decay=weight_decay)
+
+    x = BatchNormalization(epsilon=eps, axis=concat_axis, name='conv'+str(final_stage)+'_blk_bn')(x)
+    x = Scale(axis=concat_axis, name='conv'+str(final_stage)+'_blk_scale')(x)
+    x = Activation('relu', name='relu'+str(final_stage)+'_blk')(x)
+
+    x_fc = GlobalAveragePooling2D(name='pool'+str(final_stage))(x)
+    x_fc = Dense(1000, name='fc6')(x_fc)
+    x_fc = Activation('softmax', name='prob')(x_fc)
+
+    model = Model(img_input, x_fc, name='densenet')
+
+    if K.image_dim_ordering() == 'th':
+      # Use pre-trained weights for Theano backend
+      weights_path = 'imagenet_models/densenet161_weights_th.h5'
+    else:
+      # Use pre-trained weights for Tensorflow backend
+      weights_path = 'imagenet_models/densenet161_weights_tf.h5'
+
+    model.load_weights(weights_path, by_name=True)
+
+    # Truncate and replace softmax layer for transfer learning
+    # Cannot use model.layers.pop() since model is not of Sequential() type
+    # The method below works since pre-trained weights are stored in layers but not in the model
+    x_newfc = GlobalAveragePooling2D(name='pool'+str(final_stage))(x)
+    x_newfc = Dense(num_classes, name='fc6')(x_newfc)
+    x_newfc = Activation('softmax', name='prob')(x_newfc)
+
+    model = Model(img_input, x_newfc)
+
+    # Learning rate is changed to 0.001
+    sgd = SGD(lr=1e-3, decay=1e-6, momentum=0.9, nesterov=True)
+    model.compile(optimizer=sgd, loss='categorical_crossentropy', metrics=['accuracy'])
+
+    return model
+
+
+def conv_block(x, stage, branch, nb_filter, dropout_rate=None, weight_decay=1e-4):
+    '''Apply BatchNorm, Relu, bottleneck 1x1 Conv2D, 3x3 Conv2D, and option dropout
+        # Arguments
+            x: input tensor 
+            stage: index for dense block
+            branch: layer index within each dense block
+            nb_filter: number of filters
+            dropout_rate: dropout rate
+            weight_decay: weight decay factor
+    '''
+    eps = 1.1e-5
+    conv_name_base = 'conv' + str(stage) + '_' + str(branch)
+    relu_name_base = 'relu' + str(stage) + '_' + str(branch)
+
+    # 1x1 Convolution (Bottleneck layer)
+    inter_channel = nb_filter * 4  
+    x = BatchNormalization(epsilon=eps, axis=concat_axis, name=conv_name_base+'_x1_bn')(x)
+    x = Scale(axis=concat_axis, name=conv_name_base+'_x1_scale')(x)
+    x = Activation('relu', name=relu_name_base+'_x1')(x)
+    x = Convolution2D(inter_channel, 1, 1, name=conv_name_base+'_x1', bias=False)(x)
+
+    if dropout_rate:
+        x = Dropout(dropout_rate)(x)
+
+    # 3x3 Convolution
+    x = BatchNormalization(epsilon=eps, axis=concat_axis, name=conv_name_base+'_x2_bn')(x)
+    x = Scale(axis=concat_axis, name=conv_name_base+'_x2_scale')(x)
+    x = Activation('relu', name=relu_name_base+'_x2')(x)
+    x = ZeroPadding2D((1, 1), name=conv_name_base+'_x2_zeropadding')(x)
+    x = Convolution2D(nb_filter, 3, 3, name=conv_name_base+'_x2', bias=False)(x)
+
+    if dropout_rate:
+        x = Dropout(dropout_rate)(x)
+
+    return x
+
+
+def transition_block(x, stage, nb_filter, compression=1.0, dropout_rate=None, weight_decay=1E-4):
+    ''' Apply BatchNorm, 1x1 Convolution, averagePooling, optional compression, dropout 
+        # Arguments
+            x: input tensor
+            stage: index for dense block
+            nb_filter: number of filters
+            compression: calculated as 1 - reduction. Reduces the number of feature maps in the transition block.
+            dropout_rate: dropout rate
+            weight_decay: weight decay factor
+    '''
+
+    eps = 1.1e-5
+    conv_name_base = 'conv' + str(stage) + '_blk'
+    relu_name_base = 'relu' + str(stage) + '_blk'
+    pool_name_base = 'pool' + str(stage) 
+
+    x = BatchNormalization(epsilon=eps, axis=concat_axis, name=conv_name_base+'_bn')(x)
+    x = Scale(axis=concat_axis, name=conv_name_base+'_scale')(x)
+    x = Activation('relu', name=relu_name_base)(x)
+    x = Convolution2D(int(nb_filter * compression), 1, 1, name=conv_name_base, bias=False)(x)
+
+    if dropout_rate:
+        x = Dropout(dropout_rate)(x)
+
+    x = AveragePooling2D((2, 2), strides=(2, 2), name=pool_name_base)(x)
+
+    return x
+
+
+def dense_block(x, stage, nb_layers, nb_filter, growth_rate, dropout_rate=None, weight_decay=1e-4, grow_nb_filters=True):
+    ''' Build a dense_block where the output of each conv_block is fed to subsequent ones
+        # Arguments
+            x: input tensor
+            stage: index for dense block
+            nb_layers: the number of layers of conv_block to append to the model.
+            nb_filter: number of filters
+            growth_rate: growth rate
+            dropout_rate: dropout rate
+            weight_decay: weight decay factor
+            grow_nb_filters: flag to decide to allow number of filters to grow
+    '''
+
+    eps = 1.1e-5
+    concat_feat = x
+
+    for i in range(nb_layers):
+        branch = i+1
+        x = conv_block(concat_feat, stage, branch, growth_rate, dropout_rate, weight_decay)
+        concat_feat = merge([concat_feat, x], mode='concat', concat_axis=concat_axis, name='concat_'+str(stage)+'_'+str(branch))
+
+        if grow_nb_filters:
+            nb_filter += growth_rate
+
+    return concat_feat, nb_filter
+
+if __name__ == '__main__':
+
+    # Example to fine-tune on 3000 samples from Cifar10
+
+    img_rows, img_cols = 224, 224 # Resolution of inputs
+    channel = 3
+    num_classes = 10 
+    batch_size = 8
+    nb_epoch = 10
+
+    # Load Cifar10 data. Please implement your own load_data() module for your own dataset
+    X_train, Y_train, X_valid, Y_valid = load_cifar10_data(img_rows, img_cols)
+
+    # Load our model
+    model = densenet161_model(img_rows=img_rows, img_cols=img_cols, color_type=channel, num_classes=num_classes)
+
+    # Start Fine-tuning
+    model.fit(X_train, Y_train,
+              batch_size=batch_size,
+              nb_epoch=nb_epoch,
+              shuffle=True,
+              verbose=1,
+              validation_data=(X_valid, Y_valid),
+              )
+
+    # Make predictions
+    predictions_valid = model.predict(X_valid, batch_size=batch_size, verbose=1)
+
+    # Cross-entropy loss score
+    score = log_loss(Y_valid, predictions_valid)
diff --git a/keras_contrib/applications/densenet169.py b/keras_contrib/applications/densenet169.py
new file mode 100644
index 000000000..ab2516c78
--- /dev/null
+++ b/keras_contrib/applications/densenet169.py
@@ -0,0 +1,236 @@
+# -*- coding: utf-8 -*-
+
+from keras.optimizers import SGD
+from keras.layers import Input, merge, ZeroPadding2D
+from keras.layers.core import Dense, Dropout, Activation
+from keras.layers.convolutional import Convolution2D
+from keras.layers.pooling import AveragePooling2D, GlobalAveragePooling2D, MaxPooling2D
+from keras.layers.normalization import BatchNormalization
+from keras.models import Model
+import keras.backend as K
+
+from sklearn.metrics import log_loss
+
+from custom_layers.scale_layer import Scale
+
+from load_cifar10 import load_cifar10_data
+
+def densenet169_model(img_rows, img_cols, color_type=1, nb_dense_block=4, growth_rate=32, nb_filter=64, reduction=0.5, dropout_rate=0.0, weight_decay=1e-4, num_classes=None):
+    '''
+    DenseNet 169 Model for Keras
+
+    Model Schema is based on 
+    https://github.com/flyyufelix/DenseNet-Keras
+
+    ImageNet Pretrained Weights 
+    Theano: https://drive.google.com/open?id=0Byy2AcGyEVxfN0d3T1F1MXg0NlU
+    TensorFlow: https://drive.google.com/open?id=0Byy2AcGyEVxfSEc5UC1ROUFJdmM
+
+    # Arguments
+        nb_dense_block: number of dense blocks to add to end
+        growth_rate: number of filters to add per dense block
+        nb_filter: initial number of filters
+        reduction: reduction factor of transition blocks.
+        dropout_rate: dropout rate
+        weight_decay: weight decay factor
+        classes: optional number of classes to classify images
+        weights_path: path to pre-trained weights
+    # Returns
+        A Keras model instance.
+    '''
+    eps = 1.1e-5
+
+    # compute compression factor
+    compression = 1.0 - reduction
+
+    # Handle Dimension Ordering for different backends
+    global concat_axis
+    if K.image_dim_ordering() == 'tf':
+      concat_axis = 3
+      img_input = Input(shape=(224, 224, 3), name='data')
+    else:
+      concat_axis = 1
+      img_input = Input(shape=(3, 224, 224), name='data')
+
+    # From architecture for ImageNet (Table 1 in the paper)
+    nb_filter = 64
+    nb_layers = [6,12,32,32] # For DenseNet-169
+
+    # Initial convolution
+    x = ZeroPadding2D((3, 3), name='conv1_zeropadding')(img_input)
+    x = Convolution2D(nb_filter, 7, 7, subsample=(2, 2), name='conv1', bias=False)(x)
+    x = BatchNormalization(epsilon=eps, axis=concat_axis, name='conv1_bn')(x)
+    x = Scale(axis=concat_axis, name='conv1_scale')(x)
+    x = Activation('relu', name='relu1')(x)
+    x = ZeroPadding2D((1, 1), name='pool1_zeropadding')(x)
+    x = MaxPooling2D((3, 3), strides=(2, 2), name='pool1')(x)
+
+    # Add dense blocks
+    for block_idx in range(nb_dense_block - 1):
+        stage = block_idx+2
+        x, nb_filter = dense_block(x, stage, nb_layers[block_idx], nb_filter, growth_rate, dropout_rate=dropout_rate, weight_decay=weight_decay)
+
+        # Add transition_block
+        x = transition_block(x, stage, nb_filter, compression=compression, dropout_rate=dropout_rate, weight_decay=weight_decay)
+        nb_filter = int(nb_filter * compression)
+
+    final_stage = stage + 1
+    x, nb_filter = dense_block(x, final_stage, nb_layers[-1], nb_filter, growth_rate, dropout_rate=dropout_rate, weight_decay=weight_decay)
+
+    x = BatchNormalization(epsilon=eps, axis=concat_axis, name='conv'+str(final_stage)+'_blk_bn')(x)
+    x = Scale(axis=concat_axis, name='conv'+str(final_stage)+'_blk_scale')(x)
+    x = Activation('relu', name='relu'+str(final_stage)+'_blk')(x)
+
+    x_fc = GlobalAveragePooling2D(name='pool'+str(final_stage))(x)
+    x_fc = Dense(1000, name='fc6')(x_fc)
+    x_fc = Activation('softmax', name='prob')(x_fc)
+
+    model = Model(img_input, x_fc, name='densenet')
+
+    if K.image_dim_ordering() == 'th':
+      # Use pre-trained weights for Theano backend
+      weights_path = 'imagenet_models/densenet169_weights_th.h5'
+    else:
+      # Use pre-trained weights for Tensorflow backend
+      weights_path = 'imagenet_models/densenet169_weights_tf.h5'
+
+    model.load_weights(weights_path, by_name=True)
+
+    # Truncate and replace softmax layer for transfer learning
+    # Cannot use model.layers.pop() since model is not of Sequential() type
+    # The method below works since pre-trained weights are stored in layers but not in the model
+    x_newfc = GlobalAveragePooling2D(name='pool'+str(final_stage))(x)
+    x_newfc = Dense(num_classes, name='fc6')(x_newfc)
+    x_newfc = Activation('softmax', name='prob')(x_newfc)
+
+    model = Model(img_input, x_newfc)
+
+    # Learning rate is changed to 0.001
+    sgd = SGD(lr=1e-3, decay=1e-6, momentum=0.9, nesterov=True)
+    model.compile(optimizer=sgd, loss='categorical_crossentropy', metrics=['accuracy'])
+
+    return model
+
+
+def conv_block(x, stage, branch, nb_filter, dropout_rate=None, weight_decay=1e-4):
+    '''Apply BatchNorm, Relu, bottleneck 1x1 Conv2D, 3x3 Conv2D, and option dropout
+        # Arguments
+            x: input tensor 
+            stage: index for dense block
+            branch: layer index within each dense block
+            nb_filter: number of filters
+            dropout_rate: dropout rate
+            weight_decay: weight decay factor
+    '''
+    eps = 1.1e-5
+    conv_name_base = 'conv' + str(stage) + '_' + str(branch)
+    relu_name_base = 'relu' + str(stage) + '_' + str(branch)
+
+    # 1x1 Convolution (Bottleneck layer)
+    inter_channel = nb_filter * 4  
+    x = BatchNormalization(epsilon=eps, axis=concat_axis, name=conv_name_base+'_x1_bn')(x)
+    x = Scale(axis=concat_axis, name=conv_name_base+'_x1_scale')(x)
+    x = Activation('relu', name=relu_name_base+'_x1')(x)
+    x = Convolution2D(inter_channel, 1, 1, name=conv_name_base+'_x1', bias=False)(x)
+
+    if dropout_rate:
+        x = Dropout(dropout_rate)(x)
+
+    # 3x3 Convolution
+    x = BatchNormalization(epsilon=eps, axis=concat_axis, name=conv_name_base+'_x2_bn')(x)
+    x = Scale(axis=concat_axis, name=conv_name_base+'_x2_scale')(x)
+    x = Activation('relu', name=relu_name_base+'_x2')(x)
+    x = ZeroPadding2D((1, 1), name=conv_name_base+'_x2_zeropadding')(x)
+    x = Convolution2D(nb_filter, 3, 3, name=conv_name_base+'_x2', bias=False)(x)
+
+    if dropout_rate:
+        x = Dropout(dropout_rate)(x)
+
+    return x
+
+
+def transition_block(x, stage, nb_filter, compression=1.0, dropout_rate=None, weight_decay=1E-4):
+    ''' Apply BatchNorm, 1x1 Convolution, averagePooling, optional compression, dropout 
+        # Arguments
+            x: input tensor
+            stage: index for dense block
+            nb_filter: number of filters
+            compression: calculated as 1 - reduction. Reduces the number of feature maps in the transition block.
+            dropout_rate: dropout rate
+            weight_decay: weight decay factor
+    '''
+
+    eps = 1.1e-5
+    conv_name_base = 'conv' + str(stage) + '_blk'
+    relu_name_base = 'relu' + str(stage) + '_blk'
+    pool_name_base = 'pool' + str(stage) 
+
+    x = BatchNormalization(epsilon=eps, axis=concat_axis, name=conv_name_base+'_bn')(x)
+    x = Scale(axis=concat_axis, name=conv_name_base+'_scale')(x)
+    x = Activation('relu', name=relu_name_base)(x)
+    x = Convolution2D(int(nb_filter * compression), 1, 1, name=conv_name_base, bias=False)(x)
+
+    if dropout_rate:
+        x = Dropout(dropout_rate)(x)
+
+    x = AveragePooling2D((2, 2), strides=(2, 2), name=pool_name_base)(x)
+
+    return x
+
+
+def dense_block(x, stage, nb_layers, nb_filter, growth_rate, dropout_rate=None, weight_decay=1e-4, grow_nb_filters=True):
+    ''' Build a dense_block where the output of each conv_block is fed to subsequent ones
+        # Arguments
+            x: input tensor
+            stage: index for dense block
+            nb_layers: the number of layers of conv_block to append to the model.
+            nb_filter: number of filters
+            growth_rate: growth rate
+            dropout_rate: dropout rate
+            weight_decay: weight decay factor
+            grow_nb_filters: flag to decide to allow number of filters to grow
+    '''
+
+    eps = 1.1e-5
+    concat_feat = x
+
+    for i in range(nb_layers):
+        branch = i+1
+        x = conv_block(concat_feat, stage, branch, growth_rate, dropout_rate, weight_decay)
+        concat_feat = merge([concat_feat, x], mode='concat', concat_axis=concat_axis, name='concat_'+str(stage)+'_'+str(branch))
+
+        if grow_nb_filters:
+            nb_filter += growth_rate
+
+    return concat_feat, nb_filter
+
+if __name__ == '__main__':
+
+    # Example to fine-tune on 3000 samples from Cifar10
+
+    img_rows, img_cols = 224, 224 # Resolution of inputs
+    channel = 3
+    num_classes = 10 
+    batch_size = 16 
+    nb_epoch = 10
+
+    # Load Cifar10 data. Please implement your own load_data() module for your own dataset
+    X_train, Y_train, X_valid, Y_valid = load_cifar10_data(img_rows, img_cols)
+
+    # Load our model
+    model = densenet169_model(img_rows=img_rows, img_cols=img_cols, color_type=channel, num_classes=num_classes)
+
+    # Start Fine-tuning
+    model.fit(X_train, Y_train,
+              batch_size=batch_size,
+              nb_epoch=nb_epoch,
+              shuffle=True,
+              verbose=1,
+              validation_data=(X_valid, Y_valid),
+              )
+
+    # Make predictions
+    predictions_valid = model.predict(X_valid, batch_size=batch_size, verbose=1)
+
+    # Cross-entropy loss score
+    score = log_loss(Y_valid, predictions_valid)
diff --git a/keras_contrib/applications/inception_v3.py b/keras_contrib/applications/inception_v3.py
new file mode 100644
index 000000000..a8daad33e
--- /dev/null
+++ b/keras_contrib/applications/inception_v3.py
@@ -0,0 +1,249 @@
+# -*- coding: utf-8 -*-
+
+from keras.models import Sequential
+from keras.optimizers import SGD
+from keras.layers import Input, Dense, Convolution2D, MaxPooling2D, AveragePooling2D, ZeroPadding2D, Dropout, Flatten, merge, Reshape, Activation
+from keras.layers.normalization import BatchNormalization
+from keras.models import Model
+from keras import backend as K
+
+from sklearn.metrics import log_loss
+
+from load_cifar10 import load_cifar10_data
+
+def conv2d_bn(x, nb_filter, nb_row, nb_col,
+              border_mode='same', subsample=(1, 1),
+              name=None):
+    """
+    Utility function to apply conv + BN for Inception V3.
+    """
+    if name is not None:
+        bn_name = name + '_bn'
+        conv_name = name + '_conv'
+    else:
+        bn_name = None
+        conv_name = None
+    bn_axis = 1
+    x = Convolution2D(nb_filter, nb_row, nb_col,
+                      subsample=subsample,
+                      activation='relu',
+                      border_mode=border_mode,
+                      name=conv_name)(x)
+    x = BatchNormalization(axis=bn_axis, name=bn_name)(x)
+    return x
+
+def inception_v3_model(img_rows, img_cols, channel=1, num_classes=None):
+    """
+    Inception-V3 Model for Keras
+
+    Model Schema is based on 
+    https://github.com/fchollet/deep-learning-models/blob/master/inception_v3.py
+
+    ImageNet Pretrained Weights 
+    https://github.com/fchollet/deep-learning-models/releases/download/v0.2/inception_v3_weights_th_dim_ordering_th_kernels.h5
+
+    Parameters:
+      img_rows, img_cols - resolution of inputs
+      channel - 1 for grayscale, 3 for color 
+      num_classes - number of class labels for our classification task
+    """
+    channel_axis = 1
+    img_input = Input(shape=(channel, img_rows, img_cols))
+    x = conv2d_bn(img_input, 32, 3, 3, subsample=(2, 2), border_mode='valid')
+    x = conv2d_bn(x, 32, 3, 3, border_mode='valid')
+    x = conv2d_bn(x, 64, 3, 3)
+    x = MaxPooling2D((3, 3), strides=(2, 2))(x)
+
+    x = conv2d_bn(x, 80, 1, 1, border_mode='valid')
+    x = conv2d_bn(x, 192, 3, 3, border_mode='valid')
+    x = MaxPooling2D((3, 3), strides=(2, 2))(x)
+
+    # mixed 0, 1, 2: 35 x 35 x 256
+    for i in range(3):
+        branch1x1 = conv2d_bn(x, 64, 1, 1)
+
+        branch5x5 = conv2d_bn(x, 48, 1, 1)
+        branch5x5 = conv2d_bn(branch5x5, 64, 5, 5)
+
+        branch3x3dbl = conv2d_bn(x, 64, 1, 1)
+        branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3)
+        branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3)
+
+        branch_pool = AveragePooling2D(
+            (3, 3), strides=(1, 1), border_mode='same')(x)
+        branch_pool = conv2d_bn(branch_pool, 32, 1, 1)
+        x = merge([branch1x1, branch5x5, branch3x3dbl, branch_pool],
+                  mode='concat', concat_axis=channel_axis,
+                  name='mixed' + str(i))
+
+    # mixed 3: 17 x 17 x 768
+    branch3x3 = conv2d_bn(x, 384, 3, 3, subsample=(2, 2), border_mode='valid')
+
+    branch3x3dbl = conv2d_bn(x, 64, 1, 1)
+    branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3)
+    branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3,
+                             subsample=(2, 2), border_mode='valid')
+
+    branch_pool = MaxPooling2D((3, 3), strides=(2, 2))(x)
+    x = merge([branch3x3, branch3x3dbl, branch_pool],
+              mode='concat', concat_axis=channel_axis,
+              name='mixed3')
+
+    # mixed 4: 17 x 17 x 768
+    branch1x1 = conv2d_bn(x, 192, 1, 1)
+
+    branch7x7 = conv2d_bn(x, 128, 1, 1)
+    branch7x7 = conv2d_bn(branch7x7, 128, 1, 7)
+    branch7x7 = conv2d_bn(branch7x7, 192, 7, 1)
+
+    branch7x7dbl = conv2d_bn(x, 128, 1, 1)
+    branch7x7dbl = conv2d_bn(branch7x7dbl, 128, 7, 1)
+    branch7x7dbl = conv2d_bn(branch7x7dbl, 128, 1, 7)
+    branch7x7dbl = conv2d_bn(branch7x7dbl, 128, 7, 1)
+    branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 1, 7)
+
+    branch_pool = AveragePooling2D((3, 3), strides=(1, 1), border_mode='same')(x)
+    branch_pool = conv2d_bn(branch_pool, 192, 1, 1)
+    x = merge([branch1x1, branch7x7, branch7x7dbl, branch_pool],
+              mode='concat', concat_axis=channel_axis,
+              name='mixed4')
+
+    # mixed 5, 6: 17 x 17 x 768
+    for i in range(2):
+        branch1x1 = conv2d_bn(x, 192, 1, 1)
+
+        branch7x7 = conv2d_bn(x, 160, 1, 1)
+        branch7x7 = conv2d_bn(branch7x7, 160, 1, 7)
+        branch7x7 = conv2d_bn(branch7x7, 192, 7, 1)
+
+        branch7x7dbl = conv2d_bn(x, 160, 1, 1)
+        branch7x7dbl = conv2d_bn(branch7x7dbl, 160, 7, 1)
+        branch7x7dbl = conv2d_bn(branch7x7dbl, 160, 1, 7)
+        branch7x7dbl = conv2d_bn(branch7x7dbl, 160, 7, 1)
+        branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 1, 7)
+
+        branch_pool = AveragePooling2D(
+            (3, 3), strides=(1, 1), border_mode='same')(x)
+        branch_pool = conv2d_bn(branch_pool, 192, 1, 1)
+        x = merge([branch1x1, branch7x7, branch7x7dbl, branch_pool],
+                  mode='concat', concat_axis=channel_axis,
+                  name='mixed' + str(5 + i))
+
+    # mixed 7: 17 x 17 x 768
+    branch1x1 = conv2d_bn(x, 192, 1, 1)
+
+    branch7x7 = conv2d_bn(x, 192, 1, 1)
+    branch7x7 = conv2d_bn(branch7x7, 192, 1, 7)
+    branch7x7 = conv2d_bn(branch7x7, 192, 7, 1)
+
+    branch7x7dbl = conv2d_bn(x, 160, 1, 1)
+    branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 7, 1)
+    branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 1, 7)
+    branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 7, 1)
+    branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 1, 7)
+
+    branch_pool = AveragePooling2D((3, 3), strides=(1, 1), border_mode='same')(x)
+    branch_pool = conv2d_bn(branch_pool, 192, 1, 1)
+    x = merge([branch1x1, branch7x7, branch7x7dbl, branch_pool],
+              mode='concat', concat_axis=channel_axis,
+              name='mixed7')
+
+    # mixed 8: 8 x 8 x 1280
+    branch3x3 = conv2d_bn(x, 192, 1, 1)
+    branch3x3 = conv2d_bn(branch3x3, 320, 3, 3,
+                          subsample=(2, 2), border_mode='valid')
+
+    branch7x7x3 = conv2d_bn(x, 192, 1, 1)
+    branch7x7x3 = conv2d_bn(branch7x7x3, 192, 1, 7)
+    branch7x7x3 = conv2d_bn(branch7x7x3, 192, 7, 1)
+    branch7x7x3 = conv2d_bn(branch7x7x3, 192, 3, 3,
+                            subsample=(2, 2), border_mode='valid')
+
+    branch_pool = AveragePooling2D((3, 3), strides=(2, 2))(x)
+    x = merge([branch3x3, branch7x7x3, branch_pool],
+              mode='concat', concat_axis=channel_axis,
+              name='mixed8')
+
+    # mixed 9: 8 x 8 x 2048
+    for i in range(2):
+        branch1x1 = conv2d_bn(x, 320, 1, 1)
+
+        branch3x3 = conv2d_bn(x, 384, 1, 1)
+        branch3x3_1 = conv2d_bn(branch3x3, 384, 1, 3)
+        branch3x3_2 = conv2d_bn(branch3x3, 384, 3, 1)
+        branch3x3 = merge([branch3x3_1, branch3x3_2],
+                          mode='concat', concat_axis=channel_axis,
+                          name='mixed9_' + str(i))
+
+        branch3x3dbl = conv2d_bn(x, 448, 1, 1)
+        branch3x3dbl = conv2d_bn(branch3x3dbl, 384, 3, 3)
+        branch3x3dbl_1 = conv2d_bn(branch3x3dbl, 384, 1, 3)
+        branch3x3dbl_2 = conv2d_bn(branch3x3dbl, 384, 3, 1)
+        branch3x3dbl = merge([branch3x3dbl_1, branch3x3dbl_2],
+                             mode='concat', concat_axis=channel_axis)
+
+        branch_pool = AveragePooling2D(
+            (3, 3), strides=(1, 1), border_mode='same')(x)
+        branch_pool = conv2d_bn(branch_pool, 192, 1, 1)
+        x = merge([branch1x1, branch3x3, branch3x3dbl, branch_pool],
+                  mode='concat', concat_axis=channel_axis,
+                  name='mixed' + str(9 + i))
+
+    # Fully Connected Softmax Layer
+    x_fc = AveragePooling2D((8, 8), strides=(8, 8), name='avg_pool')(x)
+    x_fc = Flatten(name='flatten')(x_fc)
+    x_fc = Dense(1000, activation='softmax', name='predictions')(x_fc)
+
+    # Create model
+    model = Model(img_input, x_fc)
+
+    # Load ImageNet pre-trained data 
+    model.load_weights('imagenet_models/inception_v3_weights_th_dim_ordering_th_kernels.h5')
+
+    # Truncate and replace softmax layer for transfer learning
+    # Cannot use model.layers.pop() since model is not of Sequential() type
+    # The method below works since pre-trained weights are stored in layers but not in the model
+    x_newfc = AveragePooling2D((8, 8), strides=(8, 8), name='avg_pool')(x)
+    x_newfc = Flatten(name='flatten')(x_newfc)
+    x_newfc = Dense(num_classes, activation='softmax', name='predictions')(x_newfc)
+
+    # Create another model with our customized softmax
+    model = Model(img_input, x_newfc)
+
+    # Learning rate is changed to 0.001
+    sgd = SGD(lr=1e-3, decay=1e-6, momentum=0.9, nesterov=True)
+    model.compile(optimizer=sgd, loss='categorical_crossentropy', metrics=['accuracy'])
+
+    return model 
+
+if __name__ == '__main__':
+
+    # Example to fine-tune on 3000 samples from Cifar10
+
+    img_rows, img_cols = 299, 299 # Resolution of inputs
+    channel = 3
+    num_classes = 10 
+    batch_size = 16 
+    nb_epoch = 10
+
+    # Load Cifar10 data. Please implement your own load_data() module for your own dataset
+    X_train, Y_train, X_valid, Y_valid = load_cifar10_data(img_rows, img_cols)
+
+    # Load our model
+    model = inception_v3_model(img_rows, img_cols, channel, num_classes)
+
+    # Start Fine-tuning
+    model.fit(X_train, Y_train,
+              batch_size=batch_size,
+              nb_epoch=nb_epoch,
+              shuffle=True,
+              verbose=1,
+              validation_data=(X_valid, Y_valid),
+              )
+
+    # Make predictions
+    predictions_valid = model.predict(X_valid, batch_size=batch_size, verbose=1)
+
+    # Cross-entropy loss score
+    score = log_loss(Y_valid, predictions_valid)
+
diff --git a/keras_contrib/applications/inception_v4.py b/keras_contrib/applications/inception_v4.py
new file mode 100644
index 000000000..09a07864f
--- /dev/null
+++ b/keras_contrib/applications/inception_v4.py
@@ -0,0 +1,298 @@
+# -*- coding: utf-8 -*-
+
+from keras.models import Sequential
+from keras.optimizers import SGD
+from keras.layers import Input, Dense, Convolution2D, MaxPooling2D, AveragePooling2D, ZeroPadding2D, Dropout, Flatten, merge, Reshape, Activation
+from keras.layers.normalization import BatchNormalization
+from keras.models import Model
+from keras import backend as K
+
+from sklearn.metrics import log_loss
+
+from load_cifar10 import load_cifar10_data
+
+def conv2d_bn(x, nb_filter, nb_row, nb_col,
+              border_mode='same', subsample=(1, 1), bias=False):
+    """
+    Utility function to apply conv + BN. 
+    (Slightly modified from https://github.com/fchollet/keras/blob/master/keras/applications/inception_v3.py)
+    """
+    if K.image_dim_ordering() == "th":
+        channel_axis = 1
+    else:
+        channel_axis = -1
+    x = Convolution2D(nb_filter, nb_row, nb_col,
+                      subsample=subsample,
+                      border_mode=border_mode,
+                      bias=bias)(x)
+    x = BatchNormalization(axis=channel_axis)(x)
+    x = Activation('relu')(x)
+    return x
+
+def block_inception_a(input):
+    if K.image_dim_ordering() == "th":
+        channel_axis = 1
+    else:
+        channel_axis = -1
+
+    branch_0 = conv2d_bn(input, 96, 1, 1)
+
+    branch_1 = conv2d_bn(input, 64, 1, 1)
+    branch_1 = conv2d_bn(branch_1, 96, 3, 3)
+
+    branch_2 = conv2d_bn(input, 64, 1, 1)
+    branch_2 = conv2d_bn(branch_2, 96, 3, 3)
+    branch_2 = conv2d_bn(branch_2, 96, 3, 3)
+
+    branch_3 = AveragePooling2D((3,3), strides=(1,1), border_mode='same')(input)
+    branch_3 = conv2d_bn(branch_3, 96, 1, 1)
+
+    x = merge([branch_0, branch_1, branch_2, branch_3], mode='concat', concat_axis=channel_axis)
+    return x
+
+
+def block_reduction_a(input):
+    if K.image_dim_ordering() == "th":
+        channel_axis = 1
+    else:
+        channel_axis = -1
+
+    branch_0 = conv2d_bn(input, 384, 3, 3, subsample=(2,2), border_mode='valid')
+
+    branch_1 = conv2d_bn(input, 192, 1, 1)
+    branch_1 = conv2d_bn(branch_1, 224, 3, 3)
+    branch_1 = conv2d_bn(branch_1, 256, 3, 3, subsample=(2,2), border_mode='valid')
+
+    branch_2 = MaxPooling2D((3,3), strides=(2,2), border_mode='valid')(input)
+
+    x = merge([branch_0, branch_1, branch_2], mode='concat', concat_axis=channel_axis)
+    return x
+
+
+def block_inception_b(input):
+    if K.image_dim_ordering() == "th":
+        channel_axis = 1
+    else:
+        channel_axis = -1
+
+    branch_0 = conv2d_bn(input, 384, 1, 1)
+
+    branch_1 = conv2d_bn(input, 192, 1, 1)
+    branch_1 = conv2d_bn(branch_1, 224, 1, 7)
+    branch_1 = conv2d_bn(branch_1, 256, 7, 1)
+
+    branch_2 = conv2d_bn(input, 192, 1, 1)
+    branch_2 = conv2d_bn(branch_2, 192, 7, 1)
+    branch_2 = conv2d_bn(branch_2, 224, 1, 7)
+    branch_2 = conv2d_bn(branch_2, 224, 7, 1)
+    branch_2 = conv2d_bn(branch_2, 256, 1, 7)
+
+    branch_3 = AveragePooling2D((3,3), strides=(1,1), border_mode='same')(input)
+    branch_3 = conv2d_bn(branch_3, 128, 1, 1)
+
+    x = merge([branch_0, branch_1, branch_2, branch_3], mode='concat', concat_axis=channel_axis)
+    return x
+
+
+def block_reduction_b(input):
+    if K.image_dim_ordering() == "th":
+        channel_axis = 1
+    else:
+        channel_axis = -1
+
+    branch_0 = conv2d_bn(input, 192, 1, 1)
+    branch_0 = conv2d_bn(branch_0, 192, 3, 3, subsample=(2, 2), border_mode='valid')
+
+    branch_1 = conv2d_bn(input, 256, 1, 1)
+    branch_1 = conv2d_bn(branch_1, 256, 1, 7)
+    branch_1 = conv2d_bn(branch_1, 320, 7, 1)
+    branch_1 = conv2d_bn(branch_1, 320, 3, 3, subsample=(2,2), border_mode='valid')
+
+    branch_2 = MaxPooling2D((3, 3), strides=(2, 2), border_mode='valid')(input)
+
+    x = merge([branch_0, branch_1, branch_2], mode='concat', concat_axis=channel_axis)
+    return x
+
+
+def block_inception_c(input):
+    if K.image_dim_ordering() == "th":
+        channel_axis = 1
+    else:
+        channel_axis = -1
+
+    branch_0 = conv2d_bn(input, 256, 1, 1)
+
+    branch_1 = conv2d_bn(input, 384, 1, 1)
+    branch_10 = conv2d_bn(branch_1, 256, 1, 3)
+    branch_11 = conv2d_bn(branch_1, 256, 3, 1)
+    branch_1 = merge([branch_10, branch_11], mode='concat', concat_axis=channel_axis)
+
+
+    branch_2 = conv2d_bn(input, 384, 1, 1)
+    branch_2 = conv2d_bn(branch_2, 448, 3, 1)
+    branch_2 = conv2d_bn(branch_2, 512, 1, 3)
+    branch_20 = conv2d_bn(branch_2, 256, 1, 3)
+    branch_21 = conv2d_bn(branch_2, 256, 3, 1)
+    branch_2 = merge([branch_20, branch_21], mode='concat', concat_axis=channel_axis)
+
+    branch_3 = AveragePooling2D((3, 3), strides=(1, 1), border_mode='same')(input)
+    branch_3 = conv2d_bn(branch_3, 256, 1, 1)
+
+    x = merge([branch_0, branch_1, branch_2, branch_3], mode='concat', concat_axis=channel_axis)
+    return x
+
+
+def inception_v4_base(input):
+    if K.image_dim_ordering() == "th":
+        channel_axis = 1
+    else:
+        channel_axis = -1
+
+    # Input Shape is 299 x 299 x 3 (th) or 3 x 299 x 299 (th)
+    net = conv2d_bn(input, 32, 3, 3, subsample=(2,2), border_mode='valid')
+    net = conv2d_bn(net, 32, 3, 3, border_mode='valid')
+    net = conv2d_bn(net, 64, 3, 3)
+
+    branch_0 = MaxPooling2D((3,3), strides=(2,2), border_mode='valid')(net)
+
+    branch_1 = conv2d_bn(net, 96, 3, 3, subsample=(2,2), border_mode='valid')
+
+    net = merge([branch_0, branch_1], mode='concat', concat_axis=channel_axis)
+
+    branch_0 = conv2d_bn(net, 64, 1, 1)
+    branch_0 = conv2d_bn(branch_0, 96, 3, 3, border_mode='valid')
+
+    branch_1 = conv2d_bn(net, 64, 1, 1)
+    branch_1 = conv2d_bn(branch_1, 64, 1, 7)
+    branch_1 = conv2d_bn(branch_1, 64, 7, 1)
+    branch_1 = conv2d_bn(branch_1, 96, 3, 3, border_mode='valid')
+
+    net = merge([branch_0, branch_1], mode='concat', concat_axis=channel_axis)
+
+    branch_0 = conv2d_bn(net, 192, 3, 3, subsample=(2,2), border_mode='valid')
+    branch_1 = MaxPooling2D((3,3), strides=(2,2), border_mode='valid')(net)
+
+    net = merge([branch_0, branch_1], mode='concat', concat_axis=channel_axis)
+
+    # 35 x 35 x 384
+    # 4 x Inception-A blocks
+    for idx in xrange(4):
+      net = block_inception_a(net)
+
+    # 35 x 35 x 384
+    # Reduction-A block
+    net = block_reduction_a(net)
+
+    # 17 x 17 x 1024
+    # 7 x Inception-B blocks
+    for idx in xrange(7):
+      net = block_inception_b(net)
+
+    # 17 x 17 x 1024
+    # Reduction-B block
+    net = block_reduction_b(net)
+
+    # 8 x 8 x 1536
+    # 3 x Inception-C blocks
+    for idx in xrange(3):
+      net = block_inception_c(net)
+
+    return net
+
+
+def inception_v4_model(img_rows, img_cols, color_type=1, num_classeses=None, dropout_keep_prob=0.2):
+    '''
+    Inception V4 Model for Keras
+
+    Model Schema is based on
+    https://github.com/kentsommer/keras-inceptionV4
+
+    ImageNet Pretrained Weights 
+    Theano: https://github.com/kentsommer/keras-inceptionV4/releases/download/2.0/inception-v4_weights_th_dim_ordering_th_kernels.h5
+    TensorFlow: https://github.com/kentsommer/keras-inceptionV4/releases/download/2.0/inception-v4_weights_tf_dim_ordering_tf_kernels.h5
+
+    Parameters:
+      img_rows, img_cols - resolution of inputs
+      channel - 1 for grayscale, 3 for color 
+      num_classes - number of class labels for our classification task
+    '''
+
+    # Input Shape is 299 x 299 x 3 (tf) or 3 x 299 x 299 (th)
+    if K.image_dim_ordering() == 'th':
+        inputs = Input((3, 299, 299))
+    else:
+        inputs = Input((299, 299, 3))
+
+    # Make inception base
+    net = inception_v4_base(inputs)
+
+
+    # Final pooling and prediction
+
+    # 8 x 8 x 1536
+    net_old = AveragePooling2D((8,8), border_mode='valid')(net)
+
+    # 1 x 1 x 1536
+    net_old = Dropout(dropout_keep_prob)(net_old)
+    net_old = Flatten()(net_old)
+
+    # 1536
+    predictions = Dense(output_dim=1001, activation='softmax')(net_old)
+
+    model = Model(inputs, predictions, name='inception_v4')
+
+    if K.image_dim_ordering() == 'th':
+      # Use pre-trained weights for Theano backend
+      weights_path = 'imagenet_models/inception-v4_weights_th_dim_ordering_th_kernels.h5'
+    else:
+      # Use pre-trained weights for Tensorflow backend
+      weights_path = 'imagenet_models/inception-v4_weights_tf_dim_ordering_tf_kernels.h5'
+
+    model.load_weights(weights_path, by_name=True)
+
+    # Truncate and replace softmax layer for transfer learning
+    # Cannot use model.layers.pop() since model is not of Sequential() type
+    # The method below works since pre-trained weights are stored in layers but not in the model
+    net_ft = AveragePooling2D((8,8), border_mode='valid')(net)
+    net_ft = Dropout(dropout_keep_prob)(net_ft)
+    net_ft = Flatten()(net_ft)
+    predictions_ft = Dense(output_dim=num_classes, activation='softmax')(net_ft)
+
+    model = Model(inputs, predictions_ft, name='inception_v4')
+
+    # Learning rate is changed to 0.001
+    sgd = SGD(lr=1e-3, decay=1e-6, momentum=0.9, nesterov=True)
+    model.compile(optimizer=sgd, loss='categorical_crossentropy', metrics=['accuracy'])
+
+    return model
+
+if __name__ == '__main__':
+
+    # Example to fine-tune on 3000 samples from Cifar10
+
+    img_rows, img_cols = 299, 299 # Resolution of inputs
+    channel = 3
+    num_classes = 10 
+    batch_size = 16 
+    nb_epoch = 10
+
+    # Load Cifar10 data. Please implement your own load_data() module for your own dataset
+    X_train, Y_train, X_valid, Y_valid = load_cifar10_data(img_rows, img_cols)
+
+    # Load our model
+    model = inception_v4_model(img_rows, img_cols, channel, num_classes, dropout_keep_prob=0.2)
+
+    # Start Fine-tuning
+    model.fit(X_train, Y_train,
+              batch_size=batch_size,
+              nb_epoch=nb_epoch,
+              shuffle=True,
+              verbose=1,
+              validation_data=(X_valid, Y_valid),
+              )
+
+    # Make predictions
+    predictions_valid = model.predict(X_valid, batch_size=batch_size, verbose=1)
+
+    # Cross-entropy loss score
+    score = log_loss(Y_valid, predictions_valid)
diff --git a/keras_contrib/applications/resnet_101.py b/keras_contrib/applications/resnet_101.py
new file mode 100644
index 000000000..b757bd250
--- /dev/null
+++ b/keras_contrib/applications/resnet_101.py
@@ -0,0 +1,206 @@
+# -*- coding: utf-8 -*-
+
+from keras.models import Sequential
+from keras.optimizers import SGD
+from keras.layers import Input, Dense, Convolution2D, MaxPooling2D, AveragePooling2D, ZeroPadding2D, Dropout, Flatten, merge, Reshape, Activation
+from keras.layers.normalization import BatchNormalization
+from keras.models import Model
+from keras import backend as K
+
+from sklearn.metrics import log_loss
+
+from custom_layers.scale_layer import Scale
+
+from load_cifar10 import load_cifar10_data
+
+import sys
+sys.setrecursionlimit(3000)
+
+def identity_block(input_tensor, kernel_size, filters, stage, block):
+    '''The identity_block is the block that has no conv layer at shortcut
+    # Arguments
+        input_tensor: input tensor
+        kernel_size: defualt 3, the kernel size of middle conv layer at main path
+        filters: list of integers, the nb_filters of 3 conv layer at main path
+        stage: integer, current stage label, used for generating layer names
+        block: 'a','b'..., current block label, used for generating layer names
+    '''
+    eps = 1.1e-5
+    nb_filter1, nb_filter2, nb_filter3 = filters
+    conv_name_base = 'res' + str(stage) + block + '_branch'
+    bn_name_base = 'bn' + str(stage) + block + '_branch'
+    scale_name_base = 'scale' + str(stage) + block + '_branch'
+
+    x = Convolution2D(nb_filter1, 1, 1, name=conv_name_base + '2a', bias=False)(input_tensor)
+    x = BatchNormalization(epsilon=eps, axis=bn_axis, name=bn_name_base + '2a')(x)
+    x = Scale(axis=bn_axis, name=scale_name_base + '2a')(x)
+    x = Activation('relu', name=conv_name_base + '2a_relu')(x)
+
+    x = ZeroPadding2D((1, 1), name=conv_name_base + '2b_zeropadding')(x)
+    x = Convolution2D(nb_filter2, kernel_size, kernel_size,
+                      name=conv_name_base + '2b', bias=False)(x)
+    x = BatchNormalization(epsilon=eps, axis=bn_axis, name=bn_name_base + '2b')(x)
+    x = Scale(axis=bn_axis, name=scale_name_base + '2b')(x)
+    x = Activation('relu', name=conv_name_base + '2b_relu')(x)
+
+    x = Convolution2D(nb_filter3, 1, 1, name=conv_name_base + '2c', bias=False)(x)
+    x = BatchNormalization(epsilon=eps, axis=bn_axis, name=bn_name_base + '2c')(x)
+    x = Scale(axis=bn_axis, name=scale_name_base + '2c')(x)
+
+    x = merge([x, input_tensor], mode='sum', name='res' + str(stage) + block)
+    x = Activation('relu', name='res' + str(stage) + block + '_relu')(x)
+    return x
+
+def conv_block(input_tensor, kernel_size, filters, stage, block, strides=(2, 2)):
+    '''conv_block is the block that has a conv layer at shortcut
+    # Arguments
+        input_tensor: input tensor
+        kernel_size: defualt 3, the kernel size of middle conv layer at main path
+        filters: list of integers, the nb_filters of 3 conv layer at main path
+        stage: integer, current stage label, used for generating layer names
+        block: 'a','b'..., current block label, used for generating layer names
+    Note that from stage 3, the first conv layer at main path is with subsample=(2,2)
+    And the shortcut should have subsample=(2,2) as well
+    '''
+    eps = 1.1e-5
+    nb_filter1, nb_filter2, nb_filter3 = filters
+    conv_name_base = 'res' + str(stage) + block + '_branch'
+    bn_name_base = 'bn' + str(stage) + block + '_branch'
+    scale_name_base = 'scale' + str(stage) + block + '_branch'
+
+    x = Convolution2D(nb_filter1, 1, 1, subsample=strides,
+                      name=conv_name_base + '2a', bias=False)(input_tensor)
+    x = BatchNormalization(epsilon=eps, axis=bn_axis, name=bn_name_base + '2a')(x)
+    x = Scale(axis=bn_axis, name=scale_name_base + '2a')(x)
+    x = Activation('relu', name=conv_name_base + '2a_relu')(x)
+
+    x = ZeroPadding2D((1, 1), name=conv_name_base + '2b_zeropadding')(x)
+    x = Convolution2D(nb_filter2, kernel_size, kernel_size,
+                      name=conv_name_base + '2b', bias=False)(x)
+    x = BatchNormalization(epsilon=eps, axis=bn_axis, name=bn_name_base + '2b')(x)
+    x = Scale(axis=bn_axis, name=scale_name_base + '2b')(x)
+    x = Activation('relu', name=conv_name_base + '2b_relu')(x)
+
+    x = Convolution2D(nb_filter3, 1, 1, name=conv_name_base + '2c', bias=False)(x)
+    x = BatchNormalization(epsilon=eps, axis=bn_axis, name=bn_name_base + '2c')(x)
+    x = Scale(axis=bn_axis, name=scale_name_base + '2c')(x)
+
+    shortcut = Convolution2D(nb_filter3, 1, 1, subsample=strides,
+                             name=conv_name_base + '1', bias=False)(input_tensor)
+    shortcut = BatchNormalization(epsilon=eps, axis=bn_axis, name=bn_name_base + '1')(shortcut)
+    shortcut = Scale(axis=bn_axis, name=scale_name_base + '1')(shortcut)
+
+    x = merge([x, shortcut], mode='sum', name='res' + str(stage) + block)
+    x = Activation('relu', name='res' + str(stage) + block + '_relu')(x)
+    return x
+
+def resnet101_model(img_rows, img_cols, color_type=1, num_classes=None):
+    """
+    Resnet 101 Model for Keras
+
+    Model Schema and layer naming follow that of the original Caffe implementation
+    https://github.com/KaimingHe/deep-residual-networks
+
+    ImageNet Pretrained Weights 
+    Theano: https://drive.google.com/file/d/0Byy2AcGyEVxfdUV1MHJhelpnSG8/view?usp=sharing
+    TensorFlow: https://drive.google.com/file/d/0Byy2AcGyEVxfTmRRVmpGWDczaXM/view?usp=sharing
+
+    Parameters:
+      img_rows, img_cols - resolution of inputs
+      channel - 1 for grayscale, 3 for color 
+      num_classes - number of class labels for our classification task
+    """
+    eps = 1.1e-5
+
+    # Handle Dimension Ordering for different backends
+    global bn_axis
+    if K.image_dim_ordering() == 'tf':
+      bn_axis = 3
+      img_input = Input(shape=(img_rows, img_cols, color_type), name='data')
+    else:
+      bn_axis = 1
+      img_input = Input(shape=(color_type, img_rows, img_cols), name='data')
+
+    x = ZeroPadding2D((3, 3), name='conv1_zeropadding')(img_input)
+    x = Convolution2D(64, 7, 7, subsample=(2, 2), name='conv1', bias=False)(x)
+    x = BatchNormalization(epsilon=eps, axis=bn_axis, name='bn_conv1')(x)
+    x = Scale(axis=bn_axis, name='scale_conv1')(x)
+    x = Activation('relu', name='conv1_relu')(x)
+    x = MaxPooling2D((3, 3), strides=(2, 2), name='pool1')(x)
+
+    x = conv_block(x, 3, [64, 64, 256], stage=2, block='a', strides=(1, 1))
+    x = identity_block(x, 3, [64, 64, 256], stage=2, block='b')
+    x = identity_block(x, 3, [64, 64, 256], stage=2, block='c')
+
+    x = conv_block(x, 3, [128, 128, 512], stage=3, block='a')
+    for i in range(1,3):
+      x = identity_block(x, 3, [128, 128, 512], stage=3, block='b'+str(i))
+
+    x = conv_block(x, 3, [256, 256, 1024], stage=4, block='a')
+    for i in range(1,23):
+      x = identity_block(x, 3, [256, 256, 1024], stage=4, block='b'+str(i))
+
+    x = conv_block(x, 3, [512, 512, 2048], stage=5, block='a')
+    x = identity_block(x, 3, [512, 512, 2048], stage=5, block='b')
+    x = identity_block(x, 3, [512, 512, 2048], stage=5, block='c')
+
+    x_fc = AveragePooling2D((7, 7), name='avg_pool')(x)
+    x_fc = Flatten()(x_fc)
+    x_fc = Dense(1000, activation='softmax', name='fc1000')(x_fc)
+
+    model = Model(img_input, x_fc)
+
+    if K.image_dim_ordering() == 'th':
+      # Use pre-trained weights for Theano backend
+      weights_path = 'imagenet_models/resnet101_weights_th.h5'
+    else:
+      # Use pre-trained weights for Tensorflow backend
+      weights_path = 'imagenet_models/resnet101_weights_tf.h5'
+
+    model.load_weights(weights_path, by_name=True)
+
+    # Truncate and replace softmax layer for transfer learning
+    # Cannot use model.layers.pop() since model is not of Sequential() type
+    # The method below works since pre-trained weights are stored in layers but not in the model
+    x_newfc = AveragePooling2D((7, 7), name='avg_pool')(x)
+    x_newfc = Flatten()(x_newfc)
+    x_newfc = Dense(num_classes, activation='softmax', name='fc8')(x_newfc)
+
+    model = Model(img_input, x_newfc)
+
+    # Learning rate is changed to 0.001
+    sgd = SGD(lr=1e-3, decay=1e-6, momentum=0.9, nesterov=True)
+    model.compile(optimizer=sgd, loss='categorical_crossentropy', metrics=['accuracy'])
+
+    return model
+
+if __name__ == '__main__':
+
+    # Example to fine-tune on 3000 samples from Cifar10
+
+    img_rows, img_cols = 224, 224 # Resolution of inputs
+    channel = 3
+    num_classes = 10 
+    batch_size = 16 
+    nb_epoch = 10
+
+    # Load Cifar10 data. Please implement your own load_data() module for your own dataset
+    X_train, Y_train, X_valid, Y_valid = load_cifar10_data(img_rows, img_cols)
+
+    # Load our model
+    model = resnet101_model(img_rows, img_cols, channel, num_classes)
+
+    # Start Fine-tuning
+    model.fit(X_train, Y_train,
+              batch_size=batch_size,
+              nb_epoch=nb_epoch,
+              shuffle=True,
+              verbose=1,
+              validation_data=(X_valid, Y_valid),
+              )
+
+    # Make predictions
+    predictions_valid = model.predict(X_valid, batch_size=batch_size, verbose=1)
+
+    # Cross-entropy loss score
+    score = log_loss(Y_valid, predictions_valid)
diff --git a/keras_contrib/applications/resnet_152.py b/keras_contrib/applications/resnet_152.py
new file mode 100644
index 000000000..0662e37c4
--- /dev/null
+++ b/keras_contrib/applications/resnet_152.py
@@ -0,0 +1,206 @@
+# -*- coding: utf-8 -*-
+
+from keras.models import Sequential
+from keras.optimizers import SGD
+from keras.layers import Input, Dense, Convolution2D, MaxPooling2D, AveragePooling2D, ZeroPadding2D, Dropout, Flatten, merge, Reshape, Activation
+from keras.layers.normalization import BatchNormalization
+from keras.models import Model
+from keras import backend as K
+
+from sklearn.metrics import log_loss
+
+from custom_layers.scale_layer import Scale
+
+from load_cifar10 import load_cifar10_data
+
+import sys
+sys.setrecursionlimit(3000)
+
+def identity_block(input_tensor, kernel_size, filters, stage, block):
+    '''The identity_block is the block that has no conv layer at shortcut
+    # Arguments
+        input_tensor: input tensor
+        kernel_size: defualt 3, the kernel size of middle conv layer at main path
+        filters: list of integers, the nb_filters of 3 conv layer at main path
+        stage: integer, current stage label, used for generating layer names
+        block: 'a','b'..., current block label, used for generating layer names
+    '''
+    eps = 1.1e-5
+    nb_filter1, nb_filter2, nb_filter3 = filters
+    conv_name_base = 'res' + str(stage) + block + '_branch'
+    bn_name_base = 'bn' + str(stage) + block + '_branch'
+    scale_name_base = 'scale' + str(stage) + block + '_branch'
+
+    x = Convolution2D(nb_filter1, 1, 1, name=conv_name_base + '2a', bias=False)(input_tensor)
+    x = BatchNormalization(epsilon=eps, axis=bn_axis, name=bn_name_base + '2a')(x)
+    x = Scale(axis=bn_axis, name=scale_name_base + '2a')(x)
+    x = Activation('relu', name=conv_name_base + '2a_relu')(x)
+
+    x = ZeroPadding2D((1, 1), name=conv_name_base + '2b_zeropadding')(x)
+    x = Convolution2D(nb_filter2, kernel_size, kernel_size,
+                      name=conv_name_base + '2b', bias=False)(x)
+    x = BatchNormalization(epsilon=eps, axis=bn_axis, name=bn_name_base + '2b')(x)
+    x = Scale(axis=bn_axis, name=scale_name_base + '2b')(x)
+    x = Activation('relu', name=conv_name_base + '2b_relu')(x)
+
+    x = Convolution2D(nb_filter3, 1, 1, name=conv_name_base + '2c', bias=False)(x)
+    x = BatchNormalization(epsilon=eps, axis=bn_axis, name=bn_name_base + '2c')(x)
+    x = Scale(axis=bn_axis, name=scale_name_base + '2c')(x)
+
+    x = merge([x, input_tensor], mode='sum', name='res' + str(stage) + block)
+    x = Activation('relu', name='res' + str(stage) + block + '_relu')(x)
+    return x
+
+def conv_block(input_tensor, kernel_size, filters, stage, block, strides=(2, 2)):
+    '''conv_block is the block that has a conv layer at shortcut
+    # Arguments
+        input_tensor: input tensor
+        kernel_size: defualt 3, the kernel size of middle conv layer at main path
+        filters: list of integers, the nb_filters of 3 conv layer at main path
+        stage: integer, current stage label, used for generating layer names
+        block: 'a','b'..., current block label, used for generating layer names
+    Note that from stage 3, the first conv layer at main path is with subsample=(2,2)
+    And the shortcut should have subsample=(2,2) as well
+    '''
+    eps = 1.1e-5
+    nb_filter1, nb_filter2, nb_filter3 = filters
+    conv_name_base = 'res' + str(stage) + block + '_branch'
+    bn_name_base = 'bn' + str(stage) + block + '_branch'
+    scale_name_base = 'scale' + str(stage) + block + '_branch'
+
+    x = Convolution2D(nb_filter1, 1, 1, subsample=strides,
+                      name=conv_name_base + '2a', bias=False)(input_tensor)
+    x = BatchNormalization(epsilon=eps, axis=bn_axis, name=bn_name_base + '2a')(x)
+    x = Scale(axis=bn_axis, name=scale_name_base + '2a')(x)
+    x = Activation('relu', name=conv_name_base + '2a_relu')(x)
+
+    x = ZeroPadding2D((1, 1), name=conv_name_base + '2b_zeropadding')(x)
+    x = Convolution2D(nb_filter2, kernel_size, kernel_size,
+                      name=conv_name_base + '2b', bias=False)(x)
+    x = BatchNormalization(epsilon=eps, axis=bn_axis, name=bn_name_base + '2b')(x)
+    x = Scale(axis=bn_axis, name=scale_name_base + '2b')(x)
+    x = Activation('relu', name=conv_name_base + '2b_relu')(x)
+
+    x = Convolution2D(nb_filter3, 1, 1, name=conv_name_base + '2c', bias=False)(x)
+    x = BatchNormalization(epsilon=eps, axis=bn_axis, name=bn_name_base + '2c')(x)
+    x = Scale(axis=bn_axis, name=scale_name_base + '2c')(x)
+
+    shortcut = Convolution2D(nb_filter3, 1, 1, subsample=strides,
+                             name=conv_name_base + '1', bias=False)(input_tensor)
+    shortcut = BatchNormalization(epsilon=eps, axis=bn_axis, name=bn_name_base + '1')(shortcut)
+    shortcut = Scale(axis=bn_axis, name=scale_name_base + '1')(shortcut)
+
+    x = merge([x, shortcut], mode='sum', name='res' + str(stage) + block)
+    x = Activation('relu', name='res' + str(stage) + block + '_relu')(x)
+    return x
+
+def resnet152_model(img_rows, img_cols, color_type=1, num_classes=None):
+    """
+    Resnet 152 Model for Keras
+
+    Model Schema and layer naming follow that of the original Caffe implementation
+    https://github.com/KaimingHe/deep-residual-networks
+
+    ImageNet Pretrained Weights 
+    Theano: https://drive.google.com/file/d/0Byy2AcGyEVxfZHhUT3lWVWxRN28/view?usp=sharing
+    TensorFlow: https://drive.google.com/file/d/0Byy2AcGyEVxfeXExMzNNOHpEODg/view?usp=sharing
+
+    Parameters:
+      img_rows, img_cols - resolution of inputs
+      channel - 1 for grayscale, 3 for color 
+      num_classes - number of class labels for our classification task
+    """
+    eps = 1.1e-5
+
+    # Handle Dimension Ordering for different backends
+    global bn_axis
+    if K.image_dim_ordering() == 'tf':
+      bn_axis = 3
+      img_input = Input(shape=(img_rows, img_cols, color_type), name='data')
+    else:
+      bn_axis = 1
+      img_input = Input(shape=(color_type, img_rows, img_cols), name='data')
+
+    x = ZeroPadding2D((3, 3), name='conv1_zeropadding')(img_input)
+    x = Convolution2D(64, 7, 7, subsample=(2, 2), name='conv1', bias=False)(x)
+    x = BatchNormalization(epsilon=eps, axis=bn_axis, name='bn_conv1')(x)
+    x = Scale(axis=bn_axis, name='scale_conv1')(x)
+    x = Activation('relu', name='conv1_relu')(x)
+    x = MaxPooling2D((3, 3), strides=(2, 2), name='pool1')(x)
+
+    x = conv_block(x, 3, [64, 64, 256], stage=2, block='a', strides=(1, 1))
+    x = identity_block(x, 3, [64, 64, 256], stage=2, block='b')
+    x = identity_block(x, 3, [64, 64, 256], stage=2, block='c')
+
+    x = conv_block(x, 3, [128, 128, 512], stage=3, block='a')
+    for i in range(1,8):
+      x = identity_block(x, 3, [128, 128, 512], stage=3, block='b'+str(i))
+
+    x = conv_block(x, 3, [256, 256, 1024], stage=4, block='a')
+    for i in range(1,36):
+      x = identity_block(x, 3, [256, 256, 1024], stage=4, block='b'+str(i))
+
+    x = conv_block(x, 3, [512, 512, 2048], stage=5, block='a')
+    x = identity_block(x, 3, [512, 512, 2048], stage=5, block='b')
+    x = identity_block(x, 3, [512, 512, 2048], stage=5, block='c')
+
+    x_fc = AveragePooling2D((7, 7), name='avg_pool')(x)
+    x_fc = Flatten()(x_fc)
+    x_fc = Dense(1000, activation='softmax', name='fc1000')(x_fc)
+
+    model = Model(img_input, x_fc)
+
+    if K.image_dim_ordering() == 'th':
+      # Use pre-trained weights for Theano backend
+      weights_path = 'imagenet_models/resnet152_weights_th.h5'
+    else:
+      # Use pre-trained weights for Tensorflow backend
+      weights_path = 'imagenet_models/resnet152_weights_tf.h5'
+
+    model.load_weights(weights_path, by_name=True)
+
+    # Truncate and replace softmax layer for transfer learning
+    # Cannot use model.layers.pop() since model is not of Sequential() type
+    # The method below works since pre-trained weights are stored in layers but not in the model
+    x_newfc = AveragePooling2D((7, 7), name='avg_pool')(x)
+    x_newfc = Flatten()(x_newfc)
+    x_newfc = Dense(num_classes, activation='softmax', name='fc8')(x_newfc)
+
+    model = Model(img_input, x_newfc)
+
+    # Learning rate is changed to 0.001
+    sgd = SGD(lr=1e-3, decay=1e-6, momentum=0.9, nesterov=True)
+    model.compile(optimizer=sgd, loss='categorical_crossentropy', metrics=['accuracy'])
+
+    return model
+
+if __name__ == '__main__':
+
+    # Example to fine-tune on 3000 samples from Cifar10
+
+    img_rows, img_cols = 224, 224 # Resolution of inputs
+    channel = 3
+    num_classes = 10 
+    batch_size = 8
+    nb_epoch = 10
+
+    # Load Cifar10 data. Please implement your own load_data() module for your own dataset
+    X_train, Y_train, X_valid, Y_valid = load_cifar10_data(img_rows, img_cols)
+
+    # Load our model
+    model = resnet152_model(img_rows, img_cols, channel, num_classes)
+
+    # Start Fine-tuning
+    model.fit(X_train, Y_train,
+              batch_size=batch_size,
+              nb_epoch=nb_epoch,
+              shuffle=True,
+              verbose=1,
+              validation_data=(X_valid, Y_valid),
+              )
+
+    # Make predictions
+    predictions_valid = model.predict(X_valid, batch_size=batch_size, verbose=1)
+
+    # Cross-entropy loss score
+    score = log_loss(Y_valid, predictions_valid)
diff --git a/keras_contrib/applications/resnet_50.py b/keras_contrib/applications/resnet_50.py
new file mode 100644
index 000000000..2889a7bdf
--- /dev/null
+++ b/keras_contrib/applications/resnet_50.py
@@ -0,0 +1,198 @@
+# -*- coding: utf-8 -*-
+
+from keras.models import Sequential
+from keras.optimizers import SGD
+from keras.layers import Input, Dense, Convolution2D, MaxPooling2D, AveragePooling2D, ZeroPadding2D, Dropout, Flatten, merge, Reshape, Activation
+from keras.layers.normalization import BatchNormalization
+from keras.models import Model
+from keras import backend as K
+
+from sklearn.metrics import log_loss
+
+from load_cifar10 import load_cifar10_data
+
+def identity_block(input_tensor, kernel_size, filters, stage, block):
+    """
+    The identity_block is the block that has no conv layer at shortcut
+    Arguments
+        input_tensor: input tensor
+        kernel_size: defualt 3, the kernel size of middle conv layer at main path
+        filters: list of integers, the nb_filters of 3 conv layer at main path
+        stage: integer, current stage label, used for generating layer names
+        block: 'a','b'..., current block label, used for generating layer names
+    """
+
+    nb_filter1, nb_filter2, nb_filter3 = filters
+    conv_name_base = 'res' + str(stage) + block + '_branch'
+    bn_name_base = 'bn' + str(stage) + block + '_branch'
+
+    x = Convolution2D(nb_filter1, 1, 1, name=conv_name_base + '2a')(input_tensor)
+    x = BatchNormalization(axis=bn_axis, name=bn_name_base + '2a')(x)
+    x = Activation('relu')(x)
+
+    x = Convolution2D(nb_filter2, kernel_size, kernel_size,
+                      border_mode='same', name=conv_name_base + '2b')(x)
+    x = BatchNormalization(axis=bn_axis, name=bn_name_base + '2b')(x)
+    x = Activation('relu')(x)
+
+    x = Convolution2D(nb_filter3, 1, 1, name=conv_name_base + '2c')(x)
+    x = BatchNormalization(axis=bn_axis, name=bn_name_base + '2c')(x)
+
+    x = merge([x, input_tensor], mode='sum')
+    x = Activation('relu')(x)
+    return x
+
+def conv_block(input_tensor, kernel_size, filters, stage, block, strides=(2, 2)):
+    """
+    conv_block is the block that has a conv layer at shortcut
+    # Arguments
+        input_tensor: input tensor
+        kernel_size: defualt 3, the kernel size of middle conv layer at main path
+        filters: list of integers, the nb_filters of 3 conv layer at main path
+        stage: integer, current stage label, used for generating layer names
+        block: 'a','b'..., current block label, used for generating layer names
+    Note that from stage 3, the first conv layer at main path is with subsample=(2,2)
+    And the shortcut should have subsample=(2,2) as well
+    """
+
+    nb_filter1, nb_filter2, nb_filter3 = filters
+    conv_name_base = 'res' + str(stage) + block + '_branch'
+    bn_name_base = 'bn' + str(stage) + block + '_branch'
+
+    x = Convolution2D(nb_filter1, 1, 1, subsample=strides,
+                      name=conv_name_base + '2a')(input_tensor)
+    x = BatchNormalization(axis=bn_axis, name=bn_name_base + '2a')(x)
+    x = Activation('relu')(x)
+
+    x = Convolution2D(nb_filter2, kernel_size, kernel_size, border_mode='same',
+                      name=conv_name_base + '2b')(x)
+    x = BatchNormalization(axis=bn_axis, name=bn_name_base + '2b')(x)
+    x = Activation('relu')(x)
+
+    x = Convolution2D(nb_filter3, 1, 1, name=conv_name_base + '2c')(x)
+    x = BatchNormalization(axis=bn_axis, name=bn_name_base + '2c')(x)
+
+    shortcut = Convolution2D(nb_filter3, 1, 1, subsample=strides,
+                             name=conv_name_base + '1')(input_tensor)
+    shortcut = BatchNormalization(axis=bn_axis, name=bn_name_base + '1')(shortcut)
+
+    x = merge([x, shortcut], mode='sum')
+    x = Activation('relu')(x)
+    return x
+
+def resnet50_model(img_rows, img_cols, color_type=1, num_classes=None):
+    """
+    Resnet 50 Model for Keras
+
+    Model Schema is based on 
+    https://github.com/fchollet/deep-learning-models/blob/master/resnet50.py
+
+    ImageNet Pretrained Weights 
+    https://github.com/fchollet/deep-learning-models/releases/download/v0.2/resnet50_weights_th_dim_ordering_th_kernels.h5
+
+    Parameters:
+      img_rows, img_cols - resolution of inputs
+      channel - 1 for grayscale, 3 for color 
+      num_classes - number of class labels for our classification task
+    """
+
+    # Handle Dimension Ordering for different backends
+    global bn_axis
+    if K.image_dim_ordering() == 'tf':
+      bn_axis = 3
+      img_input = Input(shape=(img_rows, img_cols, color_type))
+    else:
+      bn_axis = 1
+      img_input = Input(shape=(color_type, img_rows, img_cols))
+
+    x = ZeroPadding2D((3, 3))(img_input)
+    x = Convolution2D(64, 7, 7, subsample=(2, 2), name='conv1')(x)
+    x = BatchNormalization(axis=bn_axis, name='bn_conv1')(x)
+    x = Activation('relu')(x)
+    x = MaxPooling2D((3, 3), strides=(2, 2))(x)
+
+    x = conv_block(x, 3, [64, 64, 256], stage=2, block='a', strides=(1, 1))
+    x = identity_block(x, 3, [64, 64, 256], stage=2, block='b')
+    x = identity_block(x, 3, [64, 64, 256], stage=2, block='c')
+
+    x = conv_block(x, 3, [128, 128, 512], stage=3, block='a')
+    x = identity_block(x, 3, [128, 128, 512], stage=3, block='b')
+    x = identity_block(x, 3, [128, 128, 512], stage=3, block='c')
+    x = identity_block(x, 3, [128, 128, 512], stage=3, block='d')
+
+    x = conv_block(x, 3, [256, 256, 1024], stage=4, block='a')
+    x = identity_block(x, 3, [256, 256, 1024], stage=4, block='b')
+    x = identity_block(x, 3, [256, 256, 1024], stage=4, block='c')
+    x = identity_block(x, 3, [256, 256, 1024], stage=4, block='d')
+    x = identity_block(x, 3, [256, 256, 1024], stage=4, block='e')
+    x = identity_block(x, 3, [256, 256, 1024], stage=4, block='f')
+
+    x = conv_block(x, 3, [512, 512, 2048], stage=5, block='a')
+    x = identity_block(x, 3, [512, 512, 2048], stage=5, block='b')
+    x = identity_block(x, 3, [512, 512, 2048], stage=5, block='c')
+
+    # Fully Connected Softmax Layer
+    x_fc = AveragePooling2D((7, 7), name='avg_pool')(x)
+    x_fc = Flatten()(x_fc)
+    x_fc = Dense(1000, activation='softmax', name='fc1000')(x_fc)
+
+    # Create model
+    model = Model(img_input, x_fc)
+
+    # Load ImageNet pre-trained data 
+    if K.image_dim_ordering() == 'th':
+      # Use pre-trained weights for Theano backend
+      weights_path = 'imagenet_models/resnet50_weights_th_dim_ordering_th_kernels.h5'
+    else:
+      # Use pre-trained weights for Tensorflow backend
+      weights_path = 'imagenet_models/resnet50_weights_tf_dim_ordering_tf_kernels.h5'
+
+    model.load_weights(weights_path)
+
+    # Truncate and replace softmax layer for transfer learning
+    # Cannot use model.layers.pop() since model is not of Sequential() type
+    # The method below works since pre-trained weights are stored in layers but not in the model
+    x_newfc = AveragePooling2D((7, 7), name='avg_pool')(x)
+    x_newfc = Flatten()(x_newfc)
+    x_newfc = Dense(num_classes, activation='softmax', name='fc10')(x_newfc)
+
+    # Create another model with our customized softmax
+    model = Model(img_input, x_newfc)
+
+    # Learning rate is changed to 0.001
+    sgd = SGD(lr=1e-3, decay=1e-6, momentum=0.9, nesterov=True)
+    model.compile(optimizer=sgd, loss='categorical_crossentropy', metrics=['accuracy'])
+  
+    return model
+
+if __name__ == '__main__':
+
+    # Example to fine-tune on 3000 samples from Cifar10
+
+    img_rows, img_cols = 224, 224 # Resolution of inputs
+    channel = 3
+    num_classes = 10 
+    batch_size = 16 
+    nb_epoch = 10
+
+    # Load Cifar10 data. Please implement your own load_data() module for your own dataset
+    X_train, Y_train, X_valid, Y_valid = load_cifar10_data(img_rows, img_cols)
+
+    # Load our model
+    model = resnet50_model(img_rows, img_cols, channel, num_classes)
+
+    # Start Fine-tuning
+    model.fit(X_train, Y_train,
+              batch_size=batch_size,
+              nb_epoch=nb_epoch,
+              shuffle=True,
+              verbose=1,
+              validation_data=(X_valid, Y_valid),
+              )
+
+    # Make predictions
+    predictions_valid = model.predict(X_valid, batch_size=batch_size, verbose=1)
+
+    # Cross-entropy loss score
+    score = log_loss(Y_valid, predictions_valid)
+