diff --git a/docs/src/tutorial/mnist.md b/docs/src/tutorial/mnist.md
index 3047a9c1d3ca..096d7dd0310f 100644
--- a/docs/src/tutorial/mnist.md
+++ b/docs/src/tutorial/mnist.md
@@ -28,7 +28,7 @@ data = mx.Variable(:data)
 
 and then cascading fully-connected layers and activation functions:
 
-``` {.sourceCode .julia}
+```julia
 fc1  = mx.FullyConnected(data = data, name=:fc1, num_hidden=128)
 act1 = mx.Activation(data = fc1, name=:relu1, act_type=:relu)
 fc2  = mx.FullyConnected(data = act1, name=:fc2, num_hidden=64)
diff --git a/docs/src/user-guide/overview.md b/docs/src/user-guide/overview.md
index 85814cdc63fe..9a7d8e514894 100644
--- a/docs/src/user-guide/overview.md
+++ b/docs/src/user-guide/overview.md
@@ -59,7 +59,7 @@ The followings are common ways to create NDArray objects:
 
 -   `mx.empty(shape[, context])`: create on uninitialized array of a
     given shape on a specific device. For example,
-    `` mx.empty(2,3)`, `mx.((2,3), mx.gpu(2)) ``.
+    ` mx.empty(2,3)`, `mx.((2,3), mx.gpu(2)) `.
 -   `mx.zeros(shape[, context])` and `mx.ones(shape[, context])`:
     similar to the Julia's built-in `zeros` and `ones`.
 -   `mx.copy(jl_arr, context)`: copy the contents of a Julia `Array` to
diff --git a/src/callback.jl b/src/callback.jl
index c3e1d299403e..6203f9f51f87 100644
--- a/src/callback.jl
+++ b/src/callback.jl
@@ -28,7 +28,7 @@ end
 """
     every_n_batch(callback :: Function, n :: Int; call_on_0 = false)
 
-A convenient function to construct a callback that runs every ``n`` mini-batches.
+A convenient function to construct a callback that runs every `n` mini-batches.
 
 # Arguments
 * `call_on_0::Bool`: keyword argument, default false. Unless set, the callback
@@ -64,7 +64,7 @@ end
 """
     speedometer(; frequency=50)
 
-Create an :class:`AbstractBatchCallback` that measure the training speed
+Create an `AbstractBatchCallback` that measure the training speed
    (number of samples processed per second) every k mini-batches.
 
 # Arguments
@@ -95,7 +95,7 @@ end
 """
     every_n_epoch(callback :: Function, n :: Int; call_on_0 = false)
 
-A convenient function to construct a callback that runs every ``n`` full data-passes.
+A convenient function to construct a callback that runs every `n` full data-passes.
 
 * Int call_on_0: keyword argument, default false. Unless set, the callback
           will **not** be run on epoch 0. Epoch 0 means no training has been performed
@@ -120,7 +120,7 @@ end
 """
     do_checkpoint(prefix; frequency=1, save_epoch_0=false)
 
-Create an :class:`AbstractEpochCallback` that save checkpoints of the model to disk.
+Create an `AbstractEpochCallback` that save checkpoints of the model to disk.
 The checkpoints can be loaded back later on.
 
 # Arguments
diff --git a/src/context.jl b/src/context.jl
index 908d542f5ba3..410a80ca8b4c 100644
--- a/src/context.jl
+++ b/src/context.jl
@@ -19,7 +19,7 @@ end
 """
     cpu(dev_id)
 
-Get a CPU context with a specific id. ``cpu()`` is usually the default context for many
+Get a CPU context with a specific id. `cpu()` is usually the default context for many
 operations when no context is specified.
 
 # Arguments
diff --git a/src/executor.jl b/src/executor.jl
index 0bdccc942d22..8e8bdd3bec93 100644
--- a/src/executor.jl
+++ b/src/executor.jl
@@ -1,7 +1,7 @@
 """
     Executor
 
-An executor is a realization of a symbolic architecture defined by a :class:`SymbolicNode`.
+An executor is a realization of a symbolic architecture defined by a `SymbolicNode`.
 The actual forward and backward computation specified by the network architecture can
 be carried out with an executor.
 """
@@ -68,12 +68,12 @@ end
 """
     bind(sym, ctx, args; args_grad=Dict(), aux_states=Dict(), grad_req=GRAD_WRITE)
 
-Create an :class:`Executor` by binding a :class:`SymbolicNode` to concrete :class:`NDArray`.
+Create an `Executor` by binding a `SymbolicNode` to concrete `NDArray`.
 
 # Arguments
 * `sym::SymbolicNode`: the network architecture describing the computation graph.
 * `ctx::Context`: the context on which the computation should run.
-* `args`: either a list of :class:`NDArray` or a dictionary of name-array pairs. Concrete
+* `args`: either a list of `NDArray` or a dictionary of name-array pairs. Concrete
           arrays for all the inputs in the network architecture. The inputs typically include
           network parameters (weights, bias, filters, etc.), data and labels. See :func:`list_arguments`
           and :func:`infer_shape`.
diff --git a/src/initializer.jl b/src/initializer.jl
index 7ee9920a9d12..dacb06f349c3 100644
--- a/src/initializer.jl
+++ b/src/initializer.jl
@@ -64,7 +64,7 @@ end
 """
     UniformInitializer(scale=0.07)
 
-Construct a :class:`UniformInitializer` with the specified scale.
+Construct a `UniformInitializer` with the specified scale.
 """
 UniformInitializer() = UniformInitializer(0.07)
 
@@ -84,7 +84,7 @@ end
 """
     NormalIninitializer(; mu=0, sigma=0.01)
 
-Construct a :class:`NormalInitializer` with mean ``mu`` and variance ``sigma``.
+Construct a `NormalInitializer` with mean `mu` and variance `sigma`.
 """
 NormalInitializer(; mu=0, sigma=0.01) = NormalInitializer(mu, sigma)
 
@@ -106,9 +106,9 @@ a normal distribution with μ = 0 and σ² or a uniform distribution from -σ to
 Several different ways of calculating the variance are given in the literature or are
 used by various libraries.
 
-* [Bengio and Glorot 2010]: ``mx.XavierInitializer(distribution = mx.xv_uniform, regularization = mx.xv_avg, magnitude = 1)``
-* [K. He, X. Zhang, S. Ren, and J. Sun 2015]: ``mx.XavierInitializer(distribution = mx.xv_gaussian, regularization = mx.xv_in, magnitude = 2)``
-* caffe_avg: ``mx.XavierInitializer(distribution = mx.xv_uniform, regularization = mx.xv_avg, magnitude = 3)``
+* [Bengio and Glorot 2010]: `mx.XavierInitializer(distribution = mx.xv_uniform, regularization = mx.xv_avg, magnitude = 1)`
+* [K. He, X. Zhang, S. Ren, and J. Sun 2015]: `mx.XavierInitializer(distribution = mx.xv_gaussian, regularization = mx.xv_in, magnitude = 2)`
+* caffe_avg: `mx.XavierInitializer(distribution = mx.xv_uniform, regularization = mx.xv_avg, magnitude = 3)`
 """
 
 @enum XavierDistribution xv_uniform xv_normal
diff --git a/src/io.jl b/src/io.jl
index f240388ffbda..56b463a1a6c3 100644
--- a/src/io.jl
+++ b/src/io.jl
@@ -25,7 +25,7 @@ The root type for all data provider. A data provider should implement the follow
    training stage, both *data* and *label* will be feeded into the model, while during
    prediction stage, only *data* is loaded. Otherwise, they could be anything, with any names, and
    of any shapes. The provided data and label names here should match the input names in a target
-   :class:`SymbolicNode`.
+   `SymbolicNode`.
 
    A data provider should also implement the Julia iteration interface, in order to allow iterating
    through the data set. The provider will be called in the following way:
@@ -48,7 +48,7 @@ The root type for all data provider. A data provider should implement the follow
 
    By default, :func:`eachbatch` simply returns the provider itself, so the iterator interface
    is implemented on the provider type itself. But the extra layer of abstraction allows us to
-   implement a data provider easily via a Julia ``Task`` coroutine. See the
+   implement a data provider easily via a Julia `Task` coroutine. See the
    data provider defined in :doc:`the char-lstm example
    </tutorial/char-lstm>` for an example of using coroutine to define data
    providers.
@@ -58,7 +58,7 @@ The detailed interface functions for the iterator API is listed below:
     Base.eltype(provider) -> AbstractDataBatch
 
    :param AbstractDataProvider provider: the data provider.
-   :return: the specific subtype representing a data batch. See :class:`AbstractDataBatch`.
+   :return: the specific subtype representing a data batch. See `AbstractDataBatch`.
 
     Base.start(provider) -> AbstractDataProviderState
 
@@ -91,7 +91,7 @@ case, you can safely assume that
   not be called.
 
 With those assumptions, it will be relatively easy to adapt any existing iterator. See the implementation
-of the built-in :class:`MXDataProvider` for example.
+of the built-in `MXDataProvider` for example.
 
 .. caution::
 
@@ -137,7 +137,7 @@ abstract AbstractDataProviderState
       :return: a vector of data in this batch, should be in the same order as declared in
                :func:`provide_data() <AbstractDataProvider.provide_data>`.
 
-               The last dimension of each :class:`NDArray` should always match the batch_size, even when
+               The last dimension of each `NDArray` should always match the batch_size, even when
                :func:`count_samples` returns a value less than the batch size. In this case,
                the data provider is free to pad the remaining contents with any value.
 
@@ -167,7 +167,7 @@ abstract AbstractDataProviderState
       :type targets: Vector{Vector{SlicedNDArray}}
 
       The targets is a list of the same length as number of data provided by this provider.
-      Each element in the list is a list of :class:`SlicedNDArray`. This list described a
+      Each element in the list is a list of `SlicedNDArray`. This list described a
       spliting scheme of this data batch into different slices, each slice is specified by
       a slice-ndarray pair, where *slice* specify the range of samples in the mini-batch
       that should be loaded into the corresponding *ndarray*.
@@ -189,7 +189,7 @@ abstract AbstractDataBatch
 """
     DataBatch
 
-   A basic subclass of :class:`AbstractDataBatch`, that implement the interface by
+   A basic subclass of `AbstractDataBatch`, that implement the interface by
    accessing member fields.
 """
 type DataBatch <: AbstractDataBatch
@@ -204,7 +204,7 @@ get_label{Provider<:AbstractDataProvider}(::Provider, batch :: DataBatch) = batc
 """
     SlicedNDArray
 
-   A alias type of ``Tuple{UnitRange{Int},NDArray}``.
+   A alias type of `Tuple{UnitRange{Int},NDArray}`.
 """
 typealias SlicedNDArray Tuple{UnitRange{Int},NDArray}
 
@@ -257,7 +257,7 @@ eachbatch(provider :: AbstractDataProvider) = provider
 """
     ArrayDataProvider
 
-   A convenient tool to iterate :class:`NDArray` or Julia ``Array``.
+   A convenient tool to iterate `NDArray` or Julia `Array`.
 """
 type ArrayDataProvider <: AbstractDataProvider
   data_arrays   :: Vector{Array{MX_float}}
@@ -277,16 +277,16 @@ end
 """
     ArrayDataProvider(data[, label]; batch_size, shuffle, data_padding, label_padding)
 
-   Construct a data provider from :class:`NDArray` or Julia Arrays.
+   Construct a data provider from `NDArray` or Julia Arrays.
 
    :param data: the data, could be
 
-          - a :class:`NDArray`, or a Julia Array. This is equivalent to ``:data => data``.
-          - a name-data pair, like ``:mydata => array``, where ``:mydata`` is the name of the data
-            and ``array`` is an :class:`NDArray` or a Julia Array.
+          - a `NDArray`, or a Julia Array. This is equivalent to `:data => data`.
+          - a name-data pair, like `:mydata => array`, where `:mydata` is the name of the data
+            and `array` is an `NDArray` or a Julia Array.
           - a list of name-data pairs.
 
-   :param label: the same as the ``data`` parameter. When this argument is omitted, the constructed
+   :param label: the same as the `data` parameter. When this argument is omitted, the constructed
           provider will provide no labels.
    :param Int batch_size: the batch size, default is 0, which means treating the whole array as a
           single mini-batch.
@@ -294,9 +294,9 @@ end
    :param Real data_padding: when the mini-batch goes beyond the dataset boundary, there might
           be less samples to include than a mini-batch. This value specify a scalar to pad the
           contents of all the missing data points.
-   :param Real label_padding: the same as ``data_padding``, except for the labels.
+   :param Real label_padding: the same as `data_padding`, except for the labels.
 
-   TODO: remove ``data_padding`` and ``label_padding``, and implement rollover that copies
+   TODO: remove `data_padding` and `label_padding`, and implement rollover that copies
    the last or first several training samples to feed the padding.
 """
 # Julia's type system is sometimes very frustrating. You cannot specify a function
@@ -563,16 +563,16 @@ function _define_data_iter_creator(hdr :: MX_handle; gen_docs::Bool=false)
 
   if gen_docs
     if endswith(string(iter_name), "Iter")
-      f_desc = "Can also be called with the alias ``$(string(iter_name)[1:end-4] * "Provider")``.\n"
+      f_desc = "Can also be called with the alias `$(string(iter_name)[1:end-4] * "Provider")`.\n"
     else
       f_desc = ""
     end
     f_desc *= unsafe_string(ref_desc[]) * "\n\n"
-    f_desc *= ":param Base.Symbol data_name: keyword argument, default ``:data``. The name of the data.\n"
-    f_desc *= ":param Base.Symbol label_name: keyword argument, default ``:softmax_label``. " *
-              "The name of the label. Could be ``nothing`` if no label is presented in this dataset.\n\n"
+    f_desc *= ":param Base.Symbol data_name: keyword argument, default `:data`. The name of the data.\n"
+    f_desc *= ":param Base.Symbol label_name: keyword argument, default `:softmax_label`. " *
+              "The name of the label. Could be `nothing` if no label is presented in this dataset.\n\n"
     f_desc *= _format_docstring(Int(ref_narg[]), ref_arg_names, ref_arg_types, ref_arg_descs)
-    f_desc *= ":return: the constructed :class:`MXDataProvider`."
+    f_desc *= ":return: the constructed `MXDataProvider`."
     return (iter_name, f_desc)
   end
 
diff --git a/src/metric.jl b/src/metric.jl
index 21d5e4e34b14..aea8a0af628e 100644
--- a/src/metric.jl
+++ b/src/metric.jl
@@ -22,8 +22,8 @@ interfaces.
 
       Get the accumulated metrics.
 
-      :return: ``Vector{Tuple{Base.Symbol, Real}}``, a list of name-value pairs. For
-               example, ``[(:accuracy, 0.9)]``.
+      :return: `Vector{Tuple{Base.Symbol, Real}}`, a list of name-value pairs. For
+               example, `[(:accuracy, 0.9)]`.
 """
 abstract AbstractEvalMetric
 
diff --git a/src/model.jl b/src/model.jl
index f47d49d1a82d..d6752e24ebd3 100644
--- a/src/model.jl
+++ b/src/model.jl
@@ -48,9 +48,9 @@ end
     FeedForward(arch :: SymbolicNode, ctx)
 
 * arch: the architecture of the network constructed using the symbolic API.
-* ctx: the devices on which this model should do computation. It could be a single :class:`Context`
-               or a list of :class:`Context` objects. In the latter case, data parallelization will be used
-               for training. If no context is provided, the default context ``cpu()`` will be used.
+* ctx: the devices on which this model should do computation. It could be a single `Context`
+               or a list of `Context` objects. In the latter case, data parallelization will be used
+               for training. If no context is provided, the default context `cpu()` will be used.
 """
 function FeedForward(arch :: SymbolicNode; context :: Union{Context, Vector{Context}, Void} = nothing)
   if isa(context, Void)
@@ -74,7 +74,7 @@ end
 * AbstractInitializer initializer: an initializer describing how the weights should be initialized.
 * Bool overwrite: keyword argument, force initialization even when weights already exists.
 * input_shapes: the shape of all data and label inputs to this model, given as keyword arguments.
-                        For example, ``data=(28,28,1,100), label=(100,)``.
+                        For example, `data=(28,28,1,100), label=(100,)`.
 """
 function init_model(self :: FeedForward, initializer :: AbstractInitializer; overwrite::Bool=false, input_shapes...)
   # all arg names, including data, label, and parameters
@@ -177,12 +177,12 @@ end
 
 * FeedForward self: the model.
 * AbstractDataProvider data: the data to perform prediction on.
-* Bool overwrite: an :class:`Executor` is initialized the first time predict is called. The memory
-                          allocation of the :class:`Executor` depends on the mini-batch size of the test
+* Bool overwrite: an `Executor` is initialized the first time predict is called. The memory
+                          allocation of the `Executor` depends on the mini-batch size of the test
                           data provider. If you call predict twice with data provider of the same batch-size,
-                          then the executor can be potentially be re-used. So, if ``overwrite`` is false,
-                          we will try to re-use, and raise an error if batch-size changed. If ``overwrite``
-                          is true (the default), a new :class:`Executor` will be created to replace the old one.
+                          then the executor can be potentially be re-used. So, if `overwrite` is false,
+                          we will try to re-use, and raise an error if batch-size changed. If `overwrite`
+                          is true (the default), a new `Executor` will be created to replace the old one.
 
    .. note::
 
@@ -196,9 +196,9 @@ end
 
    .. note::
 
-      If you perform further after prediction. The weights are not automatically synchronized if ``overwrite``
+      If you perform further after prediction. The weights are not automatically synchronized if `overwrite`
       is set to false and the old predictor is re-used. In this case
-      setting ``overwrite`` to true (the default) will re-initialize the predictor the next time you call
+      setting `overwrite` to true (the default) will re-initialize the predictor the next time you call
       predict and synchronize the weights again.
 
    :seealso: :func:`train`, :func:`fit`, :func:`init_model`, :func:`load_checkpoint`
@@ -319,28 +319,28 @@ end
 """
     fit(model :: FeedForward, optimizer, data; kwargs...)
 
-Train the ``model`` on ``data`` with the ``optimizer``.
+Train the `model` on `data` with the `optimizer`.
 
 * FeedForward model: the model to be trained.
 * AbstractOptimizer optimizer: the optimization algorithm to use.
 * AbstractDataProvider data: the training data provider.
 * Int n_epoch: default 10, the number of full data-passes to run.
-* AbstractDataProvider eval_data: keyword argument, default ``nothing``. The data provider for
+* AbstractDataProvider eval_data: keyword argument, default `nothing`. The data provider for
           the validation set.
-* AbstractEvalMetric eval_metric: keyword argument, default ``Accuracy()``. The metric used
-          to evaluate the training performance. If ``eval_data`` is provided, the same metric is also
+* AbstractEvalMetric eval_metric: keyword argument, default `Accuracy()`. The metric used
+          to evaluate the training performance. If `eval_data` is provided, the same metric is also
           calculated on the validation set.
-* kvstore: keyword argument, default ``:local``. The key-value store used to synchronize gradients
+* kvstore: keyword argument, default `:local`. The key-value store used to synchronize gradients
           and parameters when multiple devices are used for training.
-   :type kvstore: :class:`KVStore` or ``Base.Symbol``
-* AbstractInitializer initializer: keyword argument, default ``UniformInitializer(0.01)``.
+   :type kvstore: `KVStore` or `Base.Symbol`
+* AbstractInitializer initializer: keyword argument, default `UniformInitializer(0.01)`.
 * Bool force_init: keyword argument, default false. By default, the random initialization using the
-          provided ``initializer`` will be skipped if the model weights already exists, maybe from a previous
+          provided `initializer` will be skipped if the model weights already exists, maybe from a previous
           call to :func:`train` or an explicit call to :func:`init_model` or :func:`load_checkpoint`. When
           this option is set, it will always do random initialization at the begining of training.
-* callbacks: keyword argument, default ``[]``. Callbacks to be invoked at each epoch or mini-batch,
-          see :class:`AbstractCallback`.
-   :type callbacks: ``Vector{AbstractCallback}``
+* callbacks: keyword argument, default `[]`. Callbacks to be invoked at each epoch or mini-batch,
+          see `AbstractCallback`.
+   :type callbacks: `Vector{AbstractCallback}`
 """
 function fit(self :: FeedForward, optimizer :: AbstractOptimizer, data :: AbstractDataProvider; kwargs...)
   opts = TrainingOptions(; kwargs...)
diff --git a/src/ndarray.jl b/src/ndarray.jl
index 49555b50f09b..14fdee01564f 100644
--- a/src/ndarray.jl
+++ b/src/ndarray.jl
@@ -70,13 +70,10 @@ end
 """
     NDArray
 
-   Wrapper of the ``NDArray`` type in ``libmxnet``. This is the basic building block
-   of tensor-based computation.
-
-   .. _ndarray-shape-note:
-
-   .. note::
+Wrapper of the `NDArray` type in `libmxnet`. This is the basic building block
+of tensor-based computation.
 
+!!! note
       since C/C++ use row-major ordering for arrays while Julia follows a
       column-major ordering. To keep things consistent, we keep the underlying data
       in their original layout, but use *language-native* convention when we talk
@@ -113,7 +110,7 @@ Base.cconvert(t::Type{MX_handle}, obj::NDArray) = Base.unsafe_convert(t, obj)
 """
     context(arr :: NDArray)
 
-   Get the context that this :class:`NDArray` lives on.
+Get the context that this `NDArray` lives on.
 """
 function context(arr :: NDArray)
   ref_typeid = Ref{Cint}(0)
@@ -130,7 +127,7 @@ end
    empty(DType, shape :: Tuple)
    empty(DType, dim1, dim2, ...)
 
-   Allocate memory for an uninitialized :class:`NDArray` with a specified type.
+Allocate memory for an uninitialized `NDArray` with a specified type.
 """
 function empty{N,T<:DType}(::Type{T}, shape :: NTuple{N, Int})
   empty(T, shape, cpu())
@@ -148,7 +145,7 @@ end
    empty(shape :: Tuple)
    empty(dim1, dim2, ...)
 
-   Allocate memory for an uninitialized :class:`NDArray` with specific shape of type Float32.
+Allocate memory for an uninitialized `NDArray` with specific shape of type Float32.
 """
 function empty{N}(shape :: NTuple{N, Int})
   empty(shape, cpu())
@@ -165,7 +162,7 @@ end
     zeros(DType, shape :: Tuple)
     zeros(DType, dim1, dim2, ...)
 
-Create zero-ed :class:`NDArray` with specific shape and type
+Create zero-ed `NDArray` with specific shape and type
 """
 function zeros{N,T<:DType}(:: Type{T}, shape :: NTuple{N, Int})
   zeros(T, shape, cpu())
@@ -184,7 +181,7 @@ end
     zeros(shape :: Tuple)
     zeros(dim1, dim2, ...)
 
-Create zero-ed :class:`NDArray` with specific shape.
+Create zero-ed `NDArray` with specific shape.
 """
 function zeros{N}(shape :: NTuple{N, Int})
   zeros(shape, cpu())
@@ -203,7 +200,7 @@ end
     ones(DType, shape :: Tuple)
     ones(DType, dim1, dim2, ...)
 
-Create an :class:`NDArray` with specific shape & type, and initialize with 1.
+Create an `NDArray` with specific shape & type, and initialize with 1.
 """
 function ones{N,T<:DType}(:: Type{T}, shape :: NTuple{N, Int})
   ones(T, shape, cpu())
@@ -222,7 +219,7 @@ end
     ones(shape :: Tuple)
     ones(dim1, dim2, ...)
 
-Create an :class:`NDArray` with specific shape and initialize with 1.
+Create an `NDArray` with specific shape and initialize with 1.
 """
 function ones{N}(shape :: NTuple{N, Int})
   ones(shape, cpu())
@@ -243,8 +240,8 @@ import Base: size, length, ndims, eltype
    size(arr :: NDArray)
    size(arr :: NDArray, dim :: Int)
 
-   Get the shape of an :class:`NDArray`. The shape is in Julia's column-major convention. See
-   also the :ref:`notes on NDArray shapes <ndarray-shape-note>`.
+Get the shape of an `NDArray`. The shape is in Julia's column-major convention. See
+also the notes on NDArray shapes [`NDArrat`](@ref).
 """
 function size(arr :: NDArray)
   ref_ndim  = Ref{MX_uint}(0)
@@ -260,7 +257,7 @@ end
 """
     length(arr :: NDArray)
 
-   Get the number of elements in an :class:`NDArray`.
+Get the number of elements in an `NDArray`.
 """
 function length(arr :: NDArray)
   prod(size(arr))
@@ -269,7 +266,7 @@ end
 """
     ndims(arr :: NDArray)
 
-   Get the number of dimensions of an :class:`NDArray`. Is equivalent to ``length(size(arr))``.
+Get the number of dimensions of an `NDArray`. Is equivalent to `length(size(arr))`.
 """
 function ndims(arr :: NDArray)
   length(size(arr))
@@ -278,7 +275,7 @@ end
 """
     eltype(arr :: NDArray)
 
-   Get the element type of an :class:`NDArray`.
+Get the element type of an `NDArray`.
 """
 function eltype{T <: Union{NDArray, MX_NDArrayHandle}}(arr :: T)
   dtype_ref = Ref{Cint}(0)
@@ -299,11 +296,11 @@ import Base: slice
 """
     slice(arr :: NDArray, start:stop)
 
-   Create a view into a sub-slice of an :class:`NDArray`. Note only slicing at the slowest
-   changing dimension is supported. In Julia's column-major perspective, this is the last
-   dimension. For example, given an :class:`NDArray` of shape (2,3,4), ``slice(array, 2:3)`` will create
-   a :class:`NDArray` of shape (2,3,2), sharing the data with the original array. This operation is
-   used in data parallelization to split mini-batch into sub-batches for different devices.
+Create a view into a sub-slice of an `NDArray`. Note only slicing at the slowest
+changing dimension is supported. In Julia's column-major perspective, this is the last
+dimension. For example, given an `NDArray` of shape (2,3,4), `slice(array, 2:3)` will create
+a `NDArray` of shape (2,3,2), sharing the data with the original array. This operation is
+used in data parallelization to split mini-batch into sub-batches for different devices.
 """
 function slice(arr :: NDArray, ::Colon)
   arr
@@ -329,13 +326,13 @@ import Base: setindex!
 """
     setindex!(arr :: NDArray, val, idx)
 
-   Assign values to an :class:`NDArray`. Elementwise assignment is not implemented, only the following
-   scenarios are supported
+Assign values to an `NDArray`. Elementwise assignment is not implemented, only the following
+scenarios are supported
 
-   - ``arr[:] = val``: whole array assignment, ``val`` could be a scalar or an array (Julia ``Array``
-     or :class:`NDArray`) of the same shape.
-   - ``arr[start:stop] = val``: assignment to a *slice*, ``val`` could be a scalar or an array of
-     the same shape to the slice. See also :func:`slice`.
+* `arr[:] = val`: whole array assignment, `val` could be a scalar or an array (Julia `Array`
+  or `NDArray`) of the same shape.
+* `arr[start:stop] = val`: assignment to a *slice*, `val` could be a scalar or an array of
+  the same shape to the slice. See also [`slice`](@ref).
 """
 function setindex!(arr :: NDArray, val :: Real, ::Colon)
   @assert(arr.writable)
@@ -356,36 +353,35 @@ import Base: getindex
 """
     getindex(arr :: NDArray, idx)
 
-Shortcut for :func:`slice`. A typical use is to write
-
-   .. code-block:: julia
-
-      arr[:] += 5
+Shortcut for [`slice`](@ref). A typical use is to write
 
-   which translates into
+```julia
+  arr[:] += 5
+```
 
-   .. code-block:: julia
+which translates into
 
-      arr[:] = arr[:] + 5
+```julia
+  arr[:] = arr[:] + 5
+```
 
-   which furthur translates into
+which furthur translates into
 
-   .. code-block:: julia
+```julia
+  setindex!(getindex(arr, Colon()), 5, Colon())
+```
 
-      setindex!(getindex(arr, Colon()), 5, Colon())
-
-   .. note::
-
-      The behavior is quite different from indexing into Julia's ``Array``. For example, ``arr[2:5]``
-      create a **copy** of the sub-array for Julia ``Array``, while for :class:`NDArray`, this is
-      a *slice* that shares the memory.
+!!! note
+    The behavior is quite different from indexing into Julia's `Array`. For example, `arr[2:5]`
+    create a **copy** of the sub-array for Julia `Array`, while for `NDArray`, this is
+    a *slice* that shares the memory.
 """
 function getindex(arr :: NDArray, ::Colon)
   return arr
 end
 
 """
-Shortcut for `slice`. **NOTE** the behavior for Julia's built-in index slicing is to create a
+Shortcut for [`slice`](@ref). **NOTE** the behavior for Julia's built-in index slicing is to create a
 copy of the sub-array, while here we simply call `slice`, which shares the underlying memory.
 """
 function getindex(arr :: NDArray, idx::UnitRange{Int})
@@ -397,7 +393,7 @@ import Base: copy!, copy, convert
 .. function::
    copy!(dst :: Union{NDArray, Array}, src :: Union{NDArray, Array})
 
-   Copy contents of ``src`` into ``dst``.
+Copy contents of `src` into `dst`.
 """
 function copy!(dst :: NDArray, src :: NDArray)
   @assert(dst.writable)
@@ -441,13 +437,12 @@ end
 
 
 """
-.. function::
-   copy(arr :: NDArray)
-   copy(arr :: NDArray, ctx :: Context)
-   copy(arr :: Array, ctx :: Context)
+    copy(arr :: NDArray)
+    copy(arr :: NDArray, ctx :: Context)
+    copy(arr :: Array, ctx :: Context)
 
-   Create a copy of an array. When no :class:`Context` is given, create a Julia ``Array``.
-   Otherwise, create an :class:`NDArray` on the specified context.
+Create a copy of an array. When no `Context` is given, create a Julia `Array`.
+Otherwise, create an `NDArray` on the specified context.
 """
 # Create copy: NDArray -> Julia Array
 function copy(arr :: NDArray)
@@ -470,7 +465,7 @@ end
 """
     convert(::Type{Array{T}}, arr :: NDArray)
 
-   Convert an :class:`NDArray` into a Julia ``Array`` of specific type. Data will be copied.
+Convert an `NDArray` into a Julia `Array` of specific type. Data will be copied.
 """
 # Convert copy: NDArray -> Julia Array
 function convert{T<:Real}(t::Type{Array{T}}, arr :: NDArray)
@@ -480,25 +475,25 @@ end
 """
     @inplace
 
-Julia does not support re-definiton of ``+=`` operator (like ``__iadd__`` in python),
-When one write ``a += b``, it gets translated to ``a = a+b``. ``a+b`` will allocate new
-memory for the results, and the newly allocated :class:`NDArray` object is then assigned
+Julia does not support re-definiton of `+=` operator (like `__iadd__` in python),
+When one write `a += b`, it gets translated to `a = a+b`. `a+b` will allocate new
+memory for the results, and the newly allocated `NDArray` object is then assigned
 back to a, while the original contents in a is discarded. This is very inefficient
 when we want to do inplace update.
 
 This macro is a simple utility to implement this behavior. Write
 
-   .. code-block:: julia
+```julia
+  @mx.inplace a += b
+```
 
-      @mx.inplace a += b
+will translate into
 
-   will translate into
+```julia
+  mx.add_to!(a, b)
+```
 
-   .. code-block:: julia
-
-      mx.add_to!(a, b)
-
-   which will do inplace adding of the contents of ``b`` into ``a``.
+which will do inplace adding of the contents of `b` into `a`.
 """
 macro inplace(stmt)
   if stmt.head == :+= || stmt.head == :.+=
@@ -517,7 +512,7 @@ end
 """
     add_to!(dst :: NDArray, args :: Union{Real, NDArray}...)
 
-Add a bunch of arguments into ``dst``. Inplace updating.
+Add a bunch of arguments into `dst`. Inplace updating.
 """
 function add_to!(dst :: NDArray, args :: Union{Real, NDArray}...)
   @assert dst.writable
@@ -537,8 +532,8 @@ import Base: +, .+
     +(args...)
     .+(args...)
 
-Summation. Multiple arguments of either scalar or :class:`NDArray` could be
-added together. Note at least the first or second argument needs to be an :class:`NDArray` to
+Summation. Multiple arguments of either scalar or `NDArray` could be
+added together. Note at least the first or second argument needs to be an `NDArray` to
 avoid ambiguity of built-in summation.
 """
 function +(arg0 :: NDArray, args :: Union{Real, NDArray}...)
@@ -558,7 +553,7 @@ end
 """
     sub_from!(dst :: NDArray, args :: Union{Real, NDArray}...)
 
-   Subtract a bunch of arguments from ``dst``. Inplace updating.
+Subtract a bunch of arguments from `dst`. Inplace updating.
 """
 function sub_from!(dst :: NDArray, arg :: Union{Real, NDArray})
   @assert dst.writable
@@ -576,8 +571,8 @@ import Base: -, .-
     -(arg0)
     .-(arg0, arg1)
 
-Subtraction ``arg0 - arg1``, of scalar types or :class:`NDArray`. Or create
-the negative of ``arg0``.
+Subtraction `arg0 - arg1`, of scalar types or `NDArray`. Or create
+the negative of `arg0`.
 """
 function -(arg0 :: NDArray, arg1 :: Union{Real, NDArray})
   ret = copy(arg0, context(arg0))
@@ -602,8 +597,8 @@ end
 """
     mul_to!(dst :: NDArray, arg :: Union{Real, NDArray})
 
-   Elementwise multiplication into ``dst`` of either a scalar or an :class:`NDArray` of the same shape.
-   Inplace updating.
+Elementwise multiplication into `dst` of either a scalar or an `NDArray` of the same shape.
+Inplace updating.
 """
 function mul_to!(dst :: NDArray, arg :: Union{Real, NDArray})
   @assert dst.writable
@@ -620,7 +615,7 @@ import Base: .*, *
 """
     .*(arg0, arg1)
 
-Elementwise multiplication of ``arg0`` and ``arg``, could be either scalar or :class:`NDArray`.
+Elementwise multiplication of `arg0` and `arg`, could be either scalar or `NDArray`.
 """
 function .*(arg0 :: NDArray, arg :: Union{Real, NDArray})
   ret = copy(arg0, context(arg0))
@@ -633,7 +628,7 @@ end
 """
     *(arg0, arg1)
 
-Currently only multiplication a scalar with an :class:`NDArray` is implemented. Matrix multiplication
+Currently only multiplication a scalar with an `NDArray` is implemented. Matrix multiplication
 is to be added soon.
 """
 function *(arg0 :: NDArray, arg :: Real)
@@ -647,7 +642,7 @@ end
 """
     div_from!(dst :: NDArray, arg :: Union{Real, NDArray})
 
-Elementwise divide a scalar or an :class:`NDArray` of the same shape from ``dst``. Inplace updating.
+Elementwise divide a scalar or an `NDArray` of the same shape from `dst`. Inplace updating.
 """
 function div_from!(dst :: NDArray, arg :: Union{Real, NDArray})
   @assert dst.writable
@@ -662,7 +657,7 @@ import Base: ./, /
 """
     ./(arg0 :: NDArray, arg :: Union{Real, NDArray})
 
-Elementwise dividing an :class:`NDArray` by a scalar or another :class:`NDArray` of the same shape.
+Elementwise dividing an `NDArray` by a scalar or another `NDArray` of the same shape.
 """
 function ./(arg0 :: NDArray, arg :: Union{Real, NDArray})
   ret = copy(arg0, context(arg0))
@@ -672,7 +667,7 @@ end
 """
     /(arg0 :: NDArray, arg :: Real)
 
-Divide an :class:`NDArray` by a scalar. Matrix division (solving linear systems) is not implemented yet.
+Divide an `NDArray` by a scalar. Matrix division (solving linear systems) is not implemented yet.
 """
 function /(arg0 :: NDArray, arg :: Real)
   ./(arg0, arg)
@@ -685,42 +680,41 @@ Manipulating as Julia Arrays
 
     @nd_as_jl(captures..., statement)
 
-   A convenient macro that allows to operate :class:`NDArray` as Julia Arrays. For example,
+A convenient macro that allows to operate `NDArray` as Julia Arrays. For example,
 
-   .. code-block:: julia
+```julia
+  x = mx.zeros(3,4)
+  y = mx.ones(3,4)
+  z = mx.zeros((3,4), mx.gpu())
 
-      x = mx.zeros(3,4)
-      y = mx.ones(3,4)
-      z = mx.zeros((3,4), mx.gpu())
-
-      @mx.nd_as_jl ro=(x,y) rw=z begin
-        # now x, y, z are just ordinary Julia Arrays
-        z[:,1] = y[:,2]
-        z[:,2] = 5
-      end
-
-   Under the hood, the macro convert all the declared captures from :class:`NDArray` into Julia
-   Arrays, by using :func:`try_get_shared`. And automatically commit the modifications back into
-   the :class:`NDArray` that is declared as ``rw``. This is useful for fast prototyping and when
-   implement non-critical computations, such as :class:`AbstractEvalMetric`.
-
-   .. note::
-
-      - Multiple ``rw`` and / or ``ro`` capture declaration could be made.
-      - The macro does **not** check to make sure that ``ro`` captures are not modified. If the
-        original :class:`NDArray` lives in CPU memory, then it is very likely the corresponding
-        Julia Array shares data with the :class:`NDArray`, so modifying the Julia Array will also
-        modify the underlying :class:`NDArray`.
-      - More importantly, since the :class:`NDArray` is
-        asynchronized, we will wait for *writing* for ``rw`` variables but wait only for *reading*
-        in ``ro`` variables. If we write into those ``ro`` variables, **and** if the memory is
-        shared, racing condition might happen, and the behavior is undefined.
-      - When an :class:`NDArray` is declared to be captured as ``rw``, its contents is always sync
-        back in the end.
-      - The execution results of the expanded macro is always ``nothing``.
-      - The statements are wrapped in a ``let``, thus locally introduced new variables will not be
-        available after the statements. So you will need to declare the variables before calling the
-        macro if needed.
+  @mx.nd_as_jl ro=(x,y) rw=z begin
+    # now x, y, z are just ordinary Julia Arrays
+    z[:,1] = y[:,2]
+    z[:,2] = 5
+  end
+```
+
+Under the hood, the macro convert all the declared captures from `NDArray` into Julia
+Arrays, by using `try_get_shared`. And automatically commit the modifications back into
+the `NDArray` that is declared as `rw`. This is useful for fast prototyping and when
+implement non-critical computations, such as `AbstractEvalMetric`.
+
+!!! note
+* Multiple `rw` and / or `ro` capture declaration could be made.
+* The macro does **not** check to make sure that `ro` captures are not modified. If the
+  original `NDArray` lives in CPU memory, then it is very likely the corresponding
+  Julia Array shares data with the `NDArray`, so modifying the Julia Array will also
+  modify the underlying `NDArray`.
+* More importantly, since the `NDArray` is
+  asynchronized, we will wait for *writing* for `rw` variables but wait only for *reading*
+  in `ro` variables. If we write into those `ro` variables, **and** if the memory is
+  shared, racing condition might happen, and the behavior is undefined.
+* When an `NDArray` is declared to be captured as `rw`, its contents is always sync
+  back in the end.
+* The execution results of the expanded macro is always `nothing`.
+* The statements are wrapped in a `let`, thus locally introduced new variables will not be
+  available after the statements. So you will need to declare the variables before calling the
+  macro if needed.
 """
 macro nd_as_jl(m_args...)
   @assert(length(m_args) > 0)
@@ -812,14 +806,15 @@ end
 """
     try_get_shared(arr)
 
-   Try to create a Julia array by sharing the data with the underlying :class:`NDArray`.
+Try to create a Julia array by sharing the data with the underlying `NDArray`.
 
-* NDArray arr: the array to be shared.
+# Arguments:
+* `arr::NDArray`: the array to be shared.
 
    .. warning::
 
-      The returned array does not guarantee to share data with the underlying :class:`NDArray`.
-      In particular, data sharing is possible only when the :class:`NDArray` lives on CPU.
+      The returned array does not guarantee to share data with the underlying `NDArray`.
+      In particular, data sharing is possible only when the `NDArray` lives on CPU.
 """
 function try_get_shared(arr :: NDArray)
   if context(arr).device_type == CPU
@@ -834,10 +829,11 @@ end
 """
     is_shared(j_arr, arr)
 
-   Test whether ``j_arr`` is sharing data with ``arr``.
+Test whether `j_arr` is sharing data with `arr`.
 
+# Arguments:
 * Array j_arr: the Julia Array.
-* NDArray arr: the :class:`NDArray`.
+* NDArray arr: the `NDArray`.
 """
 function is_shared(j_arr :: Array, arr :: NDArray)
   false
@@ -857,13 +853,16 @@ end
 
 Load NDArrays from binary file.
 
-* AbstractString filename: the path of the file to load. It could be S3 or HDFS address.
-   :return: Either ``Dict{Base.Symbol, NDArray}`` or ``Vector{NDArray}``.
+# Arguments:
+* `filename::String`: the path of the file to load. It could be S3 or HDFS address.
+
+Returns either `Dict{Symbol, NDArray}` or `Vector{NDArray}`.
 
-If the ``libmxnet`` is built with the corresponding component enabled. Examples
-* ``s3://my-bucket/path/my-s3-ndarray``
-* ``hdfs://my-bucket/path/my-hdfs-ndarray``
-* ``/path-to/my-local-ndarray``
+`filename` can point to `s3` or `hdfs` resources if the `libmxnet` is built with the
+corresponding components enabled. Examples:
+* `s3://my-bucket/path/my-s3-ndarray`
+* `hdfs://my-bucket/path/my-hdfs-ndarray`
+* `/path-to/my-local-ndarray`
 """
 function load(filename::AbstractString, ::Type{NDArray})
   out_size      = Ref{MX_uint}(0)
@@ -886,12 +885,11 @@ end
 """
     save(filename :: AbstractString, data)
 
-Save NDarrays to binary file. Filename could be S3 or HDFS address, if ``libmxnet`` is built
-with corresponding support.
+Save NDarrays to binary file. Filename could be S3 or HDFS address, if `libmxnet` is built
+with corresponding support (see `load`).
 
-* AbstractString filename: path to the binary file to write to.
-* data: data to save to file.
-   :type data: :class:`NDArray`, or a ``Vector{NDArray}`` or a ``Dict{Base.Symbol, NDArray}``.
+* `filename::String`: path to the binary file to write to.
+* `data`: data to save to file. Data can be a`NDArray`, a `Vector{NDArray}`, or a `Dict{Base.Symbol, NDArray}`.
 """
 function save(filename::AbstractString, data::NDArray)
   save(filename, [data])
@@ -928,22 +926,22 @@ end
 import Base: sqrt
 
 """
-The libxmnet APIs are automatically imported from ``libmxnet.so``. The functions listed
-here operate on :class:`NDArray` objects. The arguments to the functions are typically ordered
+The libxmnet APIs are automatically imported from `libmxnet.so`. The functions listed
+here operate on `NDArray` objects. The arguments to the functions are typically ordered
 as
 
 .. code-block:: julia
 
    func_name(arg_in1, arg_in2, ..., scalar1, scalar2, ..., arg_out1, arg_out2, ...)
 
-unless ``NDARRAY_ARG_BEFORE_SCALAR`` is not set. In this case, the scalars are put before the input arguments:
+unless `NDARRAY_ARG_BEFORE_SCALAR` is not set. In this case, the scalars are put before the input arguments:
 
 .. code-block:: julia
 
    func_name(scalar1, scalar2, ..., arg_in1, arg_in2, ..., arg_out1, arg_out2, ...)
 
 
-If ``ACCEPT_EMPTY_MUTATE_TARGET`` is set. An overloaded function without the output arguments will also be defined:
+If `ACCEPT_EMPTY_MUTATE_TARGET` is set. An overloaded function without the output arguments will also be defined:
 
 .. code-block:: julia
 
@@ -952,7 +950,7 @@ If ``ACCEPT_EMPTY_MUTATE_TARGET`` is set. An overloaded function without the out
 Upon calling, the output arguments will be automatically initialized with empty NDArrays.
 
 Those functions always return the output arguments. If there is only one output (the typical situation), that
-object (:class:`NDArray`) is returned. Otherwise, a tuple containing all the outputs will be returned.
+object (`NDArray`) is returned. Otherwise, a tuple containing all the outputs will be returned.
 """
 
 function _get_ndarray_functions()
diff --git a/src/nn-factory.jl b/src/nn-factory.jl
index b170214a1af9..a60a4716bfef 100644
--- a/src/nn-factory.jl
+++ b/src/nn-factory.jl
@@ -1,24 +1,25 @@
 """
-    MLP(input, spec)
+    MLP(input, spec; hidden_activation = :relu, prefix)
 
 Construct a multi-layer perceptron. A MLP is a multi-layer neural network with
 fully connected layers.
 
-* SymbolicNode input: the input to the mlp.
-* spec: the mlp specification, a list of hidden dimensions. For example,
-          ``[128, (512, :sigmoid), 10]``. The number in the list indicate the
+# Arguments:
+* `input::SymbolicNode`: the input to the mlp.
+* `spec`: the mlp specification, a list of hidden dimensions. For example,
+          `[128, (512, :sigmoid), 10]`. The number in the list indicate the
           number of hidden units in each layer. A tuple could be used to specify
           the activation of each layer. Otherwise, the default activation will
           be used (except for the last layer).
-* Base.Symbol hidden_activation: keyword argument, default ``:relu``, indicating
+* `hidden_activation::Symbol`: keyword argument, default `:relu`, indicating
           the default activation for hidden layers. The specification here could be overwritten
-          by layer-wise specification in the ``spec`` argument. Also activation is not
+          by layer-wise specification in the `spec` argument. Also activation is not
           applied to the last, i.e. the prediction layer. See :func:`Activation` for a
           list of supported activation types.
-* prefix: keyword argument, default ``gensym()``, used as the prefix to
+* `prefix`: keyword argument, default `gensym()`, used as the prefix to
           name the constructed layers.
 
-   :return: the constructed MLP.
+Returns the constructed MLP.
 """
 function MLP(input, spec; hidden_activation::Base.Symbol=:relu, prefix=gensym())
   spec = convert(Vector{Union{Int,Tuple}}, spec)
diff --git a/src/optimizer.jl b/src/optimizer.jl
index e2fc44338a2f..c5d4b29aa308 100644
--- a/src/optimizer.jl
+++ b/src/optimizer.jl
@@ -85,7 +85,7 @@ get_learning_rate(self :: Fixed, state :: OptimizationState) = self.learning_rat
     LearningRate.Exp
 
    :math:`\eta_t = \eta_0\gamma^t`. Here :math:`t` is the epoch count, or the iteration
-   count if ``decay_on_iteration`` is set to true.
+   count if `decay_on_iteration` is set to true.
 """
 type Exp <: AbstractLearningRateScheduler
   learning_rate :: Float64
@@ -102,7 +102,7 @@ get_learning_rate(self :: Exp, state :: OptimizationState) =
     LearningRate.Inv
 
    :math:`\eta_t = \eta_0 * (1 + \gamma * t)^(-power)`.
-   Here :math:`t` is the epoch count, or the iteration count if ``decay_on_iteration``
+   Here :math:`t` is the epoch count, or the iteration count if `decay_on_iteration`
    is set to true.
 """
 type Inv <: AbstractLearningRateScheduler
@@ -207,7 +207,7 @@ abstract AbstractOptimizerOptions
     normalized_gradient(opts, state, grad)
 
 * AbstractOptimizerOptions opts: options for the optimizer, should contain the field
-          ``grad_scale``, ``grad_clip`` and ``weight_decay``.
+          `grad_scale`, `grad_clip` and `weight_decay`.
 * OptimizationState state: the current optimization state.
 * NDArray weight: the trainable weights.
 * NDArray grad: the original gradient of the weights.
diff --git a/src/symbolic-node.jl b/src/symbolic-node.jl
index 600bc75b3aeb..b71733ace71b 100644
--- a/src/symbolic-node.jl
+++ b/src/symbolic-node.jl
@@ -37,7 +37,7 @@ end
     call(self :: SymbolicNode, args :: SymbolicNode...)
     call(self :: SymbolicNode; kwargs...)
 
-Make a new node by composing ``self`` with ``args``. Or the arguments
+Make a new node by composing `self` with `args`. Or the arguments
 can be specified using keyword arguments.
 """
 @compat function (self::SymbolicNode)(args :: SymbolicNode...)
@@ -66,11 +66,11 @@ end
     list_arguments(self :: SymbolicNode)
 
 List all the arguments of this node. The argument for a node contains both
-the inputs and parameters. For example, a :class:`FullyConnected` node will
+the inputs and parameters. For example, a `FullyConnected` node will
 have both data and weights in its arguments. A composed node (e.g. a MLP) will
 list all the arguments for intermediate nodes.
 
-   :return: A list of symbols indicating the names of the arguments.
+Returns a list of symbols indicating the names of the arguments.
 """
 function list_arguments(self :: SymbolicNode)
   @_list_symbol_info(self, :MXSymbolListArguments)
@@ -81,7 +81,7 @@ end
 
 List all the outputs of this node.
 
-   :return: A list of symbols indicating the names of the outputs.
+Returns a list of symbols indicating the names of the outputs.
 """
 function list_outputs(self :: SymbolicNode)
   @_list_symbol_info(self, :MXSymbolListOutputs)
@@ -99,7 +99,7 @@ and do not have gradient. But still be useful for the specific operations.
 A common example of auxiliary state is the moving_mean and moving_variance in BatchNorm.
 Most operators do not have Auxiliary states.
 
-   :return: A list of symbols indicating the names of the auxiliary states.
+Returns a list of symbols indicating the names of the auxiliary states.
 """
 function list_auxiliary_states(self :: SymbolicNode)
   @_list_symbol_info(self, :MXSymbolListAuxiliaryStates)
@@ -108,8 +108,8 @@ end
 """
     get_internals(self :: SymbolicNode)
 
-Get a new grouped :class:`SymbolicNode` whose output contains all the internal outputs of
-this :class:`SymbolicNode`.
+Get a new grouped `SymbolicNode` whose output contains all the internal outputs of
+this `SymbolicNode`.
 """
 function get_internals(self :: SymbolicNode)
   ref_hdr = Ref{MX_handle}(0)
@@ -120,8 +120,9 @@ end
 """
     get_attr(self :: SymbolicNode, key :: Symbol)
 
-Get attribute attached to this :class:`SymbolicNode` belonging to key.
-:return: The value belonging to key as a :class:`Nullable`.
+Get attribute attached to this `SymbolicNode` belonging to key.
+
+Returns the value belonging to key as a `Nullable`.
 """
 function get_attr(self :: SymbolicNode, key :: Symbol)
   key_s = string(key)
@@ -140,7 +141,8 @@ end
     list_attr(self :: SymbolicNode)
 
 Get all attributes from a symbol.
-:return: Dictionary of attributes.
+
+Returns a dictionary of attributes.
 """
 function list_attr(self :: SymbolicNode)
   ref_sz    = Ref{MX_uint}(0)
@@ -162,7 +164,8 @@ end
     list_all_attr(self :: SymbolicNode)
 
 Get all attributes from the symbol graph.
-:return: Dictionary of attributes.
+
+Returns a dictionary of attributes.
 """
 function list_all_attr(self :: SymbolicNode)
   ref_sz    = Ref{MX_uint}(0)
@@ -183,12 +186,12 @@ end
 """
     set_attr(self:: SymbolicNode, key :: Symbol, value :: AbstractString)
 
-Set the attribute key to value for this :class:`SymbolicNode`.
+Set the attribute key to value for this `SymbolicNode`.
 
 # Warning
 It is encouraged not to call this function directly, unless you know exactly what you are doing. The
-recommended way of setting attributes is when creating the :class:`SymbolicNode`. Changing
-the attributes of a :class:`SymbolicNode` that is already been used somewhere else might
+recommended way of setting attributes is when creating the `SymbolicNode`. Changing
+the attributes of a `SymbolicNode` that is already been used somewhere else might
 cause unexpected behavior and inconsistency.
 """
 function set_attr(self :: SymbolicNode, key :: Symbol, value :: AbstractString)
@@ -205,7 +208,7 @@ Create a symbolic variable with the given name. This is typically used as a plac
 For example, the data node, acting as the starting point of a network architecture.
 
 # Arguments
-* Dict{Symbol, AbstractString} attrs: The attributes associated with this :class:`Variable`.
+* Dict{Symbol, AbstractString} attrs: The attributes associated with this `Variable`.
 """
 function Variable(name :: Union{Symbol, AbstractString}; attrs = Dict())
   attrs = convert(Dict{Symbol, AbstractString}, attrs)
@@ -221,7 +224,7 @@ end
 """
     Group(nodes :: SymbolicNode...)
 
-Create a :class:`SymbolicNode` by grouping nodes together.
+Create a `SymbolicNode` by grouping nodes together.
 """
 function Group(nodes :: SymbolicNode...)
   handles = MX_handle[nodes...]
@@ -283,9 +286,9 @@ as a list of shapes, which should specify the shapes of inputs in the same order
 the arguments returned by :func:`list_arguments`. Alternatively, the shape information
 could be specified via keyword arguments.
 
-:return: A 3-tuple containing shapes of all the arguments, shapes of all the outputs and
-         shapes of all the auxiliary variables. If shape inference failed due to incomplete
-         or incompatible inputs, the return value will be ``(nothing, nothing, nothing)``.
+Returns a 3-tuple containing shapes of all the arguments, shapes of all the outputs and
+shapes of all the auxiliary variables. If shape inference failed due to incomplete
+or incompatible inputs, the return value will be `(nothing, nothing, nothing)`.
 """
 function infer_shape(self :: SymbolicNode; kwargs...)
   sdata  = MX_uint[]
@@ -351,9 +354,9 @@ as a list of types, which should specify the types of inputs in the same order a
 the arguments returned by :func:`list_arguments`. Alternatively, the type information
 could be specified via keyword arguments.
 
-:return: A 3-tuple containing types of all the arguments, types of all the outputs and
-         types of all the auxiliary variables. If type inference failed due to incomplete
-         or incompatible inputs, the return value will be ``(nothing, nothing, nothing)``.
+Returns a 3-tuple containing types of all the arguments, types of all the outputs and
+types of all the auxiliary variables. If type inference failed due to incomplete
+or incompatible inputs, the return value will be `(nothing, nothing, nothing)`.
 """
 function infer_type(self :: SymbolicNode; kwargs...)
   types = Cint[toTypeFlag(x[2]) for x in kwargs]
@@ -524,7 +527,7 @@ end
 """
     to_json(self :: SymbolicNode)
 
-Convert a :class:`SymbolicNode` into a JSON string.
+Convert a `SymbolicNode` into a JSON string.
 """
 function to_json(self :: SymbolicNode)
   ref_json = Ref{char_p}(0)
@@ -535,7 +538,7 @@ end
 """
     from_json(repr :: AbstractString, ::Type{SymbolicNode})
 
-Load a :class:`SymbolicNode` from a JSON string representation.
+Load a `SymbolicNode` from a JSON string representation.
 """
 function from_json(repr :: AbstractString, ::Type{SymbolicNode})
   ref_hdr = Ref{MX_handle}(0)
@@ -546,7 +549,7 @@ end
 """
     load(filename :: AbstractString, ::Type{SymbolicNode})
 
-Load a :class:`SymbolicNode` from a JSON file.
+Load a `SymbolicNode` from a JSON file.
 """
 function load(filename :: AbstractString, ::Type{SymbolicNode})
   ref_hdr = Ref{MX_handle}(0)
@@ -557,7 +560,7 @@ end
 """
     save(filename :: AbstractString, node :: SymbolicNode)
 
-Save a :class:`SymbolicNode` to a JSON file.
+Save a `SymbolicNode` to a JSON file.
 """
 function save(filename :: AbstractString, node :: SymbolicNode)
   @mxcall(:MXSymbolSaveToFile, (MX_handle, char_p), node, filename)
diff --git a/src/visualize.jl b/src/visualize.jl
index 15f23c6bfffc..c60868430a9c 100644
--- a/src/visualize.jl
+++ b/src/visualize.jl
@@ -6,10 +6,10 @@ import JSON
 * SymbolicNode network: the network to visualize.
 * AbstractString title: keyword argument, default "Network Visualization",
           the title of the GraphViz graph.
-* input_shapes: keyword argument, default ``nothing``. If provided,
+* input_shapes: keyword argument, default `nothing`. If provided,
           will run shape inference and plot with the shape information. Should
           be either a dictionary of name-shape mapping or an array of shapes.
-   :return: the graph description in GraphViz ``dot`` language.
+   :return: the graph description in GraphViz `dot` language.
 """
 function to_graphviz(network :: SymbolicNode; title="Network Visualization", input_shapes=nothing)
   if !isa(input_shapes, Void)