Remove references to DIST, MGPU tests with < 2 devices

nancynwei · Jul 16, 2015 · c07814d · c07814d
1 parent a12c5ba
commit c07814d
Show file tree

Hide file tree

Showing 16 changed files with 71 additions and 146 deletions.
diff --git a/Makefile b/Makefile
@@ -19,7 +19,6 @@
 DEV := $(strip $(shell grep -i '^ *DEV *=' setup.cfg | cut -f 2 -d '='))
 CPU := $(strip $(shell grep -i '^ *CPU *=' setup.cfg | cut -f 2 -d '='))
 GPU := $(strip $(shell grep -i '^ *GPU *=' setup.cfg | cut -f 2 -d '='))
-DIST := $(strip $(shell grep -i '^ *DIST *=' setup.cfg | cut -f 2 -d '='))
 
 # get release version info
 RELEASE := $(strip $(shell grep '^VERSION *=' setup.py | cut -f 2 -d '=' \
@@ -84,9 +83,8 @@ endif
 default: build
 
 build: clean_pyc
-	@echo "Running build(DEV=$(DEV) CPU=$(CPU) GPU=$(GPU) DIST=$(DIST))..."
-	@python setup.py neon --dev $(DEV) --cpu $(CPU) --gpu $(GPU) --dist $(DIST) \
-		build
+	@echo "Running build(DEV=$(DEV) CPU=$(CPU) GPU=$(GPU))..."
+	@python setup.py neon --dev $(DEV) --cpu $(CPU) --gpu $(GPU) build
 
 pip_check:
 ifeq (, $(shell which pip))
@@ -114,11 +112,11 @@ ifdef INSTALL_REQUIRES
 endif
 
 develop: deps_install
-	@echo "Running develop(DEV=$(DEV) CPU=$(CPU) GPU=$(GPU) DIST=$(DIST))..."
+	@echo "Running develop(DEV=$(DEV) CPU=$(CPU) GPU=$(GPU))..."
 	@pip install -e .
 
 install: deps_install
-	@echo "Running install(DEV=$(DEV) CPU=$(CPU) GPU=$(GPU) DIST=$(DIST))..."
+	@echo "Running install(DEV=$(DEV) CPU=$(CPU) GPU=$(GPU))..."
 	@pip install .
 
 uninstall: pip_check
@@ -131,7 +129,7 @@ test: build
 
 test_all:
 	@echo "Running test_all..."
-	@tox -- -e CPU=$(CPU) GPU=$(GPU) DIST=$(DIST)
+	@tox -- -e CPU=$(CPU) GPU=$(GPU)
 
 integration: build
 	@echo "Running integration checks (this may take 10-20 minutes)..."
@@ -144,12 +142,12 @@ serialize: build
 sanity: build
 	@echo "Running sanity checks..."
 	@PYTHONPATH=${PYTHONPATH}:./ python neon/tests/sanity_check.py \
-		--cpu $(CPU) --gpu $(GPU) --datapar $(DIST) --modelpar $(DIST)
+		--cpu $(CPU) --gpu $(GPU)
 
 speed: build
 	@echo "Running speed checks..."
 	@PYTHONPATH=${PYTHONPATH}:./ python neon/tests/speed_check.py \
-		--cpu $(CPU) --gpu $(GPU) --datapar $(DIST) --modelpar $(DIST)
+		--cpu $(CPU) --gpu $(GPU)
 
 grad: build
 	@echo "Running gradient checks..."

diff --git a/README.md b/README.md
@@ -4,14 +4,16 @@
 Deep Learning framework. We have designed it with the following
 functionality in mind:
 
-* YAML for easy model specification (inspired by [pylearn2](https://github.com/lisa-lab/pylearn2))
+* YAML for easy model specification (inspired by
+  [pylearn2](https://github.com/lisa-lab/pylearn2))
 * Python for easily adding models and support for many data formats
 * Support for commonly used models: convnets, MLPs, RNNs, LSTMs, autoencoders,
   RBMs
 * Support for common learning rules, activation functions and cost functions
-* Comparing performance of alternate numeric representations with 32-bit floating point (fp32) for
-  Deep Learning
-* Support for using [spearmint](https://github.com/JasperSnoek/spearmint) for hyperparameter optimization
+* Comparing performance of alternate numeric representations with 32-bit
+  floating point (fp32) for Deep Learning
+* Support for using [spearmint](https://github.com/JasperSnoek/spearmint) for
+  hyperparameter optimization
 * Swappable hardware backends: write code once and then deploy on CPUs, GPUs,
   or Nervana hardware
 

diff --git a/doc/source/api.rst b/doc/source/api.rst
@@ -64,6 +64,7 @@ Nervana GPU
   :toctree: generated/
 
   neon.backends.gpu.GPU
+  neon.backends.mgpu.MGPU
 
 Nervana Hardware
 ----------------
@@ -185,7 +186,6 @@ Convolutional Layer
    :toctree: generated/
 
    neon.layers.convolutional.ConvLayer
-   neon.layers.convolutional.SubConvLayer
 
 Pooling Layers
 ---------------

diff --git a/doc/source/backends.rst b/doc/source/backends.rst
@@ -49,9 +49,7 @@ Current Implementations
    neon.backends.cpu.CPU
    neon.backends.gpu.GPU
    neon.backends.cc2.GPU
-   neon.backends.par.NoPar
-   neon.backends.par.DataPar
-   neon.backends.par.ModelPar
+   neon.backends.mgpu.MGPU
 
 Adding a new Backend
 --------------------

diff --git a/doc/source/distributed.rst b/doc/source/distributed.rst
@@ -24,7 +24,8 @@ Note that we only support parallel computation with multiple GPUs and not on
 multiple CPUs.  Moreover, multi-GPU computation is only supported via our
 ``nervanagpu`` backend, which requires Maxwell architecture devices.
 
-The parallel implementation used in neon has been tested on up to 8 GPUs.
+The parallel implementation used in neon has been tested on up to 8 GPUs.  All
+devices must be housed in the same machine.
 
 Parallelization Model
 ---------------------
@@ -47,8 +48,8 @@ the replica activations that are used for the next layer.
 
 Requirements
 ------------
-In order to parallelize across ``N`` nodes, the following conditions must be
-satisfied:
+In order to parallelize across ``N`` GPU device nodes, the following
+conditions must be satisfied:
 
 - In data parallel mode, the minibatch size must be a multiple of ``N``.
 - In model parallel mode, the number of output units of each fully connected
@@ -59,12 +60,42 @@ For example, an MLP with no convolutional layers that has 3 hidden layers with
 ``GCD(6, 200, 20) == 2``).  If the first layer had 12 hidden nodes, the model
 could be parallelized across 4 GPUs.
 
-Since AlexNet [AK2012]_has fully connected layers with outputs of 4096, 4096,
+Since AlexNet [AK2012]_ has fully connected layers with outputs of 4096, 4096,
 and 1000, it can be split across up to 8 GPUs (``GCD(4096, 1000) = 8``) as long
 as the minibatch supplied is divisible by 8.
 
+
+Usage
+=====
+
+The following example illustrates how to train a convnet on the MNIST dataset
+across 2 GPUs (devices selected by default):
+
+.. code-block:: bash
+
+    neon --gpu nervanagpu2 examples/convnet/mnist-small.yaml
+
+
+The following example illustrates how to train the same convnet with 2 GPUs,
+but specifying devices 1 and 2 (Note that the device_ids specified here do not
+necessarily correspond to how they appear when running ``nvidia-smi``):
+
+.. code-block:: bash
+
+    neon --gpu nervanagpu2 examples/convnet/mnist-small.yaml --device_id 1 2
+
+
+The following example illustrates how to train a convnet on the i1k alexnet
+model included with neon across 4 GPUs:
+
+.. code-block:: bash
+
+    neon --gpu nervanagpu4 examples/convnet/i1k-alexnet-fp32.yaml
+
+
 Known Issues
 ============
+
 Dropout Layers
 --------------
 Dropout layers occur between fully connected layers, which have replicated
@@ -89,31 +120,9 @@ consistent across model replicas.
 In fully connected layers, since activations are replicated on each device, the
 batch normalization parameters should be identical without need for sharing.
 
-Usage
------
-
-The following example illustrates how to train a convnet on the MNIST dataset
-across 2 GPUs (devices selected by default):
-
-.. code-block:: bash
-
-    neon --gpu nervanagpu2 examples/convnet/mnist-small.yaml
-
-The following example illustrates how to train the same convnet with 2 GPUs,
-but specifying devices 1 and 2 (Note that the device_ids specified here do not
-necessarily correspond to how they appear when running ``nvidia-smi``):
-
-.. code-block:: bash
-
-    neon --gpu nervanagpu2 examples/convnet/mnist-small.yaml --device_id 1 2
-
 
-The following example illustrates how to train a convnet on the i1k alexnet
-model included with neon across 4 GPUs:
-
-.. code-block:: bash
-
-    neon --gpu nervanagpu4 examples/convnet/i1k-alexnet-fp32.yaml
+References
+==========
 
 .. [AK2014] Alex Krizhevsky, One weird trick for parallelizing convolutional neural networks. http://arxiv.org/abs/1404.5997
 .. [AK2012] Alex Krizhevsky, Ilya Sutskever, Geoffrey Hinton, ImageNet classification with deep convolutional neural networks. http://www.cs.toronto.edu/~kriz/imgnet-paper-2012.pdf
diff --git a/doc/source/installation.rst b/doc/source/installation.rst
@@ -25,7 +25,7 @@ Overview
     git clone https://github.com/NervanaSystems/neon.git
     cd neon
 
-    # configure optional backends like GPU, distributed processing by editing
+    # configure optional backends like GPU by editing
     # setup.cfg with a text editor.
     nano setup.cfg
 
@@ -71,6 +71,8 @@ GPU=nervanagpu
   nervanagpu backend
 * `maxas <https://github.com/NervanaSystems/maxas/>`_ Assembler for NVIDIA
   Maxwell architecture.  Required for installing the nervanagpu backend.
+* Multiple GPU devices can be utilized in parallel, as described in the
+  :doc:`distributed` section.
 
 GPU=cudanet
 ^^^^^^^^^^^
@@ -103,13 +105,6 @@ DEV=1
 * `matplotlib <http://matplotlib.org>`_ Currently used for some basic
   visualizations like RNN features.
 
-DIST=1
-^^^^^^
-
-* `mpi4py <https://github.com/mpi4py/mpi4py>`_ for creation of distributed
-  Tensors in data and model parallel models.
-* `openmpi <http://www.open-mpi.org/>`_ required for mpi4py
-
 
 Configuration Setup
 -------------------
@@ -127,9 +122,9 @@ to the ``make`` command.  Below is an example showing the default values for
 As shown, the default set of options is fairly restrictive, so only the CPU
 based backend will be available:
 
-* Set ``GPU=nervanagpu`` (maxwell) or ``GPU=cudanet`` (kepler), if you have a CUDA capable GPU
+* Set ``GPU=nervanagpu`` (maxwell) or ``GPU=cudanet`` (kepler), if you have a
+  CUDA capable GPU
 * Set ``DEV=1``, if you plan to run unit tests, build documentation or develop neon 
-* Set ``DIST=1``, if you would like to run your model training in parallel via MPI
 
 To override what is defined in ``setup.cfg``, one can pass the appropriate
 options on the command-line (useful when doing in-place development).  Here's
@@ -140,60 +135,6 @@ an example:
     make -e GPU=cudanet DEV=1 test
 
 
-.. _mpi_install:
-
-Installing MPI on an Ubuntu cluster (for distributed models)
-------------------------------------------------------------
-neon provides distributed implementations of convnets and MLPs in addition to the non-distributed implementations.
-It has been tested with
-`OpenMPI 1.8.1 <http://www.open-mpi.org/software/ompi/v1.8/>`_ and
-`mpi4py <https://github.com/mpi4py/mpi4py>`_.
-
-1. Install OpenMPI:
-
-.. code-block:: bash
-
-    cd <openmpi_source_dir>
-    ./configure --prefix=/<path_to_install_openmpi> --with-cuda
-    make all
-    sudo make install
-
-Make sure that ``PATH`` includes ``/<path_to_openmpi>/bin`` and
-``LD_LIBRARY_PATH`` includes ``/<path_to_openmpi>/lib``
-
-2. Install mpi4py:
-
-.. code-block:: bash
-
-  # set DIST=1 in setup.cfg then run:
-  make install
-  # or
-  make -e DIST=1 install
-  # or
-  cd <mpi4py_source_dir>
-	sudo python setup.py build --configure install
-
-3. Setup ``/etc/hosts`` with IPs of the nodes.
-e.g.:
-
-.. code-block:: bash
-
-	192.168.1.1 host1
-	192.168.1.2 host2
-
-4. Setup a hosts file to use with MPI ``-hostfile`` option.
-For additional info refer to `this document <http://cs.calvin.edu/curriculum/cs/374/homework/MPI/01/multicoreHostFiles.html>`_.
-e.g.:
-
-.. code-block:: bash
-
-	host1 slots=2
-	host2 slots=2
-
-5. Read through the :doc:`distributed` section to see how run neon in data or
-   model parallel mode using MPI.
-
-
 Virtualenv
 ----------
 If you are doing work on a multi-user system, don't have sudo access, or just

diff --git a/doc/source/layers.rst b/doc/source/layers.rst
@@ -40,7 +40,6 @@ Available Layers
    neon.layers.fully_connected.FCLayer
 
    neon.layers.convolutional.ConvLayer
-   neon.layers.convolutional.SubConvLayer
 
    neon.layers.pooling.PoolingLayer
    neon.layers.pooling.CrossMapPoolingLayer

diff --git a/doc/source/using_neon.rst b/doc/source/using_neon.rst
@@ -70,7 +70,7 @@ process to train and run inference on a toy network:
 Parallelization
 ---------------
 Read through the :doc:`distributed` section to see how to run model training in
-data and model parallel modes using MPI.
+data and model parallel modes using multiple GPU devices.
 
 
 .. _train_models:

diff --git a/neon/backends/gpu.py b/neon/backends/gpu.py
@@ -1109,9 +1109,9 @@ def gdmwd_compound(self, ps_item, us_item, vs_item, momentum_coef,
             vs_item, the updated velocity.
             us_item, used as a temp buffer.
         """
-        vs_item[:] = vs_item * momentum_coef \
-                   - us_item * learning_rate \
-                   - ps_item * learning_rate * wd
+        vs_item[:] = (vs_item * momentum_coef -
+                      us_item * learning_rate -
+                      ps_item * learning_rate * wd)
         ps_item[:] = ps_item + vs_item
 
     def exp_mavg(self, mavg, newval, rho):

diff --git a/neon/backends/tests/test_mgputensor.py b/neon/backends/tests/test_mgputensor.py
@@ -32,7 +32,11 @@ class TestGPU(object):
     def setup(self):
         from neon.backends.mgpu import MGPU, MGPUTensor
         # this code gets called prior to each test
-        self.be = MGPU(rng_seed=0, num_dev=2)
+        try:
+            self.be = MGPU(rng_seed=0, num_dev=2)
+        except AssertionError:
+            # likely that only one GPU device is available
+            self.be = MGPU(rng_seed=0, num_dev=1)
         self.gpt = MGPUTensor
 
     @attr('bbx')

diff --git a/neon/datasets/imageset.py b/neon/datasets/imageset.py
@@ -274,7 +274,8 @@ def init_mini_batch_producer(self, batch_size, setname, predict=False):
                           for i in range(self.macro_num_decode_buf)]
         self.target_macro = [None for i in range(self.macro_num_decode_buf)]
         self.lbl_one_hot = [{lbl: self.backend.alloc_host_mem(
-                             (self.macro_size, self.nclass[lbl]), dtype=ibetype)
+                             (self.macro_size, self.nclass[lbl]),
+                             dtype=ibetype)
                              for lbl in self.label_list}
                             for i in range(self.macro_num_decode_buf)]
 

diff --git a/neon/tests/sanity_check.py b/neon/tests/sanity_check.py
@@ -29,10 +29,6 @@ def parse_args():
                         type=int)
     parser.add_argument('--gpu', default="", help='Run GPU sanity check '
                         '(specify one of cudanet or nervanagpu')
-    parser.add_argument('--datapar', default=0, type=int,
-                        help='Run data parallel sanity check')
-    parser.add_argument('--modelpar', default=0, type=int,
-                        help='Run model parallel sanity check')
     return parser.parse_args()
 
 
@@ -54,22 +50,13 @@ def sanity_check(conf_file, result, **be_args):
     check_file = os.path.join(script_dir, '..', '..', 'examples',
                               'convnet', 'synthetic-sanity_check.yaml')
     expected_result = 0.5390625
-    # TODO: modelpar currently broken on synthetic-sanity_check.yaml
-    # (dimensions not aligned), so skipping for the moment.
-    # for be in ["cpu", "gpu", "datapar", "modelpar"]:
-    for be in ["cpu", "gpu", "datapar"]:
+    for be in ["cpu", "gpu"]:
         be_args = {'rng_seed': 0}
         if (args.__dict__[be] != 0 and args.__dict__[be] != "" and
                 args.__dict__[be] != "0"):
             if be == "gpu":
                 be_args[be] = args.__dict__[be]
-            elif be == "datapar":
-                be_args[be] = 1
             print('{} check '.format(be)),
-            if be == "datapar":
-                # temporary hack because we are not running via mpirun.
-                os.environ['OMPI_COMM_WORLD_LOCAL_RANK'] = '0'
-                os.environ['OMPI_COMM_WORLD_LOCAL_SIZE'] = '1'
             sanity_check(check_file, expected_result, **be_args)
             print('OK')
     sys.exit(res)