diff --git a/matlab/caffe/ilsvrc_2012_mean.mat b/matlab/caffe/ilsvrc_2012_mean.mat
new file mode 100644
index 00000000000..f1da25c84a1
Binary files /dev/null and b/matlab/caffe/ilsvrc_2012_mean.mat differ
diff --git a/matlab/caffe/matcaffe.cpp b/matlab/caffe/matcaffe.cpp
index 1f11a2bc66a..d137b31e812 100644
--- a/matlab/caffe/matcaffe.cpp
+++ b/matlab/caffe/matcaffe.cpp
@@ -11,99 +11,78 @@
 
 using namespace caffe;
 
-// A simple wrapper over CaffeNet that runs the forward process.
-struct CaffeNet
-{
-  // The pointer to the internal caffe::Net instance
-	shared_ptr<Net<float> > net_;
-
-  CaffeNet() {}
-  
-  void init(string param_file, string pretrained_param_file) {
-    net_.reset(new Net<float>(param_file));
-    net_->CopyTrainedLayersFrom(pretrained_param_file);
-  }
-
-  virtual ~CaffeNet() {}
-
-  /*
-  inline void check_array_against_blob(
-      PyArrayObject* arr, Blob<float>* blob) {
-    CHECK(PyArray_FLAGS(arr) & NPY_ARRAY_C_CONTIGUOUS);
-    CHECK_EQ(PyArray_NDIM(arr), 4);
-    CHECK_EQ(PyArray_ITEMSIZE(arr), 4);
-    npy_intp* dims = PyArray_DIMS(arr);
-    CHECK_EQ(dims[0], blob->num());
-    CHECK_EQ(dims[1], blob->channels());
-    CHECK_EQ(dims[2], blob->height());
-    CHECK_EQ(dims[3], blob->width());
+// The pointer to the internal caffe::Net instance
+static shared_ptr<Net<float> > net_;
+
+// Five things to be aware of:
+//   caffe uses row-major order
+//   matlab uses column-major order
+//   caffe uses BGR color channel order
+//   matlab uses RGB color channel order
+//   images need to have the data mean subtracted
+//
+// Data coming in from matlab needs to be in the order 
+//   [batch_images, channels, height, width] 
+// where width is the fastest dimension.
+// Here is the rough matlab for putting image data into the correct
+// format:
+//   % convert from uint8 to single
+//   im = single(im);
+//   % reshape to a fixed size (e.g., 227x227)
+//   im = imresize(im, [IMAGE_DIM IMAGE_DIM], 'bilinear');
+//   % permute from RGB to BGR and subtract the data mean (already in BGR)
+//   im = im(:,:,[3 2 1]) - data_mean;
+//   % flip width and height to make width the fastest dimension
+//   im = permute(im, [2 1 3]);
+//
+// If you have multiple images, cat them with cat(4, ...)
+//
+// The actual forward function. It takes in a cell array of 4-D arrays as
+// input and outputs a cell array. 
+static mxArray* do_forward(const mxArray* const bottom) {
+  vector<Blob<float>*>& input_blobs = net_->input_blobs();
+  CHECK_EQ(static_cast<unsigned int>(mxGetDimensions(bottom)[0]), 
+      input_blobs.size());
+  for (unsigned int i = 0; i < input_blobs.size(); ++i) {
+    const mxArray* const elem = mxGetCell(bottom, i);
+    const float* const data_ptr = 
+        reinterpret_cast<const float* const>(mxGetPr(elem));
+    switch (Caffe::mode()) {
+    case Caffe::CPU:
+      memcpy(input_blobs[i]->mutable_cpu_data(), data_ptr,
+          sizeof(float) * input_blobs[i]->count());
+      break;
+    case Caffe::GPU:
+      cudaMemcpy(input_blobs[i]->mutable_gpu_data(), data_ptr,
+          sizeof(float) * input_blobs[i]->count(), cudaMemcpyHostToDevice);
+      break;
+    default:
+      LOG(FATAL) << "Unknown Caffe mode.";
+    }  // switch (Caffe::mode())
   }
-  */
-
-  // Data needs to be [images, channels, height, width] where width is the fastest dimension
-  // 
-  // In matlab, reading an image gives [height, width, channels] where height is the fastest dimension
-  //  - want to have the order as [width, height, channels, images]
-  //    (channels in BGR order)
-  //  - 
-  //
-  // The matlab model is: 
-  //   - bottom is a cell array of 4D tensors in the correct format
-  //   - top is allocated in here as a cell array of outputs
-  //
-  // The actual forward function. It takes in a python list of numpy arrays as
-  // input and a python list of numpy arrays as output. The input and output
-  // should all have correct shapes, are single-precisionabcdnt- and c contiguous.
-  //
-  //
-  mxArray* Forward(const mxArray* const bottom) {
-    vector<Blob<float>*>& input_blobs = net_->input_blobs();
-    CHECK_EQ(static_cast<unsigned int>(mxGetDimensions(bottom)[0]), 
-        input_blobs.size());
-    for (unsigned int i = 0; i < input_blobs.size(); ++i) {
-      const mxArray* const elem = mxGetCell(bottom, i);
-      const float* const data_ptr = 
-          reinterpret_cast<const float* const>(mxGetPr(elem));
-      //check_array_against_blob(arr, input_blobs[i]);
-      switch (Caffe::mode()) {
-      case Caffe::CPU:
-        memcpy(input_blobs[i]->mutable_cpu_data(), data_ptr,
-            sizeof(float) * input_blobs[i]->count());
-        break;
-      case Caffe::GPU:
-        cudaMemcpy(input_blobs[i]->mutable_gpu_data(), data_ptr,
-            sizeof(float) * input_blobs[i]->count(), cudaMemcpyHostToDevice);
-        break;
-      default:
-        LOG(FATAL) << "Unknown Caffe mode.";
-      }  // switch (Caffe::mode())
-    }
-    const vector<Blob<float>*>& output_blobs = net_->ForwardPrefilled();
-    mxArray* mx_out = mxCreateCellMatrix(output_blobs.size(), 1);
-    for (unsigned int i = 0; i < output_blobs.size(); ++i) {
-      mxArray* mx_blob = mxCreateNumericMatrix(output_blobs[i]->count(), 
-          1, mxSINGLE_CLASS, mxREAL);
-      mxSetCell(mx_out, i, mx_blob);
-      float* data_ptr = reinterpret_cast<float*>(mxGetPr(mx_blob));
-      //check_array_against_blob(arr, output_blobs[i]);
-      switch (Caffe::mode()) {
-      case Caffe::CPU:
-        memcpy(data_ptr, output_blobs[i]->cpu_data(),
-            sizeof(float) * output_blobs[i]->count());
-        break;
-      case Caffe::GPU:
-        cudaMemcpy(data_ptr, output_blobs[i]->gpu_data(),
-            sizeof(float) * output_blobs[i]->count(), cudaMemcpyDeviceToHost);
-        break;
-      default:
-        LOG(FATAL) << "Unknown Caffe mode.";
-      }  // switch (Caffe::mode())
-    }
-
-    return mx_out;
+  const vector<Blob<float>*>& output_blobs = net_->ForwardPrefilled();
+  mxArray* mx_out = mxCreateCellMatrix(output_blobs.size(), 1);
+  for (unsigned int i = 0; i < output_blobs.size(); ++i) {
+    mxArray* mx_blob = mxCreateNumericMatrix(output_blobs[i]->count(), 
+        1, mxSINGLE_CLASS, mxREAL);
+    mxSetCell(mx_out, i, mx_blob);
+    float* data_ptr = reinterpret_cast<float*>(mxGetPr(mx_blob));
+    switch (Caffe::mode()) {
+    case Caffe::CPU:
+      memcpy(data_ptr, output_blobs[i]->cpu_data(),
+          sizeof(float) * output_blobs[i]->count());
+      break;
+    case Caffe::GPU:
+      cudaMemcpy(data_ptr, output_blobs[i]->gpu_data(),
+          sizeof(float) * output_blobs[i]->count(), cudaMemcpyDeviceToHost);
+      break;
+    default:
+      LOG(FATAL) << "Unknown Caffe mode.";
+    }  // switch (Caffe::mode())
   }
 
-};
+  return mx_out;
+}
 
 // The caffe::Caffe utility functions.
 static void set_mode_cpu(MEX_ARGS) { 
@@ -123,19 +102,38 @@ static void set_phase_test(MEX_ARGS) {
 }
 
 static void set_device(MEX_ARGS) { 
+  if (nrhs != 1) {
+    LOG(ERROR) << "Only given " << nrhs << " arguments";
+    mexErrMsgTxt("Wrong number of arguments");
+  }
+
   int device_id = static_cast<int>(mxGetScalar(prhs[0]));
   Caffe::SetDevice(device_id); 
 }
 
-static CaffeNet net;
+static void init(MEX_ARGS) {
+  if (nrhs != 2) {
+    LOG(ERROR) << "Only given " << nrhs << " arguments";
+    mexErrMsgTxt("Wrong number of arguments");
+  }
+
+  char* param_file = mxArrayToString(prhs[0]);
+  char* model_file = mxArrayToString(prhs[1]);
 
-static void net_init(MEX_ARGS) {
-  net.init("/home/rbg/working/caffe/examples/imagenet_deploy.prototxt", 
-           "/home/rbg/working/caffe/examples/alexnet_train_iter_470000");
+  net_.reset(new Net<float>(string(param_file)));
+  net_->CopyTrainedLayersFrom(string(model_file));
+
+  mxFree(param_file);
+  mxFree(model_file);
 }
 
-static void net_forward(MEX_ARGS) {
-  plhs[0] = net.Forward(prhs[0]);
+static void forward(MEX_ARGS) {
+  if (nrhs != 1) {
+    LOG(ERROR) << "Only given " << nrhs << " arguments";
+    mexErrMsgTxt("Wrong number of arguments");
+  }
+
+  plhs[0] = do_forward(prhs[0]);
 }
 
 /** -----------------------------------------------------------------
@@ -148,8 +146,8 @@ struct handler_registry {
 
 static handler_registry handlers[] = {
   // Public API functions
-  { "forward",            net_forward     },
-  { "init",               net_init        },
+  { "forward",            forward         },
+  { "init",               init            },
   { "set_mode_cpu",       set_mode_cpu    },
   { "set_mode_gpu",       set_mode_gpu    },
   { "set_phase_train",    set_phase_train },
@@ -164,19 +162,27 @@ static handler_registry handlers[] = {
  ** matlab entry point: caffe(api_command, arg1, arg2, ...)
  **/
 void mexFunction(MEX_ARGS) {
-  // TODO: check args
+  if (nrhs == 0) {
+    LOG(ERROR) << "No API command given";
+    mexErrMsgTxt("An API command is requires");
+    return;
+  }
+
   { // Handle input command
     char *cmd = mxArrayToString(prhs[0]);
-    //bool dispatched = false;
+    bool dispatched = false;
     // Dispatch to cmd handler
     for (int i = 0; handlers[i].func != NULL; i++) {
       if (handlers[i].cmd.compare(cmd) == 0) {
         handlers[i].func(nlhs, plhs, nrhs-1, prhs+1);
-        //dispatched = true;
+        dispatched = true;
         break;
       }
     }
+    if (!dispatched) {
+      LOG(ERROR) << "Unknown command `" << cmd << "'";
+      mexErrMsgTxt("API command not recognized");
+    }
     mxFree(cmd);
-    //checkM(dispatched, "Command not found!");
   }
 }
diff --git a/matlab/caffe/matcaffe_demo.m b/matlab/caffe/matcaffe_demo.m
index 6b4ca2f79e1..d070268412c 100644
--- a/matlab/caffe/matcaffe_demo.m
+++ b/matlab/caffe/matcaffe_demo.m
@@ -1,46 +1,62 @@
-function res = matcaffe_demo(im, gpu)
+function scores = matcaffe_demo(im, use_gpu)
+% scores = matcaffe_demo(im, use_gpu)
+% 
+% Demo of the matlab wrapper using the ILSVRC network.
+%
+% input
+%   im       color image as uint8 HxWx3
+%   use_gpu  1 to use the GPU, 0 to use the CPU
+%
+% output
+%   scores   1000-dimensional ILSVRC score vector
 
-% load image net mean
-%  // In matlab, reading an image gives [height, width, channels] where height is the fastest dimension
-%  //  - want to have the order as [width, height, channels, images]
-%  //    (channels in BGR order)
-%  //  - 
+model_def_file = '../../examples/imagenet_deploy.prototxt';
+% NOTE: you'll have to get the pre-trained ILSVRC network
+model_file = '../../examples/alexnet_train_iter_470000';
 
-% 1: swap channel order to BGR
-% 2: extract 5 crops and their flips
-% 3: swap rows and columns and concat along 4th dim
-% 4: wrap in cell aray
+% init caffe network (spews logging info)
+caffe('init', model_def_file, model_file);
 
-caffe('init');
-if gpu
+% set to use GPU or CPU
+if exist('use_gpu', 'var') && use_gpu
   caffe('set_mode_gpu');
 else
   caffe('set_mode_cpu');
 end
+
+% put into test mode
 caffe('set_phase_test');
+
+% prepare oversampled input
 tic;
-blob = {prepare_image(im)};
+input_data = {prepare_image(im)};
 toc;
+
+% do forward pass to get scores
 tic;
-res = caffe('forward', blob);
+scores = caffe('forward', input_data);
 toc;
-res = reshape(res{1}, [1000 10]);
-res = mean(res, 2);
+
+% average output scores
+scores = reshape(scores{1}, [1000 10]);
+scores = mean(scores, 2);
 
 
+% ------------------------------------------------------------------------
 function images = prepare_image(im)
+% ------------------------------------------------------------------------
 d = load('ilsvrc_2012_mean');
-image_mean = d.image_mean;
+IMAGE_MEAN = d.image_mean;
 IMAGE_DIM = 256;
 CROPPED_DIM = 227;
 
 % resize to fixed input size
 im = single(im);
 im = imresize(im, [IMAGE_DIM IMAGE_DIM], 'bilinear');
-% permute from RGB to BGR
-im = im(:,:,[3 2 1]) - image_mean;
+% permute from RGB to BGR (IMAGE_MEAN is already BGR)
+im = im(:,:,[3 2 1]) - IMAGE_MEAN;
 
-% oversample
+% oversample (4 corners, center, and their x-axis flips)
 images = zeros(CROPPED_DIM, CROPPED_DIM, 3, 10, 'single');
 indices = [0 IMAGE_DIM-CROPPED_DIM] + 1;
 curr = 1;