torch related fix (apache#2016)

piiswrong · piiswrong · commit 25580a35cf9d · 2016-05-04T21:07:52.000-07:00
* we actually don't need to link against libnn

* torch module fix

* adam optimizer fix to match torch

* lint

* add save/load params

* lint

* lint
diff --git a/make/config.mk b/make/config.mk
@@ -112,6 +112,7 @@ EXTRA_OPERATORS =
 #----------------------------
 
 # whether to use torch integration. This requires installing torch.
+# You also need to add TORCH_PATH/install/lib to your LD_LIBRARY_PATH
 # TORCH_PATH = $(HOME)/torch
 # MXNET_PLUGINS += plugin/torch/torch.mk
 
diff --git a/plugin/torch/torch.mk b/plugin/torch/torch.mk
@@ -1,22 +1,7 @@
 CFLAGS += -I$(TORCH_PATH)/install/include -I$(TORCH_PATH)/install/include/TH -I$(TORCH_PATH)/install/include/THC/ -DMXNET_USE_TORCH=1
-LDFLAGS += -L$(TORCH_PATH)/install/lib -lluajit -lluaT -lTH -lTHC -L$(TORCH_PATH)/install/lib/lua/5.1 -lpaths -ltorch
-
-ifneq ("$(wildcard $(TORCH_PATH)/install/lib/lua/5.1/libnn.so)","")
-	LDFLAGS += -lnn
-else
-	LDFLAGS += -lnnx
-endif
-
-ifeq ($(USE_CUDA), 1)
-	LDFLAGS += -lcutorch
-	ifneq ("$(wildcard $(TORCH_PATH)/install/lib/lua/5.1/libcunn.so)","")
-		LDFLAGS += -lcunn
-	else
-		LDFLAGS += -lcunnx
-	endif
-endif
+LDFLAGS += -L$(TORCH_PATH)/install/lib -lluajit -lluaT -lTH -lTHC
 
 TORCH_SRC = $(wildcard plugin/torch/*.cc)
 PLUGIN_OBJ += $(patsubst %.cc, build/%.o, $(TORCH_SRC))
 TORCH_CUSRC = $(wildcard plugin/torch/*.cu)
-PLUGIN_CUOBJ += $(patsubst %.cu, build/%_gpu.o, $(TORCH_CUSRC))
+PLUGIN_CUOBJ += $(patsubst %.cu, build/%_gpu.o, $(TORCH_CUSRC))
diff --git a/plugin/torch/torch_module-inl.h b/plugin/torch/torch_module-inl.h
@@ -310,12 +310,16 @@ class TorchModuleProp : public OperatorProperty {
   }
 
   virtual std::vector<std::string> ListOutputs() const {
-    std::vector<std::string> ret;
-    std::string output = "output";
-    for (uint32_t i = 0; i < param_.num_outputs; ++i) {
-      ret.push_back(output + "_" + std::to_string(i));
+    if (param_.num_outputs > 1) {
+      std::vector<std::string> ret;
+      std::string output = "output";
+      for (uint32_t i = 0; i < param_.num_outputs; ++i) {
+        ret.push_back(output + std::to_string(i));
+      }
+      return ret;
+    } else {
+      return {"output"};
     }
-    return ret;
   }
   void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
     param_.Init(kwargs);
diff --git a/python/mxnet/io.py b/python/mxnet/io.py
@@ -20,7 +20,7 @@
 
 class DataBatch(object):
     """Default object for holding a mini-batch of data and related information."""
-    def __init__(self, data, label, pad, index,
+    def __init__(self, data, label, pad=None, index=None,
                  bucket_key=None, provide_data=None, provide_label=None):
         self.data = data
         self.label = label
diff --git a/python/mxnet/module/base_module.py b/python/mxnet/module/base_module.py
@@ -461,6 +461,40 @@ def set_params(self, arg_params, aux_params):
         self.init_params(initializer=None, arg_params=arg_params, aux_params=aux_params,
                          allow_missing=False, force_init=True)
 
+    def save_params(self, fname):
+        """Save model parameters to file.
+
+        Parameters
+        ----------
+        fname : str
+            Path to output param file.
+        """
+        arg_params, aux_params = self.get_params()
+        save_dict = {('arg:%s' % k) : v for k, v in arg_params.items()}
+        save_dict.update({('aux:%s' % k) : v for k, v in aux_params.items()})
+        ndarray.save(fname, save_dict)
+
+    def load_params(self, fname):
+        """Load model parameters from file.
+
+        Parameters
+        ----------
+        fname : str
+            Path to input param file.
+        """
+        save_dict = ndarray.load(fname)
+        arg_params = {}
+        aux_params = {}
+        for k, value in save_dict.items():
+            arg_type, name = k.split(':', 1)
+            if arg_type == 'arg':
+                arg_params[name] = value
+            elif arg_type == 'aux':
+                aux_params[name] = value
+            else:
+                raise ValueError("Invalid param file " + fname)
+        self.set_params(arg_params, aux_params)
+
     ################################################################################
     # Computations
     ################################################################################
diff --git a/python/mxnet/optimizer.py b/python/mxnet/optimizer.py
@@ -483,15 +483,13 @@ class Adam(Optimizer):
     clip_gradient : float, optional
         clip gradient in range [-clip_gradient, clip_gradient]
     """
-    def __init__(self, learning_rate=0.002, beta1=0.9, beta2=0.999, epsilon=1e-8,
+    def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,
                  decay_factor=(1 - 1e-8), **kwargs):
         super(Adam, self).__init__(learning_rate=learning_rate, **kwargs)
         self.beta1 = beta1
         self.beta2 = beta2
         self.epsilon = epsilon
         self.decay_factor = decay_factor
-        self.time = 0
-        self.time_first_index = None
 
     def create_state(self, index, weight):
         """Create additional optimizer state: mean, variance
@@ -502,7 +500,6 @@ def create_state(self, index, weight):
             The weight data
 
         """
-        self.time_first_index = None  # time is incremented only on the first index
         return (zeros(weight.shape, weight.context, dtype=weight.dtype),  # mean
                 zeros(weight.shape, weight.context, dtype=weight.dtype))  # variance
 
@@ -528,37 +525,25 @@ def update(self, index, weight, grad, state):
         lr = self._get_lr(index)
         self._update_count(index)
 
+        t = self._index_update_count[index]
         mean, variance = state
 
-        # increment time only when the first parameters is called
-        if self.time_first_index is None:
-            self.time_first_index = index
-            self.time = 0  # all parameters share the same time
-        elif self.time_first_index == index:
-            self.time += 1
+        grad *= self.rescale_grad
+        if self.clip_gradient is not None:
+            clip(grad, -self.clip_gradient, self.clip_gradient, out=grad)
 
-        t1 = self.time + 1
-        learning_rate = (lr *
-                         math.sqrt(1. - self.beta2**t1) /
-                         (1. - self.beta1**t1))
-        beta_1t = self.beta1 * self.decay_factor ** (t1 - 1)
+        mean[:] = self.beta1 * mean + (1. - self.beta1) * grad
+        variance[:] = self.beta2 * variance + (1. - self.beta2) * grad * grad
 
-        grad = grad * self.rescale_grad
-        if self.clip_gradient is not None:
-            grad = clip(grad, -self.clip_gradient, self.clip_gradient)
+        coef1 = 1. - self.beta1**t
+        coef2 = 1. - self.beta2**t
+        lr *= math.sqrt(coef2)/coef1
+
+        weight[:] -= lr*mean/(sqrt(variance) + self.epsilon)
 
-        mean_t = beta_1t * mean + (1. - beta_1t) * grad
-        variance_t = (self.beta2 * variance +
-                      (1. - self.beta2) * grad * grad)
-        step = (learning_rate * mean_t /
-                (sqrt(variance_t) + self.epsilon))
         wd = self._get_wd(index)
         if wd > 0.:
-            step += lr * wd * weight
-
-        weight[:] += -step
-        mean[:] = mean_t
-        variance[:] = variance_t
+            weight[:] -= (lr * wd) * weight
 
 @register
 class AdaGrad(Optimizer):
diff --git a/python/mxnet/symbol.py b/python/mxnet/symbol.py
@@ -98,6 +98,9 @@ def __pow__(self, other):
         else:
             raise TypeError('type %s not supported' % str(type(other)))
 
+    def __neg__(self):
+        return self.__mul__(-1.0)
+
     def __del__(self):
         check_call(_LIB.MXSymbolFree(self.handle))
 
diff --git a/python/mxnet/torch.py b/python/mxnet/torch.py
@@ -14,7 +14,7 @@
 try:
     _LUAJIT = ctypes.CDLL("libluajit.so", mode=ctypes.RTLD_GLOBAL)
 except OSError:
-    pass
+    _LUAJIT = None
 
 # pylint: disable=too-many-locals, invalid-name
 def _make_torch_function(handle):