openai
diff --git a/‎Makefile‎
Lines changed: 56 additions & 3 deletions b/‎Makefile‎
Lines changed: 56 additions & 3 deletions
diff --git a/‎blocksparse/ewops.py‎
Lines changed: 10 additions & 0 deletions b/‎blocksparse/ewops.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎blocksparse/nccl.py‎
Lines changed: 5 additions & 2 deletions b/‎blocksparse/nccl.py‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎blocksparse/norms.py‎
Lines changed: 8 additions & 1 deletion b/‎blocksparse/norms.py‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎blocksparse/optimize.py‎
Lines changed: 98 additions & 4 deletions b/‎blocksparse/optimize.py‎
Lines changed: 98 additions & 4 deletions
@@ -48,14 +48,15 @@ CCFLAGS=-std=c++11 -O3 -fPIC -DGOOGLE_CUDA=1 -D_GLIBCXX_USE_CXX11_ABI=$(TF_ABI)
 
 NVCCFLAGS=-DGOOGLE_CUDA=1 -D_GLIBCXX_USE_CXX11_ABI=$(TF_ABI) -O3 -Xcompiler -fPIC -std=c++11 --prec-div=false --prec-sqrt=false \
 	-arch=sm_61 \
-  	-gencode=arch=compute_35,code=sm_35 \
- 	-gencode=arch=compute_50,code=sm_50 \
- 	-gencode=arch=compute_52,code=sm_52 \
 	-gencode=arch=compute_60,code=sm_60 \
 	-gencode=arch=compute_61,code=sm_61 \
 	-gencode=arch=compute_70,code=sm_70 \
 	-gencode=arch=compute_61,code=compute_61
 
+#  	-gencode=arch=compute_35,code=sm_35 \
+# 	-gencode=arch=compute_50,code=sm_50 \
+# 	-gencode=arch=compute_52,code=sm_52 \
+
 OBJS=\
 	$(TARGET)/batch_norm_op.o \
 	$(TARGET)/blocksparse_conv_op.o \
@@ -80,6 +81,7 @@ CU_OBJS=\
 	$(TARGET)/blocksparse_l2_norm_op_gpu.cu.o \
 	$(TARGET)/blocksparse_matmul_op_gpu.cu.o \
 	$(TARGET)/blocksparse_matmul_gated_op_gpu.cu.o \
+	$(TARGET)/blocksparse_transformer_op_gpu.cu.o \
 	$(TARGET)/cwise_linear_op_gpu.cu.o \
 	$(TARGET)/edge_bias_op_gpu.cu.o \
 	$(TARGET)/ew_op_gpu.cu.o \
@@ -108,3 +110,54 @@ $(TARGET)/%.o: src/%.cc src/*.h $(TARGET)/blocksparse_kernels.h
 	g++ $(CCFLAGS) -c $< -o $@
 
 
+# bazel-0.17.1-installer-linux-x86_64.sh (--user)
+# NVIDIA-Linux-x86_64-396.37.run
+# cuda_9.2.148_396.37_linux
+# cudnn-9.2-linux-x64-v7.2.1.38.tgz
+# nccl_2.2.13-1+cuda9.2_x86_64.txz
+
+# apt-get install mpich
+
+# uncomment:
+# https://github.com/tensorflow/tensorflow/blob/r1.10/tensorflow/core/kernels/batch_matmul_op_real.cc#L35
+
+
+# ls -l /usr/local
+# lrwxrwxrwx  1 root  root    19 Jul 14 13:11 cuda -> /usr/local/cuda-9.2/
+# drwxr-xr-x 18 root  root  4096 Sep 14 16:12 cuda-9.2/
+# lrwxrwxrwx  1 root  root    39 Jul 12 17:01 nccl -> /usr/local/nccl_2.2.13-1+cuda9.2_x86_64/
+# drwxr-xr-x  4 root  root  4096 Jul 12 16:27 nccl_2.2.13-1+cuda9.2_x86_64/
+
+# export TF_NEED_CUDA=1
+# export TF_NEED_MKL=0
+# export TF_NEED_GCP=0
+# export TF_NEED_HDFS=0
+# export TF_NEED_OPENCL=0
+# export TF_NEED_AWS=0
+# export TF_NEED_JEMALLOC=0
+# export TF_NEED_KAFKA=0
+# export TF_NEED_OPENCL_SYCL=0
+# export TF_NEED_COMPUTECPP=0
+# export TF_CUDA_CLANG=0
+# export TF_NEED_TENSORRT=0
+# export TF_ENABLE_XLA=0
+# export TF_NEED_GDR=0
+# export TF_NEED_VERBS=0
+# export TF_NEED_MPI=0
+# export TF_CUDA_VERSION="9.2"
+# export TF_CUDNN_VERSION="7.2"
+# export TF_NCCL_VERSION="2.2"
+# export TF_CUDA_COMPUTE_CAPABILITIES="6.0,7.0"
+# export GCC_HOST_COMPILER_PATH="/usr/bin/gcc"
+# export CUDA_TOOLKIT_PATH="/usr/local/cuda"
+# export CUDNN_INSTALL_PATH="/usr/local/cuda"
+# export NCCL_INSTALL_PATH="/usr/local/nccl"
+
+# alias tfbuild0="bazel build --config=opt --config=cuda //tensorflow/tools/pip_package:build_pip_package"
+# alias tfbuild1="bazel-bin/tensorflow/tools/pip_package/build_pip_package ~"
+# alias tfbuild2="pip uninstall tensorflow"
+# alias tfbuild3="pip install ~/tensorflow-*.whl"
+
+# git clone blocksparse
+# make compile
+# pip install dist/*.whl
@@ -149,6 +149,16 @@ def ew_z_xb_grad(op, dz):
 
     raise ValueError("bad op code")
 
+############################## Filter Infinity/Nans + scale #####################################
+
+filter_infinity_op = _op_module.filter_infinity
+
+def filter_infinity(x, scale=1.0, zero_nans=True):
+    return filter_infinity_op(x, scale, zero_nans=zero_nans)
+
+@ops.RegisterGradient("FilterInfinity")
+def filter_infinity_grad(op, dy):
+    return filter_infinity_op(dy, op.inputs[1], zero_nans=op.get_attr("zero_nans")), None
 
 ############################## Float Cast #####################################
 
 
@@ -43,7 +43,7 @@ def allreduce(x, sync_size=0, num_comms=2, logfile="", rank=0, prereduce=0, name
     return ret
 
 
-def group_allreduce(grads, parms, search_strings=None, cast_map=None, num_comms=2, prereduce=0):
+def group_allreduce(grads, parms, search_strings=None, cast_map=None, cast_all=None, num_comms=2, prereduce=0):
 
     # if no grouping specified, create one group to reduce at the end (no overlap with compute)
     if search_strings is None:
@@ -55,7 +55,10 @@ def group_allreduce(grads, parms, search_strings=None, cast_map=None, num_comms=
         for name, group16, group32 in groups:
             if name == search_strings[-1] or name in param.name:
 
-                if cast_map is not None and name in cast_map:
+                if cast_all is not None:
+                    grad = float_cast(grad, dtype=cast_all)
+
+                elif cast_map is not None and name in cast_map:
                     grad = float_cast(grad, dtype=cast_map[name])
 
                 if grad.dtype.base_dtype is tf.float16:
 
@@ -267,4 +267,11 @@ def _magic64u(d):
     magic, shift = _magic32u(nmax, d)
     if magic != 1:
         shift -= 32
-    return (magic, shift)
+    return (magic, shift)
+
+# for d in range(1,1000000):
+#     magic, shift = _magic32u(0x7fffffff, d)
+#     if shift < 32 or len(hex(magic)) > 10:
+#         if magic != 1:
+#             print(d, magic, shift)
+
@@ -198,18 +198,19 @@ def apply(self, grad_params, gpu=0):
 
 
 class ClipAdamOptimizer(optimizer.Optimizer):
-    def __init__(self, learning_rate=3e-4, beta1=0.9, beta2=0.999, epsilon=1e-8, clip_sigmas=0.0, grad_scale=1.0, zero_nans=True, name="ClipAdam"):
+    def __init__(self, learning_rate=3e-4, beta1=0.9, beta2=0.999, epsilon=1e-8, clip_sigmas=0.0, grad_scale=1.0, sat_infs=None, zero_nans=True, name="ClipAdam"):
         super().__init__(False, name)
         self.beta1      = beta1
         self.beta2      = beta2
         self.epsilon    = epsilon
+        self.sat_infs   = sat_infs
         self.zero_nans  = zero_nans
         self.name       = name
 
         with tf.device("/cpu:0"), tf.variable_scope("adam_beta"):
 
-            if type(learning_rate) is float:
-                learning_rate = tf.constant(learning_rate)
+            if type(learning_rate) in (float, int):
+                learning_rate = tf.constant(float(learning_rate))
             if type(clip_sigmas)   in (float, int):
                 clip_sigmas   = tf.constant(float(clip_sigmas))
             if type(grad_scale)    in (float, int):
@@ -240,9 +241,12 @@ def _apply_dense(self, grad, param):
         m = self.get_slot(param, "m")
         v = self.get_slot(param, "v")
 
+        # a float32 grad could still could contain infs from upstream fp16 math
+        sat_infs = grad.dtype is tf.float16 if self.sat_infs is None else self.sat_infs
+
         return adam_op(grad, param, m, v, self.lr, self.grad_scale, self.clip_sigma,
             decay_mean=self.beta1, decay_var=self.beta2, epsilon=self.epsilon,
-            zero_nans=self.zero_nans, lazy_update=hasattr(grad, "lazy")).out_param
+            sat_infs=sat_infs, zero_nans=self.zero_nans, lazy_update=hasattr(grad, "lazy")).out_param
 
     def _apply_sparse(self, grad, param):
         raise NotImplementedError("Sparse gradient updates are not supported.")
@@ -255,3 +259,93 @@ def _finish(self, update_ops, name_scope):
 
         return tf.group(*update_ops + [update_beta1, update_beta2], name=name_scope)
 
+
+class AdamOptimizer(ClipAdamOptimizer):
+    def __init__(self, learning_rate=3e-4, beta1=0.9, beta2=0.999, epsilon=1e-8, grad_scale=1.0, sat_infs=None, zero_nans=True, name="Adam"):
+        super().__init__(learning_rate=learning_rate, beta1=beta1, beta2=beta2, epsilon=epsilon, grad_scale=grad_scale, sat_infs=sat_infs, zero_nans=zero_nans, name=name)
+
+
+############################## ClipAdamOptimizer #####################################
+
+adafactor1d_op = _op_module.adafactor1d
+adafactor2d_op = _op_module.adafactor2d
+
+class AdafactorOptimizer(optimizer.Optimizer):
+    def __init__(self, learning_rate=5e-4, beta2=0.999, epsilon=1e-30, clip_thresh=1.0, grad_scale=1.0, sat_infs=None, zero_nans=True, name="Adafactor"):
+        super().__init__(False, name)
+        self.epsilon    = epsilon
+        self.sat_infs   = sat_infs
+        self.zero_nans  = zero_nans
+        self.name       = name
+
+        with tf.device("/cpu:0"), tf.variable_scope("adafactor_decay"):
+
+            if type(learning_rate) in (float, int):
+                learning_rate = tf.constant(float(learning_rate))
+            if type(clip_thresh)   in (float, int):
+                clip_thresh   = tf.constant(float(clip_thresh))
+            if type(grad_scale)    in (float, int):
+                grad_scale    = tf.constant(float(grad_scale))
+            one = tf.constant(1.0)
+
+            self.decay1_power = tf.Variable(initial_value=beta2,       name="decay1_power", trainable=False)
+            self.decay2_power = tf.Variable(initial_value=beta2*beta2, name="decay2_power", trainable=False)
+            self.learn_rate   = learning_rate
+            self.clip_thresh  = clip_thresh
+            self.grad_scale   = grad_scale
+            self.decay_t      = tf.constant(beta2)
+            self.decay        = self.decay_t * (one - self.decay1_power) / (one - self.decay2_power)
+
+    def _get_beta_accumulators(self):
+        return self.decay1_power, self.decay2_power
+
+    def _non_slot_variables(self):
+        return self._get_beta_accumulators()
+
+    def _create_slots(self, params):
+        # Create slots for the first and second moments.
+        for param in params:
+            if param.shape.ndims == 2 and param.shape[0].value > 1:
+                self._get_or_make_slot(param, tf.zeros(param.shape[1].value), "cv", self.name + "CV")
+                self._get_or_make_slot(param, tf.zeros(param.shape[0].value), "rv", self.name + "RV")
+            elif param.shape.ndims == 1 or (param.shape.ndims == 2 and param.shape[0].value == 1):
+                self._get_or_make_slot(param, tf.zeros(param.shape.num_elements()), "cv", self.name + "CV")
+            else:
+                raise ValueError("only 1 or 2d params are supported")
+
+    def _apply_dense(self, grad, param):
+
+        # a float32 grad could still could contain infs from upstream fp16 math
+        sat_infs = grad.dtype is tf.float16 if self.sat_infs is None else self.sat_infs
+
+        if param.shape.ndims == 2 and param.shape[0].value > 1:
+
+            cv = self.get_slot(param, "cv")
+            rv = self.get_slot(param, "rv")
+
+            return adafactor2d_op(param, cv, rv, grad,
+                self.decay, self.learn_rate, self.grad_scale, self.clip_thresh,
+                epsilon=self.epsilon, sat_infs=sat_infs, zero_nans=self.zero_nans).out_param
+
+        elif param.shape.ndims == 1 or (param.shape.ndims == 2 and param.shape[0].value == 1):
+
+            cv = self.get_slot(param, "cv")
+
+            return adafactor1d_op(param, cv, grad,
+                self.decay, self.learn_rate, self.grad_scale, self.clip_thresh,
+                epsilon=self.epsilon, sat_infs=sat_infs, zero_nans=self.zero_nans).out_param
+        else:
+            raise ValueError("only 1 or 2d params are supported")
+
+    def _apply_sparse(self, grad, param):
+        raise NotImplementedError("Sparse gradient updates are not supported.")
+
+    def _finish(self, update_ops, name_scope):
+        # Update the power accumulators.
+        with ops.control_dependencies([ self.decay ]), tf.device("/cpu:0"):
+            update_decay1 = self.decay1_power.assign(self.decay1_power * self.decay_t)
+            update_decay2 = self.decay2_power.assign(self.decay2_power * self.decay_t)
+
+        return tf.group(*update_ops + [update_decay1, update_decay2], name=name_scope)
+
+