Skip to content

Commit 4be9016

Browse files
committed
lots of fixes + blocksparse transformer ops for volta fp16 (see enwiki8 example)
1 parent 245b43a commit 4be9016

27 files changed

+4670
-110
lines changed

Makefile

Lines changed: 56 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -48,14 +48,15 @@ CCFLAGS=-std=c++11 -O3 -fPIC -DGOOGLE_CUDA=1 -D_GLIBCXX_USE_CXX11_ABI=$(TF_ABI)
4848

4949
NVCCFLAGS=-DGOOGLE_CUDA=1 -D_GLIBCXX_USE_CXX11_ABI=$(TF_ABI) -O3 -Xcompiler -fPIC -std=c++11 --prec-div=false --prec-sqrt=false \
5050
-arch=sm_61 \
51-
-gencode=arch=compute_35,code=sm_35 \
52-
-gencode=arch=compute_50,code=sm_50 \
53-
-gencode=arch=compute_52,code=sm_52 \
5451
-gencode=arch=compute_60,code=sm_60 \
5552
-gencode=arch=compute_61,code=sm_61 \
5653
-gencode=arch=compute_70,code=sm_70 \
5754
-gencode=arch=compute_61,code=compute_61
5855

56+
# -gencode=arch=compute_35,code=sm_35 \
57+
# -gencode=arch=compute_50,code=sm_50 \
58+
# -gencode=arch=compute_52,code=sm_52 \
59+
5960
OBJS=\
6061
$(TARGET)/batch_norm_op.o \
6162
$(TARGET)/blocksparse_conv_op.o \
@@ -80,6 +81,7 @@ CU_OBJS=\
8081
$(TARGET)/blocksparse_l2_norm_op_gpu.cu.o \
8182
$(TARGET)/blocksparse_matmul_op_gpu.cu.o \
8283
$(TARGET)/blocksparse_matmul_gated_op_gpu.cu.o \
84+
$(TARGET)/blocksparse_transformer_op_gpu.cu.o \
8385
$(TARGET)/cwise_linear_op_gpu.cu.o \
8486
$(TARGET)/edge_bias_op_gpu.cu.o \
8587
$(TARGET)/ew_op_gpu.cu.o \
@@ -108,3 +110,54 @@ $(TARGET)/%.o: src/%.cc src/*.h $(TARGET)/blocksparse_kernels.h
108110
g++ $(CCFLAGS) -c $< -o $@
109111

110112

113+
# bazel-0.17.1-installer-linux-x86_64.sh (--user)
114+
# NVIDIA-Linux-x86_64-396.37.run
115+
# cuda_9.2.148_396.37_linux
116+
# cudnn-9.2-linux-x64-v7.2.1.38.tgz
117+
# nccl_2.2.13-1+cuda9.2_x86_64.txz
118+
119+
# apt-get install mpich
120+
121+
# uncomment:
122+
# https://github.com/tensorflow/tensorflow/blob/r1.10/tensorflow/core/kernels/batch_matmul_op_real.cc#L35
123+
124+
125+
# ls -l /usr/local
126+
# lrwxrwxrwx 1 root root 19 Jul 14 13:11 cuda -> /usr/local/cuda-9.2/
127+
# drwxr-xr-x 18 root root 4096 Sep 14 16:12 cuda-9.2/
128+
# lrwxrwxrwx 1 root root 39 Jul 12 17:01 nccl -> /usr/local/nccl_2.2.13-1+cuda9.2_x86_64/
129+
# drwxr-xr-x 4 root root 4096 Jul 12 16:27 nccl_2.2.13-1+cuda9.2_x86_64/
130+
131+
# export TF_NEED_CUDA=1
132+
# export TF_NEED_MKL=0
133+
# export TF_NEED_GCP=0
134+
# export TF_NEED_HDFS=0
135+
# export TF_NEED_OPENCL=0
136+
# export TF_NEED_AWS=0
137+
# export TF_NEED_JEMALLOC=0
138+
# export TF_NEED_KAFKA=0
139+
# export TF_NEED_OPENCL_SYCL=0
140+
# export TF_NEED_COMPUTECPP=0
141+
# export TF_CUDA_CLANG=0
142+
# export TF_NEED_TENSORRT=0
143+
# export TF_ENABLE_XLA=0
144+
# export TF_NEED_GDR=0
145+
# export TF_NEED_VERBS=0
146+
# export TF_NEED_MPI=0
147+
# export TF_CUDA_VERSION="9.2"
148+
# export TF_CUDNN_VERSION="7.2"
149+
# export TF_NCCL_VERSION="2.2"
150+
# export TF_CUDA_COMPUTE_CAPABILITIES="6.0,7.0"
151+
# export GCC_HOST_COMPILER_PATH="/usr/bin/gcc"
152+
# export CUDA_TOOLKIT_PATH="/usr/local/cuda"
153+
# export CUDNN_INSTALL_PATH="/usr/local/cuda"
154+
# export NCCL_INSTALL_PATH="/usr/local/nccl"
155+
156+
# alias tfbuild0="bazel build --config=opt --config=cuda //tensorflow/tools/pip_package:build_pip_package"
157+
# alias tfbuild1="bazel-bin/tensorflow/tools/pip_package/build_pip_package ~"
158+
# alias tfbuild2="pip uninstall tensorflow"
159+
# alias tfbuild3="pip install ~/tensorflow-*.whl"
160+
161+
# git clone blocksparse
162+
# make compile
163+
# pip install dist/*.whl

blocksparse/ewops.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,16 @@ def ew_z_xb_grad(op, dz):
149149

150150
raise ValueError("bad op code")
151151

152+
############################## Filter Infinity/Nans + scale #####################################
153+
154+
filter_infinity_op = _op_module.filter_infinity
155+
156+
def filter_infinity(x, scale=1.0, zero_nans=True):
157+
return filter_infinity_op(x, scale, zero_nans=zero_nans)
158+
159+
@ops.RegisterGradient("FilterInfinity")
160+
def filter_infinity_grad(op, dy):
161+
return filter_infinity_op(dy, op.inputs[1], zero_nans=op.get_attr("zero_nans")), None
152162

153163
############################## Float Cast #####################################
154164

blocksparse/nccl.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ def allreduce(x, sync_size=0, num_comms=2, logfile="", rank=0, prereduce=0, name
4343
return ret
4444

4545

46-
def group_allreduce(grads, parms, search_strings=None, cast_map=None, num_comms=2, prereduce=0):
46+
def group_allreduce(grads, parms, search_strings=None, cast_map=None, cast_all=None, num_comms=2, prereduce=0):
4747

4848
# if no grouping specified, create one group to reduce at the end (no overlap with compute)
4949
if search_strings is None:
@@ -55,7 +55,10 @@ def group_allreduce(grads, parms, search_strings=None, cast_map=None, num_comms=
5555
for name, group16, group32 in groups:
5656
if name == search_strings[-1] or name in param.name:
5757

58-
if cast_map is not None and name in cast_map:
58+
if cast_all is not None:
59+
grad = float_cast(grad, dtype=cast_all)
60+
61+
elif cast_map is not None and name in cast_map:
5962
grad = float_cast(grad, dtype=cast_map[name])
6063

6164
if grad.dtype.base_dtype is tf.float16:

blocksparse/norms.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -267,4 +267,11 @@ def _magic64u(d):
267267
magic, shift = _magic32u(nmax, d)
268268
if magic != 1:
269269
shift -= 32
270-
return (magic, shift)
270+
return (magic, shift)
271+
272+
# for d in range(1,1000000):
273+
# magic, shift = _magic32u(0x7fffffff, d)
274+
# if shift < 32 or len(hex(magic)) > 10:
275+
# if magic != 1:
276+
# print(d, magic, shift)
277+

blocksparse/optimize.py

Lines changed: 98 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -198,18 +198,19 @@ def apply(self, grad_params, gpu=0):
198198

199199

200200
class ClipAdamOptimizer(optimizer.Optimizer):
201-
def __init__(self, learning_rate=3e-4, beta1=0.9, beta2=0.999, epsilon=1e-8, clip_sigmas=0.0, grad_scale=1.0, zero_nans=True, name="ClipAdam"):
201+
def __init__(self, learning_rate=3e-4, beta1=0.9, beta2=0.999, epsilon=1e-8, clip_sigmas=0.0, grad_scale=1.0, sat_infs=None, zero_nans=True, name="ClipAdam"):
202202
super().__init__(False, name)
203203
self.beta1 = beta1
204204
self.beta2 = beta2
205205
self.epsilon = epsilon
206+
self.sat_infs = sat_infs
206207
self.zero_nans = zero_nans
207208
self.name = name
208209

209210
with tf.device("/cpu:0"), tf.variable_scope("adam_beta"):
210211

211-
if type(learning_rate) is float:
212-
learning_rate = tf.constant(learning_rate)
212+
if type(learning_rate) in (float, int):
213+
learning_rate = tf.constant(float(learning_rate))
213214
if type(clip_sigmas) in (float, int):
214215
clip_sigmas = tf.constant(float(clip_sigmas))
215216
if type(grad_scale) in (float, int):
@@ -240,9 +241,12 @@ def _apply_dense(self, grad, param):
240241
m = self.get_slot(param, "m")
241242
v = self.get_slot(param, "v")
242243

244+
# a float32 grad could still could contain infs from upstream fp16 math
245+
sat_infs = grad.dtype is tf.float16 if self.sat_infs is None else self.sat_infs
246+
243247
return adam_op(grad, param, m, v, self.lr, self.grad_scale, self.clip_sigma,
244248
decay_mean=self.beta1, decay_var=self.beta2, epsilon=self.epsilon,
245-
zero_nans=self.zero_nans, lazy_update=hasattr(grad, "lazy")).out_param
249+
sat_infs=sat_infs, zero_nans=self.zero_nans, lazy_update=hasattr(grad, "lazy")).out_param
246250

247251
def _apply_sparse(self, grad, param):
248252
raise NotImplementedError("Sparse gradient updates are not supported.")
@@ -255,3 +259,93 @@ def _finish(self, update_ops, name_scope):
255259

256260
return tf.group(*update_ops + [update_beta1, update_beta2], name=name_scope)
257261

262+
263+
class AdamOptimizer(ClipAdamOptimizer):
264+
def __init__(self, learning_rate=3e-4, beta1=0.9, beta2=0.999, epsilon=1e-8, grad_scale=1.0, sat_infs=None, zero_nans=True, name="Adam"):
265+
super().__init__(learning_rate=learning_rate, beta1=beta1, beta2=beta2, epsilon=epsilon, grad_scale=grad_scale, sat_infs=sat_infs, zero_nans=zero_nans, name=name)
266+
267+
268+
############################## ClipAdamOptimizer #####################################
269+
270+
adafactor1d_op = _op_module.adafactor1d
271+
adafactor2d_op = _op_module.adafactor2d
272+
273+
class AdafactorOptimizer(optimizer.Optimizer):
274+
def __init__(self, learning_rate=5e-4, beta2=0.999, epsilon=1e-30, clip_thresh=1.0, grad_scale=1.0, sat_infs=None, zero_nans=True, name="Adafactor"):
275+
super().__init__(False, name)
276+
self.epsilon = epsilon
277+
self.sat_infs = sat_infs
278+
self.zero_nans = zero_nans
279+
self.name = name
280+
281+
with tf.device("/cpu:0"), tf.variable_scope("adafactor_decay"):
282+
283+
if type(learning_rate) in (float, int):
284+
learning_rate = tf.constant(float(learning_rate))
285+
if type(clip_thresh) in (float, int):
286+
clip_thresh = tf.constant(float(clip_thresh))
287+
if type(grad_scale) in (float, int):
288+
grad_scale = tf.constant(float(grad_scale))
289+
one = tf.constant(1.0)
290+
291+
self.decay1_power = tf.Variable(initial_value=beta2, name="decay1_power", trainable=False)
292+
self.decay2_power = tf.Variable(initial_value=beta2*beta2, name="decay2_power", trainable=False)
293+
self.learn_rate = learning_rate
294+
self.clip_thresh = clip_thresh
295+
self.grad_scale = grad_scale
296+
self.decay_t = tf.constant(beta2)
297+
self.decay = self.decay_t * (one - self.decay1_power) / (one - self.decay2_power)
298+
299+
def _get_beta_accumulators(self):
300+
return self.decay1_power, self.decay2_power
301+
302+
def _non_slot_variables(self):
303+
return self._get_beta_accumulators()
304+
305+
def _create_slots(self, params):
306+
# Create slots for the first and second moments.
307+
for param in params:
308+
if param.shape.ndims == 2 and param.shape[0].value > 1:
309+
self._get_or_make_slot(param, tf.zeros(param.shape[1].value), "cv", self.name + "CV")
310+
self._get_or_make_slot(param, tf.zeros(param.shape[0].value), "rv", self.name + "RV")
311+
elif param.shape.ndims == 1 or (param.shape.ndims == 2 and param.shape[0].value == 1):
312+
self._get_or_make_slot(param, tf.zeros(param.shape.num_elements()), "cv", self.name + "CV")
313+
else:
314+
raise ValueError("only 1 or 2d params are supported")
315+
316+
def _apply_dense(self, grad, param):
317+
318+
# a float32 grad could still could contain infs from upstream fp16 math
319+
sat_infs = grad.dtype is tf.float16 if self.sat_infs is None else self.sat_infs
320+
321+
if param.shape.ndims == 2 and param.shape[0].value > 1:
322+
323+
cv = self.get_slot(param, "cv")
324+
rv = self.get_slot(param, "rv")
325+
326+
return adafactor2d_op(param, cv, rv, grad,
327+
self.decay, self.learn_rate, self.grad_scale, self.clip_thresh,
328+
epsilon=self.epsilon, sat_infs=sat_infs, zero_nans=self.zero_nans).out_param
329+
330+
elif param.shape.ndims == 1 or (param.shape.ndims == 2 and param.shape[0].value == 1):
331+
332+
cv = self.get_slot(param, "cv")
333+
334+
return adafactor1d_op(param, cv, grad,
335+
self.decay, self.learn_rate, self.grad_scale, self.clip_thresh,
336+
epsilon=self.epsilon, sat_infs=sat_infs, zero_nans=self.zero_nans).out_param
337+
else:
338+
raise ValueError("only 1 or 2d params are supported")
339+
340+
def _apply_sparse(self, grad, param):
341+
raise NotImplementedError("Sparse gradient updates are not supported.")
342+
343+
def _finish(self, update_ops, name_scope):
344+
# Update the power accumulators.
345+
with ops.control_dependencies([ self.decay ]), tf.device("/cpu:0"):
346+
update_decay1 = self.decay1_power.assign(self.decay1_power * self.decay_t)
347+
update_decay2 = self.decay2_power.assign(self.decay2_power * self.decay_t)
348+
349+
return tf.group(*update_ops + [update_decay1, update_decay2], name=name_scope)
350+
351+

0 commit comments

Comments
 (0)