[Embedding] Support EmbeddingVariable and gather/apply ops place on G…

…PU. (DeepRec-AI#21)
huangjin1995 · Apr 16, 2022 · a3d1e5f · a3d1e5f
1 parent d7e00b1
commit a3d1e5f
Show file tree

Hide file tree

Showing 29 changed files with 2,864 additions and 20 deletions.
diff --git a/cibuild/cpu-ut/cpu-python-ut.sh b/cibuild/cpu-ut/cpu-python-ut.sh
@@ -60,6 +60,7 @@ export TF_BUILD_BAZEL_TARGET="$TF_ALL_TARGETS "\
 "-//tensorflow/python:work_queue_test "\
 "-//tensorflow/python/keras:metrics_test "\
 "-//tensorflow/python/keras:training_test "\
+"-//tensorflow/python:embedding_variable_ops_gpu_test "\
 
 for i in $(seq 1 3); do
     [ $i -gt 1 ] && echo "WARNING: cmd execution failed, will retry in $((i-1)) times later" && sleep 2

diff --git a/cibuild/gpu-ut.sh b/cibuild/gpu-ut.sh
@@ -16,6 +16,7 @@
 
 set -eo pipefail
 
+export TF_CUDA_COMPUTE_CAPABILITIES="7.5,8.0"
 export TF_NEED_TENSORRT=0
 export TF_NEED_ROCM=0
 export TF_NEED_COMPUTECPP=0

diff --git a/cibuild/gpu-ut/gpu-c-ut.sh b/cibuild/gpu-ut/gpu-c-ut.sh
@@ -16,6 +16,7 @@
 
 set -eo pipefail
 
+export TF_CUDA_COMPUTE_CAPABILITIES="7.5,8.0"
 export TF_NEED_TENSORRT=0
 export TF_NEED_ROCM=0
 export TF_NEED_COMPUTECPP=0

diff --git a/cibuild/gpu-ut/gpu-cc-ut.sh b/cibuild/gpu-ut/gpu-cc-ut.sh
@@ -16,6 +16,7 @@
 
 set -eo pipefail
 
+export TF_CUDA_COMPUTE_CAPABILITIES="7.5,8.0"
 export TF_NEED_TENSORRT=0
 export TF_NEED_ROCM=0
 export TF_NEED_COMPUTECPP=0

diff --git a/cibuild/gpu-ut/gpu-contrib-ut.sh b/cibuild/gpu-ut/gpu-contrib-ut.sh
@@ -16,6 +16,7 @@
 
 set -eo pipefail
 
+export TF_CUDA_COMPUTE_CAPABILITIES="7.5,8.0"
 export TF_NEED_TENSORRT=0
 export TF_NEED_ROCM=0
 export TF_NEED_COMPUTECPP=0

diff --git a/cibuild/gpu-ut/gpu-core-ut.sh b/cibuild/gpu-ut/gpu-core-ut.sh
@@ -16,6 +16,7 @@
 
 set -eo pipefail
 
+export TF_CUDA_COMPUTE_CAPABILITIES="7.5,8.0"
 export TF_NEED_TENSORRT=0
 export TF_NEED_ROCM=0
 export TF_NEED_COMPUTECPP=0

diff --git a/cibuild/gpu-ut/gpu-examples-ut.sh b/cibuild/gpu-ut/gpu-examples-ut.sh
@@ -16,6 +16,7 @@
 
 set -eo pipefail
 
+export TF_CUDA_COMPUTE_CAPABILITIES="7.5,8.0"
 export TF_NEED_TENSORRT=0
 export TF_NEED_ROCM=0
 export TF_NEED_COMPUTECPP=0

diff --git a/cibuild/gpu-ut/gpu-java-ut.sh b/cibuild/gpu-ut/gpu-java-ut.sh
@@ -16,6 +16,7 @@
 
 set -eo pipefail
 
+export TF_CUDA_COMPUTE_CAPABILITIES="7.5,8.0"
 export TF_NEED_TENSORRT=0
 export TF_NEED_ROCM=0
 export TF_NEED_COMPUTECPP=0

diff --git a/cibuild/gpu-ut/gpu-js-ut.sh b/cibuild/gpu-ut/gpu-js-ut.sh
@@ -16,6 +16,7 @@
 
 set -eo pipefail
 
+export TF_CUDA_COMPUTE_CAPABILITIES="7.5,8.0"
 export TF_NEED_TENSORRT=0
 export TF_NEED_ROCM=0
 export TF_NEED_COMPUTECPP=0

diff --git a/cibuild/gpu-ut/gpu-python-ut.sh b/cibuild/gpu-ut/gpu-python-ut.sh
@@ -16,6 +16,7 @@
 
 set -eo pipefail
 
+export TF_CUDA_COMPUTE_CAPABILITIES="7.5,8.0"
 export TF_NEED_TENSORRT=0
 export TF_NEED_ROCM=0
 export TF_NEED_COMPUTECPP=0

diff --git a/cibuild/gpu-ut/gpu-stream_executor-ut.sh b/cibuild/gpu-ut/gpu-stream_executor-ut.sh
@@ -16,6 +16,7 @@
 
 set -eo pipefail
 
+export TF_CUDA_COMPUTE_CAPABILITIES="7.5,8.0"
 export TF_NEED_TENSORRT=0
 export TF_NEED_ROCM=0
 export TF_NEED_COMPUTECPP=0

diff --git a/docs/Embedding-Variable-GPU.md b/docs/Embedding-Variable-GPU.md
@@ -0,0 +1,29 @@
+# Embedding Variable GPU支持
+## 功能介绍
+GPU具有强大的并行计算能力，对于EmbeddingVariable底层的Hash Table查找、插入等操作也具有明显的加速作用。同时，对于模型计算部分若使用GPU，则使用GPU上的EmbeddingVariable也可避免Host和Device上的数据拷贝，提高整体性能。因此我们增加了EmbeddingVariable的GPU支持。
+
+当前版本的EmbeddingVariable GPU实现暂时只支持部分基础功能。对于特征淘汰、特征准入、特征统计等功能暂未支持。对应的优化器现在提供了Adagrad以及FtrlOptimizer的支持。
+
+
+## 使用方法
+使用开启了GPU支持的DeepRec版本，在拥有NVIDIA GPU的环境下，EmbeddingVariable会自动被放置在GPU device上。
+
+我们也可手动指定device，将其放置于GPU上
+```python
+with tf.device('/gpu:0'):
+    var = tf.get_embedding_variable("var_0",
+                                    embedding_dim=3,
+                                    initializer=tf.ones_initializer(tf.float32),
+                                    partitioner=tf.fixed_size_partitioner(num_shards=4))
+```
+
+或者使用feature_column
+```python
+columns = tf.feature_column.categorical_column_with_embedding("col_emb", dtype=tf.dtypes.int64)
+with tf.device('/gpu:0'):
+    W = tf.feature_column.embedding_column(categorical_column=columns,
+                dimension=3,
+                initializer=tf.ones_initializer(tf.dtypes.float32))
+```
+
+注意：GPU版本的EmbeddingVariable暂时无法和TensorFlow自带Saver一起使用，我们后面会修复这个问题。
diff --git a/docs/index.md b/docs/index.md
@@ -27,6 +27,7 @@ Feature-Filter
 Dynamic-dimension-Embedding-Variable
 Adaptive-Embedding
 Multi-Hash-Variable
+Embedding-Variable-GPU
 ```
 
 ```{toctree}

diff --git a/modelzoo/features/EmbeddingVariable/WDL/README.md b/modelzoo/features/EmbeddingVariable/WDL/README.md
@@ -109,6 +109,14 @@ input:                                          |
 - `image`: where nodes can pull the docker image.
 - `claimName`: PVC name.
 
+### Use GPU
+  In an environment with NVIDIA GPUs, the EmbeddingVaribles along with other Ops that have GPU support will automatically placed on GPU.
+  Since the current implementation of GPU EmbeddingVariable cannot work with TensorFlow Saver, we need run with:
+  ```
+  python train.py --no_saver
+  ```
+  (Note: Since there are some Ops without GPU supporting in this model, the performance is not good with GPUs.)
+
 ## Benchmark
 ### Stand-alone Training
 #### Test Environment

diff --git a/modelzoo/features/EmbeddingVariable/WDL/train.py b/modelzoo/features/EmbeddingVariable/WDL/train.py
@@ -434,7 +434,9 @@ def get_arg_parser():
     parser.add_argument('--saved_model_path',
                         type=str,
                         default="")
-
+    parser.add_argument('--no_saver',
+                        help='not add saver to the model.',
+                        action='store_true')
     return parser
 
 
@@ -581,10 +583,11 @@ def main(tf_config=None, server=None):
                 sess.run(tf.local_variables_initializer())
                 merged = tf.summary.merge_all()
                 writer = tf.summary.FileWriter(checkpoint_dir, sess.graph)
-                saver = tf.train.Saver(tf.global_variables(),
-                                       max_to_keep=args.keep_checkpoint_max,
-                                       incremental_save_restore=True)
-                incr_saver = tf_incr_saver._get_incremental_saver(True, saver)
+                if not args.no_saver:
+                    saver = tf.train.Saver(tf.global_variables(),
+                                        max_to_keep=args.keep_checkpoint_max,
+                                        incremental_save_restore=True)
+                    incr_saver = tf_incr_saver._get_incremental_saver(True, saver)
                 options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
                 run_metadata = tf.RunMetadata()
 
@@ -599,20 +602,22 @@ def main(tf_config=None, server=None):
                         _, train_loss, events = sess.run(
                             [model.train_op, model.loss, merged])
                         writer.add_summary(events, _in)
-                        checkpoint_path = saver.save(
-                            sess,
-                            save_path=os.path.join(checkpoint_dir,
-                                                   'WIDE_AND_DEEP-checkpoint'),
-                            global_step=_in)
-                        print("Save checkpoint to %s" % checkpoint_path)
+                        if not args.no_saver:
+                            checkpoint_path = saver.save(
+                                sess,
+                                save_path=os.path.join(checkpoint_dir,
+                                                    'WIDE_AND_DEEP-checkpoint'),
+                                global_step=_in)
+                            print("Save checkpoint to %s" % checkpoint_path)
                     elif args.incr_save_steps > 0 and _in % args.incr_save_steps == 0:
                         _, train_loss = sess.run(
                             [model.train_op, model.loss])
-                        incr_checkpoint_path = incr_saver.incremental_save(
-                            sess,
-                            os.path.join(checkpoint_dir, '.incremental_checkpoint/incr-WIDE_AND_DEEP-checkpoint'),
-                            global_step=_in)
-                        print("Save incremental checkpoint to %s" % incr_checkpoint_path)
+                        if not args.no_saver:
+                            incr_checkpoint_path = incr_saver.incremental_save(
+                                sess,
+                                os.path.join(checkpoint_dir, '.incremental_checkpoint/incr-WIDE_AND_DEEP-checkpoint'),
+                                global_step=_in)
+                            print("Save incremental checkpoint to %s" % incr_checkpoint_path)
                     elif (args.timeline > 0 and _in % args.timeline == 0):
                         _, train_loss = sess.run([model.train_op, model.loss],
                                                  options=options,
@@ -728,4 +733,4 @@ def main(tf_config=None, server=None):
                      server=server)
         else:
             print("Task type or index error.")
-            sys.exit()
+            sys.exit()
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
@@ -2824,6 +2824,10 @@ tf_kernel_library(
     name = "kv_variable_ops",
     hdrs = ["kv_variable_ops.h"],
     srcs = ["kv_variable_ops.cc"],
+    gpu_srcs = [
+        "kv_variable_ops_gpu.cu.cc",
+        "kv_variable_ops_gpu.h",
+    ],
     copts = ["-g"],
     nocopts = "-fno-exceptions",
     deps = [
@@ -2843,6 +2847,7 @@ tf_kernel_library(
         "@sparsehash_c11//:dense_hash_map",
         "@libcuckoo//:libcuckoo",
         "@com_github_google_leveldb//:leveldb",
+        "@cuCollections//:cuco_hash_table",
     ],
 )
 
@@ -6188,8 +6193,15 @@ tf_cc_test(
 
 tf_kernel_library(
     name = "training_ali_ops",
-    prefix = "training_ali_ops",
-    hdrs = ["training_ali_op_helpers.h"],
+    hdrs = [
+        "training_ali_ops.h",
+        "training_ali_op_helpers.h"
+    ],
+    srcs = ["training_ali_ops.cc"],
+    gpu_srcs = [
+        "training_ali_ops_gpu.cu.cc",
+        "training_ali_ops_gpu.h",
+    ],
     deps = [
         ":bounds_check",
         ":training_op_helpers",