PaddlePaddle
diff --git a/‎AUTHORS.md‎
Lines changed: 9 additions & 0 deletions b/‎AUTHORS.md‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 54 additions & 38 deletions b/‎CMakeLists.txt‎
Lines changed: 54 additions & 38 deletions
diff --git a/‎README.md‎
Lines changed: 12 additions & 9 deletions b/‎README.md‎
Lines changed: 12 additions & 9 deletions
diff --git a/‎README_cn.md‎
Lines changed: 11 additions & 7 deletions b/‎README_cn.md‎
Lines changed: 11 additions & 7 deletions
diff --git a/‎cmake/cblas.cmake‎
Lines changed: 15 additions & 9 deletions b/‎cmake/cblas.cmake‎
Lines changed: 15 additions & 9 deletions
diff --git a/‎cmake/ccache.cmake‎
Lines changed: 26 additions & 11 deletions b/‎cmake/ccache.cmake‎
Lines changed: 26 additions & 11 deletions
@@ -1,13 +1,17 @@
 | Github account | name |
 |---|---|
 | abhinavarora | Abhinav Arora |
+| andreazanetti | Andrea Zanetti |
+| arlesniak | Artur Lesniak |
+| arogowie-intel | Adam Osewski |
 | backyes | Yan-Fei Wang |
 | baiyfbupt | Yi-Fan Bai |
 | beckett1124 | Bin Qi |
 | ChengduoZH | Cheng-Duo Zhao|
 | chengxiaohua1105 | Xiao-Hua Cheng |
 | cxwangyi, yiwangbaidu, wangkuiyi | Yi Wang |
 | cxysteven | Xing-Yi Cheng |
+| ddokupil | Dariusz Dokupil |
 | dzhwinter | Zhi-Hong Dong |
 | dragonwarrior | Long Wang |
 | dyning | Yuning Du |
@@ -21,6 +25,7 @@
 | hedaoyuan | Dao-Yuan He |
 | helinwang | He-Lin Wang |
 | jacquesqiao | Long-Fei Qiao |
+| jakpiase | Jakub Piasecki |
 | [jczaja](https://raw.githubusercontent.com/jczaja/Paddle/paddle-poland-team/doc/images/paddle_poland_team.jpg) | Jacek Czaja |
 | JiayiFeng | Jia-Yi Feng |
 | kbinias | Krzysztof Binias |
@@ -42,6 +47,7 @@
 | pakchoi | Chuan-Jiang Song |
 | panyx0718 | Xin Pan |
 | pengli09 | Peng Li |
+| pmajchrzak |Piotr Majchrzak |
 | pkuyym | Ya-Ming Yang |
 | pzelazko-intel | Pawel Zelazko |
 | [pawelpiotrowicz](https://raw.githubusercontent.com/jczaja/Paddle/paddle-poland-team/doc/images/paddle_poland_team.jpg)  | Pawel Piotrowicz |
@@ -72,3 +78,6 @@
 | zhaopu7 | Pu Zhao |
 | zhouxiao-coder | Xiao Zhou |
 | Zrachel | Rui-Qing Zhang |
+| jeng1220 | Bai-Cheng(Ryan) Jeng (NVIDIA) |
+| mingxu1067 | Ming Huang (NVIDIA) |
+| zlsh80826 | Reese Wang (NVIDIA) |
@@ -12,14 +12,27 @@
 # See the License for the specific language governing permissions and
 # limitations under the License
 
-cmake_minimum_required(VERSION 3.10)
-cmake_policy(VERSION 3.10)
+if(APPLE AND WITH_ARM)
+    # cmake 3.19.2 version starts to support M1
+    cmake_minimum_required(VERSION 3.19.2)
+    cmake_policy(VERSION 3.19.2)
+else(APPLE AND WITH_ARM)
+    cmake_minimum_required(VERSION 3.10)
+    cmake_policy(VERSION 3.10)
+endif(APPLE AND WITH_ARM)
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
 
 include(system)
 
+# Note(zhouwei): Ninja Generator will set CMAKE_BUILD_TYPE to Debug
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE "Release" CACHE STRING
+      "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
+      FORCE)
+endif()
+
 project(paddle CXX C)
 
 # enable language CUDA
@@ -66,6 +79,11 @@ if(WITH_MUSL)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations -Wno-deprecated-declarations -Wno-error=pessimizing-move -Wno-error=deprecated-copy")
 endif()
 
+if(APPLE AND WITH_ARM)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -target arm64-apple-darwin")
+    set(CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS} -target arm64-apple-darwin")
+endif()
+
 if(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11)
     set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
 endif()
@@ -90,10 +108,6 @@ if(WIN32)
 
     if (MSVC_STATIC_CRT)
         message(STATUS "Use static C runtime time, refer to https://docs.microsoft.com/en-us/cpp/c-runtime-library/crt-library-features?view=vs-2019")
-        set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /MTd")
-        set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /MT")
-        set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /MTd")
-        set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /MT")
         foreach(flag_var
             CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
             CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
@@ -105,9 +119,7 @@ if(WIN32)
         endforeach(flag_var)
     endif()
 
-    # NOTE(zhouwei25): temporarily change MP to 1 for reducing CPU & memory utilization
-    set(PROCESS_MAX 1)
-    #math(EXPR PROCESS_MAX "${CPU_CORES} * 1 / 2")
+    math(EXPR PROCESS_MAX "${CPU_CORES} * 2 / 3")
 
     # windows build turn off warnings, use parallel compiling.
     foreach(flag_var
@@ -116,7 +128,10 @@ if(WIN32)
         CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
         CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
         string(REGEX REPLACE "/W[1-4]" " /W0 " ${flag_var} "${${flag_var}}")
-        set(${flag_var} "${${flag_var}} /MP${PROCESS_MAX}")
+        # NOTE(zhouwei25): GPU compile have too high memory utilization when parallel compiling
+        if(NOT WITH_GPU)
+            set(${flag_var} "${${flag_var}} /MP${PROCESS_MAX}")
+        endif()
     endforeach(flag_var)
     foreach(flag_var CMAKE_CXX_FLAGS CMAKE_C_FLAGS)
         set(${flag_var} "${${flag_var}} /w")
@@ -208,16 +223,10 @@ option(WITH_STRIP       "Strip so files of Whl packages"         OFF)
 
 # PY_VERSION
 if(NOT PY_VERSION)
-  set(PY_VERSION 2.7)
+  set(PY_VERSION 3.6)
 endif()
 set(PYBIND11_PYTHON_VERSION ${PY_VERSION})
 
-# CMAKE_BUILD_TYPE
-if(NOT CMAKE_BUILD_TYPE)
-    set(CMAKE_BUILD_TYPE "Release" CACHE STRING
-      "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
-      FORCE)
-endif()
 
 # the type of sanitizer, options are: Address, Leak, Memory, Thread, Undefined. Default: OFF
 if(SANITIZER_TYPE AND NOT "${SANITIZER_TYPE}" MATCHES "^(Address|Leak|Memory|Thread|Undefined)$")
@@ -282,6 +291,27 @@ if(WITH_GPU)
     endif()
 endif()
 
+if(WITH_ROCM)
+    include(hip)
+    include(miopen) # set miopen libraries, must before configure
+endif(WITH_ROCM)
+
+if (NOT WITH_ROCM AND WITH_RCCL)
+    MESSAGE(WARNING
+        "Disable RCCL when compiling without ROCM. Force WITH_RCCL=OFF.")
+    set(WITH_RCCL OFF CACHE STRING
+        "Disable RCCL when compiling without ROCM" FORCE)
+endif()
+
+if(WITH_RCCL)
+     add_definitions("-DPADDLE_WITH_RCCL")
+     include(rccl)
+else()
+     if(WITH_ROCM)
+         MESSAGE(WARNING "If the environment is multi-card, the WITH_RCCL option needs to be turned on, otherwise only a single card can be used.")
+     endif()
+endif()
+
 include(third_party)  # download, build, install third_party, Contains about 20+ dependencies
 
 include(flags)              # set paddle compile flags
@@ -306,35 +336,16 @@ include(configure)          # add paddle env configuration
 
 include_directories("${PADDLE_SOURCE_DIR}")
 
-if(WITH_ROCM)
-    include(hip)
-endif(WITH_ROCM)
-
-if (NOT WITH_ROCM AND WITH_RCCL)
-    MESSAGE(WARNING
-        "Disable RCCL when compiling without ROCM. Force WITH_RCCL=OFF.")
-    set(WITH_RCCL OFF CACHE STRING
-        "Disable RCCL when compiling without ROCM" FORCE)
-endif()
-
-if(WITH_RCCL)
-     add_definitions("-DPADDLE_WITH_RCCL")
-     include(rccl)
-else()
-     if(WITH_ROCM)
-         MESSAGE(WARNING "If the environment is multi-card, the WITH_RCCL option needs to be turned on, otherwise only a single card can be used.")
-     endif()
-endif()
-
 if(WITH_NV_JETSON)
     set(WITH_ARM ON CACHE STRING "Set WITH_ARM=ON when compiling WITH_NV_JETSON=ON." FORCE)
 endif()
 
 if(WITH_ARM)
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
-    set(WITH_XBYAK OFF CACHE STRING "Disable XBYAK when compiling WITH_ARM=ON" FORCE)
+    set(WITH_XBYAK OFF CACHE STRING "Disable XBYAK when compiling WITH_ARM=ON." FORCE)
     set(WITH_MKL OFF CACHE STRING "Disable MKL when compiling WITH_ARM=ON." FORCE)
+    set(WITH_AVX OFF CACHE STRING "Disable AVX when compiling WITH_AVX=OFF." FORCE)
     add_definitions(-DPADDLE_WITH_ARM)
 endif()
 
@@ -352,6 +363,11 @@ if (WITH_MIPS)
     add_definitions(-DPADDLE_WITH_MIPS)
 endif()
 
+if (WITH_HETERPS)
+    if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -faligned-new")
+    endif()
+endif()
 set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")
 
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
 
@@ -1,4 +1,3 @@
-
 <p align="center">
 <img align="center" src="doc/imgs/logo.png", width=1600>
 <p>
@@ -22,7 +21,7 @@ PaddlePaddle is originated from industrial practices with dedication and commitm
 
 ## Installation
 
-### Latest PaddlePaddle Release: [v2.0](https://github.com/PaddlePaddle/Paddle/tree/release/2.0)
+### Latest PaddlePaddle Release: [v2.1](https://github.com/PaddlePaddle/Paddle/tree/release/2.1)
 
 Our vision is to enable deep learning for everyone via PaddlePaddle.
 Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest features of PaddlePaddle.
@@ -34,9 +33,9 @@ pip install paddlepaddle
 pip install paddlepaddle-gpu
 
 ```
-More infomation about installation, please view [Quick Install](https://www.paddlepaddle.org.cn/install/quick)
+For more information about installation, please view [Quick Install](https://www.paddlepaddle.org.cn/install/quick)
 
-Now our developers can acquire Tesla V100 online computing resources for free. If you create a program by AI Studio, you will obtain 10 hours to train models online per day. [Click here to start](https://aistudio.baidu.com/aistudio/index).
+Now our developers can acquire Tesla V100 online computing resources for free. If you create a program by AI Studio, you will obtain 8 hours to train models online per day. [Click here to start](https://aistudio.baidu.com/aistudio/index).
 
 ## FOUR LEADING TECHNOLOGIES
 
@@ -47,14 +46,13 @@ Now our developers can acquire Tesla V100 online computing resources for free. I
 
 -  **Support Ultra-Large-Scale Training of Deep Neural Networks**
 
-    PaddlePaddle has made breakthroughs in ultra-large-scale deep neural networks training. It launched the world's first large-scale open-source training platform that supports the training of deep networks with 100 billions of features and trillions of parameters using data sources distributed over hundreds of nodes. PaddlePaddle overcomes the online deep learning challenges for ultra-large-scale deep learning models, and further achieved the real-time model updating with more than 1 trillion parameters.
+    PaddlePaddle has made breakthroughs in ultra-large-scale deep neural networks training. It launched the world's first large-scale open-source training platform that supports the training of deep networks with 100 billion features and trillions of parameters using data sources distributed over hundreds of nodes. PaddlePaddle overcomes the online deep learning challenges for ultra-large-scale deep learning models, and further achieved real-time model updating with more than 1 trillion parameters.
      [Click here to learn more](https://github.com/PaddlePaddle/Fleet)
 
 
-- **Accelerated High-Performance Inference over Ubiquitous Deployments**
+- **High-Performance Inference Engines for Comprehensive Deployment Enviroments**
 
-    PaddlePaddle is not only compatible with other open-source frameworks for models training, but also works well on the ubiquitous developments, varying from platforms to devices. More specifically, PaddlePaddle accelerates the inference procedure with the fastest speed-up. Note that, a recent breakthrough of inference speed has been made by PaddlePaddle on Huawei's Kirin NPU, through the hardware/software co-optimization.
-     [Click here to learn more](https://github.com/PaddlePaddle/Paddle-Lite)
+   PaddlePaddle is not only compatible with models trained in 3rd party open-source frameworks , but also offers complete inference products for various production scenarios. Our inference product line includes [Paddle Inference](https://paddle-inference.readthedocs.io/en/latest/product_introduction/summary.html): Native inference library for high-performance server and cloud inference; [Paddle Serving](https://github.com/PaddlePaddle/Serving): A service-oriented framework suitable for distributed and pipeline productions; [Paddle Lite](https://github.com/PaddlePaddle/Paddle-Lite): Ultra-Lightweight inference engine for mobile and IoT environments; [Paddle.js](https://www.paddlepaddle.org.cn/paddle/paddlejs): A frontend inference engine for browser and mini-apps. Furthermore, by great amounts of optimization with leading hardware in each scenario, Paddle inference engines outperform most of the other mainstream frameworks.
 
 
 - **Industry-Oriented Models and Libraries with Open Source Repositories**
@@ -87,8 +85,13 @@ We provide [English](https://www.paddlepaddle.org.cn/documentation/docs/en/guide
 ## Communication
 
 - [Github Issues](https://github.com/PaddlePaddle/Paddle/issues): bug reports, feature requests, install issues, usage issues, etc.
-- QQ discussion group: 778260830 (PaddlePaddle).
+- QQ discussion group: 793866180 (PaddlePaddle).
 - [Forums](https://ai.baidu.com/forum/topic/list/168?pageNo=1): discuss implementations, research, etc.
+    
+## Courses
+
+- [Server Deployments](https://aistudio.baidu.com/aistudio/course/introduce/19084): Courses intorducing high performance server deployments via local and remote services.
+- [Edge Deployments](https://aistudio.baidu.com/aistudio/course/introduce/22690): Courses intorducing edge deployments from mobile, IoT to web and applets.   
 
 ## Copyright and License
 PaddlePaddle is provided under the [Apache-2.0 license](LICENSE).
@@ -1,4 +1,4 @@
-
+
 <p align="center">
 <img align="center" src="doc/imgs/logo.png", width=1600>
 <p>
@@ -19,7 +19,7 @@
 
 ## 安装
 
-### PaddlePaddle最新版本: [v2.0](https://github.com/PaddlePaddle/Paddle/tree/release/2.0)
+### PaddlePaddle最新版本: [v2.1](https://github.com/PaddlePaddle/Paddle/tree/release/2.1)
 
 跟进PaddlePaddle最新特性请参考我们的[版本说明](https://github.com/PaddlePaddle/Paddle/releases)
 
@@ -32,7 +32,7 @@ pip install paddlepaddle-gpu
 ```
 更多安装信息详见官网 [安装说明](https://www.paddlepaddle.org.cn/install/quick)
 
-PaddlePaddle用户可领取**免费Tesla V100在线算力资源**，训练模型更高效。**每日登陆即送10小时**，[前往使用免费算力](https://aistudio.baidu.com/aistudio/index)。
+PaddlePaddle用户可领取**免费Tesla V100在线算力资源**，训练模型更高效。**每日登陆即送8小时**，[前往使用免费算力](https://aistudio.baidu.com/aistudio/index)。
 
 ## 四大领先技术
 
@@ -47,10 +47,9 @@ PaddlePaddle用户可领取**免费Tesla V100在线算力资源**，训练模型
     [查看详情](https://github.com/PaddlePaddle/Fleet)
 
 
-- **多端多平台部署的高性能推理引擎**
+- **支持多端多平台的高性能推理部署工具**
 
-    飞桨不仅兼容其他开源框架训练的模型，还可以轻松地部署到不同架构的平台设备上。同时，飞桨的推理速度也是全面领先的。尤其经过了跟华为麒麟NPU的软硬一体优化，使得飞桨在NPU上的推理速度进一步突破。
-    [查看详情](https://github.com/PaddlePaddle/Paddle-Lite)
+    飞桨不仅广泛兼容第三方开源框架训练的模型部署，并且为不同的场景的生产环境提供了完备的推理引擎，包括适用于高性能服务器及云端推理的原生推理库 [Paddle Inference](https://paddle-inference.readthedocs.io/en/latest/product_introduction/summary.html)，面向分布式、流水线生产环境下自动上云、A/B测试等高阶功能的服务化推理框架 [Paddle Serving](https://github.com/PaddlePaddle/Serving)，针对于移动端、物联网场景的轻量化推理引擎 [Paddle Lite](https://github.com/PaddlePaddle/Paddle-Lite)，以及在浏览器、小程序等环境下使用的前端推理引擎 [Paddle.js](https://www.paddlepaddle.org.cn/paddle/paddlejs)。同时，透过与不同场景下的主流硬件高度适配优化及异构计算的支持, 飞桨的推理性能也领先绝大部分的主流实现。
 
 
 - **面向产业应用，开源开放覆盖多领域的工业级模型库。**
@@ -83,8 +82,13 @@ PaddlePaddle用户可领取**免费Tesla V100在线算力资源**，训练模型
 ## 交流与反馈
 
 - 欢迎您通过[Github Issues](https://github.com/PaddlePaddle/Paddle/issues)来提交问题、报告与建议
-- QQ群: 778260830 (PaddlePaddle)
+- QQ群: 793866180 (PaddlePaddle)
 - [论坛](https://ai.baidu.com/forum/topic/list/168): 欢迎大家在PaddlePaddle论坛分享在使用PaddlePaddle中遇到的问题和经验, 营造良好的论坛氛围
+    
+## 课程
+
+- [服务器部署](https://aistudio.baidu.com/aistudio/course/introduce/19084): 详细介绍高性能服务器端部署实操，包含本地端及服务化Serving部署等
+- [端侧部署](https://aistudio.baidu.com/aistudio/course/introduce/22690): 详细介绍端侧多场景部署实操，从移端端设备、IoT、网页到小程序部署
 
 ## 版权和许可证
 PaddlePaddle由[Apache-2.0 license](LICENSE)提供
@@ -69,15 +69,21 @@ if(NOT DEFINED CBLAS_PROVIDER)
     PATHS ${OPENBLAS_LIB_SEARCH_PATHS})
 
   if(OPENBLAS_LAPACKE_INC_DIR AND OPENBLAS_INC_DIR AND OPENBLAS_LIB)
-    set(CBLAS_PROVIDER OPENBLAS)
-    set(CBLAS_INC_DIR ${OPENBLAS_INC_DIR} ${OPENBLAS_LAPACKE_INC_DIR})
-    set(CBLAS_LIBRARIES ${OPENBLAS_LIB})
-
-    add_definitions(-DPADDLE_USE_OPENBLAS)
-    add_definitions(-DLAPACK_FOUND)
-
-    message(STATUS "Found OpenBLAS (include: ${OPENBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
-    message(STATUS "Found lapack in OpenBLAS (include: ${OPENBLAS_LAPACKE_INC_DIR})")
+    file(READ "${OPENBLAS_INC_DIR}/openblas_config.h" config_file)
+    string(REGEX MATCH "OpenBLAS ([0-9]+\.[0-9]+\.[0-9]+)" tmp ${config_file})
+    string(REGEX MATCH "([0-9]+\.[0-9]+\.[0-9]+)" ver ${tmp})
+    
+    if (${ver} VERSION_GREATER_EQUAL "0.3.7")
+      set(CBLAS_PROVIDER OPENBLAS)
+      set(CBLAS_INC_DIR ${OPENBLAS_INC_DIR} ${OPENBLAS_LAPACKE_INC_DIR})
+      set(CBLAS_LIBRARIES ${OPENBLAS_LIB})
+
+      add_definitions(-DPADDLE_USE_OPENBLAS)
+      add_definitions(-DLAPACK_FOUND)
+
+      message(STATUS "Found OpenBLAS (include: ${OPENBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
+      message(STATUS "Found lapack in OpenBLAS (include: ${OPENBLAS_LAPACKE_INC_DIR})")
+    endif()
   endif()
 endif()
 
 
@@ -1,14 +1,29 @@
 # Use ccache if found ccache program
 
-find_program(CCACHE_PATH ccache)
+if(NOT WIN32)
+    find_program(CCACHE_PATH ccache)
+    if(CCACHE_PATH)
+        execute_process(COMMAND ccache -V OUTPUT_VARIABLE ccache_output)
+        execute_process(COMMAND ccache -s cache directory OUTPUT_VARIABLE cache_directory)
+        string(REGEX MATCH "[0-9]+.[0-9]+" ccache_version ${ccache_output})
+        message(STATUS "ccache is founded, use ccache to speed up compile on Unix.")
+        # show statistics summary of ccache
+        message("ccache version\t\t\t    " ${ccache_version} "\n" ${cache_directory})
+        set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ${CCACHE_PATH})
+        set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK ${CCACHE_PATH})
+    endif(CCACHE_PATH)
+elseif("${CMAKE_GENERATOR}" STREQUAL "Ninja")
+    # (Note:zhouwei25) Only Ninja Generator can support sccache now
+    find_program(SCCACHE_PATH sccache)
 
-if(CCACHE_PATH)
-    execute_process(COMMAND ccache -V OUTPUT_VARIABLE ccache_output)
-    execute_process(COMMAND ccache -s cache directory OUTPUT_VARIABLE cache_directory)
-    string(REGEX MATCH "[0-9]+.[0-9]+" ccache_version ${ccache_output})
-    message(STATUS "Ccache is founded, use ccache to speed up compile.")
-    # show statistics summary of ccache
-    message("ccache version\t\t\t    " ${ccache_version} "\n" ${cache_directory})
-    set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ${CCACHE_PATH})
-    set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK ${CCACHE_PATH})
-endif(CCACHE_PATH)
+    if(SCCACHE_PATH)
+        execute_process(COMMAND sccache -V OUTPUT_VARIABLE sccache_version)
+        message(STATUS "${sccache_version} is founded, use [${SCCACHE_PATH}] to speed up compile on Windows.")
+
+        set(CMAKE_C_COMPILER_LAUNCHER ${SCCACHE_PATH})
+        set(CMAKE_CXX_COMPILER_LAUNCHER ${SCCACHE_PATH})
+        # (Note:zhouwei25) sccache for cuda compiler has bug so that it can't be hit
+        # refer to https://github.com/mozilla/sccache/issues/1017, so we fix it
+        set(CMAKE_CUDA_COMPILER_LAUNCHER ${SCCACHE_PATH})
+    endif(SCCACHE_PATH)
+endif()