Skip to content

Commit ae76b89

Browse files
authored
Byshiue patch 2 (NVIDIA#788)
[FasterTransformer] feat: Update FasterTransformer v3.1
1 parent 99b1c89 commit ae76b89

File tree

397 files changed

+454835
-7
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

397 files changed

+454835
-7
lines changed

FasterTransformer/README.md

Lines changed: 27 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ This repository provides a script and recipe to run the highly optimized transfo
1010
- [FasterTransformer v2](#fastertransformer-v2)
1111
- [FasterTransformer v2.1](#fastertransformer-v21)
1212
- [FasterTransformer v3.0](#fastertransformer-v30)
13+
- [FasterTransformer v3.1](#fastertransformer-v31)
1314
- [Architecture matrix](#architecture-matrix)
1415
- [Release notes](#release-notes)
1516
- [Changelog](#changelog)
@@ -33,16 +34,21 @@ FasterTransformer v2.1 optimizes some kernels of encoder and decoder, adding the
3334

3435
FasterTransformer v3.0 adds the supporting of INT8 quantization for cpp and TensorFlow encoder model on Turing and Ampere GPUs.
3536

37+
### FasterTransformer v3.1
38+
39+
First, FasterTransformer v3.1 adds the supporting of INT8 quantization of PyTorch encoder model on Turing and Ampere GPUs. Second, v3.1 improves the performances of encoder on FP16 and INT8. Compared to v3.0, v3.1 provides at most 1.2x speedup on T4 FP16, and 1.7x speedup on T4 INT8. Third, v3.1 supports the inference of GPT-2 model.
40+
3641
### Architecture matrix
3742

38-
The following matrix shows the Architecture Differences between the model.
43+
The following matrix shows the architecture differences between the model.
3944

40-
| Architecure | Encoder | Encoder INT8 quantization |Decoder | Decoding with beam search | Decoding with sampling |
41-
|---------------------------|-------------------|----------------------------|--------------------|---------------------------|------------------------|
42-
|FasterTransformer v1 | Yes | No | No | No | No |
43-
|FasterTransformer v2 | Yes | No | Yes | Yes | No |
44-
|FasterTransformer v2.1 | Yes | No | Yes | Yes | Yes |
45-
|FasterTransformer v3.0 | Yes | Yes | Yes | Yes | Yes |
45+
| Architecure | Encoder | Encoder INT8 quantization | Decoder | Decoding with beam search | Decoding with sampling | GPT-2 |
46+
|---------------------------|-------------------|----------------------------|---------------------|---------------------------|------------------------|-------|
47+
| v1 | Yes | No | No | No | No | No |
48+
| v2 | Yes | No | Yes | Yes | No | No |
49+
| v2.1 | Yes | No | Yes | Yes | Yes | No |
50+
| v3.0 | Yes | Yes | Yes | Yes | Yes | No |
51+
| v3.1 | Yes | Yes | Yes | Yes | Yes | Yes |
4652

4753
## Release notes
4854

@@ -52,9 +58,23 @@ FasterTransformer v2 will be deprecated on Dec 2020.
5258

5359
FasterTransformer v2.1 will be deprecated on July 2021.
5460

61+
FasterTransformer v3.0 will be deprecated on Sep 2021.
62+
5563
### Changelog
5664

65+
Dec 2020
66+
- **Release the FasterTransformer 3.1**
67+
68+
Nov 2020
69+
- Optimize the INT8 inference.
70+
- Support PyTorch INT8 inference.
71+
- Provide PyTorch INT8 quantiztion tools.
72+
- Integrate the fused multi-head attention kernel of TensorRT into FasterTransformer.
73+
- Add unit test of SQuAD.
74+
- Update the missed NGC checkpoints.
75+
5776
Sep 2020
77+
- Support GPT2
5878
- **Release the FasterTransformer 3.0**
5979
- Support INT8 quantization of encoder of cpp and TensorFlow op.
6080
- Add bert-tf-quantization tool.
Lines changed: 207 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,207 @@
1+
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
cmake_minimum_required(VERSION 3.8 FATAL_ERROR) # for PyTorch extensions, version should be greater than 3.13
15+
project(FasterTransformer LANGUAGES CXX CUDA)
16+
17+
find_package(CUDA 10.1 REQUIRED)
18+
19+
option(BUILD_TRT "Build in TensorRT mode" OFF)
20+
option(BUILD_TF "Build in TensorFlow mode" OFF)
21+
option(BUILD_THE "Build in PyTorch eager mode" OFF)
22+
option(BUILD_THS "Build in TorchScript class mode" OFF)
23+
24+
if(BUILD_THS)
25+
if(DEFINED ENV{NVIDIA_PYTORCH_VERSION})
26+
if($ENV{NVIDIA_PYTORCH_VERSION} VERSION_LESS "20.03")
27+
message(FATAL_ERROR "NVIDIA PyTorch image is too old for TorchScript mode.")
28+
endif()
29+
if($ENV{NVIDIA_PYTORCH_VERSION} VERSION_EQUAL "20.03")
30+
add_definitions(-DLEGACY_THS=1)
31+
endif()
32+
endif()
33+
endif()
34+
35+
set(CXX_STD "11" CACHE STRING "C++ standard")
36+
37+
set(CUDA_PATH ${CUDA_TOOLKIT_ROOT_DIR})
38+
39+
set(TF_PATH "" CACHE STRING "TensorFlow path")
40+
41+
if(BUILD_TF AND NOT TF_PATH)
42+
message(FATAL_ERROR "TF_PATH must be set if BUILD_TF(=TensorFlow mode) is on.")
43+
endif()
44+
45+
set(TRT_PATH "" CACHE STRING "TensorRT path")
46+
47+
if(BUILD_TRT AND NOT TRT_PATH)
48+
message(FATAL_ERROR "TRT_PATH must be set if BUILD_TRT(=TensorRT mode) is on.")
49+
endif()
50+
51+
list(APPEND CMAKE_MODULE_PATH ${CUDA_PATH}/lib64)
52+
53+
if (${CUDA_VERSION} GREATER_EQUAL 11.0)
54+
message(STATUS "Add DCUDA11_MODE")
55+
add_definitions("-DCUDA11_MODE")
56+
endif()
57+
58+
# setting compiler flags
59+
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
60+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
61+
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -Wall")
62+
63+
if (SM STREQUAL 80 OR
64+
SM STREQUAL 86 OR
65+
SM STREQUAL 70 OR
66+
SM STREQUAL 75 OR
67+
SM STREQUAL 61 OR
68+
SM STREQUAL 60)
69+
#set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_${SM},code=\\\"sm_${SM},compute_${SM}\\\" -rdc=true")
70+
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_${SM},code=\\\"sm_${SM},compute_${SM}\\\"")
71+
if (SM STREQUAL 70 OR SM STREQUAL 75 OR SM STREQUAL 80 OR SM STREQUAL 86)
72+
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DWMMA")
73+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DWMMA")
74+
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DWMMA")
75+
endif()
76+
if(BUILD_THE OR BUILD_THS)
77+
string(SUBSTRING ${SM} 0 1 SM_MAJOR)
78+
string(SUBSTRING ${SM} 1 1 SM_MINOR)
79+
set(ENV{TORCH_CUDA_ARCH_LIST} "${SM_MAJOR}.${SM_MINOR}")
80+
endif()
81+
message("-- Assign GPU architecture (sm=${SM})")
82+
83+
else()
84+
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} \
85+
-gencode=arch=compute_70,code=\\\"sm_70,compute_70\\\" \
86+
-gencode=arch=compute_75,code=\\\"sm_75,compute_75\\\" \
87+
")
88+
# -rdc=true")
89+
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DWMMA")
90+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DWMMA")
91+
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DWMMA")
92+
if(BUILD_THE OR BUILD_THS)
93+
set(ENV{TORCH_CUDA_ARCH_LIST} "7.0;7.5")
94+
endif()
95+
message("-- Assign GPU architecture (sm=70,75)")
96+
endif()
97+
98+
set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -Wall -O0")
99+
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -Wall -O0")
100+
# set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -O0 -G -Xcompiler -Wall --ptxas-options=-v --resource-usage")
101+
set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -O0 -G -Xcompiler -Wall")
102+
103+
set(CMAKE_CXX_STANDARD "${CXX_STD}")
104+
set(CMAKE_CXX_STANDARD_REQUIRED ON)
105+
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda")
106+
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
107+
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --std=c++${CXX_STD}")
108+
109+
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3")
110+
# set(CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CUDA_FLAGS_RELEASE} -Xcompiler -O3 --ptxas-options=--verbose")
111+
set(CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CUDA_FLAGS_RELEASE} -Xcompiler -O3")
112+
113+
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
114+
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
115+
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
116+
117+
set(COMMON_HEADER_DIRS
118+
${PROJECT_SOURCE_DIR}
119+
${CUDA_PATH}/include
120+
)
121+
122+
set(COMMON_LIB_DIRS
123+
${CUDA_PATH}/lib64
124+
)
125+
126+
if(BUILD_TF)
127+
list(APPEND COMMON_HEADER_DIRS ${TF_PATH}/include)
128+
list(APPEND COMMON_LIB_DIRS ${TF_PATH})
129+
endif()
130+
131+
if(BUILD_TRT)
132+
list(APPEND COMMON_HEADER_DIRS ${TRT_PATH}/include)
133+
list(APPEND COMMON_LIB_DIRS ${TRT_PATH}/lib)
134+
endif()
135+
136+
set(PYTHON_PATH "python" CACHE STRING "Python path")
137+
if(BUILD_THS)
138+
execute_process(COMMAND ${PYTHON_PATH} "-c" "from __future__ import print_function; import torch; print(torch.__version__,end='');"
139+
RESULT_VARIABLE _PYTHON_SUCCESS
140+
OUTPUT_VARIABLE TORCH_VERSION)
141+
if (TORCH_VERSION VERSION_LESS "1.5.0")
142+
message(FATAL_ERROR "PyTorch >= 1.5.0 is needed for TorchScript mode.")
143+
endif()
144+
endif()
145+
if(BUILD_THE OR BUILD_THS)
146+
execute_process(COMMAND ${PYTHON_PATH} "-c" "from __future__ import print_function; import os; import torch;
147+
print(os.path.dirname(torch.__file__),end='');"
148+
RESULT_VARIABLE _PYTHON_SUCCESS
149+
OUTPUT_VARIABLE TORCH_DIR)
150+
if (NOT _PYTHON_SUCCESS MATCHES 0)
151+
message(FATAL_ERROR "Torch config Error.")
152+
endif()
153+
list(APPEND CMAKE_PREFIX_PATH ${TORCH_DIR})
154+
find_package(Torch REQUIRED)
155+
156+
execute_process(COMMAND ${PYTHON_PATH} "-c" "from __future__ import print_function; from distutils import sysconfig;
157+
print(sysconfig.get_python_inc());
158+
print(sysconfig.get_config_var('SO'));"
159+
RESULT_VARIABLE _PYTHON_SUCCESS
160+
OUTPUT_VARIABLE _PYTHON_VALUES)
161+
if (NOT _PYTHON_SUCCESS MATCHES 0)
162+
message(FATAL_ERROR "Python config Error.")
163+
endif()
164+
string(REGEX REPLACE ";" "\\\\;" _PYTHON_VALUES ${_PYTHON_VALUES})
165+
string(REGEX REPLACE "\n" ";" _PYTHON_VALUES ${_PYTHON_VALUES})
166+
list(GET _PYTHON_VALUES 0 PY_INCLUDE_DIR)
167+
list(GET _PYTHON_VALUES 1 PY_SUFFIX)
168+
list(APPEND COMMON_HEADER_DIRS ${PY_INCLUDE_DIR})
169+
170+
execute_process(COMMAND ${PYTHON_PATH} "-c" "from torch.utils import cpp_extension; print(' '.join(cpp_extension._prepare_ldflags([],True,False)),end='');"
171+
RESULT_VARIABLE _PYTHON_SUCCESS
172+
OUTPUT_VARIABLE TORCH_LINK)
173+
if (NOT _PYTHON_SUCCESS MATCHES 0)
174+
message(FATAL_ERROR "PyTorch link config Error.")
175+
endif()
176+
endif()
177+
178+
179+
include_directories(
180+
${COMMON_HEADER_DIRS}
181+
)
182+
183+
link_directories(
184+
${COMMON_LIB_DIRS}
185+
)
186+
187+
add_subdirectory(fastertransformer)
188+
add_subdirectory(tools)
189+
add_subdirectory(sample)
190+
191+
if(BUILD_TF)
192+
add_custom_target(copy ALL COMMENT "Copying tensorflow test scripts")
193+
add_custom_command(TARGET copy
194+
POST_BUILD
195+
COMMAND cp ${PROJECT_SOURCE_DIR}/sample/tensorflow/ ${PROJECT_BINARY_DIR} -r
196+
)
197+
endif()
198+
199+
if(BUILD_THE OR BUILD_THS)
200+
add_custom_target(copy ALL COMMENT "Copying pytorch test scripts")
201+
add_custom_command(TARGET copy
202+
POST_BUILD
203+
COMMAND cp ${PROJECT_SOURCE_DIR}/sample/pytorch/ ${PROJECT_BINARY_DIR} -r
204+
COMMAND mkdir -p ${PROJECT_BINARY_DIR}/pytorch/translation/data/
205+
COMMAND cp ${PROJECT_SOURCE_DIR}/sample/tensorflow/utils/translation/test.* ${PROJECT_BINARY_DIR}/pytorch/translation/data/
206+
)
207+
endif()

0 commit comments

Comments
 (0)