Skip to content

Commit ede109a

Browse files
committed
Merge branch 'main' into wangjial/benchgc_op
2 parents 8999233 + 1f1eef9 commit ede109a

30 files changed

+1131
-9
lines changed

CMakeLists.txt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake")
2828

2929
option(GC_LEGACY_ENABLE ON)
3030
option(GC_TEST_ENABLE "Build the tests" ON)
31+
option(GC_USE_GPU "Enable GPU backend" OFF)
3132
option(GC_ENABLE_BINDINGS_PYTHON "Enable Graph Complier Python Binding" ON)
3233
option(GC_BENCH_ENABLE "Build benchgc. Only available when GC_TEST_ENABLE and GC_ENABLE_BINDINGS_PYTHON is enabled" ON)
3334
option(GC_DEV_LINK_LLVM_DYLIB "Link dynamic libraries of LLVM and MLIR. For developers only. Do not use it in packing the library." OFF)
@@ -53,6 +54,13 @@ include(AddLLVM)
5354
include(AddMLIR)
5455
include(HandleLLVMOptions)
5556

57+
if(GC_USE_GPU)
58+
include(imex)
59+
if(GC_DEV_LINK_LLVM_DYLIB)
60+
message(WARN "GPU backend may not be compatible with dynamic linking to LLVM")
61+
endif()
62+
endif()
63+
5664
if(GC_ENABLE_BINDINGS_PYTHON AND NOT MLIR_ENABLE_BINDINGS_PYTHON)
5765
message(STATUS "Failed to enable Python API due to the 'MLIR_ENABLE_BINDINGS_PYTHON' for LLVM is not ON.")
5866
set(GC_ENABLE_BINDINGS_PYTHON OFF CACHE BOOL "" FORCE)

README.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,9 @@ cmake --build build --target install
3232
```
3333

3434
Notes
35-
* It is recommended to add optional options `-DLLVM_BUILD_LLVM_DYLIB=ON -DLLVM_LINK_LLVM_DYLIB=ON` to the command `cmake -G Ninja llvm ...` above. These will enable the build of LLVM/MLIR dynamic libraries and let MLIR/LLVM tools link to them, to reduce the installed binary size of LLVM/MLIR. These options also enable the `GC_DEV_LINK_LLVM_DYLIB` option of graph-compiler repo (see below).
35+
* It is recommended to add optional options `-DLLVM_BUILD_LLVM_DYLIB=ON -DLLVM_LINK_LLVM_DYLIB=ON` to the command `cmake -G Ninja llvm ...` above **if you are building for CPU only**. These will enable the build of LLVM/MLIR dynamic libraries and let MLIR/LLVM tools link to them, to reduce the installed binary size of LLVM/MLIR. These options also enable the `GC_DEV_LINK_LLVM_DYLIB` option of graph-compiler repo (see below).
3636
* The option `-DLLVM_INSTALL_GTEST=ON` is optional, if the tests of graph-compiler are disabled (see `GC_TEST_ENABLE` below).
37+
* If you would like to enable GPU components of Graph Compiler, please make sure to statically link Graph Compiler and LLVM(MLIR). It is a known issue that LLVM shared library cannot be linked together with IGC (Intel's low level GPU compiler). Make sure `LLVM_BUILD_LLVM_DYLIB` and `LLVM_LINK_LLVM_DYLIB` are `OFF` (they are off by default). Also make sure Graph Compiler's cmake option `GC_DEV_LINK_LLVM_DYLIB` is `OFF` when configuring Graph Compiler (see below).
3738

3839
We have now installed LLVM at `llvm-project/llvm-install`.
3940

@@ -57,6 +58,7 @@ cmake --build . --target gc-check
5758
Notes:
5859
* `/PATH/TO/llvm-project/llvm-install` should be the install path of LLVM. If you installed LLVM elsewhere by `-DCMAKE_INSTALL_PREFIX` option when building LLVM, you need to change the path in `-DMLIR_DIR` accordingly.
5960
* The cmake option `-DLLVM_EXTERNAL_LIT` is for the tests of this project. It requires the `lit` tool to be installed in the system. You can install it via `pip install lit`. If you don't need to run the tests of this repo, you can omit this option in the command line.
61+
* If GPU components are on (`-DGC_USE_GPU=ON`), make sure the Level-zero runtime is installed in your system. Either install Level-zero runtime via system package managers (e.g. `apt`), or follow the instructions of [IMEX](https://github.com/intel/mlir-extensions).
6062

6163
Graph Compiler supports the following build-time options.
6264

@@ -67,4 +69,5 @@ Graph Compiler supports the following build-time options.
6769
| GC_DEV_LINK_LLVM_DYLIB | ON, **OFF** | Controls dynamic link LLVM/MLIR libraries, mainly for developer |
6870
| GC_ENABLE_BINDINGS_PYTHON | **ON**, OFF | Controls building the Python API |
6971
| GC_BENCH_ENABLE | **ON**, OFF | Controls building benchgc. The configuration will only take effect when both GC_ENABLE_BINDING_PYTHON and GC_TEST_ENABLE are ON. |
72+
| GC_USE_GPU | ON, **OFF** | Whether to enable the GPU components |
7073

cmake/functions.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ function(gc_fetch_content
3131
FetchContent_Declare(
3232
${name}
3333
SOURCE_DIR ${GC_${uname}_SRC_DIR}
34-
CMAKE_ARGS ${${uname}_CMAKE_ARGS}
34+
CMAKE_ARGS ${GC_${uname}_CMAKE_ARGS}
3535
)
3636
else ()
3737
if (DEFINED GC_${uname}_VERSION)

cmake/imex.cmake

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
include_guard()
2+
3+
get_property(IMEX_INCLUDES GLOBAL PROPERTY IMEX_INCLUDES)
4+
if (NOT DEFINED IMEX_INCLUDES)
5+
include(functions)
6+
set(IMEX_CHECK_LLVM_VERSION ON)
7+
set(IMEX_ENABLE_L0_RUNTIME 1)
8+
# TODO: Change to main https://github.com/oneapi-src/oneDNN.git when all the
9+
# required functionality is merged.
10+
gc_fetch_content(imex 496b240093b5e132b60c5ee69878300fe69be300 https://github.com/Menooker/mlir-extensions
11+
CMAKE_ARGS "-DMLIR_DIR=${MLIR_DIR};-DIMEX_CHECK_LLVM_VERSION=ON;-DIMEX_ENABLE_L0_RUNTIME=1"
12+
)
13+
14+
set(IMEX_INCLUDES
15+
${imex_BINARY_DIR}/include
16+
${imex_SOURCE_DIR}/include
17+
${imex_SOURCE_DIR}/src
18+
)
19+
set_property(GLOBAL PROPERTY IMEX_INCLUDES ${IMEX_INCLUDES})
20+
endif ()

docs/CPU_pipeline_overview.md

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
# Graph Compiler CPU Compilation Flow Overview
2+
3+
Graph Compiler is an MLIR based end-to-end DL compiler. The entire compilation process is divided into front-end, middle-end and back-end. Different compilation stages will use different combinations of dialects, and together with various transformation passes to perform various optimizations and graph lowering transformations. The entire process will transform IR from hardware-independent abstract expression to hardware-related concrete expression, and finally generate an executable kernel.
4+
5+
Meanwhile, as an MLIR down-stream project, Graph Compiler's implementation not only uses the existing dialects and passes from MLIR up-stream, but also defines new dialects and passes. Most of the new implementations are upstream-able, and we will do so in the future.
6+
7+
The content introduced in this document does not represent the current implemented status, but the target status after the implementation is completed.
8+
9+
### Front-End
10+
11+
The Graph Compiler front-end takes OneDNN Graph dialect as input. oneDNN Graph dialect is a newly defined dialect, which aims to describe the computation graph defined by oneDNN Graph. The ops in Dialect follow the [oneDNN Graph specification](https://oneapi-src.github.io/oneDNN/graph_supported_operations.html).
12+
13+
oneDNN graph dialect example:
14+
15+
```mlir
16+
func.func @mlp(%in: tensor<128x512xbf16>,
17+
%weight0: tensor<512x256xbf16>, %bias0: tensor<256xbf16>) -> tensor<128x256xbf16> {
18+
// layer 0
19+
%0 = onednn_graph.matmul %in, %weight0, %bias0 : (tensor<128x512xbf16>, tensor<512x256xbf16>, tensor<256xbf16>) -> tensor<128x256xbf16>
20+
%1 = onednn_graph.relu %0 : (tensor<128x256xbf16>) -> tensor<128x256xbf16>
21+
return %1 : tensor<128x256xbf16>
22+
}
23+
```
24+
25+
There's no planned optimization passe in front-end. The only transformation pass is to lowering OneDNN Graph dialect into Linalg dialect.
26+
27+
### Middle-End
28+
29+
Middle-end is mainly responsible for general optimizations that are independent of the target hardware, and most of the transformations apply to both CPU and GPU. Some of the transformations need to query the target hardware information, such as cache level and capacity. The Hardware abstract layer(HAL) is the interface for abstracting and describing the target hardware information. Therefore, the same pass can generate different optimization results for different hardware under the guidance of HAL.
30+
31+
According to the different dialect combinations used, middle-end is divided into the following stages:
32+
33+
#### Linalg on Tensor
34+
35+
This is the intermediate representation closest to the framework calculation graph. The example IR looks like:
36+
37+
```mlir
38+
func.func @mlp(%in: tensor<128x512xbf16>,
39+
%weight0: tensor<512x256xbf16>, %bias0: tensor<256xbf16>) -> tensor<128x256xbf16> {
40+
%0 = tensor.empty() : tensor<128x256xbf16>
41+
%cst = arith.constant 0.000000e+00 : bf16
42+
%1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<128x256xbf16>) -> tensor<128x256xbf16>
43+
%2 = linalg.matmul ins(%in, %weight0 : tensor<128x512xbf16>, tensor<512x256xbf16>) outs(%1 : tensor<128x256xbf16>) -> tensor<128x256xbf16>
44+
%3 = tensor.empty() : tensor<128x256xbf16>
45+
%broadcasted = linalg.broadcast ins(%bias0 : tensor<256xbf16>) outs(%3 : tensor<128x256xbf16>) dimensions = [0]
46+
%4 = tensor.empty() : tensor<128x256xbf16>
47+
%5 = linalg.add ins(%2, %broadcasted : tensor<128x256xbf16>, tensor<128x256xbf16>) outs(%4: tensor<128x256xbf16>) -> tensor<128x256xbf16>
48+
%6 = tensor.empty() : tensor<128x256xbf16>
49+
%7 = linalgx.relu ins(%5 : tensor<128x256xbf16>) outs(%6 : tensor<128x256xbf16>) -> tensor<128x256xbf16>
50+
return %7 : tensor<128x256xbf16>
51+
}
52+
```
53+
54+
In this stage, GC will perform some analysis and transformation related to the whole graph. The main transformations include:
55+
56+
* Padding propagation : insert tensor.pad op to adjust tensor shape if the shape is not divisible for target tiling size.
57+
* Layout propagation : insert tensor.pack and tensor.unpack to adjust tensor layout if blocking layout is preferred.
58+
* Tensor constant propagation : identify folding with constant tensor and build folding block.
59+
* Matmul lowering : lower Linalg.matmul into scf.forall with linalg.batch_reduce_matmul.
60+
* Fine-grain fusion: fuse element-wise/broadcast/reduce/movement ops into base op(e.g. matmul).
61+
* Lower linalg to arith/math on virtual vector : lower Linalg to Arith/Math and tiling tensor to virtual vector.
62+
63+
### Tensor and scf loop with arith/math on virtual vector
64+
65+
In this stage, most of the Linalg ops are lowered to Scf loops with Arith and Math ops. Both Arith and Math ops use tile tensor as input and output. The tile tensor here can be multi-dimensional tensor in any shape, regardless of the hardware register width. The tile size is chosen based on L1 cache capacity, that is, it is a good abstraction to partition the problem size to this granularity, since the microkernel, pre-op, and post-op, works at the tensor size fitting within l1 cache size. Meanwhile, converting Linalg into Arith and Math can further expose the implementation details of Linalg ops, which allow us to further simplify the computation after fusion.
66+
67+
IR example:
68+
69+
```mlir
70+
func.func @add_tensor(%arg0: tensor<4x8x31xf32>, %arg1: tensor<4x8x31xf32>) -> tensor<4x8x31xf32> {
71+
%0 = tensor.empty() : tensor<4x8x31xf32>
72+
%init = arith.constant 0: index
73+
%c1 = arith.constant 1: index
74+
%first_dim = arith.constant 4: index
75+
%second_dim = arith.constant 8: index
76+
// assume our tile shape is [31]
77+
%third_dim = arith.constant 31: index
78+
scf.for %c5 = %init to %first_dim step %c1 {
79+
scf.for %c6 = %init to %second_dim step %c1 {
80+
scf.for %c7 = %init to %third_dim step %c1 {
81+
%1 = vector.transfer_read %args0[%c5,%c6,%c7] {permutation_map = affine_map<() -> ()>} : tensor<31xf32>, vector<31xf32>
82+
%2 = vector.transfer_read %args0[%c5,%c6,%c7] {permutation_map = affine_map<() -> ()>} : tensor<31xf32>, vector<31xf32>
83+
%3 = arith.add %1, %2 : vector<31xf32>
84+
vector.transfer_write %3, %0[%c5, %c6, %c7] : vector<31xf32>, tensor<31xf32>
85+
}
86+
}
87+
}
88+
return %0: tensor<4x8x31xf32>
89+
}
90+
```
91+
92+
The main transformations in this stage include:
93+
* Bfloat16 promotion and cast eliminatation : legalize the Arith and Math ops by inserting `arith.extf` and `arith.truncf` pairs if target device doesn't support, remove pair of redundant `arith.extf` and `arith.truncf` pairs to improve performance and accuracy.
94+
* Lower to physical vector : Lower virtual vector to physical vector based on physical register width of target device.
95+
96+
### Backend-End
97+
98+
Back-end is responsible for device dependent optimization. The use of dialect will vary with the target device. This document will focus on the backend implementation for CPU.
99+
100+
The implementation of BRGEMM is the key to CPU performance.In GC we plan to introduce two different implementations:
101+
102+
* The BRGEMM provided by the library, such as onednn. In order to better abstract and describe the kernel provided by the library, we introduced the microkernel dialect.
103+
104+
* The BRGEMM generated by MLIR. In this approach, The AMX dialect will be used to simplify tile config processing and optimization.
105+
106+
By default GC will use openmp dialect to handle task parallelism. But for better performance and support for non-openmp threadpools, we also introduced the CPURuntime dialect. This dialect also introduces some runtime function calls specifically designed for the CPU, such as thread-local memory allocator, which can improve performance on the CPU.
107+
108+
The main transformations are:
109+
* Memref lowering and scheduling : lower tensor dialect to memref dialect and perform memory related optimization including memory hoist and rescheduling.
110+
* Microkernel dialect and lowering : lower linalg.batch_reduce_matmul to microkernel dialect and further lower to a function call to dnnl brgemm, or an MLIR-based brgemm implementation.
111+
* Parallelcpu dialect and lowering : lower to parallelcpu dialect for Nested parallel loop support and other CPU runtime calls.
112+
113+
In the last step, everything will lower to LLVM dialect. We don't plan to introduce any transformation on LLVM dialect, just leverage the upstream implementation for this.

docs/dialect_overview.png

83.1 KB
Loading

src/dnnl/JsonParser.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -245,11 +245,11 @@ inline mlir::Attribute JsonParser::readAttr() {
245245
} else if (_str == "s64[]") {
246246
_ia64.clear();
247247
readNumArray(_ia64);
248-
attr = _builder.getI64ArrayAttr(_ia64);
248+
attr = _builder.getDenseI64ArrayAttr(_ia64);
249249
} else if (_str == "f32[]") {
250250
_fa32.clear();
251251
readNumArray(_fa32);
252-
attr = _builder.getF32ArrayAttr(_fa32);
252+
attr = _builder.getDenseF32ArrayAttr(_fa32);
253253
} else if (_str == "string") {
254254
_reader.read_string(&_str);
255255
attr = _builder.getStringAttr(_str);

src/dnnl/JsonParser.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,13 @@
2727
#include <stdfloat>
2828
#else
2929
namespace std {
30+
#if defined(__SIZEOF_FLOAT__) && __SIZEOF_FLOAT__ == 4
3031
using float32_t = float;
32+
#elif defined(__SIZEOF_DOUBLE__) && __SIZEOF_DOUBLE__ == 4
33+
using float32_t = double;
34+
#else
35+
static_assert(false, "Unable to determine 32-bit floating point type");
36+
#endif
3137
} // namespace std
3238
#endif
3339

@@ -145,8 +151,16 @@ class JsonParser {
145151
}
146152
std::unordered_map<std::string, OpBuilderFn> _opBuilders{
147153
GC_OP("Add", mlir::onednn_graph::AddOp),
154+
GC_OP("Divide", mlir::onednn_graph::DivOp),
148155
GC_OP("MatMul", mlir::onednn_graph::MatMulOp),
156+
GC_OP("Multiply", mlir::onednn_graph::MulOp),
157+
GC_OP("Pow", mlir::onednn_graph::PowOp),
158+
GC_OP("ReduceMean", mlir::onednn_graph::ReduceMeanOp),
159+
GC_OP("ReduceSum", mlir::onednn_graph::ReduceSumOp),
149160
GC_OP("ReLU", mlir::onednn_graph::ReLUOp),
161+
GC_OP("Sigmoid", mlir::onednn_graph::SigmoidOp),
162+
GC_OP("Subtract", mlir::onednn_graph::SubOp),
163+
GC_OP("Typecast", mlir::onednn_graph::TypeCastOp),
150164
};
151165
#undef GC_OP
152166

src/gc-opt/CMakeLists.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,13 @@ set(gc_opt_libs
3636
GCPasses
3737
GCGPUPasses)
3838

39+
if(GC_USE_GPU)
40+
add_definitions(-DGC_USE_GPU=1)
41+
get_property(IMEX_INCLUDES GLOBAL PROPERTY IMEX_INCLUDES)
42+
include_directories(${IMEX_INCLUDES})
43+
list(APPEND gc_opt_libs IMEXGPUXDialect IMEXXeTileDialect IMEXRegionDialect IMEXRegionTransforms
44+
IMEXTransforms IMEXGPUToGPUX IMEXGPUToSPIRV IMEXGPUXToLLVM IMEXXeGPUToVC IMEXXeTileToXeGPU IMEXUtil)
45+
endif()
3946
if(GC_MLIR_CXX_FLAGS)
4047
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${GC_MLIR_CXX_FLAGS}")
4148
endif()

src/gc-opt/gc-opt.cpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,21 @@
2525
#include "mlir/InitAllPasses.h"
2626
#include "mlir/Tools/mlir-opt/MlirOptMain.h"
2727

28+
#ifdef GC_USE_GPU
29+
#include <imex/InitIMEXDialects.h>
30+
#include <imex/InitIMEXPasses.h>
31+
#endif
32+
2833
int main(int argc, char *argv[]) {
34+
#ifdef GC_USE_GPU
35+
imex::registerTransformsPasses();
36+
// Conversion passes
37+
imex::registerConvertGPUToGPUX();
38+
imex::registerConvertGPUXToLLVM();
39+
imex::registerConvertGPUXToSPIRV();
40+
imex::registerConvertXeGPUToVC();
41+
imex::registerConvertXeTileToXeGPU();
42+
#endif
2943
mlir::registerAllPasses();
3044
mlir::gc::registerGraphCompilerPasses();
3145
mlir::cpuruntime::registerCPURuntimePasses();
@@ -34,6 +48,9 @@ int main(int argc, char *argv[]) {
3448
registry.insert<mlir::cpuruntime::CPURuntimeDialect>();
3549
registry.insert<mlir::linalgx::LinalgxDialect>();
3650
mlir::registerAllDialects(registry);
51+
#ifdef GC_USE_GPU
52+
registry.insert<::imex::xetile::XeTileDialect, ::imex::gpux::GPUXDialect>();
53+
#endif
3754
mlir::cpuruntime::registerConvertCPURuntimeToLLVMInterface(registry);
3855
return mlir::asMainReturnCode(mlir::MlirOptMain(
3956
argc, argv, "Graph Compiler modular optimizer driver\n", registry));

test/dnnl/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ foreach (TEST_SOURCE ${TEST_SOURCES})
1212
target_include_directories(${TEST_NAME} PRIVATE ${GC_LIB_INCLUDES})
1313
if (${TEST_NAME} MATCHES "^TestApi.*")
1414
# The API tests are linked with the shared lib
15-
target_link_libraries(${TEST_NAME} PRIVATE graph_compiler)
15+
target_link_libraries(${TEST_NAME} PRIVATE LLVMSupport graph_compiler)
1616
else ()
1717
# The other tests are linked with the static lib and have non-public includes
1818
target_link_libraries(${TEST_NAME} PRIVATE graph_compiler_static)

test/dnnl/DnnlTestUtils.h

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,21 @@
2121
#include <sstream>
2222
#include <string>
2323

24-
static std::string read_str_resource(const std::string &name) {
24+
#if __cplusplus > 202002L
25+
#include <stdfloat>
26+
#else
27+
namespace std {
28+
#if defined(__SIZEOF_FLOAT__) && __SIZEOF_FLOAT__ == 4
29+
using float32_t = float;
30+
#elif defined(__SIZEOF_DOUBLE__) && __SIZEOF_DOUBLE__ == 4
31+
using float32_t = double;
32+
#else
33+
static_assert(false, "No 32-bit floating point type available");
34+
#endif
35+
} // namespace std
36+
#endif
37+
38+
static std::string readStrResource(const std::string &name) {
2539
std::filesystem::path res_dir{"resources"};
2640
auto path = std::filesystem::absolute(res_dir / name);
2741
std::ifstream file(path);

test/dnnl/TestApiBasic.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
#include "graph/backend/elyzor/include/dnnl_graph_compiler.h"
2424

2525
TEST(TestApiBasic, basicWorkflow) {
26-
auto json = read_str_resource("add.json");
26+
auto json = readStrResource("add.json");
2727

2828
const struct dnnl_graph_compiler_context ctx = {.num_threads = 4};
2929
const struct dnnl_graph_compiler *gc;

0 commit comments

Comments
 (0)