intel
diff --git a/‎CMakeLists.txt
Lines changed: 8 additions & 0 deletions b/‎CMakeLists.txt
Lines changed: 8 additions & 0 deletions
diff --git a/‎README.md
Lines changed: 4 additions & 1 deletion b/‎README.md
Lines changed: 4 additions & 1 deletion
diff --git a/‎cmake/functions.cmake
Lines changed: 1 addition & 1 deletion b/‎cmake/functions.cmake
Lines changed: 1 addition & 1 deletion
diff --git a/‎cmake/imex.cmake
Lines changed: 20 additions & 0 deletions b/‎cmake/imex.cmake
Lines changed: 20 additions & 0 deletions
diff --git a/‎docs/CPU_pipeline_overview.md
Lines changed: 113 additions & 0 deletions b/‎docs/CPU_pipeline_overview.md
Lines changed: 113 additions & 0 deletions
diff --git a/‎docs/dialect_overview.png
83.1 KB b/‎docs/dialect_overview.png
83.1 KB
diff --git a/‎src/dnnl/JsonParser.cpp
Lines changed: 2 additions & 2 deletions b/‎src/dnnl/JsonParser.cpp
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/dnnl/JsonParser.h
Lines changed: 14 additions & 0 deletions b/‎src/dnnl/JsonParser.h
Lines changed: 14 additions & 0 deletions
diff --git a/‎src/gc-opt/CMakeLists.txt
Lines changed: 7 additions & 0 deletions b/‎src/gc-opt/CMakeLists.txt
Lines changed: 7 additions & 0 deletions
diff --git a/‎src/gc-opt/gc-opt.cpp
Lines changed: 17 additions & 0 deletions b/‎src/gc-opt/gc-opt.cpp
Lines changed: 17 additions & 0 deletions
diff --git a/‎test/dnnl/CMakeLists.txt
Lines changed: 1 addition & 1 deletion b/‎test/dnnl/CMakeLists.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎test/dnnl/DnnlTestUtils.h
Lines changed: 15 additions & 1 deletion b/‎test/dnnl/DnnlTestUtils.h
Lines changed: 15 additions & 1 deletion
diff --git a/‎test/dnnl/TestApiBasic.cpp
Lines changed: 1 addition & 1 deletion b/‎test/dnnl/TestApiBasic.cpp
Lines changed: 1 addition & 1 deletion
@@ -28,6 +28,7 @@ list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake")
 
 option(GC_LEGACY_ENABLE ON)
 option(GC_TEST_ENABLE "Build the tests" ON)
+option(GC_USE_GPU "Enable GPU backend" OFF)
 option(GC_ENABLE_BINDINGS_PYTHON "Enable Graph Complier Python Binding" ON)
 option(GC_BENCH_ENABLE "Build benchgc. Only available when GC_TEST_ENABLE and GC_ENABLE_BINDINGS_PYTHON is enabled" ON)
 option(GC_DEV_LINK_LLVM_DYLIB "Link dynamic libraries of LLVM and MLIR. For developers only. Do not use it in packing the library." OFF)
@@ -53,6 +54,13 @@ include(AddLLVM)
 include(AddMLIR)
 include(HandleLLVMOptions)
 
+if(GC_USE_GPU)
+  include(imex)
+  if(GC_DEV_LINK_LLVM_DYLIB)
+    message(WARN "GPU backend may not be compatible with dynamic linking to LLVM")
+  endif()
+endif()
+
 if(GC_ENABLE_BINDINGS_PYTHON AND NOT MLIR_ENABLE_BINDINGS_PYTHON)
   message(STATUS "Failed to enable Python API due to the 'MLIR_ENABLE_BINDINGS_PYTHON' for LLVM is not ON.")
   set(GC_ENABLE_BINDINGS_PYTHON OFF CACHE BOOL "" FORCE)
 
@@ -32,8 +32,9 @@ cmake --build build --target install
 ```
 
 Notes
- * It is recommended to add optional options `-DLLVM_BUILD_LLVM_DYLIB=ON -DLLVM_LINK_LLVM_DYLIB=ON` to the command `cmake -G Ninja llvm ...` above. These will enable the build of LLVM/MLIR dynamic libraries and let MLIR/LLVM tools link to them, to reduce the installed binary size of LLVM/MLIR. These options also enable the `GC_DEV_LINK_LLVM_DYLIB` option of graph-compiler repo (see below).
+ * It is recommended to add optional options `-DLLVM_BUILD_LLVM_DYLIB=ON -DLLVM_LINK_LLVM_DYLIB=ON` to the command `cmake -G Ninja llvm ...` above **if you are building for CPU only**. These will enable the build of LLVM/MLIR dynamic libraries and let MLIR/LLVM tools link to them, to reduce the installed binary size of LLVM/MLIR. These options also enable the `GC_DEV_LINK_LLVM_DYLIB` option of graph-compiler repo (see below).
  * The option `-DLLVM_INSTALL_GTEST=ON` is optional, if the tests of graph-compiler are disabled (see `GC_TEST_ENABLE` below).
+ * If you would like to enable GPU components of Graph Compiler, please make sure to statically link Graph Compiler and LLVM(MLIR). It is a known issue that LLVM shared library cannot be linked together with IGC (Intel's low level GPU compiler). Make sure `LLVM_BUILD_LLVM_DYLIB` and `LLVM_LINK_LLVM_DYLIB` are `OFF` (they are off by default). Also make sure Graph Compiler's cmake option `GC_DEV_LINK_LLVM_DYLIB` is `OFF` when configuring Graph Compiler (see below).
 
 We have now installed LLVM at `llvm-project/llvm-install`.
 
@@ -57,6 +58,7 @@ cmake --build . --target gc-check
 Notes:
  * `/PATH/TO/llvm-project/llvm-install` should be the install path of LLVM. If you installed LLVM elsewhere by `-DCMAKE_INSTALL_PREFIX` option when building LLVM, you need to change the path in `-DMLIR_DIR` accordingly.
  *  The cmake option `-DLLVM_EXTERNAL_LIT` is for the tests of this project. It requires the `lit` tool to be installed in the system. You can install it via `pip install lit`. If you don't need to run the tests of this repo, you can omit this option in the command line.
+ * If GPU components are on (`-DGC_USE_GPU=ON`), make sure the Level-zero runtime is installed in your system. Either install Level-zero runtime via system package managers (e.g. `apt`), or follow the instructions of [IMEX](https://github.com/intel/mlir-extensions).
 
 Graph Compiler supports the following build-time options.
 
@@ -67,4 +69,5 @@ Graph Compiler supports the following build-time options.
 | GC_DEV_LINK_LLVM_DYLIB          | ON, **OFF**                            | Controls dynamic link LLVM/MLIR libraries, mainly for developer                        |
 | GC_ENABLE_BINDINGS_PYTHON       | **ON**, OFF                            | Controls building the Python API                                                       |
 | GC_BENCH_ENABLE                 | **ON**, OFF                            | Controls building benchgc. The configuration will only take effect when both GC_ENABLE_BINDING_PYTHON and GC_TEST_ENABLE are ON.                           |
+| GC_USE_GPU          | ON, **OFF**                            | Whether to enable the GPU components                        |
 
@@ -31,7 +31,7 @@ function(gc_fetch_content
         FetchContent_Declare(
                 ${name}
                 SOURCE_DIR ${GC_${uname}_SRC_DIR}
-                CMAKE_ARGS ${${uname}_CMAKE_ARGS}
+                CMAKE_ARGS ${GC_${uname}_CMAKE_ARGS}
         )
     else ()
         if (DEFINED GC_${uname}_VERSION)
 
@@ -0,0 +1,20 @@
+include_guard()
+
+get_property(IMEX_INCLUDES GLOBAL PROPERTY IMEX_INCLUDES)
+if (NOT DEFINED IMEX_INCLUDES)
+    include(functions)
+    set(IMEX_CHECK_LLVM_VERSION ON)
+    set(IMEX_ENABLE_L0_RUNTIME 1)
+    # TODO: Change to main https://github.com/oneapi-src/oneDNN.git when all the
+    # required functionality is merged.
+    gc_fetch_content(imex 496b240093b5e132b60c5ee69878300fe69be300 https://github.com/Menooker/mlir-extensions
+            CMAKE_ARGS "-DMLIR_DIR=${MLIR_DIR};-DIMEX_CHECK_LLVM_VERSION=ON;-DIMEX_ENABLE_L0_RUNTIME=1"
+    )
+
+    set(IMEX_INCLUDES
+            ${imex_BINARY_DIR}/include
+            ${imex_SOURCE_DIR}/include
+            ${imex_SOURCE_DIR}/src
+    )
+    set_property(GLOBAL PROPERTY IMEX_INCLUDES ${IMEX_INCLUDES})
+endif ()
@@ -0,0 +1,113 @@
+# Graph Compiler CPU Compilation Flow Overview
+
+Graph Compiler is an MLIR based end-to-end DL compiler. The entire compilation process is divided into front-end, middle-end and back-end. Different compilation stages will use different combinations of dialects, and together with various transformation passes to perform various optimizations and graph lowering transformations. The entire process will transform IR from hardware-independent abstract expression to hardware-related concrete expression, and finally generate an executable kernel.
+
+Meanwhile, as an MLIR down-stream project, Graph Compiler's implementation not only uses the existing dialects and passes from MLIR up-stream, but also defines new dialects and passes. Most of the new implementations are upstream-able, and we will do so in the future.
+
+The content introduced in this document does not represent the current implemented status, but the target status after the implementation is completed.
+
+### Front-End
+
+The Graph Compiler front-end takes OneDNN Graph dialect as input. oneDNN Graph dialect is a newly defined dialect, which aims to describe the computation graph defined by oneDNN Graph. The ops in Dialect follow the [oneDNN Graph specification](https://oneapi-src.github.io/oneDNN/graph_supported_operations.html).
+
+oneDNN graph dialect example:
+
+```mlir
+func.func @mlp(%in: tensor<128x512xbf16>,
+               %weight0: tensor<512x256xbf16>, %bias0: tensor<256xbf16>) -> tensor<128x256xbf16> {
+  // layer 0
+  %0 = onednn_graph.matmul %in, %weight0, %bias0 : (tensor<128x512xbf16>, tensor<512x256xbf16>, tensor<256xbf16>) -> tensor<128x256xbf16>
+  %1 = onednn_graph.relu %0 : (tensor<128x256xbf16>) -> tensor<128x256xbf16>
+  return %1 : tensor<128x256xbf16>
+}
+```
+
+There's no planned optimization passe in front-end. The only transformation pass is to lowering OneDNN Graph dialect into Linalg dialect.
+
+### Middle-End
+
+Middle-end is mainly responsible for general optimizations that are independent of the target hardware, and most of the transformations apply to both CPU and GPU. Some of the transformations need to query the target hardware information, such as cache level and capacity. The Hardware abstract layer(HAL) is the interface for abstracting and describing the target hardware information. Therefore, the same pass can generate different optimization results for different hardware under the guidance of HAL.
+
+According to the different dialect combinations used, middle-end is divided into the following stages:
+
+#### Linalg on Tensor
+
+This is the intermediate representation closest to the framework calculation graph. The example IR looks like:
+
+```mlir
+func.func @mlp(%in: tensor<128x512xbf16>,
+               %weight0: tensor<512x256xbf16>, %bias0: tensor<256xbf16>) -> tensor<128x256xbf16> {
+  %0 = tensor.empty() : tensor<128x256xbf16>
+  %cst = arith.constant 0.000000e+00 : bf16
+  %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<128x256xbf16>) -> tensor<128x256xbf16>
+  %2 = linalg.matmul ins(%in, %weight0 : tensor<128x512xbf16>, tensor<512x256xbf16>) outs(%1 : tensor<128x256xbf16>) -> tensor<128x256xbf16>
+  %3 = tensor.empty() : tensor<128x256xbf16>
+  %broadcasted = linalg.broadcast ins(%bias0 : tensor<256xbf16>) outs(%3 : tensor<128x256xbf16>) dimensions = [0]
+  %4 = tensor.empty() : tensor<128x256xbf16>
+  %5 = linalg.add ins(%2, %broadcasted : tensor<128x256xbf16>, tensor<128x256xbf16>) outs(%4: tensor<128x256xbf16>) -> tensor<128x256xbf16>
+  %6 = tensor.empty() : tensor<128x256xbf16>
+  %7 = linalgx.relu ins(%5 : tensor<128x256xbf16>) outs(%6 : tensor<128x256xbf16>) -> tensor<128x256xbf16>
+  return %7 : tensor<128x256xbf16>
+}
+```
+
+In this stage, GC will perform some analysis and transformation related to the whole graph. The main transformations include:
+
+* Padding propagation : insert tensor.pad op to adjust tensor shape if the shape is not divisible for target tiling size.
+* Layout propagation : insert tensor.pack and tensor.unpack to adjust tensor layout if blocking layout is preferred.
+* Tensor constant propagation : identify folding with constant tensor and build folding block.
+* Matmul lowering : lower Linalg.matmul into scf.forall with linalg.batch_reduce_matmul.
+* Fine-grain fusion: fuse element-wise/broadcast/reduce/movement ops into base op(e.g. matmul).
+* Lower linalg to arith/math on virtual vector : lower Linalg to Arith/Math and tiling tensor to virtual vector.
+
+### Tensor and scf loop with arith/math on virtual vector
+
+In this stage, most of the Linalg ops are lowered to Scf loops with Arith and Math ops. Both Arith and Math ops use tile tensor as input and output. The tile tensor here can be multi-dimensional tensor in any shape, regardless of the hardware register width. The tile size is chosen based on L1 cache capacity, that is, it is a good abstraction to partition the problem size to this granularity, since the microkernel, pre-op, and post-op, works at the tensor size fitting within l1 cache size. Meanwhile, converting Linalg into Arith and Math can further expose the implementation details of Linalg ops, which allow us to further simplify the computation after fusion.
+
+IR example:
+
+```mlir
+func.func @add_tensor(%arg0: tensor<4x8x31xf32>, %arg1: tensor<4x8x31xf32>) -> tensor<4x8x31xf32> {
+  %0 = tensor.empty() : tensor<4x8x31xf32>
+  %init = arith.constant 0: index
+  %c1 = arith.constant 1: index
+  %first_dim = arith.constant 4: index
+  %second_dim = arith.constant 8: index
+  // assume our tile shape is [31]
+  %third_dim = arith.constant 31: index
+  scf.for %c5 = %init to %first_dim step %c1 {
+    scf.for %c6 = %init to %second_dim step %c1 {
+        scf.for %c7 = %init to %third_dim step %c1 {
+          %1 =  vector.transfer_read %args0[%c5,%c6,%c7] {permutation_map = affine_map<() -> ()>} : tensor<31xf32>, vector<31xf32>
+          %2 =  vector.transfer_read %args0[%c5,%c6,%c7] {permutation_map = affine_map<() -> ()>} : tensor<31xf32>, vector<31xf32>
+          %3 = arith.add %1, %2 : vector<31xf32>
+          vector.transfer_write %3, %0[%c5, %c6, %c7] : vector<31xf32>, tensor<31xf32>
+        }
+    }
+  }
+  return %0: tensor<4x8x31xf32>
+}
+```
+
+The main transformations in this stage include:
+* Bfloat16 promotion and cast eliminatation : legalize the Arith and Math ops by inserting `arith.extf` and `arith.truncf` pairs if target device doesn't support, remove pair of redundant `arith.extf` and `arith.truncf` pairs to improve performance and accuracy.
+* Lower to physical vector : Lower virtual vector to physical vector based on physical register width of target device.
+
+### Backend-End
+
+Back-end is responsible for device dependent optimization. The use of dialect will vary with the target device. This document will focus on the backend implementation for CPU.
+
+The implementation of BRGEMM is the key to CPU performance.In GC we plan to introduce two different implementations:
+
+* The BRGEMM provided by the library, such as onednn. In order to better abstract and describe the kernel provided by the library, we introduced the microkernel dialect.
+
+* The BRGEMM generated by MLIR. In this approach, The AMX dialect will be used to simplify tile config processing and optimization.
+
+By default GC will use openmp dialect to handle task parallelism. But for better performance and support for non-openmp threadpools, we also introduced the CPURuntime dialect. This dialect also introduces some runtime function calls specifically designed for the CPU, such as thread-local memory allocator, which can improve performance on the CPU.
+
+The main transformations are:
+* Memref lowering and scheduling : lower tensor dialect to memref dialect and perform memory related optimization including memory hoist and rescheduling.
+* Microkernel dialect and lowering : lower linalg.batch_reduce_matmul to microkernel dialect and further lower to a function call to dnnl brgemm, or an MLIR-based brgemm implementation.
+* Parallelcpu dialect and lowering : lower to parallelcpu dialect for Nested parallel loop support and other CPU runtime calls.
+
+In the last step, everything will lower to LLVM dialect. We don't plan to introduce any transformation on LLVM dialect, just leverage the upstream implementation for this.
@@ -245,11 +245,11 @@ inline mlir::Attribute JsonParser::readAttr() {
   } else if (_str == "s64[]") {
     _ia64.clear();
     readNumArray(_ia64);
-    attr = _builder.getI64ArrayAttr(_ia64);
+    attr = _builder.getDenseI64ArrayAttr(_ia64);
   } else if (_str == "f32[]") {
     _fa32.clear();
     readNumArray(_fa32);
-    attr = _builder.getF32ArrayAttr(_fa32);
+    attr = _builder.getDenseF32ArrayAttr(_fa32);
   } else if (_str == "string") {
     _reader.read_string(&_str);
     attr = _builder.getStringAttr(_str);
 
@@ -27,7 +27,13 @@
 #include <stdfloat>
 #else
 namespace std {
+#if defined(__SIZEOF_FLOAT__) && __SIZEOF_FLOAT__ == 4
 using float32_t = float;
+#elif defined(__SIZEOF_DOUBLE__) && __SIZEOF_DOUBLE__ == 4
+using float32_t = double;
+#else
+static_assert(false, "Unable to determine 32-bit floating point type");
+#endif
 } // namespace std
 #endif
 
@@ -145,8 +151,16 @@ class JsonParser {
   }
   std::unordered_map<std::string, OpBuilderFn> _opBuilders{
       GC_OP("Add", mlir::onednn_graph::AddOp),
+      GC_OP("Divide", mlir::onednn_graph::DivOp),
       GC_OP("MatMul", mlir::onednn_graph::MatMulOp),
+      GC_OP("Multiply", mlir::onednn_graph::MulOp),
+      GC_OP("Pow", mlir::onednn_graph::PowOp),
+      GC_OP("ReduceMean", mlir::onednn_graph::ReduceMeanOp),
+      GC_OP("ReduceSum", mlir::onednn_graph::ReduceSumOp),
       GC_OP("ReLU", mlir::onednn_graph::ReLUOp),
+      GC_OP("Sigmoid", mlir::onednn_graph::SigmoidOp),
+      GC_OP("Subtract", mlir::onednn_graph::SubOp),
+      GC_OP("Typecast", mlir::onednn_graph::TypeCastOp),
   };
 #undef GC_OP
 
 
@@ -36,6 +36,13 @@ set(gc_opt_libs
         GCPasses
         GCGPUPasses)
 
+if(GC_USE_GPU)
+  add_definitions(-DGC_USE_GPU=1)
+  get_property(IMEX_INCLUDES GLOBAL PROPERTY IMEX_INCLUDES)
+  include_directories(${IMEX_INCLUDES})
+  list(APPEND gc_opt_libs IMEXGPUXDialect IMEXXeTileDialect IMEXRegionDialect IMEXRegionTransforms
+    IMEXTransforms IMEXGPUToGPUX IMEXGPUToSPIRV IMEXGPUXToLLVM IMEXXeGPUToVC IMEXXeTileToXeGPU IMEXUtil)
+endif()
 if(GC_MLIR_CXX_FLAGS)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${GC_MLIR_CXX_FLAGS}")
 endif()
 
@@ -25,7 +25,21 @@
 #include "mlir/InitAllPasses.h"
 #include "mlir/Tools/mlir-opt/MlirOptMain.h"
 
+#ifdef GC_USE_GPU
+#include <imex/InitIMEXDialects.h>
+#include <imex/InitIMEXPasses.h>
+#endif
+
 int main(int argc, char *argv[]) {
+#ifdef GC_USE_GPU
+  imex::registerTransformsPasses();
+  // Conversion passes
+  imex::registerConvertGPUToGPUX();
+  imex::registerConvertGPUXToLLVM();
+  imex::registerConvertGPUXToSPIRV();
+  imex::registerConvertXeGPUToVC();
+  imex::registerConvertXeTileToXeGPU();
+#endif
   mlir::registerAllPasses();
   mlir::gc::registerGraphCompilerPasses();
   mlir::cpuruntime::registerCPURuntimePasses();
@@ -34,6 +48,9 @@ int main(int argc, char *argv[]) {
   registry.insert<mlir::cpuruntime::CPURuntimeDialect>();
   registry.insert<mlir::linalgx::LinalgxDialect>();
   mlir::registerAllDialects(registry);
+#ifdef GC_USE_GPU
+  registry.insert<::imex::xetile::XeTileDialect, ::imex::gpux::GPUXDialect>();
+#endif
   mlir::cpuruntime::registerConvertCPURuntimeToLLVMInterface(registry);
   return mlir::asMainReturnCode(mlir::MlirOptMain(
       argc, argv, "Graph Compiler modular optimizer driver\n", registry));
 
@@ -12,7 +12,7 @@ foreach (TEST_SOURCE ${TEST_SOURCES})
     target_include_directories(${TEST_NAME} PRIVATE ${GC_LIB_INCLUDES})
     if (${TEST_NAME} MATCHES "^TestApi.*")
         # The API tests are linked with the shared lib
-        target_link_libraries(${TEST_NAME} PRIVATE graph_compiler)
+        target_link_libraries(${TEST_NAME} PRIVATE LLVMSupport graph_compiler)
     else ()
         # The other tests are linked with the static lib and have non-public includes
         target_link_libraries(${TEST_NAME} PRIVATE graph_compiler_static)
 
@@ -21,7 +21,21 @@
 #include <sstream>
 #include <string>
 
-static std::string read_str_resource(const std::string &name) {
+#if __cplusplus > 202002L
+#include <stdfloat>
+#else
+namespace std {
+#if defined(__SIZEOF_FLOAT__) && __SIZEOF_FLOAT__ == 4
+using float32_t = float;
+#elif defined(__SIZEOF_DOUBLE__) && __SIZEOF_DOUBLE__ == 4
+using float32_t = double;
+#else
+static_assert(false, "No 32-bit floating point type available");
+#endif
+} // namespace std
+#endif
+
+static std::string readStrResource(const std::string &name) {
   std::filesystem::path res_dir{"resources"};
   auto path = std::filesystem::absolute(res_dir / name);
   std::ifstream file(path);
 
@@ -23,7 +23,7 @@
 #include "graph/backend/elyzor/include/dnnl_graph_compiler.h"
 
 TEST(TestApiBasic, basicWorkflow) {
-  auto json = read_str_resource("add.json");
+  auto json = readStrResource("add.json");
 
   const struct dnnl_graph_compiler_context ctx = {.num_threads = 4};
   const struct dnnl_graph_compiler *gc;
Original file line number	Diff line number	Diff line change
`@@ -31,7 +31,7 @@ function(gc_fetch_content`
`31`	`31`	`FetchContent_Declare(`
`32`	`32`	`${name}`
`33`	`33`	`SOURCE_DIR ${GC_${uname}_SRC_DIR}`
`34`		`- CMAKE_ARGS ${${uname}_CMAKE_ARGS}`
	`34`	`+ CMAKE_ARGS ${GC_${uname}_CMAKE_ARGS}`
`35`	`35`	`)`
`36`	`36`	`else ()`
`37`	`37`	`if (DEFINED GC_${uname}_VERSION)`