InfiniTensor · YdrMaster · Feb 19, 2024 · Jan 3, 2024 · Jan 4, 2024 · Jan 5, 2024
diff --git a/.gitmodules b/.gitmodules
@@ -19,3 +19,6 @@
 [submodule "src/09python_ffi/pybind11"]
 	path = src/09python_ffi/pybind11
 	url = git@github.com:pybind/pybind11.git
+[submodule "3rd-party/cccl"]
+	path = 3rd-party/cccl
+	url = git@github.com:NVIDIA/cccl.git
diff --git a/3rd-party/backward-cpp b/3rd-party/backward-cpp
diff --git a/3rd-party/cccl b/3rd-party/cccl
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -5,6 +5,7 @@ message(STATUS "Project " ${PROJECT_NAME} " version " ${PROJECT_VERSION})
 option(ABSL_PROPAGATE_CXX_STD "Abseil need this option" ON)
 option(USE_CUDA "Support Nvidia GPU" OFF)
 option(USE_KUNLUN "Support Baidu Kunlunxin" OFF)
+option(USE_BANG "Support Hanwuji MLU" OFF)
 
 set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
@@ -41,6 +42,38 @@ if(USE_KUNLUN)
     message(STATUS "KUNLUN_HOME: ${KUNLUN_HOME}")
 endif()
 
+if (USE_BANG)
+    add_compile_definitions(USE_BANG)
+    include_directories(src/kernels/mlu/include)
+
+    # Neuware Evironment
+    if ((NOT DEFINED NEUWARE_HOME) AND (NOT DEFINED ENV{NEUWARE_HOME}))
+        message(FATAL_ERROR "NEUWARE_HOME is not defined from cmake or env")
+    elseif (DEFINED NEUWARE_HOME)
+        set(NEUWARE_HOME ${NEUWARE_HOME} CACHE STRING "NEUWARE_HOME directory for Cambricon Neuware development")
+    else()
+        set(NEUWARE_HOME $ENV{NEUWARE_HOME} CACHE STRING "NEUWARE_HOME directory for Cambricon Neuware development")
+    endif()
+    message(STATUS "NEUWARE_HOME: ${NEUWARE_HOME}")
+
+    # cnrt cndrv cnnl
+    include_directories("${NEUWARE_HOME}/include")
+    find_library(CAMBRICON_CNNL libcnnl.so "${NEUWARE_HOME}/lib64")
+    find_library(CAMBRICON_CNRT libcnrt.so "${NEUWARE_HOME}/lib64")
+    find_library(CAMBRICON_CNDRV libcndrv.so "${NEUWARE_HOME}/lib64")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lstdc++ -Wall")
+
+    if ((NOT DEFINED TARGET_CPU_ARCH) AND (NOT DEFINED ENV{TARGET_CPU_ARCH}))
+        execute_process(COMMAND uname -m OUTPUT_VARIABLE _uname_m OUTPUT_STRIP_TRAILING_WHITESPACE)
+        set(TARGET_CPU_ARCH "${_uname_m}" CACHE STRING "Target CPU ARCH")
+    elseif(DEFINED TARGET_CPU_ARCH)
+        set(TARGET_CPU_ARCH ${TARGET_CPU_ARCH} CACHE STRING "Target CPU ARCH")
+    else()
+        set(TARGET_CPU_ARCH $ENV{TARGET_CPU_ARCH} CACHE STRING "Target CPU ARCH")
+    endif()
+    message(STATUS "TARGET_CPU_ARCH: ${TARGET_CPU_ARCH}")
+endif()
+
 add_compile_options(-march=native) # this will cause error in some machine
 add_compile_options(-mtune=native)
 add_compile_options(-Wall)
@@ -72,4 +105,5 @@ add_subdirectory(src/05computation)
 add_subdirectory(src/06frontend)
 add_subdirectory(src/07onnx)
 add_subdirectory(src/08communication)
+add_subdirectory(src/08-01llm)
 add_subdirectory(src/09python_ffi)
diff --git a/Makefile b/Makefile
@@ -3,13 +3,14 @@
 TYPE ?= Debug
 CUDA ?= OFF
 KUNLUN ?= OFF
+BANG ?= OFF
 
 CMAKE_EXTRA =
 # CMAKE_EXTRA += -DCMAKE_CXX_COMPILER=
 
 build:
 	mkdir -p build
-	cmake -Bbuild -DCMAKE_BUILD_TYPE=$(TYPE) -DUSE_CUDA=$(CUDA) -DUSE_KUNLUN=$(KUNLUN) $(CMAKE_EXTRA)
+	cmake -Bbuild -DCMAKE_BUILD_TYPE=$(TYPE) -DUSE_CUDA=$(CUDA) -DUSE_KUNLUN=$(KUNLUN) -DUSE_BANG=$(BANG) $(CMAKE_EXTRA)
 	make -j -C build
 
 install-python: build

diff --git a/README.md b/README.md
@@ -168,7 +168,7 @@ executor = compiler.compile("cuda", "default", [])  # -------- 编译模型
 - [fmt 10.1.1](https://github.com/fmtlib/fmt/releases/tag/10.1.0)
 - [fmtlog v2.2.1](https://github.com/MengRao/fmtlog/releases/tag/v2.2.1)
 - [googletest v1.14.0](https://github.com/google/googletest/releases/tag/v1.14.0)
-- [backward-cpp v1.6](https://github.com/bombela/backward-cpp/releases/tag/v1.6)
+- [backward-cpp master](https://github.com/bombela/backward-cpp)
 - [result master](https://github.com/willowell/result)
 - [abseil-cpp 20230802.1](https://github.com/abseil/abseil-cpp/releases/tag/20230802.1)
 

diff --git a/src/00common/CMakeLists.txt b/src/00common/CMakeLists.txt
@@ -11,6 +11,5 @@ file(GLOB_RECURSE COMMON_TEST test/*.cpp)
 if(COMMON_TEST)
     add_executable(common_test ${COMMON_TEST})
     add_test(common_test common_test)
-    target_link_libraries(common_test common GTest::gtest_main ${BACKWARD_ENABLE})
-    add_backward(common_test)
+    target_link_libraries(common_test common GTest::gtest_main Backward::Object)
 endif()
diff --git a/src/00common/include/common/error_handler.h b/src/00common/include/common/error_handler.h
@@ -30,9 +30,9 @@ namespace refactor {
     std::abort()
 
 #ifndef DISABLE_ASSERT
-#define ASSERT(CONDITION, F, ...)                                                      \
-    {                                                                                  \
-        if (!(CONDITION)) RUNTIME_ERROR(fmt::format("Assertion: " #F, ##__VA_ARGS__)); \
+#define ASSERT(CONDITION, F, ...)                                                     \
+    {                                                                                 \
+        if (!(CONDITION)) RUNTIME_ERROR(fmt::format("Assertion: " F, ##__VA_ARGS__)); \
     }
 #else
 #define ASSERT(CONDITION, F)

diff --git a/src/00common/include/common/rc.hpp b/src/00common/include/common/rc.hpp
@@ -2,6 +2,7 @@
 #define RC_HPP
 
 #include <functional>
+#include <utility>
 
 namespace refactor {
 
@@ -18,7 +19,7 @@ namespace refactor {
         T *_value;
         struct Counter {
             size_t strong, weak;
-        } * _counter;
+        } *_counter;
 
         Rc(T *ptr, Counter *counter) noexcept
             : _value(ptr), _counter(counter) { inc(); }

diff --git a/src/01graph_topo/CMakeLists.txt b/src/01graph_topo/CMakeLists.txt
@@ -11,6 +11,5 @@ file(GLOB_RECURSE GRAPH_TOPO_TEST test/*.cpp)
 if(GRAPH_TOPO_TEST)
     add_executable(graph_topo_test ${GRAPH_TOPO_TEST})
     add_test(graph_topo_test graph_topo_test)
-    target_link_libraries(graph_topo_test graph_topo GTest::gtest_main ${BACKWARD_ENABLE})
-    add_backward(graph_topo_test)
+    target_link_libraries(graph_topo_test graph_topo GTest::gtest_main Backward::Object)
 endif()
diff --git a/src/02hardware/CMakeLists.txt b/src/02hardware/CMakeLists.txt
@@ -15,6 +15,5 @@ file(GLOB_RECURSE HARDWARE_TEST test/*.cpp)
 if(HARDWARE_TEST)
     add_executable(hardware_test ${HARDWARE_TEST})
     add_test(hardware_test hardware_test)
-    target_link_libraries(hardware_test hardware GTest::gtest_main ${BACKWARD_ENABLE})
-    add_backward(hardware_test)
+    target_link_libraries(hardware_test hardware GTest::gtest_main Backward::Object)
 endif()
diff --git a/src/02hardware/include/hardware/device.h b/src/02hardware/include/hardware/device.h
@@ -11,6 +11,8 @@ namespace refactor::hardware {
         enum class Type : int32_t {
             Cpu,
             Nvidia,
+            Mlu,
+            Kunlun,
         };
 
     protected:

diff --git a/src/02hardware/include/hardware/devices/mlu.h b/src/02hardware/include/hardware/devices/mlu.h
@@ -0,0 +1,19 @@
+#ifndef HARDWARE_DEVICES_MLU_H
+#define HARDWARE_DEVICES_MLU_H
+
+#include "../device.h"
+
+namespace refactor::hardware {
+
+    class Mlu final : public Device {
+    public:
+        explicit Mlu(int32_t card);
+        void setContext() const noexcept final;
+        Type type() const noexcept final {
+            return Type::Mlu;
+        }
+    };
+
+}// namespace refactor::hardware
+
+#endif// HARDWARE_DEVICES_MLU_H
diff --git a/src/02hardware/src/device_manager.cpp b/src/02hardware/src/device_manager.cpp
@@ -1,5 +1,6 @@
 #include "hardware/device_manager.h"
 #include "hardware/devices/cpu.h"
+#include "hardware/devices/mlu.h"
 #include "hardware/devices/nvidia.h"
 
 namespace refactor::hardware::device {
@@ -37,6 +38,7 @@ namespace refactor::hardware::device {
         using T = Device::Type;
         // clang-format off
         auto device = type == T::Nvidia ? std::make_shared<Nvidia>(card)
+                    : type == T::Mlu    ? std::make_shared<Mlu>(card)
                     : UNREACHABLEX(Arc<Device>, "");
         // clang-format on
         auto [kind, ok] = DEVICES.try_emplace(static_cast<int32_t>(type));

diff --git a/src/02hardware/src/devices/cpu/memory.cc b/src/02hardware/src/devices/cpu/memory.cc
@@ -5,19 +5,19 @@
 namespace refactor::hardware {
     using M = CpuMemory;
 
-    void *M::malloc(size_t size) noexcept {
+    void *M::malloc(size_t size) {
         return std::malloc(size);
     }
-    void M::free(void *ptr) noexcept {
+    void M::free(void *ptr) {
         std::free(ptr);
     }
-    void *M::copyHD(void *dst, void const *src, size_t bytes) const noexcept {
+    void *M::copyHD(void *dst, void const *src, size_t bytes) const {
         return std::memcpy(dst, src, bytes);
     }
-    void *M::copyDH(void *dst, void const *src, size_t bytes) const noexcept {
+    void *M::copyDH(void *dst, void const *src, size_t bytes) const {
         return std::memcpy(dst, src, bytes);
     }
-    void *M::copyDD(void *dst, void const *src, size_t bytes) const noexcept {
+    void *M::copyDD(void *dst, void const *src, size_t bytes) const {
         return std::memcpy(dst, src, bytes);
     }
 

diff --git a/src/02hardware/src/devices/cpu/memory.hh b/src/02hardware/src/devices/cpu/memory.hh
@@ -6,11 +6,11 @@
 namespace refactor::hardware {
 
     class CpuMemory final : public Memory {
-        void *malloc(size_t) noexcept final;
-        void free(void *) noexcept final;
-        void *copyHD(void *dst, void const *src, size_t bytes) const noexcept final;
-        void *copyDH(void *dst, void const *src, size_t bytes) const noexcept final;
-        void *copyDD(void *dst, void const *src, size_t bytes) const noexcept final;
+        void *malloc(size_t) final;
+        void free(void *) final;
+        void *copyHD(void *dst, void const *src, size_t bytes) const final;
+        void *copyDH(void *dst, void const *src, size_t bytes) const final;
+        void *copyDD(void *dst, void const *src, size_t bytes) const final;
     };
 
 }// namespace refactor::hardware

diff --git a/src/02hardware/src/devices/mlu/device.cc b/src/02hardware/src/devices/mlu/device.cc
@@ -0,0 +1,33 @@
+#include "functions.hh"
+#include "hardware/devices/mlu.h"
+#include "hardware/mem_pool.h"
+#include "memory.hh"
+
+namespace refactor::hardware {
+
+    static Arc<Memory> bangMemory(int32_t card) {
+#ifdef USE_BANG
+        ASSERT(0 <= card && card < getDeviceCount(), "Invalid card id: {}", card);
+        setDevice(card);
+        auto [free, total] = getMemInfo();
+        auto size = std::min(free, std::max(5ul << 30, total * 4 / 5));
+        fmt::println("initializing Cambricon MLU {}, memory {} / {}, alloc {}",
+                     card, free, total, size);
+        return std::make_shared<MemPool>(
+            std::make_shared<MluMemory>(),
+            size,
+            256ul);
+#else
+        return nullptr;
+#endif
+    }
+
+    Mlu::Mlu(int32_t card) : Device(card, bangMemory(card)) {}
+
+    void Mlu::setContext() const noexcept {
+#ifdef USE_BANG
+        setDevice(_card);
+#endif
+    }
+
+}// namespace refactor::hardware
diff --git a/src/02hardware/src/devices/mlu/functions.cc b/src/02hardware/src/devices/mlu/functions.cc
@@ -0,0 +1,21 @@
+#include "functions.hh"
+
+namespace refactor::hardware {
+
+#ifdef USE_BANG
+    int getDeviceCount() {
+        unsigned deviceCount;
+        BANG_ASSERT(cnrtGetDeviceCount(&deviceCount));
+        return static_cast<int>(deviceCount);
+    }
+    void setDevice(int device) {
+        BANG_ASSERT(cnrtSetDevice(device));
+    }
+    MemInfo getMemInfo() {
+        MemInfo memInfo;
+        BANG_ASSERT(cnrtMemGetInfo(&memInfo.free, &memInfo.total));
+        return memInfo;
+    }
+#endif
+
+}// namespace refactor::hardware
diff --git a/src/02hardware/src/devices/mlu/functions.hh b/src/02hardware/src/devices/mlu/functions.hh
@@ -0,0 +1,28 @@
+#ifndef HARDWARE_DEVICES_MLU_FUNCTIONS_CUH
+#define HARDWARE_DEVICES_MLU_FUNCTIONS_CUH
+
+#include "common.h"
+
+#ifdef USE_BANG
+#include "cnrt.h"
+
+#define BANG_ASSERT(STATUS)                                                          \
+    if (auto status = (STATUS); status != CNRT_RET_SUCCESS) {                        \
+        RUNTIME_ERROR(fmt::format("bang failed on \"" #STATUS "\" with \"{}\" ({})", \
+                                  cnrtGetErrorStr(status), (int) status));           \
+    }
+#endif
+
+namespace refactor::hardware {
+
+    struct MemInfo {
+        size_t free, total;
+    };
+
+    int getDeviceCount();
+    void setDevice(int device);
+    MemInfo getMemInfo();
+
+}// namespace refactor::hardware
+
+#endif// HARDWARE_DEVICES_NVIDIA_FUNCTIONS_CUH
diff --git a/src/02hardware/src/devices/mlu/memory.cc b/src/02hardware/src/devices/mlu/memory.cc
@@ -0,0 +1,33 @@
+#include "memory.hh"
+#include "functions.hh"
+
+namespace refactor::hardware {
+#ifdef USE_BANG
+    using M = MluMemory;
+
+    void *M::malloc(size_t size) {
+        void *ptr;
+        BANG_ASSERT(cnrtMalloc(&ptr, size));
+        return ptr;
+    }
+    void M::free(void *ptr) {
+        BANG_ASSERT(cnrtFree(ptr));
+    }
+    void *M::copyHD(void *dst, void const *src, size_t bytes) const {
+        BANG_ASSERT(cnrtMemcpy(dst, const_cast<void *>(src), bytes,
+                               CNRT_MEM_TRANS_DIR_HOST2DEV))
+        return dst;
+    }
+    void *M::copyDH(void *dst, void const *src, size_t bytes) const {
+        BANG_ASSERT(cnrtMemcpy(dst, const_cast<void *>(src), bytes,
+                               CNRT_MEM_TRANS_DIR_DEV2HOST));
+        return dst;
+    }
+    void *M::copyDD(void *dst, void const *src, size_t bytes) const {
+        BANG_ASSERT(cnrtMemcpy(dst, const_cast<void *>(src), bytes,
+                               CNRT_MEM_TRANS_DIR_PEER2PEER));
+        return dst;
+    }
+#endif
+
+}// namespace refactor::hardware
diff --git a/src/02hardware/src/devices/mlu/memory.hh b/src/02hardware/src/devices/mlu/memory.hh
@@ -0,0 +1,18 @@
+#ifndef HARDWARE_DEVICES_MLU_MEMORY_CUH
+#define HARDWARE_DEVICES_MLU_MEMORY_CUH
+
+#include "hardware/memory.h"
+
+namespace refactor::hardware {
+
+    class MluMemory final : public Memory {
+        void *malloc(size_t) final;
+        void free(void *) final;
+        void *copyHD(void *dst, void const *src, size_t bytes) const final;
+        void *copyDH(void *dst, void const *src, size_t bytes) const final;
+        void *copyDD(void *dst, void const *src, size_t bytes) const final;
+    };
+
+}// namespace refactor::hardware
+
+#endif// HARDWARE_DEVICES_MLU_MEMORY_HH
diff --git a/src/02hardware/src/devices/nvidia/device.cc b/src/02hardware/src/devices/nvidia/device.cc
@@ -23,9 +23,9 @@ namespace refactor::hardware {
 
         size_t free, total;
         CUDA_ASSERT(cudaMemGetInfo(&free, &total));
-        auto size = std::min(free, std::max(5ul << 30, total * 4 / 5));
+        auto size = free * 9 / 10;
         cudaDeviceProp prop;
-        CUDA_ASSERT(cudaGetDeviceProperties(&prop, 0));
+        CUDA_ASSERT(cudaGetDeviceProperties(&prop, card));
         size_t alignment = prop.textureAlignment;
         fmt::println("initializing Nvidia GPU {}, memory {} / {}, alloc {}, alignment {}",
                      card, free, total, size, alignment);
@@ -34,7 +34,7 @@ namespace refactor::hardware {
             size,
             alignment);
 #else
-        RUNTIME_ERROR("CUDA is not enabled");
+        return nullptr;
 #endif
     }
 

diff --git a/src/03runtime/CMakeLists.txt b/src/03runtime/CMakeLists.txt
@@ -11,6 +11,5 @@ file(GLOB_RECURSE RUNTIME_TEST test/*.cpp)
 if(RUNTIME_TEST)
     add_executable(runtime_test ${RUNTIME_TEST})
     add_test(runtime_test runtime_test)
-    target_link_libraries(runtime_test runtime GTest::gtest_main ${BACKWARD_ENABLE})
-    add_backward(runtime_test)
+    target_link_libraries(runtime_test runtime GTest::gtest_main Backward::Object)
 endif()
+1 −0		.gitignore
+55 −32		BackwardConfig.cmake
+54 −17		CMakeLists.txt
+57 −23		README.md
+79 −47		backward.hpp
+2 −2		test/suicide.cpp
+1 −1		test_package/CMakeLists.txt
-Original file line number
+Diff line change
@@ Expand Up / @@ -11,6 +11,8 @@ namespace refactor::hardware { @@
             enum class Type : int32_t {
                 Cpu,
                 Nvidia,
+                Mlu,
+                Kunlun,
             };
         protected:
@@ Expand Down @@