Skip to content

Commit

Permalink
Merge branch 'dev' into dev-mlu-runtime
Browse files Browse the repository at this point in the history
  • Loading branch information
kilinchange committed Jan 26, 2024
2 parents cc19556 + 605c4c0 commit d8a2dea
Show file tree
Hide file tree
Showing 129 changed files with 2,127 additions and 407 deletions.
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,6 @@
[submodule "src/09python_ffi/pybind11"]
path = src/09python_ffi/pybind11
url = git@github.com:pybind/pybind11.git
[submodule "3rd-party/cccl"]
path = 3rd-party/cccl
url = git@github.com:NVIDIA/cccl.git
2 changes: 1 addition & 1 deletion 3rd-party/backward-cpp
1 change: 1 addition & 0 deletions 3rd-party/cccl
Submodule cccl added at b7d422
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -105,4 +105,5 @@ add_subdirectory(src/05computation)
add_subdirectory(src/06frontend)
add_subdirectory(src/07onnx)
add_subdirectory(src/08communication)
add_subdirectory(src/08-01llm)
add_subdirectory(src/09python_ffi)
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ executor = compiler.compile("cuda", "default", []) # -------- 编译模型
- [fmt 10.1.1](https://github.com/fmtlib/fmt/releases/tag/10.1.0)
- [fmtlog v2.2.1](https://github.com/MengRao/fmtlog/releases/tag/v2.2.1)
- [googletest v1.14.0](https://github.com/google/googletest/releases/tag/v1.14.0)
- [backward-cpp v1.6](https://github.com/bombela/backward-cpp/releases/tag/v1.6)
- [backward-cpp master](https://github.com/bombela/backward-cpp)
- [result master](https://github.com/willowell/result)
- [abseil-cpp 20230802.1](https://github.com/abseil/abseil-cpp/releases/tag/20230802.1)

Expand Down
3 changes: 1 addition & 2 deletions src/00common/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,5 @@ file(GLOB_RECURSE COMMON_TEST test/*.cpp)
if(COMMON_TEST)
add_executable(common_test ${COMMON_TEST})
add_test(common_test common_test)
target_link_libraries(common_test common GTest::gtest_main ${BACKWARD_ENABLE})
add_backward(common_test)
target_link_libraries(common_test common GTest::gtest_main Backward::Object)
endif()
3 changes: 2 additions & 1 deletion src/00common/include/common/rc.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#define RC_HPP

#include <functional>
#include <utility>

namespace refactor {

Expand All @@ -18,7 +19,7 @@ namespace refactor {
T *_value;
struct Counter {
size_t strong, weak;
} * _counter;
} *_counter;

Rc(T *ptr, Counter *counter) noexcept
: _value(ptr), _counter(counter) { inc(); }
Expand Down
3 changes: 1 addition & 2 deletions src/01graph_topo/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,5 @@ file(GLOB_RECURSE GRAPH_TOPO_TEST test/*.cpp)
if(GRAPH_TOPO_TEST)
add_executable(graph_topo_test ${GRAPH_TOPO_TEST})
add_test(graph_topo_test graph_topo_test)
target_link_libraries(graph_topo_test graph_topo GTest::gtest_main ${BACKWARD_ENABLE})
add_backward(graph_topo_test)
target_link_libraries(graph_topo_test graph_topo GTest::gtest_main Backward::Object)
endif()
13 changes: 5 additions & 8 deletions src/02hardware/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,18 @@ cmake_minimum_required(VERSION 3.12 FATAL_ERROR)
project(hardware VERSION 0.0.0 LANGUAGES CXX)
message(STATUS "Project " ${PROJECT_NAME} " version " ${PROJECT_VERSION})

# Source files
file(GLOB_RECURSE HARDWARE_SRC src/*.cc src/*.cpp)
add_library(hardware STATIC ${HARDWARE_SRC} ${HARDWARE_CUDA_SRC})
target_link_libraries(hardware PUBLIC common)
target_include_directories(hardware PUBLIC include)

if(USE_CUDA)
file(GLOB_RECURSE HARDWARE_CUDA_SRC src/devices/nvidia/*.cu)
target_include_directories(hardware PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
endif()

add_library(hardware STATIC ${HARDWARE_SRC} ${HARDWARE_CUDA_SRC} ${HARDWARE_BANG_SRC})
target_link_libraries(hardware PUBLIC common)
target_include_directories(hardware PUBLIC include)

file(GLOB_RECURSE HARDWARE_TEST test/*.cpp)
if(HARDWARE_TEST)
add_executable(hardware_test ${HARDWARE_TEST})
add_test(hardware_test hardware_test)
target_link_libraries(hardware_test hardware GTest::gtest_main ${BACKWARD_ENABLE})
add_backward(hardware_test)
target_link_libraries(hardware_test hardware GTest::gtest_main Backward::Object)
endif()
2 changes: 1 addition & 1 deletion src/02hardware/include/hardware/device.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ namespace refactor::hardware {

virtual ~Device() = default;
virtual Type type() const noexcept = 0;
virtual void setContext() const noexcept;
virtual void setContext() const;

Arc<Blob> malloc(size_t);
Arc<Blob> absorb(Arc<Blob> &&);
Expand Down
2 changes: 1 addition & 1 deletion src/02hardware/include/hardware/devices/nvidia.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ namespace refactor::hardware {
class Nvidia final : public Device {
public:
explicit Nvidia(int32_t card);
void setContext() const noexcept final;
void setContext() const final;
Type type() const noexcept final {
return Type::Nvidia;
}
Expand Down
2 changes: 1 addition & 1 deletion src/02hardware/src/device.cc
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ namespace refactor::hardware {
Device::Device(decltype(_card) card, decltype(_mem) mem)
: _card(card), _mem(std::move(mem)) {}

void Device::setContext() const noexcept {}
void Device::setContext() const {}
auto Device::malloc(size_t size) -> Arc<Blob> {
return Arc<Blob>(new Blob(this, size));
}
Expand Down
39 changes: 29 additions & 10 deletions src/02hardware/src/devices/nvidia/device.cc
Original file line number Diff line number Diff line change
@@ -1,31 +1,50 @@
#include "functions.cuh"
#include "hardware/devices/nvidia.h"
#include "hardware/mem_pool.h"
#include "memory.cuh"

#ifdef USE_CUDA
#include "memory.hh"
#include <cuda_runtime.h>

#define CUDA_ASSERT(STATUS) \
if (auto status = (STATUS); status != cudaSuccess) { \
RUNTIME_ERROR(fmt::format("cuda failed on \"" #STATUS "\" with \"{}\" ({})", \
cudaGetErrorString(status), (int) status)); \
}
#endif

namespace refactor::hardware {

static Arc<Memory> cudaMemory(int32_t card) {
#ifdef USE_CUDA
ASSERT(0 <= card && card < getDeviceCount(), "Invalid card id: {}", card);
setDevice(card);
auto [free, total] = getMemInfo();
auto size = std::min(free, std::max(5ul << 30, total * 4 / 5));
fmt::println("initializing Nvidia GPU {}, memory {} / {}, alloc {}",
card, free, total, size);
int deviceCount;
CUDA_ASSERT(cudaGetDeviceCount(&deviceCount));
ASSERT(0 <= card && card < deviceCount, "Invalid card id: {}", card);
CUDA_ASSERT(cudaSetDevice(card));

size_t free, total;
CUDA_ASSERT(cudaMemGetInfo(&free, &total));
auto size = free * 9 / 10;
cudaDeviceProp prop;
CUDA_ASSERT(cudaGetDeviceProperties(&prop, 0));
size_t alignment = prop.textureAlignment;
fmt::println("initializing Nvidia GPU {}, memory {} / {}, alloc {}, alignment {}",
card, free, total, size, alignment);
return std::make_shared<MemPool>(
std::make_shared<NvidiaMemory>(),
size,
256ul);
alignment);
#else
return nullptr;
#endif
}

Nvidia::Nvidia(int32_t card) : Device(card, cudaMemory(card)) {}

void Nvidia::setContext() const noexcept {
setDevice(_card);
void Nvidia::setContext() const {
#ifdef USE_CUDA
CUDA_ASSERT(cudaSetDevice(_card));
#endif
}

}// namespace refactor::hardware
19 changes: 0 additions & 19 deletions src/02hardware/src/devices/nvidia/functions.cu

This file was deleted.

24 changes: 0 additions & 24 deletions src/02hardware/src/devices/nvidia/functions.cuh

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,5 +1,14 @@
#include "functions.cuh"
#include "memory.cuh"
#ifdef USE_CUDA

#include "memory.hh"
#include "common.h"
#include <cuda_runtime.h>

#define CUDA_ASSERT(STATUS) \
if (auto status = (STATUS); status != cudaSuccess) { \
RUNTIME_ERROR(fmt::format("cuda failed on \"" #STATUS "\" with \"{}\" ({})", \
cudaGetErrorString(status), (int) status)); \
}

namespace refactor::hardware {
using M = NvidiaMemory;
Expand Down Expand Up @@ -29,3 +38,5 @@ namespace refactor::hardware {
}

}// namespace refactor::hardware

#endif
File renamed without changes.
3 changes: 1 addition & 2 deletions src/03runtime/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,5 @@ file(GLOB_RECURSE RUNTIME_TEST test/*.cpp)
if(RUNTIME_TEST)
add_executable(runtime_test ${RUNTIME_TEST})
add_test(runtime_test runtime_test)
target_link_libraries(runtime_test runtime GTest::gtest_main ${BACKWARD_ENABLE})
add_backward(runtime_test)
target_link_libraries(runtime_test runtime GTest::gtest_main Backward::Object)
endif()
6 changes: 4 additions & 2 deletions src/03runtime/include/runtime/stream.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,11 @@ namespace refactor::runtime {
decltype(_device));

decltype(_graph) const &graph() const noexcept { return _graph; }
void setData(count_t, void const *, size_t);
auto setData(count_t, size_t) -> Arc<hardware::Device::Blob>;
void setData(count_t, Arc<hardware::Device::Blob>);
bool getData(count_t, void *, size_t) const;
auto getData(count_t) const -> Arc<hardware::Device::Blob>;
void setData(count_t, void const *, size_t);
bool copyData(count_t, void *, size_t) const;
void run();
auto bench(void (*sync)()) -> std::vector<std::chrono::nanoseconds>;
void trace(std::function<void(count_t, void const *const *, void const *const *)>);
Expand Down
12 changes: 9 additions & 3 deletions src/03runtime/src/stream.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,21 @@ namespace refactor::runtime {
std::move(edges),
} {}

auto Stream::setData(count_t i, size_t size) -> Arc<hardware::Device::Blob> {
return _graph.edges[i].blob = _device->malloc(size);
}
void Stream::setData(count_t i, Arc<hardware::Device::Blob> blob) {
_graph.edges[i].blob = std::move(blob);
}
void Stream::setData(count_t i, void const *data, size_t size) {
auto blob = _device->malloc(size);
blob->copyFromHost(data, size);
_graph.edges[i].blob = std::move(blob);
}
void Stream::setData(count_t i, Arc<hardware::Device::Blob> blob) {
_graph.edges[i].blob = std::move(blob);
auto Stream::getData(count_t i) const -> Arc<hardware::Device::Blob> {
return _graph.edges[i].blob;
}
bool Stream::getData(count_t i, void *data, size_t size) const {
bool Stream::copyData(count_t i, void *data, size_t size) const {
if (!_graph.edges[i].blob) { return false; }
_graph.edges[i].blob->copyToHost(data, size);
return true;
Expand Down
3 changes: 1 addition & 2 deletions src/04kernel/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,5 @@ file(GLOB_RECURSE KERNEL_TEST test/*.cpp)
if(KERNEL_TEST)
add_executable(kernel_test ${KERNEL_TEST})
add_test(kernel_test kernel_test)
target_link_libraries(kernel_test kernel GTest::gtest_main ${BACKWARD_ENABLE})
add_backward(kernel_test)
target_link_libraries(kernel_test kernel GTest::gtest_main Backward::Object)
endif()
3 changes: 1 addition & 2 deletions src/04kernel/cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,5 @@ file(GLOB_RECURSE KERNEL_CUDA_TEST test/*.cu)
if(KERNEL_CUDA_TEST)
add_executable(kernel_cuda_test ${KERNEL_CUDA_TEST})
add_test(kernel_cuda_test kernel_cuda_test)
target_link_libraries(kernel_cuda_test kernel_cuda GTest::gtest_main ${BACKWARD_ENABLE})
add_backward(kernel_cuda_test)
target_link_libraries(kernel_cuda_test kernel_cuda GTest::gtest_main Backward::Object)
endif()
20 changes: 20 additions & 0 deletions src/04kernel/include/kernel/collectors/hard_sigmoid.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#ifndef KERNEL_HARD_SIGMOIG_H
#define KERNEL_HARD_SIGMOIG_H

#include "../collector.h"

namespace refactor::kernel {

struct HardSigmoidCollector final : public InfoCollector {
float alpha, beta;

constexpr HardSigmoidCollector(decltype(_target) target, float alpha_, float beta_) noexcept
: InfoCollector(target), alpha(alpha_), beta(beta_) {}

std::vector<KernelBox>
filter(TensorRefs inputs, TensorRefs outputs) const final;
};
}// namespace refactor::kernel

#endif// KERNEL_HARD_SIGMOIG_H

20 changes: 20 additions & 0 deletions src/04kernel/include/kernel/collectors/rms_normalization.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#ifndef KERNEL_RMS_NORMALIZATION_H
#define KERNEL_RMS_NORMALIZATION_H

#include "../collector.h"

namespace refactor::kernel {

struct RmsNormalizationCollector final : public InfoCollector {
float epsilon;

constexpr RmsNormalizationCollector(decltype(_target) target, float epsilon_) noexcept
: InfoCollector(target), epsilon(epsilon_) {}

std::vector<KernelBox>
filter(TensorRefs inputs, TensorRefs outputs) const final;
};

}// namespace refactor::kernel

#endif// KERNEL_RMS_NORMALIZATION_H
2 changes: 2 additions & 0 deletions src/04kernel/include/kernel/collectors/simple_binary.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ namespace refactor::kernel {
And,
Or,
Xor,
Mod,
Fmod,
};

std::string_view opName(SimpleBinaryType type);
Expand Down
1 change: 1 addition & 0 deletions src/04kernel/include/kernel/collectors/simple_unary.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ namespace refactor::kernel {
Erf,
Neg,
Not,
HardSwish,
};

std::string_view unaryName(SimpleUnaryType type);
Expand Down
Loading

0 comments on commit d8a2dea

Please sign in to comment.