-
Notifications
You must be signed in to change notification settings - Fork 3.7k
[BACKEND] initial llvm codegen for amdgpu #402
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
5a4ee2c
0a5270d
01a5c93
c562873
8ed9f9a
4700efc
230194e
b1d54f6
84091ff
bcf423d
cf947c6
8c61580
14384a1
7bd23f4
9860610
4ba40cb
0ecb1e2
0bce779
ae276a3
54d02d6
e6d532d
52d8e2d
11d7585
1ca7418
bb520d9
fb29bed
38805f5
8876cde
b0c38f7
fcd7cc0
6e9a0e9
e57aa24
84044e3
a6c053b
c218cd3
1afa473
8fd4efc
80dceee
c3b39ca
678cb41
29b60e9
e48b48e
939e3ef
4501e8e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| +1 −1 | CMakeLists.txt | |
| +1 −1 | cmake/Modules/FindHDFS.cmake | |
| +8 −14 | include/dmlc/base.h | |
| +4 −44 | include/dmlc/concurrency.h | |
| +1 −5 | src/io/local_filesys.cc | |
| +1 −5 | src/io/single_file_split.h | |
| +1 −1 | tracker/dmlc_tracker/launcher.py | |
| +1 −7 | tracker/dmlc_tracker/opts.py | |
| +0 −65 | tracker/dmlc_tracker/slurm.py |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,50 @@ | ||
| """Utility for ROCm backend""" | ||
| import subprocess | ||
| from . import util | ||
| from ..api import register_func | ||
|
|
||
| def rocm_link(in_file, out_file): | ||
| """Link relocatable ELF object to shared ELF object using lld | ||
|
|
||
| Parameters | ||
| ---------- | ||
| in_file : str | ||
| Input file name (relocatable ELF object file) | ||
|
|
||
| out_file : str | ||
| Output file name (shared ELF object file) | ||
| """ | ||
| args = ["ld.lld", "-shared", in_file, "-o", out_file] | ||
| proc = subprocess.Popen( | ||
| args, | ||
| stdout=subprocess.PIPE, | ||
| stderr=subprocess.STDOUT) | ||
| (out, _) = proc.communicate() | ||
|
|
||
| if proc.returncode != 0: | ||
| msg = "Linking error using ld.lld:\n" | ||
| msg += str(out) | ||
| raise RuntimeError(msg) | ||
|
|
||
| @register_func("tvm_callback_rocm_link") | ||
| def callback_rocm_link(obj_bin): | ||
| """Links object file generated from LLVM to HSA Code Object | ||
|
|
||
| Parameters | ||
| ---------- | ||
| obj_bin : bytearray | ||
| The object file | ||
|
|
||
| Return | ||
| ------ | ||
| cobj_bin : bytearray | ||
| The HSA Code Object | ||
| """ | ||
| tmp_dir = util.tempdir() | ||
| tmp_obj = tmp_dir.relpath("rocm_kernel.o") | ||
| tmp_cobj = tmp_dir.relpath("rocm_kernel.co") | ||
| with open(tmp_obj, "wb") as out_file: | ||
| out_file.write(bytes(obj_bin)) | ||
| rocm_link(tmp_obj, tmp_cobj) | ||
| cobj_bin = bytearray(open(tmp_cobj, "rb").read()) | ||
| return cobj_bin |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,188 @@ | ||
| /*! | ||
| * Copyright (c) 2017 by Contributors | ||
| * \file codegen_amdgpu.cc | ||
| * \brief AMDGPU code generator. | ||
| */ | ||
| #ifdef TVM_LLVM_VERSION | ||
| #if TVM_ROCM_RUNTIME | ||
|
|
||
| #include <tvm/runtime/device_api.h> | ||
| #include <tvm/runtime/c_runtime_api.h> | ||
| #include <tvm/runtime/registry.h> | ||
| #include "./codegen_llvm.h" | ||
| #include "../build_common.h" | ||
| #include "../../pass/ir_util.h" | ||
| #include "../../runtime/rocm/rocm_module.h" | ||
|
|
||
| namespace tvm { | ||
| namespace codegen { | ||
|
|
||
| // AMDGPU code generator. | ||
| class CodeGenAMDGPU : public CodeGenLLVM { | ||
| public: | ||
| void AddFunction(const LoweredFunc& f) final { | ||
| // add function as void return value | ||
| CodeGenLLVM::AddFunctionInternal(f, true); | ||
| function_->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL); | ||
| } | ||
|
|
||
| void VisitStmt_(const Allocate* op) final { | ||
| CHECK(!is_zero(op->condition)); | ||
| llvm::Value* buf = nullptr; | ||
| if (op->new_expr.defined()) { | ||
| CHECK_EQ(op->free_function, "nop"); | ||
| buf = MakeValue(op->new_expr); | ||
| } else { | ||
| int32_t constant_size = op->constant_allocation_size(); | ||
| CHECK_GT(constant_size, 0) | ||
| << "Can only handle constant size stack allocation in GPU"; | ||
| StorageInfo& info = alloc_storage_info_[op->buffer_var.get()]; | ||
| if (constant_size % 4 == 0 && info.alignment == 0) { | ||
| info.alignment = GetTempAllocaAlignment(op->type, constant_size); | ||
| } | ||
| // maximum necessary alignment in the AMD devices | ||
| if (info.alignment > 16) { | ||
| info.alignment = 16; | ||
| } | ||
| if (info.scope.rank == 2) { | ||
| // const int local_address_space = 5; | ||
| // TODO(tqchen): for higher version of LLVM, local address space can be set. | ||
| llvm::AllocaInst* alloca = builder_->CreateAlloca( | ||
| LLVMType(op->type), ConstInt32(constant_size)); | ||
| if (alloca->getAlignment() < static_cast<uint32_t>(info.alignment)) { | ||
| alloca->setAlignment(info.alignment); | ||
| } | ||
| buf = alloca; | ||
| } else { | ||
| CHECK_EQ(info.scope.rank, 1) | ||
| << "Can only allocate shared or local memory inside kernel"; | ||
| // Shared memory: address space == 3 | ||
| const unsigned shared_address_space = 3; | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. https://llvm.org/docs/AMDGPUUsage.html#address-space-mapping Change to local memory space
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes. I am aware of it. Once I get LLVM IR dump, I can get better understanding of what to change or even add more functionality.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see, seems this is something we will need frequently. Let us simply also print out llvm ir and save it to the code field(optional) in the ROCMModule, so we can access it with module.get_source()
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Updated PR with IR dump code. I see some places that need to be changed (which is why I am getting Bus Error). |
||
| llvm::Type* type = llvm::ArrayType::get(LLVMType(op->type), constant_size); | ||
| // Allocate shared memory in global, address_space = 3 | ||
| llvm::GlobalVariable *global = new llvm::GlobalVariable( | ||
| *module_, type, false, llvm::GlobalValue::PrivateLinkage, 0, ".shared", | ||
| nullptr, llvm::GlobalValue::NotThreadLocal, shared_address_space); | ||
| global->setAlignment(info.alignment); | ||
| buf = global; | ||
| } | ||
| } | ||
| buf = builder_->CreatePointerCast( | ||
| buf, LLVMType(op->type)->getPointerTo( | ||
| buf->getType()->getPointerAddressSpace())); | ||
| CHECK(!var_map_.count(op->buffer_var.get())); | ||
| var_map_[op->buffer_var.get()] = buf; | ||
| this->VisitStmt(op->body); | ||
| } | ||
|
|
||
| // Return the thread index via intrinsics. | ||
| llvm::Value* GetThreadIndex(const IterVar& iv) final { | ||
| runtime::ThreadScope ts = runtime::ThreadScope::make(iv->thread_tag); | ||
| llvm::Intrinsic::ID intrin_id = ::llvm::Intrinsic::amdgcn_workitem_id_x; | ||
| if (ts.rank == 1) { | ||
| switch (ts.dim_index) { | ||
| case 0: intrin_id = ::llvm::Intrinsic::amdgcn_workitem_id_x; break; | ||
| case 1: intrin_id = ::llvm::Intrinsic::amdgcn_workitem_id_y; break; | ||
| case 2: intrin_id = ::llvm::Intrinsic::amdgcn_workitem_id_z; break; | ||
| default: LOG(FATAL) << "unknown workitem idx"; | ||
| } | ||
| } else { | ||
| CHECK_EQ(ts.rank, 0); | ||
| switch (ts.dim_index) { | ||
| case 0: intrin_id = ::llvm::Intrinsic::amdgcn_workgroup_id_x; break; | ||
| case 1: intrin_id = ::llvm::Intrinsic::amdgcn_workgroup_id_y; break; | ||
| case 2: intrin_id = ::llvm::Intrinsic::amdgcn_workgroup_id_z; break; | ||
| default: LOG(FATAL) << "unknown workgroup idx"; | ||
| } | ||
| } | ||
| llvm::Function* f = llvm::Intrinsic::getDeclaration(module_.get(), intrin_id); | ||
| return builder_->CreateCall(f, {}); | ||
| } | ||
|
|
||
| llvm::Value* CreateStorageSync(const Call* op) final { | ||
| const std::string& sync = op->args[0].as<StringImm>()->value; | ||
| if (sync == "warp") { | ||
| // TODO(tqchen) warp sync in CUDA9 | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. remove the comment here, is there any need of warp(wavefront) synchronizer in AMD GPU?
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There are sync commands for AMD GPU, but if it CUDA9 specific, current generation AMD GPUs don't support it. |
||
| return nullptr; | ||
| } else if (sync == "shared") { | ||
| llvm::Function* f = llvm::Intrinsic::getDeclaration( | ||
| module_.get(), | ||
| ::llvm::Intrinsic::amdgcn_s_barrier); | ||
| return builder_->CreateCall(f, {}); | ||
| } else { | ||
| LOG(FATAL) << "Do not support sync " << sync; | ||
| return nullptr; | ||
| } | ||
| } | ||
|
|
||
| void InitPassManagerBuilder(llvm::PassManagerBuilder* builder) final { | ||
| // Additional optimization hook to tweak the builder. | ||
| } | ||
|
|
||
| unsigned GetGlobalAddressSpace() { | ||
| return 1; | ||
| } | ||
|
|
||
| protected: | ||
| void InitTarget(llvm::TargetMachine* tm) final { | ||
| // Maximum vector lane = float4 | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. double check this
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes. float4 or dwordx4 is the perfect alignment. |
||
| native_vector_bits_ = 4 * 32; | ||
| CodeGenLLVM::InitTarget(tm); | ||
| } | ||
| }; | ||
|
|
||
| runtime::Module BuildAMDGPU(Array<LoweredFunc> funcs, std::string target) { | ||
| CHECK(target.length( | ||
| ) >= 4 && | ||
| target.substr(0, 4) == "rocm"); | ||
| llvm::TargetMachine* tm = \ | ||
| GetLLVMTargetMachine("-mtriple=amdgcn-amd-amdhsa-hcc -mcpu=gfx900" + \ | ||
| target.substr(4, target.length() - 4)); | ||
|
|
||
| std::unique_ptr<CodeGenAMDGPU> cg(new CodeGenAMDGPU()); | ||
| std::unique_ptr<llvm::LLVMContext> ctx(new llvm::LLVMContext()); | ||
| cg->Init(funcs[0]->name, tm, ctx.get(), false, false); | ||
| for (LoweredFunc f : funcs) { | ||
| cg->AddFunction(f); | ||
| } | ||
|
|
||
| std::unique_ptr<llvm::Module> module = cg->Finish(); | ||
|
|
||
| llvm::SmallString<8> dataObj, data_ll, dataAsm; | ||
| llvm::raw_svector_ostream destObj(dataObj), dest_ll(data_ll), destAsm(dataAsm); | ||
| destObj.SetUnbuffered(); | ||
| dest_ll.SetUnbuffered(); | ||
| destAsm.SetUnbuffered(); | ||
| module->print(dest_ll, nullptr); | ||
| std::unique_ptr<llvm::Module> mAsm = llvm::CloneModule(module.get()); | ||
| std::unique_ptr<llvm::Module> mObj = llvm::CloneModule(module.get()); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. remove mObjFile and mAsmFile. We can consider hold two optional source code in RocmModule, both ll and asm, and return them when different source suffix is requested, that might help you in debugging. |
||
| llvm::legacy::PassManager pass; | ||
|
|
||
| CHECK(tm->addPassesToEmitFile( | ||
| pass, destObj, llvm::TargetMachine::CGFT_ObjectFile) == 0) | ||
| << "Cannot emit target CGFT_ObjectFile"; | ||
| pass.run(*mObj); | ||
| std::string obj(dataObj.begin(), dataObj.end()); | ||
|
|
||
| const auto* f = tvm::runtime::Registry::Get("tvm_callback_rocm_link"); | ||
| CHECK(f != nullptr) << "Require tvm_callback_rocm_link to exist, do import tvm.contrib.rocm"; | ||
|
|
||
| TVMByteArray arr; | ||
| arr.data = &obj[0]; | ||
| arr.size = obj.length(); | ||
|
|
||
| std::string hsaco = (*f)(arr); | ||
| std::string ll(data_ll.begin(), data_ll.end()); | ||
|
|
||
| return ROCMModuleCreate(hsaco, "hsaco", ExtractFuncInfo(funcs), ll); | ||
| } | ||
|
|
||
| TVM_REGISTER_API("codegen.build_rocm") | ||
| .set_body([](TVMArgs args, TVMRetValue* rv) { | ||
| *rv = BuildAMDGPU(args[0], args[1]); | ||
| }); | ||
|
|
||
| } // namespace codegen | ||
| } // namespace tvm | ||
| #endif // TVM_ROCM_RUNTIME | ||
| #endif // TVM_LLVM_VERSION | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Check if address space is consistent with Amd gpu backend
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I have to make another pass on the codegen part as there are obvious differences between nvptx and amdgcn codegen. Is there a way I can see IR directly?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If you want to check the LLVM code, do module_->dump(); you have to insert it manually in the code though. Otherwise, implement GetSource in hip module which should give you the assembly
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'll use
LOG(WARNING) << module_->dump();to see it.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hi @tqchen I am getting following error:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Usually I use module_->dump() without piping it to stream and it should work
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
#404