|
| 1 | +/*! |
| 2 | + * Copyright (c) 2019 by Contributors |
| 3 | + * \file codegen_x86_64.cc |
| 4 | + * \brief X86-64 specific code generator |
| 5 | + */ |
| 6 | +#ifdef TVM_LLVM_VERSION |
| 7 | +#include "codegen_cpu.h" |
| 8 | + |
| 9 | +#include "llvm/MC/MCSubtargetInfo.h" |
| 10 | + |
| 11 | +namespace tvm { |
| 12 | +namespace codegen { |
| 13 | + |
| 14 | +namespace { |
| 15 | +bool TargetHasFeature(const llvm::TargetMachine& tm, const std::string& feature) { |
| 16 | + // MCSubTargetInfo::checkFeatures was added in LLVM 6.0 |
| 17 | +#if TVM_LLVM_VERSION >= 60 |
| 18 | + const auto* MCInfo = tm.getMCSubtargetInfo(); |
| 19 | + return MCInfo->checkFeatures(std::string("+") + feature); |
| 20 | +#else |
| 21 | + return false; |
| 22 | + // TODO(tulloch) - enable this block, need to figure out how to reimplement |
| 23 | + // this given visibility constraints, similar to |
| 24 | + // https://github.com/rust-lang/rust/pull/31709 |
| 25 | + |
| 26 | + // Copied from |
| 27 | + // https://github.com/llvm-mirror/llvm/blob/5136df4/lib/MC/MCSubtargetInfo.cpp#L78-L88. |
| 28 | + |
| 29 | + // auto checkFeatures = [&](const std::string FS) { |
| 30 | + // llvm::SubtargetFeatures T(FS); |
| 31 | + // llvm::FeatureBitset Set, All; |
| 32 | + // for (std::string F : T.getFeatures()) { |
| 33 | + // llvm::SubtargetFeatures::ApplyFeatureFlag(Set, F, MCInfo->ProcFeatures); |
| 34 | + // if (F[0] == '-') { |
| 35 | + // F[0] = '+'; |
| 36 | + // } |
| 37 | + // llvm::SubtargetFeatures::ApplyFeatureFlag(All, F, MCInfo->ProcFeatures); |
| 38 | + // } |
| 39 | + // return (MCInfo->getFeatureBits() & All) == Set; |
| 40 | + // }; |
| 41 | + // return checkFeatures(MCInfo, std::string("+") + feature); |
| 42 | +#endif |
| 43 | +} |
| 44 | +} // namespace |
| 45 | + |
| 46 | +class CodeGenX86_64 final : public CodeGenCPU { |
| 47 | + public: |
| 48 | + llvm::Value* VisitExpr_(const Cast* op) override; |
| 49 | + |
| 50 | + private: |
| 51 | + llvm::Value* CallVectorIntrin(llvm::Intrinsic::ID id, size_t intrin_lanes, llvm::Type* result_ty, |
| 52 | + const std::vector<llvm::Value*>& args); |
| 53 | +}; |
| 54 | + |
| 55 | +llvm::Value* CodeGenX86_64::VisitExpr_(const Cast* op) { |
| 56 | + // LLVM does not automatically generate the correct instruction sequences for |
| 57 | + // half -> float conversion (i.e. using AVX2/AVX-512 vectorized variants of |
| 58 | + // vcvtph2ps), so we explicitly generate them ourselves. |
| 59 | + const auto from = op->value.type(); |
| 60 | + const auto to = op->type; |
| 61 | + if (from.is_float() && to.is_float() && from.bits() == 16 && to.bits() == 32) { |
| 62 | + CHECK_EQ(from.lanes(), to.lanes()); |
| 63 | + CHECK_NOTNULL(target_machine_); |
| 64 | + |
| 65 | + const auto has_f16c = TargetHasFeature(*target_machine_, "f16c"); |
| 66 | + const auto has_avx512 = TargetHasFeature(*target_machine_, "avx512f"); |
| 67 | + |
| 68 | + if (from.lanes() >= 16 && has_avx512) { |
| 69 | + return CallVectorIntrin( |
| 70 | + ::llvm::Intrinsic::x86_avx512_mask_vcvtph2ps_512, 16, LLVMType(Float(32, from.lanes())), |
| 71 | + { |
| 72 | + MakeValue(ir::Call::make(Int(16, from.lanes()), ir::Call::reinterpret, {op->value}, |
| 73 | + ir::Call::PureIntrinsic)), |
| 74 | + MakeValue(ir::Broadcast::make(ir::FloatImm::make(Float(32), 0), from.lanes())), |
| 75 | + /*mask=*/MakeValue(ir::IntImm::make(Int(16), -1)), |
| 76 | + /*rounding-mode=*/MakeValue(ir::IntImm::make(Int(32), 4)), |
| 77 | + }); |
| 78 | + } |
| 79 | + |
| 80 | + if (from.lanes() >= 8 && has_f16c) { |
| 81 | + return CallVectorIntrin( |
| 82 | + ::llvm::Intrinsic::x86_vcvtph2ps_256, 8, LLVMType(Float(32, from.lanes())), |
| 83 | + {MakeValue(ir::Call::make(Int(16, from.lanes()), ir::Call::reinterpret, {op->value}, |
| 84 | + ir::Call::PureIntrinsic))}); |
| 85 | + } |
| 86 | + } |
| 87 | + |
| 88 | + return CodeGenCPU::VisitExpr_(op); |
| 89 | +} |
| 90 | + |
| 91 | +llvm::Value* CodeGenX86_64::CallVectorIntrin(llvm::Intrinsic::ID id, size_t intrin_lanes, |
| 92 | + llvm::Type* result_ty, |
| 93 | + |
| 94 | + const std::vector<llvm::Value*>& args) { |
| 95 | + llvm::Function* f = llvm::Intrinsic::getDeclaration(module_.get(), id, {}); |
| 96 | + if (intrin_lanes == result_ty->getVectorNumElements()) { |
| 97 | + return builder_->CreateCall(f, args); |
| 98 | + } |
| 99 | + |
| 100 | + // Otherwise, we split the vector into intrin_lanes sized elements (widening where necessary), |
| 101 | + // compute each result, and then concatenate the vectors (slicing the result if necessary). |
| 102 | + CHECK_LT(intrin_lanes, result_ty->getVectorNumElements()); |
| 103 | + std::vector<llvm::Value*> split_results; |
| 104 | + for (size_t i = 0; |
| 105 | + i < static_cast<size_t>(result_ty->getVectorNumElements()); |
| 106 | + i += intrin_lanes) { |
| 107 | + std::vector<llvm::Value*> split_args; |
| 108 | + for (const auto& v : args) { |
| 109 | + if (v->getType()->isVectorTy()) { |
| 110 | + CHECK_EQ(v->getType()->getVectorNumElements(), result_ty->getVectorNumElements()); |
| 111 | + split_args.push_back(CreateVecSlice(v, i, intrin_lanes)); |
| 112 | + } else { |
| 113 | + split_args.push_back(v); |
| 114 | + } |
| 115 | + } |
| 116 | + split_results.push_back(CallVectorIntrin( |
| 117 | + id, intrin_lanes, llvm::VectorType::get(result_ty->getScalarType(), intrin_lanes), |
| 118 | + split_args)); |
| 119 | + } |
| 120 | + return CreateVecSlice(CreateVecConcat(split_results), 0, result_ty->getVectorNumElements()); |
| 121 | +} |
| 122 | + |
| 123 | +TVM_REGISTER_GLOBAL("tvm.codegen.llvm.target_x86-64") |
| 124 | +.set_body([](const TVMArgs& targs, TVMRetValue* rv) { |
| 125 | + CodeGenLLVM* cg = new CodeGenX86_64(); |
| 126 | + *rv = static_cast<void*>(cg); |
| 127 | + }); |
| 128 | + |
| 129 | +} // namespace codegen |
| 130 | +} // namespace tvm |
| 131 | +#endif // TVM_LLVM_VERSION |
0 commit comments