From d8034721883dbd452123f8cf3bccb4285fd59f68 Mon Sep 17 00:00:00 2001 From: Jameson Nash Date: Mon, 9 Jul 2018 15:02:18 -0400 Subject: [PATCH] codegen: fix optimization misses from IR changes - dereferenceable attribute requires align attribute (if eltype is not sized) or it is simply ignored - tbaa_const isn't necessarily sufficient to allow LICM hoisting. an unknown call instruction (such as julia.gc_alloc_obj) present in the same loop can prevent TBAA from declaring the memory to be immutable. However, the invariant.load attribute is much stronger: additionally mark all tbaa_const loads with MD_immutable_load, permitting llvm to reorder and left them them much more freely (requiring only dereferenceable). --- src/cgutils.cpp | 56 ++++++++-------- src/codegen.cpp | 8 +-- src/intrinsics.cpp | 2 +- src/llvm-late-gc-lowering.cpp | 116 +++++----------------------------- src/llvm-multiversioning.cpp | 1 + src/llvm-ptls.cpp | 7 +- 6 files changed, 54 insertions(+), 136 deletions(-) diff --git a/src/cgutils.cpp b/src/cgutils.cpp index 670619029d908..f7cfdf9645cbc 100644 --- a/src/cgutils.cpp +++ b/src/cgutils.cpp @@ -4,7 +4,9 @@ static Instruction *tbaa_decorate(MDNode *md, Instruction *load_or_store) { - load_or_store->setMetadata( llvm::LLVMContext::MD_tbaa, md ); + load_or_store->setMetadata(llvm::LLVMContext::MD_tbaa, md); + if (isa(load_or_store) && md == tbaa_const) + load_or_store->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(md->getContext(), None)); return load_or_store; } @@ -358,20 +360,29 @@ static size_t dereferenceable_size(jl_value_t *jt) } } +// If given alignment is 0 and LLVM's assumed alignment for a load/store via ptr +// might be stricter than the Julia alignment for jltype, return the alignment of jltype. +// Otherwise return the given alignment. +static unsigned julia_alignment(jl_value_t *jltype) +{ + unsigned alignment = jl_datatype_align(jltype); + assert(alignment <= JL_HEAP_ALIGNMENT); + assert(JL_HEAP_ALIGNMENT % alignment == 0); + return alignment; +} + static inline void maybe_mark_argument_dereferenceable(Argument *A, jl_value_t *jt) { - auto F = A->getParent(); + AttrBuilder B; + B.addAttribute(Attribute::NonNull); // The `dereferencable` below does not imply `nonnull` for non addrspace(0) pointers. -#if JL_LLVM_VERSION >= 50000 - F->addParamAttr(A->getArgNo(), Attribute::NonNull); -#else - F->setAttributes(F->getAttributes().addAttribute(jl_LLVMContext, A->getArgNo() + 1, - Attribute::NonNull)); -#endif size_t size = dereferenceable_size(jt); - if (!size) - return; - F->addDereferenceableAttr(A->getArgNo() + 1, size); + if (size) { + B.addDereferenceableAttr(size); + if (!A->getType()->getPointerElementType()->isSized()) // mimic LLVM Loads.cpp isAligned + B.addAlignmentAttr(julia_alignment(jt)); + } + A->addAttrs(B); } static inline Instruction *maybe_mark_load_dereferenceable(Instruction *LI, bool can_be_null, @@ -1229,19 +1240,6 @@ static Value *emit_bounds_check(jl_codectx_t &ctx, const jl_cgval_t &ainfo, jl_v return im1; } -// If given alignment is 0 and LLVM's assumed alignment for a load/store via ptr -// might be stricter than the Julia alignment for jltype, return the alignment of jltype. -// Otherwise return the given alignment. -static unsigned julia_alignment(jl_value_t *jltype, unsigned alignment) -{ - if (!alignment) { - alignment = jl_datatype_align(jltype); - assert(alignment <= JL_HEAP_ALIGNMENT); - assert(JL_HEAP_ALIGNMENT % alignment == 0); - } - return alignment; -} - static Value *emit_unbox(jl_codectx_t &ctx, Type *to, const jl_cgval_t &x, jl_value_t *jt, Value* dest = NULL, MDNode *tbaa_dest = nullptr, bool isVolatile = false); static jl_cgval_t typed_load(jl_codectx_t &ctx, Value *ptr, Value *idx_0based, jl_value_t *jltype, @@ -1267,8 +1265,8 @@ static jl_cgval_t typed_load(jl_codectx_t &ctx, Value *ptr, Value *idx_0based, j // elt = data; //} //else { - Instruction *load = ctx.builder.CreateAlignedLoad(data, isboxed ? - alignment : julia_alignment(jltype, alignment), false); + Instruction *load = ctx.builder.CreateAlignedLoad(data, isboxed || alignment ? + alignment : julia_alignment(jltype), false); if (isboxed) load = maybe_mark_load_dereferenceable(load, true, jltype); if (tbaa) { @@ -1316,7 +1314,7 @@ static void typed_store(jl_codectx_t &ctx, } if (idx_0based) data = ctx.builder.CreateInBoundsGEP(r->getType(), data, idx_0based); - Instruction *store = ctx.builder.CreateAlignedStore(r, data, isboxed ? alignment : julia_alignment(jltype, alignment)); + Instruction *store = ctx.builder.CreateAlignedStore(r, data, isboxed || alignment ? alignment : julia_alignment(jltype)); if (tbaa) tbaa_decorate(tbaa, store); } @@ -2229,7 +2227,7 @@ static void emit_unionmove(jl_codectx_t &ctx, Value *dest, MDNode *tbaa_dst, con else { Value *src_ptr = data_pointer(ctx, src); unsigned nb = jl_datatype_size(typ); - unsigned alignment = julia_alignment(typ, 0); + unsigned alignment = julia_alignment(typ); Value *nbytes = ConstantInt::get(T_size, nb); if (skip) { // TODO: this Select is very bad for performance, but is necessary to work around LLVM bugs with the undef option that we want to use: @@ -2256,7 +2254,7 @@ static void emit_unionmove(jl_codectx_t &ctx, Value *dest, MDNode *tbaa_dst, con bool allunboxed = for_each_uniontype_small( [&](unsigned idx, jl_datatype_t *jt) { unsigned nb = jl_datatype_size(jt); - unsigned alignment = julia_alignment((jl_value_t*)jt, 0); + unsigned alignment = julia_alignment((jl_value_t*)jt); BasicBlock *tempBB = BasicBlock::Create(jl_LLVMContext, "union_move", ctx.f); ctx.builder.SetInsertPoint(tempBB); switchInst->addCase(ConstantInt::get(T_int8, idx), tempBB); diff --git a/src/codegen.cpp b/src/codegen.cpp index af98933bc71ca..0632b28d9af25 100644 --- a/src/codegen.cpp +++ b/src/codegen.cpp @@ -4186,12 +4186,12 @@ static void emit_cfunc_invalidate( } else { gf_ret = emit_bitcast(ctx, gf_ret, gfrt->getPointerTo()); - ctx.builder.CreateRet(ctx.builder.CreateAlignedLoad(gf_ret, julia_alignment(astrt, 0))); + ctx.builder.CreateRet(ctx.builder.CreateAlignedLoad(gf_ret, julia_alignment(astrt))); } break; } case jl_returninfo_t::SRet: { - emit_memcpy(ctx, &*gf_thunk->arg_begin(), nullptr, gf_ret, nullptr, jl_datatype_size(astrt), julia_alignment(astrt, 0)); + emit_memcpy(ctx, &*gf_thunk->arg_begin(), nullptr, gf_ret, nullptr, jl_datatype_size(astrt), julia_alignment(astrt)); ctx.builder.CreateRetVoid(); break; } @@ -5038,7 +5038,7 @@ static Function *gen_invoke_wrapper(jl_method_instance_t *lam, const jl_returnin if (lty != NULL && !isboxed) { theArg = decay_derived(emit_bitcast(ctx, theArg, PointerType::get(lty, 0))); if (!lty->isAggregateType()) // keep "aggregate" type values in place as pointers - theArg = ctx.builder.CreateAlignedLoad(theArg, julia_alignment(ty, 0)); + theArg = ctx.builder.CreateAlignedLoad(theArg, julia_alignment(ty)); } assert(dyn_cast(theArg) == NULL); args[idx] = theArg; @@ -6134,7 +6134,7 @@ static std::unique_ptr emit_function( if (returninfo.cc == jl_returninfo_t::SRet) { assert(jl_is_concrete_type(jlrettype)); emit_memcpy(ctx, sret, nullptr, retvalinfo, jl_datatype_size(jlrettype), - julia_alignment(jlrettype, 0)); + julia_alignment(jlrettype)); } else { // must be jl_returninfo_t::Union emit_unionmove(ctx, sret, nullptr, retvalinfo, /*skip*/isboxed_union); diff --git a/src/intrinsics.cpp b/src/intrinsics.cpp index c7879992ae075..c6f82ccc3cc5d 100644 --- a/src/intrinsics.cpp +++ b/src/intrinsics.cpp @@ -367,7 +367,7 @@ static Value *emit_unbox(jl_codectx_t &ctx, Type *to, const jl_cgval_t &x, jl_va return NULL; } - unsigned alignment = julia_alignment(jt, 0); + unsigned alignment = julia_alignment(jt); Type *ptype = to->getPointerTo(); if (dest) { emit_memcpy(ctx, dest, tbaa_dest, p, x.tbaa, jl_datatype_size(jt), alignment, false); diff --git a/src/llvm-late-gc-lowering.cpp b/src/llvm-late-gc-lowering.cpp index 89c9ffb7e6de7..f83ba0d2bfa7a 100644 --- a/src/llvm-late-gc-lowering.cpp +++ b/src/llvm-late-gc-lowering.cpp @@ -31,49 +31,6 @@ using namespace llvm; -namespace { -#if JL_LLVM_VERSION < 50000 -static void appendToUsedList(Module &M, StringRef Name, ArrayRef Values) { - GlobalVariable *GV = M.getGlobalVariable(Name); - SmallPtrSet InitAsSet; - SmallVector Init; - if (GV) { - ConstantArray *CA = dyn_cast(GV->getInitializer()); - for (auto &Op : CA->operands()) { - Constant *C = cast_or_null(Op); - if (InitAsSet.insert(C).second) - Init.push_back(C); - } - GV->eraseFromParent(); - } - - Type *Int8PtrTy = llvm::Type::getInt8PtrTy(M.getContext()); - for (auto *V : Values) { - Constant *C = ConstantExpr::getBitCast(V, Int8PtrTy); - if (InitAsSet.insert(C).second) - Init.push_back(C); - } - - if (Init.empty()) - return; - - ArrayType *ATy = ArrayType::get(Int8PtrTy, Init.size()); - GV = new llvm::GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage, - ConstantArray::get(ATy, Init), Name); - GV->setSection("llvm.metadata"); -} - -static void append_to_compiler_used(Module &M, ArrayRef Values) { - appendToUsedList(M, "llvm.compiler.used", Values); -} -#else -static void append_to_compiler_used(Module &M, ArrayRef Values) -{ - appendToCompilerUsed(M, Values); -} -#endif -} - /* Julia GC Root Placement pass. For a general overview of the design of GC root lowering, see the devdocs. This file is the actual implementation. @@ -340,16 +297,6 @@ namespace llvm { void initializeLateLowerGCFramePass(PassRegistry &Registry); } -template -static void addReturnAttr(T *f, Attribute::AttrKind Kind) -{ -#if JL_LLVM_VERSION >= 50000 - f->addAttribute(AttributeList::ReturnIndex, Kind); -#else - f->addAttribute(AttributeSet::ReturnIndex, Kind); -#endif -} - extern std::pair tbaa_make_child(const char *name, MDNode *parent=nullptr, bool isConstant=false); struct LateLowerGCFrame: public FunctionPass { static char ID; @@ -822,6 +769,8 @@ JL_USED_FUNC static void dumpLivenessState(Function &F, State &S) { // jtbaa_immut. static bool isLoadFromImmut(LoadInst *LI) { + if (LI->getMetadata(LLVMContext::MD_invariant_load)) + return true; MDNode *TBAA = LI->getMetadata(LLVMContext::MD_tbaa); if (!TBAA) return false; @@ -1573,7 +1522,6 @@ Value *LateLowerGCFrame::EmitLoadTag(IRBuilder<> &builder, Value *V) // that's initialized by `addrspacecast`. Such a global variable is not supported by the backend. // This is not a problem on 4.0+ since that transformation (in loop-idiom) is disabled // for NI pointers. -#if JL_LLVM_VERSION >= 40000 static SmallVector *FindRefinements(Value *V, State *S) { if (!S) @@ -1593,12 +1541,6 @@ static bool IsPermRooted(Value *V, State *S) return RefinePtr->size() == 1 && (*RefinePtr)[0] == -2; return false; } -#else -static bool IsPermRooted(Value *V, State *S) -{ - return false; -} -#endif static inline void UpdatePtrNumbering(Value *From, Value *To, State *S) { @@ -1625,11 +1567,8 @@ bool LateLowerGCFrame::CleanupIR(Function &F, State *S) { AllocaInst *Frame = nullptr; if (T_prjlvalue) { T_pprjlvalue = T_prjlvalue->getPointerTo(); - Frame = new AllocaInst(T_prjlvalue, -#if JL_LLVM_VERSION >= 50000 - 0, -#endif - ConstantInt::get(T_int32, maxframeargs), "", StartOff); + Frame = new AllocaInst(T_prjlvalue, 0, + ConstantInt::get(T_int32, maxframeargs), "", StartOff); } SmallVector write_barriers; for (BasicBlock &BB : F) { @@ -1670,8 +1609,7 @@ bool LateLowerGCFrame::CleanupIR(Function &F, State *S) { auto pool_osize = ConstantInt::get(T_int32, osize); newI = builder.CreateCall(pool_alloc_func, {ptls, pool_offs, pool_osize}); } - addReturnAttr(newI, Attribute::NoAlias); - addReturnAttr(newI, Attribute::NonNull); + newI->setAttributes(newI->getCalledFunction()->getAttributes()); newI->takeName(CI); auto store = builder.CreateStore(CI->getArgOperand(2), EmitTagPtr(builder, T_prjlvalue, newI)); @@ -1726,26 +1664,10 @@ bool LateLowerGCFrame::CleanupIR(Function &F, State *S) { CallInst *NewCall = CallInst::Create(newFptr, ReplacementArgs, "", CI); NewCall->setTailCallKind(CI->getTailCallKind()); auto old_attrs = CI->getAttributes(); -#if JL_LLVM_VERSION >= 50000 NewCall->setAttributes(AttributeList::get(CI->getContext(), old_attrs.getFnAttributes(), old_attrs.getRetAttributes(), {})); -#else - AttributeSet attr; - attr = attr.addAttributes(CI->getContext(), AttributeSet::ReturnIndex, - old_attrs.getRetAttributes()) - .addAttributes(CI->getContext(), AttributeSet::FunctionIndex, - old_attrs.getFnAttributes()); - NewCall->setAttributes(attr); -#endif -#if JL_LLVM_VERSION >= 40000 NewCall->copyMetadata(*CI); -#else - SmallVector, 1> MDs; - CI->getAllMetadata(MDs); - for (auto MD : MDs) - NewCall->setMetadata(MD.first, MD.second); -#endif CI->replaceAllUsesWith(NewCall); UpdatePtrNumbering(CI, NewCall, S); } else if (CI->getNumArgOperands() == CI->getNumOperands()) { @@ -1755,14 +1677,7 @@ bool LateLowerGCFrame::CleanupIR(Function &F, State *S) { } else { CallInst *NewCall = CallInst::Create(CI, None, CI); NewCall->takeName(CI); -#if JL_LLVM_VERSION >= 40000 NewCall->copyMetadata(*CI); -#else - SmallVector, 1> MDs; - CI->getAllMetadata(MDs); - for (auto MD : MDs) - NewCall->setMetadata(MD.first, MD.second); -#endif CI->replaceAllUsesWith(NewCall); UpdatePtrNumbering(CI, NewCall, S); } @@ -1910,11 +1825,8 @@ void LateLowerGCFrame::PlaceRootsAndUpdateCalls(std::vector &Colors, State if (MaxColor != -1 || S.Allocas.size() != 0) { unsigned NRoots = MaxColor + 1 + S.Allocas.size(); // Create GC Frame - AllocaInst *gcframe = new AllocaInst(T_prjlvalue, -#if JL_LLVM_VERSION >= 50000 - 0, -#endif - ConstantInt::get(T_int32, NRoots+2), "gcframe"); + AllocaInst *gcframe = new AllocaInst(T_prjlvalue, 0, + ConstantInt::get(T_int32, NRoots + 2), "gcframe"); gcframe->insertBefore(&*F->getEntryBlock().begin()); // Zero out gcframe BitCastInst *tempSlot_i8 = new BitCastInst(gcframe, Type::getInt8PtrTy(F->getContext()), ""); @@ -2009,8 +1921,10 @@ bool LateLowerGCFrame::doInitialization(Module &M) { args.push_back(T_int32); pool_alloc_func = Function::Create(FunctionType::get(T_prjlvalue, args, false), Function::ExternalLinkage, "jl_gc_pool_alloc", &M); - addReturnAttr(pool_alloc_func, Attribute::NoAlias); - addReturnAttr(pool_alloc_func, Attribute::NonNull); + pool_alloc_func->setAttributes(AttributeList::get(M.getContext(), + alloc_obj_func->getAttributes().getFnAttributes(), + alloc_obj_func->getAttributes().getRetAttributes(), + None)); } if (!(big_alloc_func = M.getFunction("jl_gc_big_alloc"))) { std::vector args(0); @@ -2018,8 +1932,10 @@ bool LateLowerGCFrame::doInitialization(Module &M) { args.push_back(T_size); big_alloc_func = Function::Create(FunctionType::get(T_prjlvalue, args, false), Function::ExternalLinkage, "jl_gc_big_alloc", &M); - addReturnAttr(big_alloc_func, Attribute::NoAlias); - addReturnAttr(big_alloc_func, Attribute::NonNull); + big_alloc_func->setAttributes(AttributeList::get(M.getContext(), + alloc_obj_func->getAttributes().getFnAttributes(), + alloc_obj_func->getAttributes().getRetAttributes(), + None)); } auto T_jlvalue = cast(T_prjlvalue)->getElementType(); T_pjlvalue = PointerType::get(T_jlvalue, 0); @@ -2053,7 +1969,7 @@ bool LateLowerGCFrame::doInitialization(Module &M) { j++; } if (j != 0) - append_to_compiler_used(M, ArrayRef(function_list, j)); + appendToCompilerUsed(M, ArrayRef(function_list, j)); return true; } diff --git a/src/llvm-multiversioning.cpp b/src/llvm-multiversioning.cpp index a4457fbea5ec9..3df9989720bd5 100644 --- a/src/llvm-multiversioning.cpp +++ b/src/llvm-multiversioning.cpp @@ -814,6 +814,7 @@ void CloneCtx::fix_inst_uses() std::tie(id, slot) = get_reloc_slot(orig_f); Instruction *ptr = new LoadInst(T_pvoidfunc, slot, "", false, insert_before); ptr->setMetadata(llvm::LLVMContext::MD_tbaa, tbaa_const); + ptr->setMetadata(llvm::LLVMContext::MD_invariant_load, MDNode::get(ctx, None)); ptr = new BitCastInst(ptr, F->getType(), "", insert_before); use_i->setOperand(info.use->getOperandNo(), rewrite_inst_use(uses.get_stack(), ptr, diff --git a/src/llvm-ptls.cpp b/src/llvm-ptls.cpp index 8e177bba05c3f..455a251d98ba6 100644 --- a/src/llvm-ptls.cpp +++ b/src/llvm-ptls.cpp @@ -63,7 +63,7 @@ struct LowerPTLS: public ModulePass { #else GlobalVariable *static_tls; #endif - void fix_ptls_use(CallInst *ptlsStates) const; + void fix_ptls_use(CallInst *ptlsStates); bool runOnModule(Module &M) override; }; @@ -181,7 +181,7 @@ inline T *LowerPTLS::add_comdat(T *G) const } #endif -void LowerPTLS::fix_ptls_use(CallInst *ptlsStates) const +void LowerPTLS::fix_ptls_use(CallInst *ptlsStates) { if (ptlsStates->use_empty()) { ptlsStates->eraseFromParent(); @@ -197,6 +197,7 @@ void LowerPTLS::fix_ptls_use(CallInst *ptlsStates) const // ptls = getter(); auto offset = new LoadInst(T_size, ptls_offset, "", false, ptlsStates); offset->setMetadata(llvm::LLVMContext::MD_tbaa, tbaa_const); + offset->setMetadata(llvm::LLVMContext::MD_invariant_load, MDNode::get(*ctx, None)); auto cmp = new ICmpInst(ptlsStates, CmpInst::ICMP_NE, offset, Constant::getNullValue(offset->getType())); MDBuilder MDB(*ctx); @@ -212,6 +213,7 @@ void LowerPTLS::fix_ptls_use(CallInst *ptlsStates) const ptlsStates->moveBefore(slowTerm); auto getter = new LoadInst(T_ptls_getter, ptls_slot, "", false, ptlsStates); getter->setMetadata(llvm::LLVMContext::MD_tbaa, tbaa_const); + getter->setMetadata(llvm::LLVMContext::MD_invariant_load, MDNode::get(*ctx, None)); ptlsStates->setCalledFunction(getter); set_ptls_attrs(ptlsStates); @@ -226,6 +228,7 @@ void LowerPTLS::fix_ptls_use(CallInst *ptlsStates) const // since we may not know which getter function to use ahead of time. auto getter = new LoadInst(T_ptls_getter, ptls_slot, "", false, ptlsStates); getter->setMetadata(llvm::LLVMContext::MD_tbaa, tbaa_const); + getter->setMetadata(llvm::LLVMContext::MD_invariant_load, MDNode::get(*ctx, None)); ptlsStates->setCalledFunction(getter); set_ptls_attrs(ptlsStates); }