-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[Matrix] Propagate shape information through cast insts #141869
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-llvm-transforms Author: Jon Roelofs (jroelofs) ChangesFull diff: https://github.com/llvm/llvm-project/pull/141869.diff 3 Files Affected:
diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index 756a72e6d97bc..38e777601e31c 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -32,8 +32,10 @@
#include "llvm/IR/CFG.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/MatrixBuilder.h"
@@ -229,9 +231,18 @@ static bool isUniformShape(Value *V) {
if (!I)
return true;
- if (I->isBinaryOp())
+ if (I->isBinaryOp() || I->isCast())
return true;
+ if (auto *II = dyn_cast<IntrinsicInst>(V))
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::abs:
+ case Intrinsic::fabs:
+ return true;
+ default:
+ return false;
+ }
+
switch (I->getOpcode()) {
case Instruction::FNeg:
return true;
@@ -621,7 +632,7 @@ class LowerMatrixIntrinsics {
case Intrinsic::matrix_column_major_store:
return true;
default:
- return false;
+ return isUniformShape(II);
}
return isUniformShape(V) || isa<StoreInst>(V) || isa<LoadInst>(V);
}
@@ -1066,9 +1077,11 @@ class LowerMatrixIntrinsics {
Value *Op2;
if (auto *BinOp = dyn_cast<BinaryOperator>(Inst))
Changed |= VisitBinaryOperator(BinOp);
- if (auto *UnOp = dyn_cast<UnaryOperator>(Inst))
+ else if (auto *UnOp = dyn_cast<UnaryOperator>(Inst))
Changed |= VisitUnaryOperator(UnOp);
- if (match(Inst, m_Load(m_Value(Op1))))
+ else if (auto *Cast = dyn_cast<CastInst>(Inst))
+ Changed |= VisitCastInstruction(Cast);
+ else if (match(Inst, m_Load(m_Value(Op1))))
Changed |= VisitLoad(cast<LoadInst>(Inst), Op1, Builder);
else if (match(Inst, m_Store(m_Value(Op1), m_Value(Op2))))
Changed |= VisitStore(cast<StoreInst>(Inst), Op1, Op2, Builder);
@@ -1127,6 +1140,9 @@ class LowerMatrixIntrinsics {
case Intrinsic::matrix_column_major_store:
LowerColumnMajorStore(Inst);
break;
+ case Intrinsic::abs:
+ case Intrinsic::fabs:
+ return VisitUniformIntrinsic(cast<IntrinsicInst>(Inst));
default:
return false;
}
@@ -2198,6 +2214,80 @@ class LowerMatrixIntrinsics {
return true;
}
+ /// Lower cast instructions, if shape information is available.
+ bool VisitCastInstruction(CastInst *Inst) {
+ auto I = ShapeMap.find(Inst);
+ if (I == ShapeMap.end())
+ return false;
+
+ Value *Op = Inst->getOperand(0);
+
+ IRBuilder<> Builder(Inst);
+ ShapeInfo &Shape = I->second;
+
+ MatrixTy Result;
+ MatrixTy M = getMatrix(Op, Shape, Builder);
+
+ Builder.setFastMathFlags(getFastMathFlags(Inst));
+
+ for (unsigned I = 0; I < Shape.getNumVectors(); ++I) {
+ auto *OrigTy = cast<VectorType>(Inst->getType());
+ auto *NewTy = VectorType::get(OrigTy->getElementType(),
+ ElementCount::getFixed(M.getStride()));
+ Result.addVector(
+ Builder.CreateCast(Inst->getOpcode(), M.getVector(I), NewTy));
+ }
+
+ finalizeLowering(Inst,
+ Result.addNumComputeOps(getNumOps(Result.getVectorTy()) *
+ Result.getNumVectors()),
+ Builder);
+ return true;
+ }
+
+ /// Lower uniform shape intrinsics, if shape information is available.
+ bool VisitUniformIntrinsic(IntrinsicInst *Inst) {
+ auto I = ShapeMap.find(Inst);
+ if (I == ShapeMap.end())
+ return false;
+
+ IRBuilder<> Builder(Inst);
+ ShapeInfo &Shape = I->second;
+
+ MatrixTy Result;
+
+ switch (Inst->getIntrinsicID()) {
+ case Intrinsic::abs:
+ case Intrinsic::fabs: {
+ Value *Op = Inst->getOperand(0);
+
+ MatrixTy M = getMatrix(Op, Shape, Builder);
+
+ Builder.setFastMathFlags(getFastMathFlags(Inst));
+
+ for (unsigned I = 0; I < Shape.getNumVectors(); ++I)
+ switch (Inst->getIntrinsicID()) {
+ case Intrinsic::abs:
+ Result.addVector(Builder.CreateBinaryIntrinsic(
+ Intrinsic::abs, M.getVector(I), Inst->getOperand(1)));
+ break;
+ case Intrinsic::fabs:
+ Result.addVector(Builder.CreateUnaryIntrinsic(Inst->getIntrinsicID(),
+ M.getVector(I)));
+ break;
+ }
+
+ finalizeLowering(Inst,
+ Result.addNumComputeOps(getNumOps(Result.getVectorTy()) *
+ Result.getNumVectors()),
+ Builder);
+ return true;
+ }
+ default:
+ llvm_unreachable("unexpected intrinsic");
+ }
+ }
+
/// Helper to linearize a matrix expression tree into a string. Currently
/// matrix expressions are linarized by starting at an expression leaf and
/// linearizing bottom up.
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/binop.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/binop.ll
index 9160ced2715aa..48974a951c3bb 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/binop.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/binop.ll
@@ -432,3 +432,46 @@ define void @xor_2x2(ptr %lhs, ptr %rhs, ptr %out) {
store <4 x i32> %optt, ptr %out
ret void
}
+
+define void @fabs_2x2f64(ptr %in, ptr %out) {
+; CHECK-LABEL: @fabs_2x2f64(
+; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, ptr [[IN:%.*]], align 32
+; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i64 2
+; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 16
+; CHECK-NEXT: [[TMP1:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[COL_LOAD]])
+; CHECK-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[COL_LOAD1]])
+; CHECK-NEXT: store <2 x double> [[TMP1]], ptr [[OUT:%.*]], align 32
+; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[OUT]], i64 2
+; CHECK-NEXT: store <2 x double> [[TMP2]], ptr [[VEC_GEP2]], align 16
+; CHECK-NEXT: ret void
+;
+ %load = load <4 x double>, ptr %in
+ %fabs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %load)
+ %fabst = call <4 x double> @llvm.matrix.transpose(<4 x double> %fabs, i32 2, i32 2)
+ %fabstt = call <4 x double> @llvm.matrix.transpose(<4 x double> %fabst, i32 2, i32 2)
+ store <4 x double> %fabstt, ptr %out
+ ret void
+}
+
+define void @fabs_2x2i32(ptr %in, ptr %out) {
+; CHECK-LABEL: @fabs_2x2i32(
+; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x i32>, ptr [[IN:%.*]], align 16
+; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr i32, ptr [[IN]], i64 2
+; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x i32>, ptr [[VEC_GEP]], align 8
+; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i32> @llvm.abs.v2i32(<2 x i32> [[COL_LOAD]], i1 false)
+; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i32> @llvm.abs.v2i32(<2 x i32> [[COL_LOAD1]], i1 false)
+; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i32> @llvm.abs.v2i32(<2 x i32> [[TMP1]], i1 true)
+; CHECK-NEXT: [[TMP4:%.*]] = call <2 x i32> @llvm.abs.v2i32(<2 x i32> [[TMP2]], i1 true)
+; CHECK-NEXT: store <2 x i32> [[TMP3]], ptr [[OUT:%.*]], align 16
+; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr i32, ptr [[OUT]], i64 2
+; CHECK-NEXT: store <2 x i32> [[TMP4]], ptr [[VEC_GEP2]], align 8
+; CHECK-NEXT: ret void
+;
+ %load = load <4 x i32>, ptr %in
+ %abs = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %load, i1 false)
+ %abst = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %abs, i32 2, i32 2)
+ %abstt = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %abst, i32 2, i32 2)
+ %absabstt = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %abstt, i1 true)
+ store <4 x i32> %absabstt, ptr %out
+ ret void
+}
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/unary.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/unary.ll
new file mode 100644
index 0000000000000..b73c1402044bc
--- /dev/null
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/unary.ll
@@ -0,0 +1,202 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes='lower-matrix-intrinsics' -S < %s | FileCheck %s
+
+define void @fneg_2x2(ptr %in, ptr %out) {
+; CHECK-LABEL: @fneg_2x2(
+; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x float>, ptr [[IN:%.*]], align 16
+; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr float, ptr [[IN]], i64 2
+; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x float>, ptr [[VEC_GEP]], align 8
+; CHECK-NEXT: [[TMP1:%.*]] = fneg <2 x float> [[COL_LOAD]]
+; CHECK-NEXT: [[TMP2:%.*]] = fneg <2 x float> [[COL_LOAD1]]
+; CHECK-NEXT: store <2 x float> [[TMP1]], ptr [[OUT:%.*]], align 16
+; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr float, ptr [[OUT]], i64 2
+; CHECK-NEXT: store <2 x float> [[TMP2]], ptr [[VEC_GEP2]], align 8
+; CHECK-NEXT: ret void
+;
+ %inv = load <4 x float>, ptr %in
+ %op = fneg <4 x float> %inv
+ %opt = call <4 x float> @llvm.matrix.transpose(<4 x float> %op, i32 2, i32 2)
+ %optt = call <4 x float> @llvm.matrix.transpose(<4 x float> %opt, i32 2, i32 2)
+ store <4 x float> %optt, ptr %out
+ ret void
+}
+
+define void @trunc_2x2(ptr %in, ptr %out) {
+; CHECK-LABEL: @trunc_2x2(
+; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x i64>, ptr [[IN:%.*]], align 32
+; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr i64, ptr [[IN]], i64 2
+; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x i64>, ptr [[VEC_GEP]], align 16
+; CHECK-NEXT: [[TMP1:%.*]] = trunc <2 x i64> [[COL_LOAD]] to <2 x i32>
+; CHECK-NEXT: [[TMP2:%.*]] = trunc <2 x i64> [[COL_LOAD1]] to <2 x i32>
+; CHECK-NEXT: store <2 x i32> [[TMP1]], ptr [[OUT:%.*]], align 16
+; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr i32, ptr [[OUT]], i64 2
+; CHECK-NEXT: store <2 x i32> [[TMP2]], ptr [[VEC_GEP2]], align 8
+; CHECK-NEXT: ret void
+;
+ %inv = load <4 x i64>, ptr %in
+ %op = trunc <4 x i64> %inv to <4 x i32>
+ %opt = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %op, i32 2, i32 2)
+ %optt = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %opt, i32 2, i32 2)
+ store <4 x i32> %optt, ptr %out
+ ret void
+}
+
+define void @zext_2x2(ptr %in, ptr %out) {
+; CHECK-LABEL: @zext_2x2(
+; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x i16>, ptr [[IN:%.*]], align 8
+; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr i16, ptr [[IN]], i64 2
+; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x i16>, ptr [[VEC_GEP]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = zext <2 x i16> [[COL_LOAD]] to <2 x i32>
+; CHECK-NEXT: [[TMP2:%.*]] = zext <2 x i16> [[COL_LOAD1]] to <2 x i32>
+; CHECK-NEXT: store <2 x i32> [[TMP1]], ptr [[OUT:%.*]], align 16
+; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr i32, ptr [[OUT]], i64 2
+; CHECK-NEXT: store <2 x i32> [[TMP2]], ptr [[VEC_GEP2]], align 8
+; CHECK-NEXT: ret void
+;
+ %inv = load <4 x i16>, ptr %in
+ %op = zext <4 x i16> %inv to <4 x i32>
+ %opt = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %op, i32 2, i32 2)
+ %optt = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %opt, i32 2, i32 2)
+ store <4 x i32> %optt, ptr %out
+ ret void
+}
+
+define void @sext_2x2(ptr %in, ptr %out) {
+; CHECK-LABEL: @sext_2x2(
+; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x i8>, ptr [[IN:%.*]], align 4
+; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr i8, ptr [[IN]], i64 2
+; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x i8>, ptr [[VEC_GEP]], align 2
+; CHECK-NEXT: [[TMP1:%.*]] = sext <2 x i8> [[COL_LOAD]] to <2 x i16>
+; CHECK-NEXT: [[TMP2:%.*]] = sext <2 x i8> [[COL_LOAD1]] to <2 x i16>
+; CHECK-NEXT: store <2 x i16> [[TMP1]], ptr [[OUT:%.*]], align 8
+; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr i16, ptr [[OUT]], i64 2
+; CHECK-NEXT: store <2 x i16> [[TMP2]], ptr [[VEC_GEP2]], align 4
+; CHECK-NEXT: ret void
+;
+ %inv = load <4 x i8>, ptr %in
+ %op = sext <4 x i8> %inv to <4 x i16>
+ %opt = call <4 x i16> @llvm.matrix.transpose(<4 x i16> %op, i32 2, i32 2)
+ %optt = call <4 x i16> @llvm.matrix.transpose(<4 x i16> %opt, i32 2, i32 2)
+ store <4 x i16> %optt, ptr %out
+ ret void
+}
+
+define void @fptoui_2x2(ptr %in, ptr %out) {
+; CHECK-LABEL: @fptoui_2x2(
+; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x float>, ptr [[IN:%.*]], align 16
+; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr float, ptr [[IN]], i64 2
+; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x float>, ptr [[VEC_GEP]], align 8
+; CHECK-NEXT: [[TMP1:%.*]] = fptoui <2 x float> [[COL_LOAD]] to <2 x i32>
+; CHECK-NEXT: [[TMP2:%.*]] = fptoui <2 x float> [[COL_LOAD1]] to <2 x i32>
+; CHECK-NEXT: store <2 x i32> [[TMP1]], ptr [[OUT:%.*]], align 16
+; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr i32, ptr [[OUT]], i64 2
+; CHECK-NEXT: store <2 x i32> [[TMP2]], ptr [[VEC_GEP2]], align 8
+; CHECK-NEXT: ret void
+;
+ %inv = load <4 x float>, ptr %in
+ %op = fptoui <4 x float> %inv to <4 x i32>
+ %opt = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %op, i32 2, i32 2)
+ %optt = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %opt, i32 2, i32 2)
+ store <4 x i32> %optt, ptr %out
+ ret void
+}
+
+define void @fptosi_2x2(ptr %in, ptr %out) {
+; CHECK-LABEL: @fptosi_2x2(
+; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x float>, ptr [[IN:%.*]], align 16
+; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr float, ptr [[IN]], i64 2
+; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x float>, ptr [[VEC_GEP]], align 8
+; CHECK-NEXT: [[TMP1:%.*]] = fptosi <2 x float> [[COL_LOAD]] to <2 x i32>
+; CHECK-NEXT: [[TMP2:%.*]] = fptosi <2 x float> [[COL_LOAD1]] to <2 x i32>
+; CHECK-NEXT: store <2 x i32> [[TMP1]], ptr [[OUT:%.*]], align 16
+; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr i32, ptr [[OUT]], i64 2
+; CHECK-NEXT: store <2 x i32> [[TMP2]], ptr [[VEC_GEP2]], align 8
+; CHECK-NEXT: ret void
+;
+ %inv = load <4 x float>, ptr %in
+ %op = fptosi <4 x float> %inv to <4 x i32>
+ %opt = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %op, i32 2, i32 2)
+ %optt = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %opt, i32 2, i32 2)
+ store <4 x i32> %optt, ptr %out
+ ret void
+}
+
+define void @uitofp_2x2(ptr %in, ptr %out) {
+; CHECK-LABEL: @uitofp_2x2(
+; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x i64>, ptr [[IN:%.*]], align 32
+; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr i64, ptr [[IN]], i64 2
+; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x i64>, ptr [[VEC_GEP]], align 16
+; CHECK-NEXT: [[TMP1:%.*]] = uitofp <2 x i64> [[COL_LOAD]] to <2 x double>
+; CHECK-NEXT: [[TMP2:%.*]] = uitofp <2 x i64> [[COL_LOAD1]] to <2 x double>
+; CHECK-NEXT: store <2 x double> [[TMP1]], ptr [[OUT:%.*]], align 32
+; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[OUT]], i64 2
+; CHECK-NEXT: store <2 x double> [[TMP2]], ptr [[VEC_GEP2]], align 16
+; CHECK-NEXT: ret void
+;
+ %inv = load <4 x i64>, ptr %in
+ %op = uitofp <4 x i64> %inv to <4 x double>
+ %opt = call <4 x double> @llvm.matrix.transpose(<4 x double> %op, i32 2, i32 2)
+ %optt = call <4 x double> @llvm.matrix.transpose(<4 x double> %opt, i32 2, i32 2)
+ store <4 x double> %optt, ptr %out
+ ret void
+}
+
+define void @sitofp_2x2(ptr %in, ptr %out) {
+; CHECK-LABEL: @sitofp_2x2(
+; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x i64>, ptr [[IN:%.*]], align 32
+; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr i64, ptr [[IN]], i64 2
+; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x i64>, ptr [[VEC_GEP]], align 16
+; CHECK-NEXT: [[TMP1:%.*]] = sitofp <2 x i64> [[COL_LOAD]] to <2 x double>
+; CHECK-NEXT: [[TMP2:%.*]] = sitofp <2 x i64> [[COL_LOAD1]] to <2 x double>
+; CHECK-NEXT: store <2 x double> [[TMP1]], ptr [[OUT:%.*]], align 32
+; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[OUT]], i64 2
+; CHECK-NEXT: store <2 x double> [[TMP2]], ptr [[VEC_GEP2]], align 16
+; CHECK-NEXT: ret void
+;
+ %inv = load <4 x i64>, ptr %in
+ %op = sitofp <4 x i64> %inv to <4 x double>
+ %opt = call <4 x double> @llvm.matrix.transpose(<4 x double> %op, i32 2, i32 2)
+ %optt = call <4 x double> @llvm.matrix.transpose(<4 x double> %opt, i32 2, i32 2)
+ store <4 x double> %optt, ptr %out
+ ret void
+}
+
+define void @fptrunc_2x2(ptr %in, ptr %out) {
+; CHECK-LABEL: @fptrunc_2x2(
+; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, ptr [[IN:%.*]], align 32
+; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i64 2
+; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 16
+; CHECK-NEXT: [[TMP1:%.*]] = fptrunc nnan <2 x double> [[COL_LOAD]] to <2 x float>
+; CHECK-NEXT: [[TMP2:%.*]] = fptrunc nnan <2 x double> [[COL_LOAD1]] to <2 x float>
+; CHECK-NEXT: store <2 x float> [[TMP1]], ptr [[OUT:%.*]], align 16
+; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr float, ptr [[OUT]], i64 2
+; CHECK-NEXT: store <2 x float> [[TMP2]], ptr [[VEC_GEP2]], align 8
+; CHECK-NEXT: ret void
+;
+ %inv = load <4 x double>, ptr %in
+ %op = fptrunc nnan <4 x double> %inv to <4 x float>
+ %opt = call <4 x float> @llvm.matrix.transpose(<4 x float> %op, i32 2, i32 2)
+ %optt = call <4 x float> @llvm.matrix.transpose(<4 x float> %opt, i32 2, i32 2)
+ store <4 x float> %optt, ptr %out
+ ret void
+}
+
+define void @fpext_2x2(ptr %in, ptr %out) {
+; CHECK-LABEL: @fpext_2x2(
+; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x float>, ptr [[IN:%.*]], align 16
+; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr float, ptr [[IN]], i64 2
+; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x float>, ptr [[VEC_GEP]], align 8
+; CHECK-NEXT: [[TMP1:%.*]] = fpext <2 x float> [[COL_LOAD]] to <2 x double>
+; CHECK-NEXT: [[TMP2:%.*]] = fpext <2 x float> [[COL_LOAD1]] to <2 x double>
+; CHECK-NEXT: store <2 x double> [[TMP1]], ptr [[OUT:%.*]], align 32
+; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[OUT]], i64 2
+; CHECK-NEXT: store <2 x double> [[TMP2]], ptr [[VEC_GEP2]], align 16
+; CHECK-NEXT: ret void
+;
+ %inv = load <4 x float>, ptr %in
+ %op = fpext <4 x float> %inv to <4 x double>
+ %opt = call <4 x double> @llvm.matrix.transpose(<4 x double> %op, i32 2, i32 2)
+ %optt = call <4 x double> @llvm.matrix.transpose(<4 x double> %opt, i32 2, i32 2)
+ store <4 x double> %optt, ptr %out
+ ret void
+}
|
edit: I've un-stacked them. |
3296ce5
to
42a624a
Compare
✅ With the latest revision this PR passed the C/C++ code formatter. |
42a624a
to
c3fddfe
Compare
c3fddfe
to
3779c5b
Compare
finalizeLowering(Inst, | ||
Result.addNumComputeOps(getNumOps(Result.getVectorTy()) * | ||
Result.getNumVectors()), | ||
Builder); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this is the same for most Visit*Instruction
(same as setting up Result
), should we refactor the code and hoist it to do it in the caller?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
let's do that in a separate PR after the rest of these have landed. I think we'll want the Visit* to return MatrixTy, and it'll be easier to do that refactor all at once, rather than add yet another conflict I'll have to resolve on each patch.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No description provided.