Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
d6b7a8a
[CACHE] Use base64 for shorter cache directories (#4553)
minjang Aug 22, 2024
661726c
[NFC] Simplify getThreadId function (#4554)
linuxlonelyeagle Aug 22, 2024
2ea4890
Add mechanism for remapping device-specific module imports (#4539)
int3 Aug 22, 2024
8e63999
[CI][AMD] Re-enable MI200 CI (#4555)
jungpark-mlir Aug 22, 2024
f210090
[TEST] Use device fixture for test_math_extern (#4558)
int3 Aug 23, 2024
54801fa
[FRONTEND] `interleave` does not need to check shape (#4535)
Mwsxy Aug 23, 2024
d14f59a
[Proton] Add a better description when possibly importing incorrect h…
CRobeck Aug 23, 2024
8c03e46
[BACKEND] Add a knob to fall back to the legacy mma layout conversion…
Jokeren Aug 23, 2024
93cc5b4
[BE][PIPELINE] Handle the case when values from the peeled prologue m…
pawelszczerbuk Aug 23, 2024
89ecee2
[FRONTEND] Print full file name when overriding kernel (#4566)
htyu Aug 23, 2024
1b095f0
[Proton] Move additional hatchet import into try/except check (#4568)
CRobeck Aug 23, 2024
e0613c6
[AMD] Get libamdhip64.so loaded in process first in Python (#4255)
xinyazhang Aug 23, 2024
a78c9c4
[AMD] Support FP8E5M2 with MFMA FP16 instructions (#4259)
binarman Aug 24, 2024
2d38ffa
[BACKEND] Continue the backward slice when finding free convert (#4571)
Jokeren Aug 26, 2024
07671fd
[TEST] IEEE is common and TF32 is specific to CUDA (#4573)
parsifal-47 Aug 26, 2024
381ff67
[BACKEND] Fix the `divideRight` method in Linear Layout when eliminat…
Jokeren Aug 26, 2024
ff04671
[BE][PIPELINE] Add fix for the wgmma pipelining bug with subview dist…
pawelszczerbuk Aug 26, 2024
78af5c9
[BACKEND] Optimize code generation for load with other arg (#4582)
ThomasRaoux Aug 27, 2024
fdc7718
[Readme] Fix command to get compile command (#4572)
jayzhan211 Aug 27, 2024
1827757
[BE] Enable verbose assembly with source code location (#4528)
ravil-mobile Aug 27, 2024
f48dbc1
[CODEGEN] Support CUDA 12.6 (#4588)
Jokeren Aug 27, 2024
e88a7fe
[AMD] Support emit indices logic WMMAv2 layout (#4518)
joviliast Aug 27, 2024
b2c5d36
[SWP] move schedulePrologueAndEpilogue to be before lowering (#4584)
manman-ren Aug 27, 2024
cf696d4
Emit remarks for SWP and vectorization failures (#4350)
zengwu Aug 27, 2024
241e89c
[nvidia backend] Replace cvt instructions with bitwise operations in …
chsigg Aug 28, 2024
5d6033c
Fix underflow in Triton's highestPowOf2Divisor function when the inpu…
Moerafaat Aug 23, 2024
2c9072f
Merge branch 'main' into export_cl666302296
Moerafaat Aug 28, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/integration-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ jobs:
run: |
if [ x"${{ github.repository }}" == x"triton-lang/triton" ]; then
echo '::set-output name=matrix-CUDA::[["a100-runner-set"], ["h100-runner-set"]]'
echo '::set-output name=matrix-HIP::[["self-hosted", "gfx942"]]'
echo '::set-output name=matrix-HIP::[["self-hosted", "gfx90a"], ["self-hosted", "gfx942"]]'
echo '::set-output name=matrix-MACOS::[["macos-latest"]]'
else
echo '::set-output name=matrix-CUDA::["ubuntu-latest"]'
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/integration-tests.yml.in
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ jobs:
run: |
if [ x"${{ github.repository }}" == x"triton-lang/triton" ]; then
echo '::set-output name=matrix-CUDA::[["a100-runner-set"], ["h100-runner-set"]]'
echo '::set-output name=matrix-HIP::[["self-hosted", "gfx942"]]'
echo '::set-output name=matrix-HIP::[["self-hosted", "gfx90a"], ["self-hosted", "gfx942"]]'
echo '::set-output name=matrix-MACOS::[["macos-latest"]]'
else
echo '::set-output name=matrix-CUDA::["ubuntu-latest"]'
Expand Down
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -117,9 +117,10 @@ arbitrary LLVM version.
(probably because, in our build, users don't invoke cmake directly, but
instead use setup.py). Teach vscode how to compile Triton as follows.

- Do a local build.
- Do a local build. Run command `pip install -e python`
- Get the full path to the `compile_commands.json` file produced by the build:
`find python/build -name 'compile_commands.json | xargs readlink -f'`
`find python/build -name 'compile_commands.json' | xargs readlink -f`.
You might get a full path similar to `/Users/{username}/triton/python/build/cmake.macosx-11.1-arm64-cpython-3.12/compile_commands.json`
- In vscode, install the
[C/C++
extension](https://marketplace.visualstudio.com/items?itemName=ms-vscode.cpptools),
Expand Down
28 changes: 15 additions & 13 deletions include/triton/Conversion/TritonGPUToLLVM/Utility.h
Original file line number Diff line number Diff line change
Expand Up @@ -388,19 +388,12 @@ inline Value getSharedMemoryBase(Location loc, RewriterBase &rewriter,

/* ------------------------------------ */
// Returns CTA level thread idx
inline Value getThreadIdInCTA(RewriterBase &rewriter, Location loc) {
inline Value getThreadId(RewriterBase &rewriter, Location loc) {
Value tid =
rewriter.create<::mlir::gpu::ThreadIdOp>(loc, ::mlir::gpu::Dimension::x);
return rewriter.create<arith::IndexCastOp>(loc, i32_ty, tid);
}

// Returns CTA level thread idx.
inline Value getThreadId(RewriterBase &rewriter, Location loc) {
Value tid = getThreadIdInCTA(rewriter, loc);
auto mod = rewriter.getBlock()->getParent()->getParentOfType<ModuleOp>();
return tid;
}

// -----------------------------------------------------------------------
// Shared memory utilities
// -----------------------------------------------------------------------
Expand Down Expand Up @@ -909,10 +902,12 @@ inline void emitWmmaOffsetForCTA(const AMDWmmaEncodingAttr &wmmaLayout,
auto rank = shapePerCta.size();
assert(rank == 2 || rank == 3);
SmallVector<unsigned> elemOffset(rank, 0);
auto elemStride = wmmaLayout.getVersion() == 1 ? 2 : 1;
if (rank == 3)
elemOffset[0] = ctaBatchOffset;
for (unsigned elem = 0; elem < elemsPerThreadPerGroup; elem++) {
elemOffset[rank - 2] = ctaOffsetX * shapePerCta[rank - 2] + 2 * elem;
elemOffset[rank - 2] =
ctaOffsetX * shapePerCta[rank - 2] + elemStride * elem;
elemOffset[rank - 1] = ctaOffsetY * shapePerCta[rank - 1];
offsets.push_back(elemOffset);
}
Expand Down Expand Up @@ -958,8 +953,17 @@ emitBaseIndexForWmmaLayout(Location loc, RewriterBase &rewriter,

SmallVector<Value> multiDimBase(rank);

multiDimBase[rank - 2] =
add(udiv(threadIdPerWarp, i32_val(mnkDim[2])), offWarp0);
auto ver = wmmaLayout.getVersion();
if (ver == 1) {
multiDimBase[rank - 2] =
add(udiv(threadIdPerWarp, i32_val(mnkDim[2])), offWarp0);
} else {
assert(ver == 2);
multiDimBase[rank - 2] =
add(mul(udiv(threadIdPerWarp, i32_val(mnkDim[2])),
i32_val(wmmaLayout.getSizePerThread()[rank - 2])),
offWarp0);
}
multiDimBase[rank - 1] = add(laneId, offWarp1);

// TODO: It is assumed when rank = 3, warpsPerCTA is set to
Expand Down Expand Up @@ -1109,8 +1113,6 @@ emitBaseIndexForLayoutImpl(Location loc, RewriterBase &rewriter,
} else if (auto mfmaLayout = mlir::dyn_cast<AMDMfmaEncodingAttr>(layout)) {
result = emitBaseIndexForMfmaLayout(loc, rewriter, mfmaLayout, type);
} else if (auto wmmaLayout = mlir::dyn_cast<AMDWmmaEncodingAttr>(layout)) {
// TODO: support 2nd gen of WMMA
assert(wmmaLayout.getVersion() == 1);
result = emitBaseIndexForWmmaLayout(loc, rewriter, wmmaLayout, type);
} else if (auto sliceLayout = mlir::dyn_cast<SliceEncodingAttr>(layout)) {
auto parentLayout = sliceLayout.getParent();
Expand Down
6 changes: 5 additions & 1 deletion include/triton/Dialect/Triton/IR/Utility.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,11 @@ template <typename Int> Int ceil(Int m, Int n) { return (m + n - 1) / n; }

/// Get the highest power of 2 divisor of an integer.
template <typename T> T highestPowOf2Divisor(T n) {
if (n == 0) {
// When n is 0 or min, return the highest power of 2. The min case is handled
// separately to avoid underflow when T is a signed integer. Technically
// in that case the correct divisor is -n, but this value is outside the
// range of possible values, so we take the next best alternative.
if (n == 0 || n == std::numeric_limits<T>::min()) {
return (static_cast<T>(1) << (sizeof(T) * 8 - 2));
}
return (n & (~(n - 1)));
Expand Down
18 changes: 17 additions & 1 deletion include/triton/Tools/LinearLayout.h
Original file line number Diff line number Diff line change
Expand Up @@ -577,7 +577,23 @@ class LinearLayout {

// divideLeft and divideRight are the inverses of operator*.
//
// If c = a * b, then a = c.divideRight(b) and b = c.divideLeft(a).
// Consider `a = c.divideRight(b)`, where `a` is a linear layout with
// `in-dims(a) == in-dims(b)` and `out-dims(a) == out-dims(c)`. We may remove
// some empty dimensions from `a` to form `a'` and still have `a' * b == c`.
// Therefore, there are multiple possible values that we could return for
// `(a * b).divideRight(b)` which would satisfy
// `((a * b).divideRight(b)) * b == a * b`.
//
// In the following example, we have `a * b == a' * b` when "in1" is an empty
// dimension that maps everything to 0:
//
// a = L("in1", "in2") -> ("out1", "out2")
// a' = L("in1") -> ("out1")
// b = L("in2") -> ("out2")
//
// divideLeft and divideRight resolve this ambiguity by always returning the
// "canonical" quotient, namely the one with the fewest possible size-zero
// input and output dimensions.
//
// TODO(jlebar): Implement divideLeft.
// std::optional<LinearLayout> divideLeft(const LinearLayout &divisor);
Expand Down
2 changes: 2 additions & 0 deletions lib/Analysis/Utility.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -425,6 +425,7 @@ bool supportMFMATypes(Type a, Type b) {
if (a.getIntOrFloatBitWidth() != b.getIntOrFloatBitWidth())
return false;

auto F8E5M2 = TypeID::get<Float8E5M2Type>();
auto F8E4M3FNUZ = TypeID::get<Float8E4M3FNUZType>();
auto F8E5M2FNUZ = TypeID::get<Float8E5M2FNUZType>();
auto F16 = TypeID::get<Float16Type>();
Expand All @@ -435,6 +436,7 @@ bool supportMFMATypes(Type a, Type b) {
{F32, F32},
{F16, F16},
{BF16, BF16},
{F8E5M2, F8E5M2},
{F8E4M3FNUZ, F8E4M3FNUZ},
{F8E4M3FNUZ, F8E5M2FNUZ},
{F8E5M2FNUZ, F8E4M3FNUZ},
Expand Down
13 changes: 11 additions & 2 deletions lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@ using ::mlir::LLVM::linearize;

using namespace mlir::triton::gpu;

// XXX(Keren): A temporary knob to control the use of legacy MMA conversion
// because LinearLayout seems to have some performance issues.
constexpr bool useLegacyMMAConversion = false;

struct ConvertLayoutOpConversion
: public ConvertOpToLLVMPattern<ConvertLayoutOp> {
public:
Expand Down Expand Up @@ -341,8 +345,10 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion
const LinearLayout &dstLayout,
OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const {
// TODO(jlebar): Implement me.
return failure();
// TODO(Keren): implement warp shuffle instead of using the general approach
// that uses shared memory
return transferWithinBlockOrGroup(op, srcLayout, dstLayout, adaptor,
rewriter);
}

LogicalResult
Expand Down Expand Up @@ -378,6 +384,9 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion
/*accumNumReplicates=*/1)) {
return false;
}
if (useLegacyMMAConversion) {
return false;
}
return true;
}
if (isa<BlockedEncodingAttr>(layout)) {
Expand Down
2 changes: 0 additions & 2 deletions lib/Conversion/TritonGPUToLLVM/Utility.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -814,8 +814,6 @@ SmallVector<Value> getMultiDimOffset(Attribute layout, Location loc,
emitMfmaOffsetForCTA(mfmaLayout, offsets, 0, multiDimCTAInRepId[0],
multiDimCTAInRepId[1]);
} else if (auto wmmaLayout = dyn_cast<AMDWmmaEncodingAttr>(layout)) {
// TODO: support 2nd gen of WMMA
assert(wmmaLayout.getVersion() == 1);
emitWmmaOffsetForCTA(wmmaLayout, offsets, 0, multiDimCTAInRepId[0],
multiDimCTAInRepId[1]);
}
Expand Down
32 changes: 25 additions & 7 deletions lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -565,17 +565,35 @@ AMDWmmaEncodingAttr::toLinearLayout(ArrayRef<int64_t> shape) const {

// For wmma with 16x16 output, each of the 32 threads holds 8 elements.
//
// For the register (i.e., element) dimension, these 8 elements are along
// the matrix C's M dimension, with 1 consecutive elements spanning 1 row
// and then the next 1 row being a gap.
// The first version of WMMA layout has following specific:
// for the register (i.e., element) dimension, these 8 elements are
// along the matrix C's M dimension, with 1 consecutive elements
// spanning 1 row and then the next 1 row being a gap.
//
// For the lane (i.e., thread) dimension, these threads are along the
// matrix C's N dimension, with 16 consecutive threads covering a whole
// row and the next 16 threads start at the next row.
LinearLayout tileLayout(
{{kRegister, {/*gap*/ {0, 2}, {0, 4}, {0, 8}}},
{kLane, {{1, 0}, {2, 0}, {4, 0}, {8, 0}, /*gap*/ {0, 1}}}},
{outDimNames[order[0]], outDimNames[order[1]]});
//
// The second version of wmma layout is less tricky:
// for the register dimension 8 elements are along the matrix C's M
// dimension. First 16 lanes take 0-8 elems along M, second 16 take 8-15.
// We have 16 pair of threads in each warp, one pair covers the whole
// column.
//
// Please also check explaining comments in TritonGPUAttrDefs.td at the
// AMDWmmaEncodingAttr section.
unsigned ver = getVersion();
assert(ver == 1 || ver == 2);
LinearLayout tileLayout =
ver == 1
? LinearLayout(
{{kRegister, {/*gap*/ {0, 2}, {0, 4}, {0, 8}}},
{kLane, {{1, 0}, {2, 0}, {4, 0}, {8, 0}, /*gap*/ {0, 1}}}},
{outDimNames[order[0]], outDimNames[order[1]]})
: LinearLayout(
{{kRegister, {{0, 1}, {0, 2}, {0, 4}}},
{kLane, {{1, 0}, {2, 0}, {4, 0}, {8, 0}, /*gap*/ {0, 8}}}},
{outDimNames[order[0]], outDimNames[order[1]]});

if (hasBatchDim) {
assert(order[2] == 0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1070,6 +1070,13 @@ bool mlir::triton::preProcessLoopAndGetSchedule(
coarseSchedule.dump();
});

tt::CoarseSchedule::Cluster afterPrologue =
schedulePrologueAndEpilogue(forOp, coarseSchedule, rootUsers, numStages);
LLVM_DEBUG({
LDBG("Coarse schedule with prologue and epilogue:");
coarseSchedule.dump();
});

SmallVector<Value> barriers;
// Convert the loads into async loads and create the allocs.
SmallVector<Value> allocs =
Expand All @@ -1080,13 +1087,6 @@ bool mlir::triton::preProcessLoopAndGetSchedule(
coarseSchedule.dump();
});

tt::CoarseSchedule::Cluster afterPrologue =
schedulePrologueAndEpilogue(forOp, coarseSchedule, rootUsers, numStages);
LLVM_DEBUG({
LDBG("Coarse schedule with prologue and epilogue:");
coarseSchedule.dump();
});

scheduleDependencies(forOp, coarseSchedule, numStages);
LLVM_DEBUG({
LDBG("Coarse schedule with dependencies:");
Expand Down Expand Up @@ -1402,8 +1402,7 @@ static std::optional<int> dotCanBeProperlyAsync(ttng::WarpGroupDotOp dotOp,
transitiveOperand =
cast<scf::YieldOp>(blockArg.getOwner()->getTerminator())
.getOperand(blockArg.getArgNumber() - 1);
}
if (Operation *def = transitiveOperand.getDefiningOp()) {
} else if (Operation *def = transitiveOperand.getDefiningOp()) {
transitiveOperand = def->getOperand(0);
}
}
Expand Down
18 changes: 14 additions & 4 deletions lib/Dialect/TritonGPU/Transforms/Pipeliner/PipelineExpander.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -332,16 +332,26 @@ void LoopPipelinerInternal::emitPrologue(RewriterBase &rewriter) {
if (annotateFn)
annotateFn(newOp, triton::PipeliningOption::PipelinerPart::Prologue, i);
for (unsigned destId : llvm::seq(unsigned(0), op->getNumResults())) {
setValueMapping(op->getResult(destId), newOp->getResult(destId),
i - stages[op]);
Value source = newOp->getResult(destId);
// If the value is a loop carried dependency update the loop argument
// mapping.
for (OpOperand &operand : yield->getOpOperands()) {
if (operand.get() != op->getResult(destId))
continue;
if (predicates[predicateIdx] &&
!forOp.getResult(operand.getOperandNumber()).use_empty()) {
// If the value is used outside the loop, we need to make sure we
// return the correct version of it.
Value prevValue = valueMapping
[forOp.getRegionIterArgs()[operand.getOperandNumber()]]
[i - stages[op]];
source = rewriter.create<arith::SelectOp>(
loc, predicates[predicateIdx], source, prevValue);
}
setValueMapping(forOp.getRegionIterArgs()[operand.getOperandNumber()],
newOp->getResult(destId), i - stages[op] + 1);
source, i - stages[op] + 1);
}
setValueMapping(op->getResult(destId), newOp->getResult(destId),
i - stages[op]);
}
}
}
Expand Down
7 changes: 5 additions & 2 deletions lib/Dialect/TritonGPU/Transforms/Utility.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -734,8 +734,11 @@ getConvertBackwardSlice(Value root, SetVector<Value> &slice,
continue;
enqueue(result, encoding);
}
if (!isFreeConvert(definingOp) &&
canFoldIntoConversion(definingOp, encoding))
if (isFreeConvert(definingOp)) {
enqueue(definingOp->getOperand(0), encoding);
continue;
}
if (canFoldIntoConversion(definingOp, encoding))
continue;
if (stopPropagation && stopPropagation(definingOp))
continue;
Expand Down
Loading