Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MLIR][XeGPU] Adding XeGPU 2d block operators #84692

Merged
merged 20 commits into from
Mar 18, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,12 @@
#ifndef MLIR_DIALECT_XEGPU_IR_XEGPU_H
#define MLIR_DIALECT_XEGPU_IR_XEGPU_H

#include <mlir/IR/Dialect.h>
#include "mlir/Bytecode/BytecodeOpInterface.h"
#include "mlir/IR/BuiltinTypes.h"
#include "mlir/IR/Dialect.h"
#include "mlir/Interfaces/ShapedOpInterfaces.h"
#include "mlir/Interfaces/SideEffectInterfaces.h"
#include "mlir/Interfaces/ViewLikeInterface.h"

namespace mlir {
namespace xegpu {
Expand Down
61 changes: 61 additions & 0 deletions mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,72 @@
#define MLIR_DIALECT_XEGPU_IR_XEGPUATTRS_TD

include "mlir/Dialect/XeGPU/IR/XeGPUDialect.td"
include "mlir/IR/EnumAttr.td"

class XeGPUAttr<string name, string attrMnemonic, list<Trait> traits = [],
string baseCppClass = "::mlir::Attribute">
: AttrDef<XeGPU_Dialect, name, traits, baseCppClass> {
let mnemonic = attrMnemonic;
}

def XeGPU_TensorDescAttr: XeGPUAttr<"TensorDesc", "tdesc_attr"> {
let parameters = (ins
OptionalParameter<"MemoryScopeAttr">: $memory_scope,
OptionalParameter<"IntegerAttr", "1">: $array_length,
OptionalParameter<"BoolAttr", "true">: $boundary_check
);

let builders = [
AttrBuilder<(ins
CArg<"xegpu::MemoryScope", "xegpu::MemoryScope::Global">:$memory_scope,
CArg<"int", "1">:$array_length,
CArg<"bool", "true">: $boundary_check
)>
];

let assemblyFormat = "`<` struct(params) `>`";
}

//===----------------------------------------------------------------------===//
// XeGPU Memory Scope Enums.
//===----------------------------------------------------------------------===//
def XeGPU_MemoryScopeGlobal: I32EnumAttrCase<"Global", 0, "global">;
def XeGPU_MemoryScopeShared: I32EnumAttrCase<"SLM", 1, "slm">;
def XeGPU_MemoryScope: I32EnumAttr<"MemoryScope",
"The address space of the memory the tensor descritor is created for",
[XeGPU_MemoryScopeGlobal, XeGPU_MemoryScopeShared]> {
let genSpecializedAttr = 0;
let cppNamespace = "::mlir::xegpu";
}

def XeGPU_MemoryScopeAttr:
EnumAttr<XeGPU_Dialect, XeGPU_MemoryScope, "memory_scope"> {
let assemblyFormat = "$value";
}

//===----------------------------------------------------------------------===//
// XeGPU Cache Enums.
//===----------------------------------------------------------------------===//
def XeGPU_CachePolicyCached: I32EnumAttrCase<"CACHED", 0, "cached">; // valid for read and write
def XeGPU_CachePolicyUncached: I32EnumAttrCase<"UNCACHED", 1, "uncached">; // valid for read and write
def XeGPU_CachePolicyStreaming: I32EnumAttrCase<"STREAMING", 2, "streaming">; // valid for read only
def XeGPU_CachePolicyInvalid: I32EnumAttrCase<"READ_INVALIDATE", 3, "read_invalidate">; // valid for read only
def XeGPU_CachePolicyWriteBack: I32EnumAttrCase<"WRITE_BACK", 4, "write_back">; // valid for write only
def XeGPU_CachePolicyWriteThrough: I32EnumAttrCase<"WRITE_THROUGH", 5, "write_through">; // valid for write only

def XeGPU_CachePolicyEnums : I32EnumAttr<"CachePolicy", "Cache policy",
[XeGPU_CachePolicyCached, XeGPU_CachePolicyUncached,
XeGPU_CachePolicyStreaming, XeGPU_CachePolicyInvalid,
XeGPU_CachePolicyWriteBack, XeGPU_CachePolicyWriteThrough]> {
let genSpecializedAttr = 0;
let cppNamespace = "::mlir::xegpu";
}

def XeGPU_CacheHintAttr
: EnumAttr<XeGPU_Dialect, XeGPU_CachePolicyEnums, "cache_hint"> {
let assemblyFormat = "`<` $value `>`";
}



#endif // MLIR_DIALECT_XEGPU_IR_XEGPUATTRS_TD
4 changes: 2 additions & 2 deletions mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ def XeGPU_Dialect : Dialect {
the lower-level GPU compiler.
}];

// let useDefaultTypePrinterParser = true;
// let useDefaultAttributePrinterParser = true;
let useDefaultTypePrinterParser = true;
let useDefaultAttributePrinterParser = true;
}

#endif // MLIR_DIALECT_XEGPU_IR_XEGPUDIALECT_TD
228 changes: 227 additions & 1 deletion mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,13 @@
#ifndef MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD
#define MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD

include "mlir/IR/AttrTypeBase.td"
include "mlir/Dialect/XeGPU/IR/XeGPUAttrs.td"
include "mlir/Dialect/XeGPU/IR/XeGPUDialect.td"
include "mlir/Dialect/XeGPU/IR/XeGPUTypes.td"

include "mlir/Interfaces/ShapedOpInterfaces.td"
include "mlir/Interfaces/SideEffectInterfaces.td"
include "mlir/Interfaces/ViewLikeInterface.td"

// Base class for dialect operations. This operation inherits from the base
// `Op` class in OpBase.td, and provides:
Expand All @@ -23,4 +26,227 @@ class XeGPU_Op<string mnemonic, list<Trait> traits = []>:
Op<XeGPU_Dialect, mnemonic, traits>;


def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface,
AttrSizedOperandSegments, OffsetSizeAndStrideOpInterface]> {

let summary = "Create nd-tensor descriptor operation";
let description = [{
The "create_nd_tdesc" operation creates a TensorDescType which represents
a sub-view of a 2D memory region (It can be extended to support n-D memory
region if needed in future). Elements in the subview continuous in each
dimention. It encodes the following important information for supporting
Intel hardware features:

* source: an object representing (starting address/pointer of) a 2D memory region.
It can be either a 2D memref object, or simply a pointer represented by uint64_t type.
for the later case, the shape and layout information of the 2D memory region should
be explicitly passed via `dynamic_shape` and `dynamic_strides` parameters.
* offsets: two index values represents offsets from the "source" at the each dimension
at which the subview of the target memory will be created. It is encoded via two
variables, including "dynamic_offsets" and "static_offsets", such that it can
accept various forms, such as, operands (e.g., [%c0, %c]) and attributes (e.g., [2, 4])).
* shape: the shape information of the memory region pointed by the "source". It is
typically encoded via the MemRefType of the source, e.g., memref<4096x4096xf16>.
But if "source" is simply a pointer represented as uint64_t type, or a memref
type without shape information e.g., memref<?x?xf16>, the shape information has
to be explicitly passed via the "dynamic_shape" argument. Currently "dynamic_shape"
only accepts operands(e.g., [%c4096, %c4096]), not attributes(e.g., [4096, 4096]).
* strides: the strides of the memory region pointed by the "source". Similar to shape,
it is typically encoded via the MemRefType of the source too. But if "source" is
simply a pointer represented as uint64_t type, or a memref type without shape
information e.g., memref<?x?xf16>, the strides information has to be explicitly
passed via the "dynamic_strides" argument. And it currently only accepts operands two.

Example 1 (suppose the tensor shape inferred by the compiler is 8x16):
%0 = memref.alloc() : memref<1024x1024xf32>
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%1 = xegpu.create_nd_tdesc %0[%c0, %c0]: memref<1024x1024xf32> -> TensorDesc<8x16xf32>

Example 2 (suppose the tensor shape inferred by the compiler is 8x16):
%0 = memref.alloc(%h, %w) : memref<?x?xf32>
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%1 = xegpu.create_nd_tdesc %0[%c0, %c0], [%h, %w], [%w, %c1]: memref<?x?xf32> -> TensorDesc<8x16xf32>

Example 3 (suppose the tensor shape inferred by the compiler is 8x16):
%0 = ... : ui64
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%1 = xegpu.create_nd_tdesc %0[%c0, %c0], [%h, %w], [%w, %c1]: ui64 -> TensorDesc<8x16xf32>
}];

let arguments = (ins
XeGPU_BaseAddrType: $source,
Variadic<Index>: $offsets,
Variadic<Index>: $shape,
Variadic<Index>: $strides,
DenseI64ArrayAttr: $static_offsets
);
let results = (outs XeGPU_TensorDesc: $TensorDesc);

let assemblyFormat = [{
$source ``
custom<DynamicIndexList>($offsets, $static_offsets)
(`,` `[` $shape^ `]` `,` `[` $strides `]`)?
attr-dict `:` type($source) `->` qualified(type($TensorDesc))
}];

let hasVerifier = 1;

let builders = [
OpBuilder<(ins "Type": $tdesc, "TypedValue<MemRefType>": $source,
"llvm::ArrayRef<OpFoldResult>": $offsets)>,

OpBuilder<(ins "Type": $tdesc, "TypedValue<IntegerType> ": $source,
"llvm::ArrayRef<OpFoldResult>": $offsets,
"ValueRange": $shape, "ValueRange": $stride)>
];

let extraClassDeclaration = [{
/// Returns the type of the source memref operand.
Type getSourceType() {
return getSource().getType();
}

/// Returns the type of the result TensorDesc.
xegpu::TensorDescType getType() {
return getTensorDesc().getType();
}

/// Return the element type of the TensorDesc
Type getElementType() {
return getType().getElementType();
}

/// Return the shape of the TensorDesc
llvm::ArrayRef<int64_t> getTensorDescShape() {
return getType().getShape();
}

/// wrapper for matching with OffsetSizeAndStrideOpInterface
OperandRange getSizes() {
return getShape();
}

/// wrapper for matching with OffsetSizeAndStrideOpInterface
/// If source is IntegerType and `shape` is filled, it will
/// return an array of ShapedType::kDynamic representing dynamic
/// shape encoded in the `shape` argument will be used. Presence
/// of `shape` overides static shape from source memref type.
SmallVector<int64_t> getStaticSizes() {
if (getSourceType().isa<IntegerType>() || getShape().size()) {
auto dims = getMixedOffsets().size();
return SmallVector<int64_t>(dims, ShapedType::kDynamic);
}
auto memrefType = getSourceType().dyn_cast<MemRefType>();
return SmallVector<int64_t>(memrefType.getShape());
}

/// wrapper for matching with OffsetSizeAndStrideOpInterface
/// If source is IntegerType or `strides` is filled, it will
/// return an array of ShapedType::kDynamic representing dynamic
/// strides encoded in the `strides` argument will be used. Presence
/// of `strides` overides static strides from source memref type.
SmallVector<int64_t> getStaticStrides() {
if (getSourceType().isa<IntegerType>() || getStrides().size()) {
auto dims = getMixedOffsets().size();
return SmallVector<int64_t>(dims, ShapedType::kDynamic);
}
auto memrefType = getSourceType().dyn_cast<MemRefType>();
auto [strides, offset] = getStridesAndOffset(memrefType);
return strides;
}

/// Return the expected rank of each of the`static_offsets`,
/// `static_shape` and `static_strides` attributes.
std::array<unsigned, 3> getArrayAttrMaxRanks() {
unsigned rank;
if (auto ty = getSourceType().dyn_cast<MemRefType>()) {
rank = ty.getRank();
} else {
rank = (unsigned)getMixedOffsets().size();
}
return {rank, rank, rank};
}

/// Return the number of leading operands before the `offsets`,
/// `shape` and `strides` operands.
static unsigned getOffsetSizeAndStrideStartOperandIndex() { return 1; }

mlir::Value getViewSource() { return getSource(); }
}];
}

def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> {
let summary = "prefetches a nD block to cache";
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: unify "nD/n-D/N-D block" style

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@adam-smnk Thanks, I updated them

let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);

// Format: xegpu.prefetch_nd %tdesc {l1_hint = #xegpu.cache_hint<cached>,
// l2_hint = #xegpu.cache_hint<cached>,
// l3_hint = #xegpu.cache_hint<cached>}
// : !xegpu.tensor_desc<8x16xf16>
let assemblyFormat = "$TensorDesc attr-dict `:` qualified(type($TensorDesc))";
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you always split prop-dict out of attr-dict? We're trying to deprecate merging the two (it's a slow progress...)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @joker-eph, thanks for the feedback. This is my first time to hear about this, I find some simple examples but didn't find related document about how it works. Do you mind sharing some ideas or docs, if have, about it?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@joker-eph never mind, I got some idea from your RFC https://discourse.llvm.org/t/rfc-introducing-mlir-operation-properties/67846, shared by my colleague

}


def XeGPU_LoadNdOp : XeGPU_Op<"load_nd"> {
let summary = "loads a n-D block from memory (represented by TensorDesc)"
"to registers (represented by vector)";
let description = [{
LoadNdOp essentially mimics the hardware block read instruction to read
a block of data from memory to register. It takes a set of cache hints
for each level of cache, L1, L2 and L3. If hardware does not have a
correspoding cache, Corresponding cache hint attribute will be masked.
If both transpose and vnni_axis present at the same time. It assume to
perform transpose first and then vnni transform.
}];

let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
OptionalAttr<I64Attr>: $vnni_axis,
OptionalAttr<DenseI64ArrayAttr>: $transpose,
OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);

let results = (outs XeGPU_ValueType: $value);

let extraClassDeclaration = [{
VectorType getType() {
return llvm::dyn_cast<VectorType>(getValue().getType());
}

xegpu::TensorDescType getTensorDescType() {
return getTensorDesc().getType();
}
}];

// Format: xegpu.load_nd %1 {transpose = [1, 0],
// l1_hint = #xegpu.cache_hint<cached>,
// l2_hint = #xegpu.cache_hint<uncached>,
// l3_hint = #xegpu.cache_hint<streaming>}
// : !xegpu.tensor_desc<8x16xf32> -> vector<16x8xf32>
let assemblyFormat = "$TensorDesc attr-dict `:` qualified(type($TensorDesc)) `->` type($value)";
let hasVerifier = 1;
}

def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", []> {
let summary = "stores a n-D block register region back to memory, currently only supports 2D";
let arguments = (ins XeGPU_ValueType: $value,
XeGPU_TensorDesc: $TensorDesc,
OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);

// Format: xegpu.store_nd %3, %2 {l1_hint = #xegpu.cache_hint<uncached>,
// l2_hint = #xegpu.cache_hint<write_back>,
// l3_hint = #xegpu.cache_hint<write_through>}
// : vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16>
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This kind of format is better provided as markdown examples in the let description = field (so it shows up on the website as well).
(like you did for the XeGPU_TensorDesc type below)

let assemblyFormat = "$value `,` $TensorDesc attr-dict `:` type($value) `,` qualified(type($TensorDesc))";
let hasVerifier = 1;
}

#endif // MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD
Loading
Loading