-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[mlir][xegpu] Refine layout assignment in XeGPU SIMT distribution. #142687
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@Garra1980 @Jianhui-Li Can you please take a look? |
storeScatterLayout = valueLayout.getTransposedLayout({1, 0}); | ||
} | ||
// Propagate the value layout. | ||
propagateIfChanged(operands[0], operands[0]->meet(valueLayout)); | ||
// Propagate the tensor descriptor layout. | ||
propagateIfChanged(operands[1], operands[1]->meet(storeScatterLayout)); | ||
// Use default 1D layout for mask operand. | ||
LayoutInfo maskLayout = getDefaultLayoutInfo(1); | ||
propagateIfChanged(operands[2], operands[2]->meet(maskLayout)); | ||
} | ||
|
||
namespace { | ||
|
||
//===----------------------------------------------------------------------===// | ||
// RunLayoutInfoPropagation | ||
//===----------------------------------------------------------------------===// | ||
|
||
/// Driver class for running the LayoutInfoPropagation analysis. | ||
class RunLayoutInfoPropagation { | ||
public: | ||
MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(RunLayoutInfoPropagation) | ||
|
||
RunLayoutInfoPropagation(Operation *op) : target(op) { | ||
SymbolTableCollection symbolTable; | ||
solver.load<DeadCodeAnalysis>(); | ||
solver.load<SparseConstantPropagation>(); | ||
solver.load<LayoutInfoPropagation>(symbolTable); | ||
(void)solver.initializeAndRun(op); | ||
} | ||
|
||
LayoutInfo getLayoutInfo(Value val); | ||
|
||
void printAnalysisResult(llvm::raw_ostream &os); | ||
|
||
private: | ||
DataFlowSolver solver; | ||
const Operation *target; | ||
}; | ||
} // namespace | ||
|
||
LayoutInfo RunLayoutInfoPropagation::getLayoutInfo(Value val) { | ||
auto *state = solver.lookupState<LayoutInfoLattice>(val); | ||
if (!state) | ||
return {}; | ||
return state->getValue(); | ||
} | ||
|
||
// Print the analysis result for debugging purposes. | ||
[[maybe_unused]] void | ||
RunLayoutInfoPropagation::printAnalysisResult(llvm::raw_ostream &os) { | ||
auto printFunctionResult = [&](FunctionOpInterface funcOp) { | ||
os << "function: " << funcOp.getName() << ":\n"; | ||
// Function arguments | ||
for (BlockArgument arg : funcOp.getArguments()) { | ||
LayoutInfo layout = getLayoutInfo(arg); | ||
os << "argument: " << arg << "\n"; | ||
os << "layout : "; | ||
layout.print(os); | ||
os << "\n"; | ||
} | ||
// Function ops | ||
funcOp.walk([&](Operation *op) { | ||
// Skip ops that do not have results | ||
if (op->getResults().empty()) | ||
return; | ||
os << "op : "; | ||
// For control-flow ops, print the op name only. | ||
if (isa<BranchOpInterface>(op) || isa<RegionBranchOpInterface>(op)) | ||
os << op->getName(); | ||
else | ||
op->print(os); | ||
os << "\n"; | ||
// Print the layout for each result. | ||
for (auto [i, r] : llvm::enumerate(op->getResults())) { | ||
LayoutInfo layout = getLayoutInfo(r); | ||
os << "layout for result #" << i << ": "; | ||
layout.print(os); | ||
os << "\n"; | ||
} | ||
}); | ||
}; | ||
|
||
SmallVector<FunctionOpInterface> funcOps; | ||
if (auto modOp = dyn_cast<ModuleOp>(target)) { | ||
for (auto funcOp : modOp.getOps<FunctionOpInterface>()) { | ||
funcOps.push_back(funcOp); | ||
} | ||
// Collect all GpuFuncOps in the module. | ||
for (auto gpuModOp : modOp.getOps<gpu::GPUModuleOp>()) { | ||
for (auto gpuFuncOp : gpuModOp.getOps<FunctionOpInterface>()) { | ||
funcOps.push_back(gpuFuncOp); | ||
} | ||
} | ||
} | ||
// Print the analysis result for each function. | ||
for (FunctionOpInterface funcOp : funcOps) { | ||
printFunctionResult(funcOp); | ||
} | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this part of the code does not require review because it is a copy-paste into a new file.
// CHECK-LABEL: func.func @binary_op_multiple_uses( | ||
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, %[[ARG2:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, %[[ARG3:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) { | ||
// CHECK: %[[T2:.*]] = arith.addf %{{.*}}, %{{.*}} {layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, layout_operand_1 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x16xf16> | ||
// CHECK: %[[T3:.*]] = xegpu.dpas %{{.*}}, %[[T2]] {layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, layout_operand_1 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
lane_data is [1, 1] for B as well?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yes. store_nd sets the layout for %2. so it is set to lane_layout = [1, 16], lane_data = [1, 1]> (because of backward analysis). this example will need a convert_layout.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I get your point now. current code does not handle layout conflicts. it naively attach the layout assigned by the backward analysis.
this need some improvement. maybe next PR?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am fine with separate PR.
// CHECK-NEXT: %[[T7:.*]] = xegpu.update_nd_offset %[[ARG4]], [{{.*}}] : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> | ||
// CHECK-NEXT: %[[T8:.*]] = xegpu.update_nd_offset %[[ARG5]], [{{.*}}] : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> | ||
// CHECK-NEXT: scf.yield {layout_operand_2 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} %[[T7]], %[[T8]], %[[T6]] : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>, vector<8x16xf32> | ||
// CHECK-NEXT: } {layout_operand_5 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, layout_result_2 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
what are these layouts for? The loop output three variables and seems not matching these layouts.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
5th (0 indexed) input to the loop is the C init value. and second output (0 indexed) is C output value.
Changes: