|
| 1 | +// RUN: gc-opt %s --pass-pipeline='builtin.module(func.func(iterative-tiling-and-fusion{use-cost-model=0 default-tile-size=matmul:{16,16}}),eliminate-empty-tensors,empty-tensor-to-alloc-tensor,one-shot-bufferize{bufferize-function-boundaries=1 function-boundary-type-conversion=identity-layout-map},drop-equivalent-buffer-results,func.func(finalizing-bufferize),canonicalize,cse,drop-equivalent-buffer-results,expand-realloc,canonicalize,ownership-based-buffer-deallocation,canonicalize,buffer-deallocation-simplification,bufferization-lower-deallocations,cse,canonicalize,convert-bufferization-to-memref,func.func(scf-forall-to-parallel),func.func(linalg-to-xegpu{stages=1 dpas-tile=8,16,16 k-tile=16}),xegpu-fold-alias-ops,func.func(convert-linalg-to-parallel-loops),func.func(gpu-map-parallel-loops),func.func(convert-parallel-loops-to-gpu),func.func(insert-gpu-allocs),gpu-kernel-outlining,canonicalize,set-spirv-capabilities{client-api=opencl},gpu.module(set-spirv-abi-attrs{client-api=opencl}),lower-affine,imex-vector-linearize,gpu.module(convert-xegpu-to-vc),reconcile-unrealized-casts,bf16-to-gpu,gpu.module(convert-func-to-spirv),gpu.module(convert-vector-to-spirv),imex-convert-gpu-to-spirv,spirv.module(spirv-lower-abi-attrs,spirv-update-vce),func.func(llvm-request-c-wrappers),serialize-spirv,convert-vector-to-scf,convert-gpu-to-gpux,convert-scf-to-cf,convert-cf-to-llvm,convert-vector-to-llvm,convert-index-to-llvm,convert-arith-to-llvm,convert-func-to-llvm,convert-math-to-llvm,convert-gpux-to-llvm,convert-index-to-llvm,expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \ |
| 2 | +// RUN: | gc-cpu-runner -e main --entry-point-result=void \ |
| 3 | +// RUN: --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime | FileCheck %s |
| 4 | + |
| 5 | +module { |
| 6 | + func.func @linalg_mlp(%arg0: tensor<32x4096xf16>, %arg1: tensor<4096x4096xf16>, %arg2 : tensor<32x4096xf16>, |
| 7 | + %arg3: tensor<4096x4096xf16>, %arg4 : tensor<32x4096xf16>) { |
| 8 | + %cst = arith.constant 0.000000e+00 : f16 |
| 9 | + %0 = tensor.empty() : tensor<32x4096xf16> |
| 10 | + %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<32x4096xf16>) -> tensor<32x4096xf16> |
| 11 | + %2 = linalg.matmul ins(%arg0, %arg1 : tensor<32x4096xf16>, tensor<4096x4096xf16>) |
| 12 | + outs(%1 : tensor<32x4096xf16>) -> (tensor<32x4096xf16>) |
| 13 | + %3 = tensor.empty() : tensor<32x4096xf16> |
| 14 | + %4 = linalg.add ins(%arg2, %2 : tensor<32x4096xf16>, tensor<32x4096xf16>) |
| 15 | + outs(%3 : tensor<32x4096xf16>) -> tensor<32x4096xf16> |
| 16 | + %5 = arith.constant dense<0.000000e+00> : tensor<32x4096xf16> |
| 17 | + %6 = tensor.empty() : tensor<32x4096xf16> |
| 18 | + %7 = linalg.max ins(%5, %4 : tensor<32x4096xf16>, tensor<32x4096xf16>) |
| 19 | + outs(%6 : tensor<32x4096xf16>) -> tensor<32x4096xf16> |
| 20 | + |
| 21 | + %8 = tensor.empty() : tensor<32x4096xf16> |
| 22 | + %9 = linalg.fill ins(%cst : f16) outs(%8 : tensor<32x4096xf16>) -> tensor<32x4096xf16> |
| 23 | + %10 = linalg.matmul ins(%7, %arg3 : tensor<32x4096xf16>, tensor<4096x4096xf16>) |
| 24 | + outs(%9 : tensor<32x4096xf16>) -> (tensor<32x4096xf16>) |
| 25 | + %11 = tensor.empty() : tensor<32x4096xf16> |
| 26 | + %12 = linalg.add ins(%arg4, %10 : tensor<32x4096xf16>, tensor<32x4096xf16>) |
| 27 | + outs(%11 : tensor<32x4096xf16>) -> tensor<32x4096xf16> |
| 28 | + %13 = arith.constant dense<0.000000e+00> : tensor<32x4096xf16> |
| 29 | + %14 = tensor.empty() : tensor<32x4096xf16> |
| 30 | + %15 = linalg.max ins(%13, %12 : tensor<32x4096xf16>, tensor<32x4096xf16>) |
| 31 | + outs(%14 : tensor<32x4096xf16>) -> tensor<32x4096xf16> |
| 32 | + |
| 33 | + %slice = tensor.extract_slice %15[0, 0][32, 1][1, 1] : tensor<32x4096xf16> to tensor<32xf16> |
| 34 | + %cast = tensor.cast %slice : tensor<32xf16> to tensor<*xf16> |
| 35 | + call @printMemrefF16(%cast) : (tensor<*xf16>) -> () |
| 36 | + |
| 37 | + return |
| 38 | + } |
| 39 | + |
| 40 | + func.func @main() { |
| 41 | + %0 = arith.constant dense<0.01> : tensor<32x4096xf16> |
| 42 | + %1 = arith.constant dense<0.01> : tensor<4096x4096xf16> |
| 43 | + %2 = arith.constant dense<0.02> : tensor<32x4096xf16> |
| 44 | + %3 = arith.constant dense<0.01> : tensor<4096x4096xf16> |
| 45 | + %4 = arith.constant dense<0.02> : tensor<32x4096xf16> |
| 46 | + |
| 47 | + func.call @linalg_mlp(%0, %1, %2, %3, %4) : (tensor<32x4096xf16>, tensor<4096x4096xf16>, tensor<32x4096xf16>, |
| 48 | + tensor<4096x4096xf16>, tensor<32x4096xf16>) -> () |
| 49 | + return |
| 50 | + } |
| 51 | + |
| 52 | + func.func private @printMemrefF16(%ptr : tensor<*xf16>) attributes { llvm.emit_c_interface } |
| 53 | +} |
| 54 | + |
| 55 | +// CHECK: Unranked Memref base@{{(0x)?[-0-9a-fA-F]*}} |
| 56 | +// CHECK-SAME: rank = 1 offset = 0 sizes = [32] strides = [4096] data = |
| 57 | +// CHECK-NEXT: [17.625, 17.625, 17.625, 17.625, 17.625, 17.625, 17.625, 17.625, 17.625, 17.625, 17.625, 17.625, 17.625, 17.625, 17.625, 17.625, 17.625, 17.625, 17.625, 17.625, 17.625, 17.625, 17.625, 17.625, 17.625, 17.625, 17.625, 17.625, 17.625, 17.625, 17.625, 17.625] |
0 commit comments