|
| 1 | +#blocked = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [4], order = [0], loadType = 0, smeWarpsPerCTA = [0]}> |
| 2 | +#loc = loc("/usr/local/corex-4.3.0.20250707/flagtree/python/tutorials/01-vector-add.py":28:0) |
| 3 | +module attributes {"triton_gpu.dot.num-stages" = 1 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "cuda:71", "triton_gpu.threads-per-warp" = 64 : i32} { |
| 4 | + tt.func public @add_kernel(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("/usr/local/corex-4.3.0.20250707/flagtree/python/tutorials/01-vector-add.py":28:0), %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("/usr/local/corex-4.3.0.20250707/flagtree/python/tutorials/01-vector-add.py":28:0), %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("/usr/local/corex-4.3.0.20250707/flagtree/python/tutorials/01-vector-add.py":28:0), %arg3: i32 {tt.divisibility = 16 : i32} loc("/usr/local/corex-4.3.0.20250707/flagtree/python/tutorials/01-vector-add.py":28:0)) attributes {noinline = false} { |
| 5 | + %c1024_i32 = arith.constant 1024 : i32 loc(#loc1) |
| 6 | + %0 = tt.get_program_id x : i32 loc(#loc2) |
| 7 | + %1 = arith.muli %0, %c1024_i32 : i32 loc(#loc3) |
| 8 | + %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked> loc(#loc4) |
| 9 | + %3 = tt.splat %1 : i32 -> tensor<1024xi32, #blocked> loc(#loc5) |
| 10 | + %4 = arith.addi %3, %2 : tensor<1024xi32, #blocked> loc(#loc5) |
| 11 | + %5 = tt.splat %arg3 : i32 -> tensor<1024xi32, #blocked> loc(#loc6) |
| 12 | + %6 = arith.cmpi slt, %4, %5 : tensor<1024xi32, #blocked> loc(#loc6) |
| 13 | + %7 = tt.splat %arg0 : !tt.ptr<f32> -> tensor<1024x!tt.ptr<f32>, #blocked> loc(#loc7) |
| 14 | + %8 = tt.addptr %7, %4 : tensor<1024x!tt.ptr<f32>, #blocked>, tensor<1024xi32, #blocked> loc(#loc7) |
| 15 | + %9 = tt.load %8, %6 {boundaryCheck = array<i32>, cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xf32, #blocked> loc(#loc8) |
| 16 | + %10 = tt.splat %arg1 : !tt.ptr<f32> -> tensor<1024x!tt.ptr<f32>, #blocked> loc(#loc9) |
| 17 | + %11 = tt.addptr %10, %4 : tensor<1024x!tt.ptr<f32>, #blocked>, tensor<1024xi32, #blocked> loc(#loc9) |
| 18 | + %12 = tt.load %11, %6 {boundaryCheck = array<i32>, cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xf32, #blocked> loc(#loc10) |
| 19 | + %13 = arith.addf %9, %12 : tensor<1024xf32, #blocked> loc(#loc11) |
| 20 | + %14 = tt.splat %arg2 : !tt.ptr<f32> -> tensor<1024x!tt.ptr<f32>, #blocked> loc(#loc12) |
| 21 | + %15 = tt.addptr %14, %4 : tensor<1024x!tt.ptr<f32>, #blocked>, tensor<1024xi32, #blocked> loc(#loc12) |
| 22 | + tt.store %15, %13, %6 : tensor<1024x!tt.ptr<f32>, #blocked> loc(#loc13) |
| 23 | + tt.return loc(#loc14) |
| 24 | + } loc(#loc) |
| 25 | +} loc(#loc) |
| 26 | +#loc1 = loc(unknown) |
| 27 | +#loc2 = loc("/usr/local/corex-4.3.0.20250707/flagtree/python/tutorials/01-vector-add.py":37:24) |
| 28 | +#loc3 = loc("/usr/local/corex-4.3.0.20250707/flagtree/python/tutorials/01-vector-add.py":42:24) |
| 29 | +#loc4 = loc("/usr/local/corex-4.3.0.20250707/flagtree/python/tutorials/01-vector-add.py":43:41) |
| 30 | +#loc5 = loc("/usr/local/corex-4.3.0.20250707/flagtree/python/tutorials/01-vector-add.py":43:28) |
| 31 | +#loc6 = loc("/usr/local/corex-4.3.0.20250707/flagtree/python/tutorials/01-vector-add.py":45:21) |
| 32 | +#loc7 = loc("/usr/local/corex-4.3.0.20250707/flagtree/python/tutorials/01-vector-add.py":48:24) |
| 33 | +#loc8 = loc("/usr/local/corex-4.3.0.20250707/flagtree/python/tutorials/01-vector-add.py":48:16) |
| 34 | +#loc9 = loc("/usr/local/corex-4.3.0.20250707/flagtree/python/tutorials/01-vector-add.py":49:24) |
| 35 | +#loc10 = loc("/usr/local/corex-4.3.0.20250707/flagtree/python/tutorials/01-vector-add.py":49:16) |
| 36 | +#loc11 = loc("/usr/local/corex-4.3.0.20250707/flagtree/python/tutorials/01-vector-add.py":50:17) |
| 37 | +#loc12 = loc("/usr/local/corex-4.3.0.20250707/flagtree/python/tutorials/01-vector-add.py":52:26) |
| 38 | +#loc13 = loc("/usr/local/corex-4.3.0.20250707/flagtree/python/tutorials/01-vector-add.py":52:35) |
| 39 | +#loc14 = loc("/usr/local/corex-4.3.0.20250707/flagtree/python/tutorials/01-vector-add.py":52:4) |
0 commit comments