facebookresearch · skimo-openhub · Mar 26, 2018 · Mar 23, 2018
diff --git a/src/core/polyhedral/codegen_cuda.cc b/src/core/polyhedral/codegen_cuda.cc
@@ -471,13 +471,11 @@ std::string toString(isl::aff subscript) {
 }
 
 std::string toString(isl::pw_aff subscript) {
-  isl::aff subscriptAff = isl::null<isl::aff>();
-  subscript.foreach_piece([&](isl::set domain, isl::aff aff) {
-    CHECK(!subscriptAff.get()) << "expected one piece";
-    subscriptAff = aff;
-  });
-
-  return toString(subscriptAff);
+  // Use a temporary isl::ast_build to print the expression.
+  // Ideally, this should use the build at the point
+  // where the user statement was created.
+  auto astBuild = isl::ast_build::from_context(subscript.domain());
+  return astBuild.expr_from(subscript).to_C_str();
 }
 
 isl::pw_aff makeAffFromMappedExpr(

diff --git a/test/test_mapper.cc b/test/test_mapper.cc
@@ -186,7 +186,7 @@ def fun(float(N, M) A, float(N, M) B) -> (C) {
   float32 (*B)[M] = reinterpret_cast<float32 (*)[M]>(pB);
   for (int c1 = 16 * b1; c1 < M; c1 += 4096) {
     if (M >= t1 + c1 + 1) {
-      C[t0 + 16*b0][t1 + c1] = (A[t0 + 16*b0][t1 + c1] + B[t0 + 16*b0][t1 + c1]);
+      C[t0 + 16 * b0][t1 + c1] = (A[t0 + 16 * b0][t1 + c1] + B[t0 + 16 * b0][t1 + c1]);
     }
   }
 }
@@ -442,7 +442,7 @@ TEST_F(PolyhedralMapperTest, Unroll1D) {
   auto mscop = MappedScop::makeWithOuterBlockInnerThreadStrategy(
       std::move(scop), mappingOptions);
   auto code = std::get<0>(mscop->codegen(specializedName));
-  std::string expected("C[64*b0 + c2][t0 + 64*b1]");
+  std::string expected("C[64 * b0 + c2][t0 + 64 * b1]");
   ASSERT_TRUE(code.find(expected) != std::string::npos) << code;
 }
 
@@ -461,7 +461,7 @@ TEST_F(PolyhedralMapperTest, Unroll2D) {
   auto mscop = MappedScop::makeWithOuterBlockInnerThreadStrategy(
       std::move(scop), mappingOptions);
   auto code = std::get<0>(mscop->codegen(specializedName));
-  std::string expected("C[32 + t1 + 64*b0][32 + t0 + 64*b1]");
+  std::string expected("C[t1 + 64 * b0 + 32][t0 + 64 * b1 + 32]");
   ASSERT_TRUE(code.find(expected) != std::string::npos);
 }
 

diff --git a/test/test_mapper_memory_promotion.cc b/test/test_mapper_memory_promotion.cc
@@ -118,13 +118,13 @@ TEST_F(Sum4D, CodeOuterBand) {
                        "__shared__ float32 _C_0[16][16][16][16];"};
 
   auto copyA =
-      "_A_0[c4][c5][c6][c7] = A[16*b0 + c4][16*b1 + c5][c2 + c6][c3 + c7];";
+      "_A_0[c4][c5][c6][c7] = A[16 * b0 + c4][16 * b1 + c5][c2 + c6][c3 + c7];";
   auto copyB =
-      "_B_0[c4][c5][c6][c7] = B[16*b0 + c4][16*b1 + c5][c2 + c6][c3 + c7];";
+      "_B_0[c4][c5][c6][c7] = B[16 * b0 + c4][16 * b1 + c5][c2 + c6][c3 + c7];";
   auto compute =
       "_C_0[c4][c5][c6][t0] = (_A_0[c4][c5][c6][t0] + _B_0[c4][c5][c6][t0]);";
   auto copyC =
-      "C[16*b0 + c4][16*b1 + c5][c2 + c6][c3 + c7] = _C_0[c4][c5][c6][c7];";
+      "C[16 * b0 + c4][16 * b1 + c5][c2 + c6][c3 + c7] = _C_0[c4][c5][c6][c7];";
   auto sync = "__syncthreads()";
 
   auto code = emitCode({256, 128, 192, 224}, {16, 16, 16, 16}, {0, 0, 0, 0});
@@ -160,13 +160,13 @@ TEST_F(Sum4D, CodeBeforeThreadMapping) {
                        "__shared__ float32 _B_0[16][16][16][1];",
                        "__shared__ float32 _C_0[16][16][16][1];"};
   auto copyA =
-      "_A_0[c4][c5][c6][0] = A[16*b0 + c4][16*b1 + c5][c2 + c6][t0 + c3];";
+      "_A_0[c4][c5][c6][0] = A[16 * b0 + c4][16 * b1 + c5][c2 + c6][t0 + c3];";
   auto copyB =
-      "_B_0[c4][c5][c6][0] = B[16*b0 + c4][16*b1 + c5][c2 + c6][t0 + c3];";
+      "_B_0[c4][c5][c6][0] = B[16 * b0 + c4][16 * b1 + c5][c2 + c6][t0 + c3];";
   auto compute =
       "_C_0[c4][c5][c6][0] = (_A_0[c4][c5][c6][0] + _B_0[c4][c5][c6][0]);";
   auto copyC =
-      "C[16*b0 + c4][16*b1 + c5][c2 + c6][t0 + c3] = _C_0[c4][c5][c6][0];";
+      "C[16 * b0 + c4][16 * b1 + c5][c2 + c6][t0 + c3] = _C_0[c4][c5][c6][0];";
   auto sync = "__syncthreads()";
 
   auto code =
@@ -204,12 +204,12 @@ TEST_F(Sum4D, CodeInnerBand) {
                        "__shared__ float32 _A_0[1][1][1][1];",
                        "__shared__ float32 _B_0[1][1][1][1];"};
   auto copyA =
-      "_A_0[0][0][0][0] = A[16*b0 + c4][16*b1 + c5][c2 + c6][t0 + c3];";
+      "_A_0[0][0][0][0] = A[16 * b0 + c4][16 * b1 + c5][c2 + c6][t0 + c3];";
   auto copyB =
-      "_B_0[0][0][0][0] = B[16*b0 + c4][16*b1 + c5][c2 + c6][t0 + c3];";
+      "_B_0[0][0][0][0] = B[16 * b0 + c4][16 * b1 + c5][c2 + c6][t0 + c3];";
   auto compute = "_C_0[0][0][0][0] = (_A_0[0][0][0][0] + _B_0[0][0][0][0]);";
   auto copyC =
-      "C[16*b0 + c4][16*b1 + c5][c2 + c6][t0 + c3] = _C_0[0][0][0][0];";
+      "C[16 * b0 + c4][16 * b1 + c5][c2 + c6][t0 + c3] = _C_0[0][0][0][0];";
   auto sync = "__syncthreads()";
 
   auto code =

diff --git a/test/test_tc_mapper_bugs.cc b/test/test_tc_mapper_bugs.cc
@@ -697,6 +697,47 @@ TEST(LayerNorm, ReferenceBelongsToTwoGroups) {
   atCompl.compile("layernorm", inputs, options);
 }
 
+// This case was observed when running the autotuner on example_MLP_model
+// (#200).  It calls code generation on a schedule tree containing a
+// disjunctive filter, which results in expression with more than one disjunct
+// that was not handed properly.
+// TODO: the disjunctive filter in the schedule is unexpected and its origin
+// should be identified and explained.
+TEST(TMM_128_1024_1000, DisjunctiveFilter) {
+  at::Tensor I = at::CUDA(at::kFloat).rand({128, 1024});
+  at::Tensor W = at::CUDA(at::kFloat).rand({1000, 1024});
+  std::vector<at::Tensor> inputs = {I, W};
+  std::vector<at::Tensor> outputs;
+
+  auto TC = std::string(R"TC(
+def tmm_naive(float(B, X) I, float(Y, X) W) -> (O) {
+  O(b, y) +=! I(b, rx) * W(y, rx)
+}
+)TC");
+  auto options =
+      tc::MappingOptions::makeNaiveMappingOptions()
+          .outerScheduleFusionStrategy(tc::FusionStrategy::Preserve3Coincident)
+          .outerScheduleAllowSkewing(false)
+          .outerSchedulePositiveOrthant(true)
+          .intraTileScheduleFusionStrategy(tc::FusionStrategy::Min)
+          .intraTileScheduleAllowSkewing(false)
+          .intraTileSchedulePositiveOrthant(true)
+          .tile(1, 32, 63)
+          .mapToThreads(2, 32)
+          .mapToBlocks(64, 128, 1024)
+          .unroll(128)
+          .tileImperfectlyNested(false)
+          .useSharedMemory(false)
+          .usePrivateMemory(false)
+          .unrollCopyShared(false)
+          .matchLibraryCalls(true);
+
+  tc::ATenCompilationUnit<tc::CudaTcExecutor> atCompl;
+  atCompl.define(TC);
+  // Expecting this to compile without dying.
+  atCompl.compile("tmm_naive", inputs, options);
+}
+
 TEST(Halide2Isl, MinInUpperBound) {
   at::Tensor mat1 = at::CUDA(at::kFloat).rand({1, 100, 184, 184});
   at::Tensor mat1_pad = at::CUDA(at::kFloat).rand({1, 100, 186, 186});