Add senders and receivers example and fix cartesian_product_view distance bug

gonzalobg · gonzalobg · commit bf1d8ae92ac7 · 2022-11-09T09:30:17.000-10:00
diff --git a/include/cartesian_product.hpp b/include/cartesian_product.hpp
@@ -1501,7 +1501,7 @@ namespace tl  {
          constexpr auto distance_to(cursor const& other) const {
 	   auto idx = linear();
 	   auto oidx = other.linear();
-           return static_cast<difference_type>(idx) - static_cast<difference_type>(oidx);
+           return static_cast<difference_type>(oidx) - static_cast<difference_type>(idx);
          }
 
          friend class cursor<!Const>;
@@ -1648,7 +1648,7 @@ namespace tl  {
          constexpr auto distance_to(cursor const& other) const {
 	   auto idx = linear();
 	   auto oidx = other.linear();
-           return static_cast<difference_type>(idx) - static_cast<difference_type>(oidx);
+	   return static_cast<difference_type>(oidx) - static_cast<difference_type>(idx);	   
          }
 
          friend class cursor<!Const>;
diff --git a/labs/lab2_heat/exercise2.cpp b/labs/lab2_heat/exercise2.cpp
@@ -245,25 +245,12 @@ double apply_stencil(double* u_new, double* u_old, grid g, parameters p) {
   auto xs = std::views::iota(g.x_begin, g.x_end);
   auto ys = std::views::iota(g.y_begin, g.y_end);
   auto ids = std::views::common(std::views::cartesian_product(xs, ys));
-    
-#if !defined(__NVCOMPILER)
   return std::transform_reduce(
     std::execution::par, ids.begin(), ids.end(), 
     0., std::plus{}, [u_new, u_old, p](auto idx) {
       auto [x, y] = idx;
       return stencil(u_new, u_old, x, y, p);
   });
-#else
-  // Workaround for NVIDIA C++ Compiler
-  auto is = std::views::iota((int)0, (int)std::size(ids));
-  auto cp = std::views::cartesian_product(xs, ys);
-  return std::transform_reduce(
-    std::execution::par, is.begin(), is.end(), 
-    0., std::plus{}, [u_new, u_old, p, ids = cp.begin()](auto i) {
-      auto [x, y] = ids[i];
-      return stencil(u_new, u_old, x, y, p);
-  });
-#endif
 }
 
 // Initial condition
diff --git a/labs/lab2_heat/heat.ipynb b/labs/lab2_heat/heat.ipynb
@@ -257,24 +257,6 @@
     "!mpirun -np 2 ./heat 256 256 16000"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "When using the NVIDIA C++ compiler, we currently need to workaround lack of proper support for `views::cartesian_product` in the parallel algorithms as follows:\n",
-    "\n",
-    "```c++\n",
-    "  auto cp = std::views::cartesian_product(xs, ys);\n",
-    "  auto is = std::views::iota((int)0, (int)std::size(cp));  // Create 1D range of ints\n",
-    "  return std::transform_reduce(\n",
-    "    std::execution::par, is.begin(), is.end(), \n",
-    "    0., std::plus{}, [u_new, u_old, p, ids = cp.begin()](auto i) {\n",
-    "      auto [x, y] = ids[i];  // Use int to advance a cartesian_product Iterator.\n",
-    "      return stencil(u_new, u_old, x, y, p);\n",
-    "  });\n",
-    "```"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -284,7 +266,8 @@
     "!rm output || true\n",
     "!rm heat || true\n",
     "!OMPI_CXX=nvc++ mpicxx -std=c++20 -stdpar=multicore -O4 -fast -march=native -Mllvm-fast -DNDEBUG -o heat solutions/exercise1.cpp\n",
-    "!mpirun -np 2 ./heat 256 256 16000"
+    "!mpirun -np 2 ./heat 256 256 16000\n",
+    "visualize()"
    ]
   },
   {
@@ -305,7 +288,8 @@
     "!rm output || true\n",
     "!rm heat || true\n",
     "!OMPI_CXX=nvc++ mpicxx -std=c++20 -stdpar=gpu -O4 -fast -march=native -Mllvm-fast -DNDEBUG -o heat solutions/exercise1.cpp\n",
-    "!UCX_RNDV_FRAG_MEM_TYPE=cuda mpirun -np 2 ./heat 256 256 16000"
+    "!UCX_RNDV_FRAG_MEM_TYPE=cuda mpirun -np 2 ./heat 256 256 16000\n",
+    "visualize()"
    ]
   },
   {
@@ -508,6 +492,129 @@
     "!mpirun -np 2 ./heat 256 256 16000\n",
     "visualize()"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Exercise 3: Senders & Receivers\n",
+    "\n",
+    "The goal of this exercise is to simplify the implementation of Exercise 2 - Overlap Communication and Computation - by using Senders & Receivers with a `static_thread_pool` to manage the host threads, while combining this with the C++ parallel algorithms.\n",
+    "\n",
+    "The implementation of Exercise 2 is quite complex. It requires:\n",
+    "\n",
+    "```c++\n",
+    "// A shared atomic variable to accumulate the energy:\n",
+    "std::atomic<double> energy = 0.;\n",
+    "\n",
+    "// A shared barrier for synchronizing threads:\n",
+    "std::barrier bar(3);\n",
+    "\n",
+    "// User must manually create and start threads:\n",
+    "std::thread thread_inner(..[&] {\n",
+    "      energy += computation(...);\n",
+    "      bar.arrive_and_wait();\n",
+    "      // User must manually create a critical section for MPI rank reduction: \n",
+    "      MPI_Reduce(...);\n",
+    "      // User must manually reset the shared state on each iteration:\n",
+    "      energy = 0;\n",
+    "      bar.arrive_and_wait();\n",
+    "  });\n",
+    "\n",
+    "std::thread thread_prev(...);\n",
+    "std::thread thread_next(...);\n",
+    "\n",
+    "// User must manually join all threads before doing File I/O\n",
+    "thread_prev.join();\n",
+    "thread_next.join();\n",
+    "thread_inner.join();\n",
+    "\n",
+    "// File I/O\n",
+    "```\n",
+    "\n",
+    "In this exercise, we'll use Senders & Receivers instead to create a graph representing the computation:\n",
+    "\n",
+    "```c++\n",
+    "stde::sender iteration_step(stde::scheduler sch, parameters p, long it,\n",
+    "                            std::vector<double>& u_new, std::vector<double>& u_old) {\n",
+    "    // TODO: use Senders & Receivers to create a graph representing the computation of a single iteration   \n",
+    "}\n",
+    "```\n",
+    "\n",
+    "and will then dispatch it to an execution context:\n",
+    "\n",
+    "```c++\n",
+    "stde::static_thread_pool ctx{3}; // Thread Pool with 3 threads\n",
+    "stde::scheduler auto sch = ctx.get_scheduler();\n",
+    "\n",
+    "for (long it = 0; it < p.nit(); ++it) {\n",
+    "    stde::this_thread::sync_wait(iteration_step(sch));\n",
+    "}\n",
+    "```\n",
+    "\n",
+    "### Compilation and run commands\n",
+    "\n",
+    "[exercise3.cpp]: ./exercise3.cpp\n",
+    "\n",
+    "The template [exercise3.cpp] compiles and runs as provided, but produces incorrect results due to the incomplete `iteration_step` implementation.\n",
+    "\n",
+    "After completing it the following blocks should compile and run correctly:"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Solutions Exercise 3\n",
+    "\n",
+    "The solutions for each example are available in the [`solutions/exercise3.cpp`] sub-directory.\n",
+    "\n",
+    "[`solutions/exercise3.cpp`]: ./solutions/exercise3.cpp\n",
+    "\n",
+    "The following blocks compiles and runs the solutions for Exercise 3 using different compilers and C++ standard versions.\n",
+    "By default, the [`static_thread_pool`] scheduler is used.\n",
+    "\n",
+    "[`static_thread_pool`]: https://github.com/NVIDIA/stdexec/blob/main/include/exec/static_thread_pool.hpp"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!rm output || true\n",
+    "!rm heat || true\n",
+    "!OMPI_CXX=g++ mpicxx -std=c++20 -Ofast -march=native -DNDEBUG -o heat solutions/exercise3.cpp -ltbb\n",
+    "!mpirun -np 2 ./heat 256 256 16000\n",
+    "visualize()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!rm output || true\n",
+    "!rm heat || true\n",
+    "!OMPI_CXX=clang++ mpicxx -std=c++20 -Ofast -march=native -DNDEBUG -o heat solutions/exercise3.cpp -ltbb\n",
+    "!mpirun -np 2 ./heat 256 256 16000\n",
+    "visualize()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!rm output || true\n",
+    "!rm heat || true\n",
+    "!OMPI_CXX=nvc++ mpicxx -std=c++20 -stdpar=gpu -O4 -fast -march=native -Mllvm-fast -DNDEBUG -o heat solutions/exercise3.cpp\n",
+    "!mpirun -np 2 ./heat 256 256 16000\n",
+    "visualize()"
+   ]
   }
  ],
  "metadata": {
diff --git a/labs/lab2_heat/solutions/exercise1.cpp b/labs/lab2_heat/solutions/exercise1.cpp
@@ -64,25 +64,12 @@ double apply_stencil(double* u_new, double* u_old, grid g, parameters p) {
   auto xs = std::views::iota(g.x_begin, g.x_end);
   auto ys = std::views::iota(g.y_begin, g.y_end);
   auto ids = std::views::common(std::views::cartesian_product(xs, ys));
-    
-#if !defined(__NVCOMPILER)
   return std::transform_reduce(
     std::execution::par, ids.begin(), ids.end(), 
     0., std::plus{}, [u_new, u_old, p](auto idx) {
       auto [x, y] = idx;
       return stencil(u_new, u_old, x, y, p);
   });
-#else
-  // Workaround for NVIDIA C++ Compiler
-  auto is = std::views::iota((int)0, (int)std::size(ids));
-  auto cp = std::views::cartesian_product(xs, ys);
-  return std::transform_reduce(
-    std::execution::par, is.begin(), is.end(), 
-    0., std::plus{}, [u_new, u_old, p, ids = cp.begin()](auto i) {
-      auto [x, y] = ids[i];
-      return stencil(u_new, u_old, x, y, p);
-  });
-#endif
 }
 
 // Initial condition
diff --git a/labs/lab2_heat/solutions/exercise2.cpp b/labs/lab2_heat/solutions/exercise2.cpp
@@ -242,25 +242,12 @@ double apply_stencil(double* u_new, double* u_old, grid g, parameters p) {
   auto xs = std::views::iota(g.x_begin, g.x_end);
   auto ys = std::views::iota(g.y_begin, g.y_end);
   auto ids = std::views::common(std::views::cartesian_product(xs, ys));
-    
-#if !defined(__NVCOMPILER)
   return std::transform_reduce(
     std::execution::par, ids.begin(), ids.end(), 
     0., std::plus{}, [u_new, u_old, p](auto idx) {
       auto [x, y] = idx;
       return stencil(u_new, u_old, x, y, p);
   });
-#else
-  // Workaround for NVIDIA C++ Compiler
-  auto is = std::views::iota((int)0, (int)std::size(ids));
-  auto cp = std::views::cartesian_product(xs, ys);
-  return std::transform_reduce(
-    std::execution::par, is.begin(), is.end(), 
-    0., std::plus{}, [u_new, u_old, p, ids = cp.begin()](auto i) {
-      auto [x, y] = ids[i];
-      return stencil(u_new, u_old, x, y, p);
-  });
-#endif
 }
 
 // Initial condition
diff --git a/labs/lab2_heat/solutions/exercise3.cpp b/labs/lab2_heat/solutions/exercise3.cpp