Skip to content

Commit bf1d8ae

Browse files
committed
Add senders and receivers example and fix cartesian_product_view distance bug
1 parent 00bc0fa commit bf1d8ae

File tree

6 files changed

+393
-61
lines changed

6 files changed

+393
-61
lines changed

include/cartesian_product.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1501,7 +1501,7 @@ namespace tl {
15011501
constexpr auto distance_to(cursor const& other) const {
15021502
auto idx = linear();
15031503
auto oidx = other.linear();
1504-
return static_cast<difference_type>(idx) - static_cast<difference_type>(oidx);
1504+
return static_cast<difference_type>(oidx) - static_cast<difference_type>(idx);
15051505
}
15061506

15071507
friend class cursor<!Const>;
@@ -1648,7 +1648,7 @@ namespace tl {
16481648
constexpr auto distance_to(cursor const& other) const {
16491649
auto idx = linear();
16501650
auto oidx = other.linear();
1651-
return static_cast<difference_type>(idx) - static_cast<difference_type>(oidx);
1651+
return static_cast<difference_type>(oidx) - static_cast<difference_type>(idx);
16521652
}
16531653

16541654
friend class cursor<!Const>;

labs/lab2_heat/exercise2.cpp

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -245,25 +245,12 @@ double apply_stencil(double* u_new, double* u_old, grid g, parameters p) {
245245
auto xs = std::views::iota(g.x_begin, g.x_end);
246246
auto ys = std::views::iota(g.y_begin, g.y_end);
247247
auto ids = std::views::common(std::views::cartesian_product(xs, ys));
248-
249-
#if !defined(__NVCOMPILER)
250248
return std::transform_reduce(
251249
std::execution::par, ids.begin(), ids.end(),
252250
0., std::plus{}, [u_new, u_old, p](auto idx) {
253251
auto [x, y] = idx;
254252
return stencil(u_new, u_old, x, y, p);
255253
});
256-
#else
257-
// Workaround for NVIDIA C++ Compiler
258-
auto is = std::views::iota((int)0, (int)std::size(ids));
259-
auto cp = std::views::cartesian_product(xs, ys);
260-
return std::transform_reduce(
261-
std::execution::par, is.begin(), is.end(),
262-
0., std::plus{}, [u_new, u_old, p, ids = cp.begin()](auto i) {
263-
auto [x, y] = ids[i];
264-
return stencil(u_new, u_old, x, y, p);
265-
});
266-
#endif
267254
}
268255

269256
// Initial condition

labs/lab2_heat/heat.ipynb

Lines changed: 127 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -257,24 +257,6 @@
257257
"!mpirun -np 2 ./heat 256 256 16000"
258258
]
259259
},
260-
{
261-
"cell_type": "markdown",
262-
"metadata": {},
263-
"source": [
264-
"When using the NVIDIA C++ compiler, we currently need to workaround lack of proper support for `views::cartesian_product` in the parallel algorithms as follows:\n",
265-
"\n",
266-
"```c++\n",
267-
" auto cp = std::views::cartesian_product(xs, ys);\n",
268-
" auto is = std::views::iota((int)0, (int)std::size(cp)); // Create 1D range of ints\n",
269-
" return std::transform_reduce(\n",
270-
" std::execution::par, is.begin(), is.end(), \n",
271-
" 0., std::plus{}, [u_new, u_old, p, ids = cp.begin()](auto i) {\n",
272-
" auto [x, y] = ids[i]; // Use int to advance a cartesian_product Iterator.\n",
273-
" return stencil(u_new, u_old, x, y, p);\n",
274-
" });\n",
275-
"```"
276-
]
277-
},
278260
{
279261
"cell_type": "code",
280262
"execution_count": null,
@@ -284,7 +266,8 @@
284266
"!rm output || true\n",
285267
"!rm heat || true\n",
286268
"!OMPI_CXX=nvc++ mpicxx -std=c++20 -stdpar=multicore -O4 -fast -march=native -Mllvm-fast -DNDEBUG -o heat solutions/exercise1.cpp\n",
287-
"!mpirun -np 2 ./heat 256 256 16000"
269+
"!mpirun -np 2 ./heat 256 256 16000\n",
270+
"visualize()"
288271
]
289272
},
290273
{
@@ -305,7 +288,8 @@
305288
"!rm output || true\n",
306289
"!rm heat || true\n",
307290
"!OMPI_CXX=nvc++ mpicxx -std=c++20 -stdpar=gpu -O4 -fast -march=native -Mllvm-fast -DNDEBUG -o heat solutions/exercise1.cpp\n",
308-
"!UCX_RNDV_FRAG_MEM_TYPE=cuda mpirun -np 2 ./heat 256 256 16000"
291+
"!UCX_RNDV_FRAG_MEM_TYPE=cuda mpirun -np 2 ./heat 256 256 16000\n",
292+
"visualize()"
309293
]
310294
},
311295
{
@@ -508,6 +492,129 @@
508492
"!mpirun -np 2 ./heat 256 256 16000\n",
509493
"visualize()"
510494
]
495+
},
496+
{
497+
"cell_type": "markdown",
498+
"metadata": {},
499+
"source": [
500+
"## Exercise 3: Senders & Receivers\n",
501+
"\n",
502+
"The goal of this exercise is to simplify the implementation of Exercise 2 - Overlap Communication and Computation - by using Senders & Receivers with a `static_thread_pool` to manage the host threads, while combining this with the C++ parallel algorithms.\n",
503+
"\n",
504+
"The implementation of Exercise 2 is quite complex. It requires:\n",
505+
"\n",
506+
"```c++\n",
507+
"// A shared atomic variable to accumulate the energy:\n",
508+
"std::atomic<double> energy = 0.;\n",
509+
"\n",
510+
"// A shared barrier for synchronizing threads:\n",
511+
"std::barrier bar(3);\n",
512+
"\n",
513+
"// User must manually create and start threads:\n",
514+
"std::thread thread_inner(..[&] {\n",
515+
" energy += computation(...);\n",
516+
" bar.arrive_and_wait();\n",
517+
" // User must manually create a critical section for MPI rank reduction: \n",
518+
" MPI_Reduce(...);\n",
519+
" // User must manually reset the shared state on each iteration:\n",
520+
" energy = 0;\n",
521+
" bar.arrive_and_wait();\n",
522+
" });\n",
523+
"\n",
524+
"std::thread thread_prev(...);\n",
525+
"std::thread thread_next(...);\n",
526+
"\n",
527+
"// User must manually join all threads before doing File I/O\n",
528+
"thread_prev.join();\n",
529+
"thread_next.join();\n",
530+
"thread_inner.join();\n",
531+
"\n",
532+
"// File I/O\n",
533+
"```\n",
534+
"\n",
535+
"In this exercise, we'll use Senders & Receivers instead to create a graph representing the computation:\n",
536+
"\n",
537+
"```c++\n",
538+
"stde::sender iteration_step(stde::scheduler sch, parameters p, long it,\n",
539+
" std::vector<double>& u_new, std::vector<double>& u_old) {\n",
540+
" // TODO: use Senders & Receivers to create a graph representing the computation of a single iteration \n",
541+
"}\n",
542+
"```\n",
543+
"\n",
544+
"and will then dispatch it to an execution context:\n",
545+
"\n",
546+
"```c++\n",
547+
"stde::static_thread_pool ctx{3}; // Thread Pool with 3 threads\n",
548+
"stde::scheduler auto sch = ctx.get_scheduler();\n",
549+
"\n",
550+
"for (long it = 0; it < p.nit(); ++it) {\n",
551+
" stde::this_thread::sync_wait(iteration_step(sch));\n",
552+
"}\n",
553+
"```\n",
554+
"\n",
555+
"### Compilation and run commands\n",
556+
"\n",
557+
"[exercise3.cpp]: ./exercise3.cpp\n",
558+
"\n",
559+
"The template [exercise3.cpp] compiles and runs as provided, but produces incorrect results due to the incomplete `iteration_step` implementation.\n",
560+
"\n",
561+
"After completing it the following blocks should compile and run correctly:"
562+
]
563+
},
564+
{
565+
"cell_type": "markdown",
566+
"metadata": {},
567+
"source": [
568+
"### Solutions Exercise 3\n",
569+
"\n",
570+
"The solutions for each example are available in the [`solutions/exercise3.cpp`] sub-directory.\n",
571+
"\n",
572+
"[`solutions/exercise3.cpp`]: ./solutions/exercise3.cpp\n",
573+
"\n",
574+
"The following blocks compiles and runs the solutions for Exercise 3 using different compilers and C++ standard versions.\n",
575+
"By default, the [`static_thread_pool`] scheduler is used.\n",
576+
"\n",
577+
"[`static_thread_pool`]: https://github.com/NVIDIA/stdexec/blob/main/include/exec/static_thread_pool.hpp"
578+
]
579+
},
580+
{
581+
"cell_type": "code",
582+
"execution_count": null,
583+
"metadata": {},
584+
"outputs": [],
585+
"source": [
586+
"!rm output || true\n",
587+
"!rm heat || true\n",
588+
"!OMPI_CXX=g++ mpicxx -std=c++20 -Ofast -march=native -DNDEBUG -o heat solutions/exercise3.cpp -ltbb\n",
589+
"!mpirun -np 2 ./heat 256 256 16000\n",
590+
"visualize()"
591+
]
592+
},
593+
{
594+
"cell_type": "code",
595+
"execution_count": null,
596+
"metadata": {},
597+
"outputs": [],
598+
"source": [
599+
"!rm output || true\n",
600+
"!rm heat || true\n",
601+
"!OMPI_CXX=clang++ mpicxx -std=c++20 -Ofast -march=native -DNDEBUG -o heat solutions/exercise3.cpp -ltbb\n",
602+
"!mpirun -np 2 ./heat 256 256 16000\n",
603+
"visualize()"
604+
]
605+
},
606+
{
607+
"cell_type": "code",
608+
"execution_count": null,
609+
"metadata": {},
610+
"outputs": [],
611+
"source": [
612+
"!rm output || true\n",
613+
"!rm heat || true\n",
614+
"!OMPI_CXX=nvc++ mpicxx -std=c++20 -stdpar=gpu -O4 -fast -march=native -Mllvm-fast -DNDEBUG -o heat solutions/exercise3.cpp\n",
615+
"!mpirun -np 2 ./heat 256 256 16000\n",
616+
"visualize()"
617+
]
511618
}
512619
],
513620
"metadata": {

labs/lab2_heat/solutions/exercise1.cpp

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -64,25 +64,12 @@ double apply_stencil(double* u_new, double* u_old, grid g, parameters p) {
6464
auto xs = std::views::iota(g.x_begin, g.x_end);
6565
auto ys = std::views::iota(g.y_begin, g.y_end);
6666
auto ids = std::views::common(std::views::cartesian_product(xs, ys));
67-
68-
#if !defined(__NVCOMPILER)
6967
return std::transform_reduce(
7068
std::execution::par, ids.begin(), ids.end(),
7169
0., std::plus{}, [u_new, u_old, p](auto idx) {
7270
auto [x, y] = idx;
7371
return stencil(u_new, u_old, x, y, p);
7472
});
75-
#else
76-
// Workaround for NVIDIA C++ Compiler
77-
auto is = std::views::iota((int)0, (int)std::size(ids));
78-
auto cp = std::views::cartesian_product(xs, ys);
79-
return std::transform_reduce(
80-
std::execution::par, is.begin(), is.end(),
81-
0., std::plus{}, [u_new, u_old, p, ids = cp.begin()](auto i) {
82-
auto [x, y] = ids[i];
83-
return stencil(u_new, u_old, x, y, p);
84-
});
85-
#endif
8673
}
8774

8875
// Initial condition

labs/lab2_heat/solutions/exercise2.cpp

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -242,25 +242,12 @@ double apply_stencil(double* u_new, double* u_old, grid g, parameters p) {
242242
auto xs = std::views::iota(g.x_begin, g.x_end);
243243
auto ys = std::views::iota(g.y_begin, g.y_end);
244244
auto ids = std::views::common(std::views::cartesian_product(xs, ys));
245-
246-
#if !defined(__NVCOMPILER)
247245
return std::transform_reduce(
248246
std::execution::par, ids.begin(), ids.end(),
249247
0., std::plus{}, [u_new, u_old, p](auto idx) {
250248
auto [x, y] = idx;
251249
return stencil(u_new, u_old, x, y, p);
252250
});
253-
#else
254-
// Workaround for NVIDIA C++ Compiler
255-
auto is = std::views::iota((int)0, (int)std::size(ids));
256-
auto cp = std::views::cartesian_product(xs, ys);
257-
return std::transform_reduce(
258-
std::execution::par, is.begin(), is.end(),
259-
0., std::plus{}, [u_new, u_old, p, ids = cp.begin()](auto i) {
260-
auto [x, y] = ids[i];
261-
return stencil(u_new, u_old, x, y, p);
262-
});
263-
#endif
264251
}
265252

266253
// Initial condition

0 commit comments

Comments
 (0)