1
+ /*
2
+ * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3
+ * SPDX-License-Identifier: MIT
4
+ *
5
+ * Permission is hereby granted, free of charge, to any person obtaining a
6
+ * copy of this software and associated documentation files (the "Software"),
7
+ * to deal in the Software without restriction, including without limitation
8
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9
+ * and/or sell copies of the Software, and to permit persons to whom the
10
+ * Software is furnished to do so, subject to the following conditions:
11
+ *
12
+ * The above copyright notice and this permission notice shall be included in
13
+ * all copies or substantial portions of the Software.
14
+ *
15
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21
+ * DEALINGS IN THE SOFTWARE.
22
+ */
23
+
24
+ // ! Solves heat equation in 2D, see the README.
25
+
26
+ #include < cassert>
27
+ #include < chrono>
28
+ #include < fstream>
29
+ #include < iostream>
30
+ #include < mpi.h>
31
+ #include < vector>
32
+ #include < cartesian_product.hpp> // Brings C++23 std::views::cartesian_product to C++20
33
+ #include < algorithm> // For std::fill_n
34
+ #include < numeric> // For std::transform_reduce
35
+ #include < execution> // For std::execution::par
36
+ // TODO: add C++ standard library includes as necessary
37
+ #include < exec/static_thread_pool.hpp>
38
+ #include < exec/on.hpp>
39
+
40
+ // TODO: add stde namespace alias for stdexec
41
+ namespace stde = ::stdexec;
42
+
43
+ // Problem parameters
44
+ struct parameters {
45
+ double dx, dt;
46
+ long nx, ny, ni;
47
+ int rank = 0 , nranks = 1 ;
48
+
49
+ static constexpr double alpha () { return 1.0 ; } // Thermal diffusivity
50
+
51
+ parameters (int argc, char *argv[]);
52
+
53
+ long nit () { return ni; }
54
+ long nout () { return 1000 ; }
55
+ long nx_global () { return nx * nranks; }
56
+ long ny_global () { return ny; }
57
+ double gamma () { return alpha () * dt / (dx * dx); }
58
+ long n () { return ny * (nx + 2 /* 2 halo layers */ ); }
59
+ };
60
+
61
+ // These evolve the solution of different parts of the local domain.
62
+ double inner (double * u_new, double * u_old, parameters p);
63
+ double prev (double * u_new, double * u_old, parameters p);
64
+ double next (double * u_new, double * u_old, parameters p);
65
+
66
+ stde::sender auto iteration_step (stde::scheduler auto && sch, parameters& p, long & it, std::vector<double >& u_new, std::vector<double >& u_old) {
67
+ // TODO: create task for prev, next, inner
68
+ // TODO: use "when_all" to wait on prev, next, and inner
69
+ // TODO: use "then" to perform the MPI reduction
70
+ }
71
+
72
+ void initial_condition (double * u_new, double * u_old, long n);
73
+
74
+ int main (int argc, char *argv[]) {
75
+ // Parse CLI parameters
76
+ parameters p (argc, argv);
77
+
78
+ // Initialize MPI with multi-threading support
79
+ int mt;
80
+ MPI_Init_thread (&argc, &argv, MPI_THREAD_MULTIPLE, &mt);
81
+ if (mt != MPI_THREAD_MULTIPLE) {
82
+ std::cerr << " MPI cannot be called from multiple host threads" << std::endl;
83
+ std::terminate ();
84
+ }
85
+ MPI_Comm_size (MPI_COMM_WORLD, &p.nranks );
86
+ MPI_Comm_rank (MPI_COMM_WORLD, &p.rank );
87
+
88
+ // Allocate memory
89
+ std::vector<double > u_new (p.n ()), u_old (p.n ());
90
+
91
+ // Initial condition
92
+ initial_condition (u_new.data (), u_old.data (), p.n ());
93
+
94
+ // TODO: Initialize a exec::static_thread_pool context with 3 threads
95
+ // auto ctx = ...;
96
+
97
+ // Time loop
98
+ using clk_t = std::chrono::steady_clock;
99
+ auto start = clk_t::now ();
100
+
101
+ // TODO: get an scheduler from the context
102
+ // auto sch = ...;
103
+
104
+ long it = 0 ;
105
+ // TODO:
106
+ // auto step = iteration_step(sch, p, it, u_new, u_old);
107
+ for (; it < p.nit (); ++it) {
108
+ // TODO: block calling thread on executing a step
109
+ }
110
+
111
+ auto time = std::chrono::duration<double >(clk_t::now () - start).count ();
112
+ auto grid_size = static_cast <double >(p.nx * p.ny * sizeof (double ) * 2 ) * 1e-9 ; // GB
113
+ auto memory_bw = grid_size * static_cast <double >(p.nit ()) / time; // GB/s
114
+ if (p.rank == 0 ) {
115
+ std::cerr << " Rank " << p.rank << " : local domain " << p.nx << " x" << p.ny << " (" << grid_size << " GB): "
116
+ << memory_bw << " GB/s" << std::endl;
117
+ std::cerr << " All ranks: global domain " << p.nx_global () << " x" << p.ny_global () << " (" << (grid_size * p.nranks ) << " GB): "
118
+ << memory_bw * p.nranks << " GB/s" << std::endl;
119
+ }
120
+
121
+ // Write output to file
122
+ MPI_File f;
123
+ MPI_File_open (MPI_COMM_WORLD, " output" , MPI_MODE_CREATE | MPI_MODE_WRONLY, MPI_INFO_NULL, &f);
124
+ auto header_bytes = 2 * sizeof (long ) + sizeof (double );
125
+ auto values_per_rank = p.nx * p.ny ;
126
+ auto values_bytes_per_rank = values_per_rank * sizeof (double );
127
+ MPI_File_set_size (f, header_bytes + values_bytes_per_rank * p.nranks );
128
+ MPI_Request req[3 ] = {MPI_REQUEST_NULL, MPI_REQUEST_NULL, MPI_REQUEST_NULL};
129
+ if (p.rank == 0 ) {
130
+ long total[2 ] = {p.nx * p.nranks , p.ny };
131
+ double time = p.nit () * p.dt ;
132
+ MPI_File_iwrite_at (f, 0 , total, 2 , MPI_UINT64_T, &req[1 ]);
133
+ MPI_File_iwrite_at (f, 2 * sizeof (long ), &time, 1 , MPI_DOUBLE, &req[2 ]);
134
+ }
135
+ auto values_offset = header_bytes + p.rank * values_bytes_per_rank;
136
+ MPI_File_iwrite_at (f, values_offset, u_old.data () + p.ny , values_per_rank, MPI_DOUBLE, &req[0 ]);
137
+ MPI_Waitall (p.rank == 0 ? 3 : 1 , req, MPI_STATUSES_IGNORE);
138
+ MPI_File_close (&f);
139
+
140
+ MPI_Finalize ();
141
+ return 0 ;
142
+ }
143
+
144
+ // 2D grid of indicies
145
+ struct grid {
146
+ long x_begin, x_end, y_begin, y_end;
147
+ };
148
+
149
+ double apply_stencil (double * u_new, double * u_old, grid g, parameters p);
150
+
151
+ // Reads command line arguments to initialize problem size
152
+ parameters::parameters (int argc, char *argv[]) {
153
+ if (argc != 4 ) {
154
+ std::cerr << " ERROR: incorrect arguments" << std::endl;
155
+ std::cerr << " " << argv[0 ] << " <nx> <ny> <ni>" << std::endl;
156
+ std::terminate ();
157
+ }
158
+ nx = std::stoll (argv[1 ]);
159
+ ny = std::stoll (argv[2 ]);
160
+ ni = std::stoll (argv[3 ]);
161
+ dx = 1.0 / nx;
162
+ dt = dx * dx / (5 . * alpha ());
163
+ }
164
+
165
+ // Finite-difference stencil
166
+ double stencil (double *u_new, double *u_old, long x, long y, parameters p) {
167
+ auto idx = [=](auto x, auto y) {
168
+ // Index into the memory using row-major order:
169
+ assert (x >= 0 && x < 2 * p.nx );
170
+ assert (y >= 0 && y < p.ny );
171
+ return x * p.ny + y;
172
+ };
173
+ // Apply boundary conditions:
174
+ if (y == 1 ) {
175
+ u_old[idx (x, y - 1 )] = 0 ;
176
+ }
177
+ if (y == (p.ny - 2 )) {
178
+ u_old[idx (x, y + 1 )] = 0 ;
179
+ }
180
+ // These boundary conditions are only impossed by the ranks at the end of the domain:
181
+ if (p.rank == 0 && x == 1 ) {
182
+ u_old[idx (x - 1 , y)] = 1 ;
183
+ }
184
+ if (p.rank == (p.nranks - 1 ) && x == p.nx ) {
185
+ u_old[idx (x + 1 , y)] = 0 ;
186
+ }
187
+
188
+ u_new[idx (x, y)] = (1 . - 4 . * p.gamma ()) * u_old[idx (x, y)] +
189
+ p.gamma () * (u_old[idx (x + 1 , y)] + u_old[idx (x - 1 , y)] +
190
+ u_old[idx (x, y + 1 )] + u_old[idx (x, y - 1 )]);
191
+
192
+ return u_new[idx (x, y)] * p.dx * p.dx ;
193
+ }
194
+
195
+ double apply_stencil (double * u_new, double * u_old, grid g, parameters p) {
196
+ auto xs = std::views::iota (g.x_begin , g.x_end );
197
+ auto ys = std::views::iota (g.y_begin , g.y_end );
198
+ auto ids = std::views::cartesian_product (xs, ys);
199
+ return std::transform_reduce (
200
+ std::execution::par, ids.begin (), ids.end (),
201
+ 0 ., std::plus{}, [u_new, u_old, p](auto idx) {
202
+ auto [x, y] = idx;
203
+ return stencil (u_new, u_old, x, y, p);
204
+ });
205
+ }
206
+
207
+ // Initial condition
208
+ void initial_condition (double * u_new, double * u_old, long n) {
209
+ std::fill_n (std::execution::par, u_old, n, 0.0 );
210
+ std::fill_n (std::execution::par, u_new, n, 0.0 );
211
+ }
212
+
213
+ // Evolve the solution of the interior part of the domain
214
+ // which does not depend on data from neighboring ranks
215
+ double inner (double *u_new, double *u_old, parameters p) {
216
+ grid g{.x_begin = 2 , .x_end = p.nx , .y_begin = 1 , .y_end = p.ny - 1 };
217
+ return apply_stencil (u_new, u_old, g, p);
218
+ }
219
+
220
+ // Evolve the solution of the part of the domain that
221
+ // depends on data from the previous MPI rank (rank - 1)
222
+ double prev (double *u_new, double *u_old, parameters p) {
223
+ // Send window cells, receive halo cells
224
+ if (p.rank > 0 ) {
225
+ // Send bottom boundary to bottom rank
226
+ MPI_Send (u_old + p.ny , p.ny , MPI_DOUBLE, p.rank - 1 , 0 , MPI_COMM_WORLD);
227
+ // Receive top boundary from bottom rank
228
+ MPI_Recv (u_old + 0 , p.ny , MPI_DOUBLE, p.rank - 1 , 1 , MPI_COMM_WORLD, MPI_STATUS_IGNORE);
229
+ }
230
+ // Compute prev boundary
231
+ grid g{.x_begin = 1 , .x_end = 2 , .y_begin = 1 , .y_end = p.ny - 1 };
232
+ return apply_stencil (u_new, u_old, g, p);
233
+ }
234
+
235
+ // Evolve the solution of the part of the domain that
236
+ // depends on data from the next MPI rank (rank + 1)
237
+ double next (double *u_new, double *u_old, parameters p) {
238
+ if (p.rank < p.nranks - 1 ) {
239
+ // Receive bottom boundary from top rank
240
+ MPI_Recv (u_old + (p.nx + 1 ) * p.ny , p.ny , MPI_DOUBLE, p.rank + 1 , 0 , MPI_COMM_WORLD,
241
+ MPI_STATUS_IGNORE);
242
+ // Send top boundary to top rank, and
243
+ MPI_Send (u_old + p.nx * p.ny , p.ny , MPI_DOUBLE, p.rank + 1 , 1 , MPI_COMM_WORLD);
244
+ }
245
+ // Compute next boundary
246
+ grid g{.x_begin = p.nx , .x_end = p.nx + 1 , .y_begin = 1 , .y_end = p.ny - 1 };
247
+ return apply_stencil (u_new, u_old, g, p);
248
+ }
0 commit comments