Skip to content

Commit 27f1278

Browse files
committed
Update lab2 and separate mdspan lab
1 parent 2f3e1bb commit 27f1278

File tree

10 files changed

+1454
-141
lines changed

10 files changed

+1454
-141
lines changed

labs/lab2_heat/exercise2.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -110,16 +110,16 @@ int main(int argc, char *argv[]) {
110110
std::swap(u_new, u_old);
111111
}
112112
*/
113-
113+
114+
// TODO: remove this #if - endif par to start working on the exerise
115+
#if 0
114116
// TODO: Use an atomic shared variable for the energy that can be safely modified from
115117
// multiple threads:
116118
double energy = 0.;
117119

118120
// TODO: Use a shared barrier for synchronizing three threads:
119121
// ... bar(...);
120122

121-
// TODO: remove this #if - endif par to start working on the exerise
122-
#if 0
123123
// TODO: Create three threads each running either "prev", "next", or "inner".
124124
// This demonstrates it for "prev":
125125
std::thread thread_prev([p, u_new = u_new.data(), u_old = u_old.data(),

labs/lab2_heat/exercise3.cpp

Lines changed: 248 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,248 @@
1+
/*
2+
* SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
* SPDX-License-Identifier: MIT
4+
*
5+
* Permission is hereby granted, free of charge, to any person obtaining a
6+
* copy of this software and associated documentation files (the "Software"),
7+
* to deal in the Software without restriction, including without limitation
8+
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
9+
* and/or sell copies of the Software, and to permit persons to whom the
10+
* Software is furnished to do so, subject to the following conditions:
11+
*
12+
* The above copyright notice and this permission notice shall be included in
13+
* all copies or substantial portions of the Software.
14+
*
15+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18+
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20+
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21+
* DEALINGS IN THE SOFTWARE.
22+
*/
23+
24+
//! Solves heat equation in 2D, see the README.
25+
26+
#include <cassert>
27+
#include <chrono>
28+
#include <fstream>
29+
#include <iostream>
30+
#include <mpi.h>
31+
#include <vector>
32+
#include <cartesian_product.hpp> // Brings C++23 std::views::cartesian_product to C++20
33+
#include <algorithm> // For std::fill_n
34+
#include <numeric> // For std::transform_reduce
35+
#include <execution> // For std::execution::par
36+
// TODO: add C++ standard library includes as necessary
37+
#include <exec/static_thread_pool.hpp>
38+
#include <exec/on.hpp>
39+
40+
// TODO: add stde namespace alias for stdexec
41+
namespace stde = ::stdexec;
42+
43+
// Problem parameters
44+
struct parameters {
45+
double dx, dt;
46+
long nx, ny, ni;
47+
int rank = 0, nranks = 1;
48+
49+
static constexpr double alpha() { return 1.0; } // Thermal diffusivity
50+
51+
parameters(int argc, char *argv[]);
52+
53+
long nit() { return ni; }
54+
long nout() { return 1000; }
55+
long nx_global() { return nx * nranks; }
56+
long ny_global() { return ny; }
57+
double gamma() { return alpha() * dt / (dx * dx); }
58+
long n() { return ny * (nx + 2 /* 2 halo layers */); }
59+
};
60+
61+
// These evolve the solution of different parts of the local domain.
62+
double inner(double* u_new, double* u_old, parameters p);
63+
double prev (double* u_new, double* u_old, parameters p);
64+
double next (double* u_new, double* u_old, parameters p);
65+
66+
stde::sender auto iteration_step(stde::scheduler auto&& sch, parameters& p, long& it, std::vector<double>& u_new, std::vector<double>& u_old) {
67+
// TODO: create task for prev, next, inner
68+
// TODO: use "when_all" to wait on prev, next, and inner
69+
// TODO: use "then" to perform the MPI reduction
70+
}
71+
72+
void initial_condition(double* u_new, double* u_old, long n);
73+
74+
int main(int argc, char *argv[]) {
75+
// Parse CLI parameters
76+
parameters p(argc, argv);
77+
78+
// Initialize MPI with multi-threading support
79+
int mt;
80+
MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &mt);
81+
if (mt != MPI_THREAD_MULTIPLE) {
82+
std::cerr << "MPI cannot be called from multiple host threads" << std::endl;
83+
std::terminate();
84+
}
85+
MPI_Comm_size(MPI_COMM_WORLD, &p.nranks);
86+
MPI_Comm_rank(MPI_COMM_WORLD, &p.rank);
87+
88+
// Allocate memory
89+
std::vector<double> u_new(p.n()), u_old(p.n());
90+
91+
// Initial condition
92+
initial_condition(u_new.data(), u_old.data(), p.n());
93+
94+
// TODO: Initialize a exec::static_thread_pool context with 3 threads
95+
// auto ctx = ...;
96+
97+
// Time loop
98+
using clk_t = std::chrono::steady_clock;
99+
auto start = clk_t::now();
100+
101+
// TODO: get an scheduler from the context
102+
// auto sch = ...;
103+
104+
long it = 0;
105+
// TODO:
106+
// auto step = iteration_step(sch, p, it, u_new, u_old);
107+
for (; it < p.nit(); ++it) {
108+
// TODO: block calling thread on executing a step
109+
}
110+
111+
auto time = std::chrono::duration<double>(clk_t::now() - start).count();
112+
auto grid_size = static_cast<double>(p.nx * p.ny * sizeof(double) * 2) * 1e-9; // GB
113+
auto memory_bw = grid_size * static_cast<double>(p.nit()) / time; // GB/s
114+
if (p.rank == 0) {
115+
std::cerr << "Rank " << p.rank << ": local domain " << p.nx << "x" << p.ny << " (" << grid_size << " GB): "
116+
<< memory_bw << " GB/s" << std::endl;
117+
std::cerr << "All ranks: global domain " << p.nx_global() << "x" << p.ny_global() << " (" << (grid_size * p.nranks) << " GB): "
118+
<< memory_bw * p.nranks << " GB/s" << std::endl;
119+
}
120+
121+
// Write output to file
122+
MPI_File f;
123+
MPI_File_open(MPI_COMM_WORLD, "output", MPI_MODE_CREATE | MPI_MODE_WRONLY, MPI_INFO_NULL, &f);
124+
auto header_bytes = 2 * sizeof(long) + sizeof(double);
125+
auto values_per_rank = p.nx * p.ny;
126+
auto values_bytes_per_rank = values_per_rank * sizeof(double);
127+
MPI_File_set_size(f, header_bytes + values_bytes_per_rank * p.nranks);
128+
MPI_Request req[3] = {MPI_REQUEST_NULL, MPI_REQUEST_NULL, MPI_REQUEST_NULL};
129+
if (p.rank == 0) {
130+
long total[2] = {p.nx * p.nranks, p.ny};
131+
double time = p.nit() * p.dt;
132+
MPI_File_iwrite_at(f, 0, total, 2, MPI_UINT64_T, &req[1]);
133+
MPI_File_iwrite_at(f, 2 * sizeof(long), &time, 1, MPI_DOUBLE, &req[2]);
134+
}
135+
auto values_offset = header_bytes + p.rank * values_bytes_per_rank;
136+
MPI_File_iwrite_at(f, values_offset, u_old.data() + p.ny, values_per_rank, MPI_DOUBLE, &req[0]);
137+
MPI_Waitall(p.rank == 0 ? 3 : 1, req, MPI_STATUSES_IGNORE);
138+
MPI_File_close(&f);
139+
140+
MPI_Finalize();
141+
return 0;
142+
}
143+
144+
// 2D grid of indicies
145+
struct grid {
146+
long x_begin, x_end, y_begin, y_end;
147+
};
148+
149+
double apply_stencil(double* u_new, double* u_old, grid g, parameters p);
150+
151+
// Reads command line arguments to initialize problem size
152+
parameters::parameters(int argc, char *argv[]) {
153+
if (argc != 4) {
154+
std::cerr << "ERROR: incorrect arguments" << std::endl;
155+
std::cerr << " " << argv[0] << " <nx> <ny> <ni>" << std::endl;
156+
std::terminate();
157+
}
158+
nx = std::stoll(argv[1]);
159+
ny = std::stoll(argv[2]);
160+
ni = std::stoll(argv[3]);
161+
dx = 1.0 / nx;
162+
dt = dx * dx / (5. * alpha());
163+
}
164+
165+
// Finite-difference stencil
166+
double stencil(double *u_new, double *u_old, long x, long y, parameters p) {
167+
auto idx = [=](auto x, auto y) {
168+
// Index into the memory using row-major order:
169+
assert(x >= 0 && x < 2 * p.nx);
170+
assert(y >= 0 && y < p.ny);
171+
return x * p.ny + y;
172+
};
173+
// Apply boundary conditions:
174+
if (y == 1) {
175+
u_old[idx(x, y - 1)] = 0;
176+
}
177+
if (y == (p.ny - 2)) {
178+
u_old[idx(x, y + 1)] = 0;
179+
}
180+
// These boundary conditions are only impossed by the ranks at the end of the domain:
181+
if (p.rank == 0 && x == 1) {
182+
u_old[idx(x - 1, y)] = 1;
183+
}
184+
if (p.rank == (p.nranks - 1) && x == p.nx) {
185+
u_old[idx(x + 1, y)] = 0;
186+
}
187+
188+
u_new[idx(x, y)] = (1. - 4. * p.gamma()) * u_old[idx(x, y)] +
189+
p.gamma() * (u_old[idx(x + 1, y)] + u_old[idx(x - 1, y)] +
190+
u_old[idx(x, y + 1)] + u_old[idx(x, y - 1)]);
191+
192+
return u_new[idx(x, y)] * p.dx * p.dx;
193+
}
194+
195+
double apply_stencil(double* u_new, double* u_old, grid g, parameters p) {
196+
auto xs = std::views::iota(g.x_begin, g.x_end);
197+
auto ys = std::views::iota(g.y_begin, g.y_end);
198+
auto ids = std::views::cartesian_product(xs, ys);
199+
return std::transform_reduce(
200+
std::execution::par, ids.begin(), ids.end(),
201+
0., std::plus{}, [u_new, u_old, p](auto idx) {
202+
auto [x, y] = idx;
203+
return stencil(u_new, u_old, x, y, p);
204+
});
205+
}
206+
207+
// Initial condition
208+
void initial_condition(double* u_new, double* u_old, long n) {
209+
std::fill_n(std::execution::par, u_old, n, 0.0);
210+
std::fill_n(std::execution::par, u_new, n, 0.0);
211+
}
212+
213+
// Evolve the solution of the interior part of the domain
214+
// which does not depend on data from neighboring ranks
215+
double inner(double *u_new, double *u_old, parameters p) {
216+
grid g{.x_begin = 2, .x_end = p.nx, .y_begin = 1, .y_end = p.ny - 1};
217+
return apply_stencil(u_new, u_old, g, p);
218+
}
219+
220+
// Evolve the solution of the part of the domain that
221+
// depends on data from the previous MPI rank (rank - 1)
222+
double prev(double *u_new, double *u_old, parameters p) {
223+
// Send window cells, receive halo cells
224+
if (p.rank > 0) {
225+
// Send bottom boundary to bottom rank
226+
MPI_Send(u_old + p.ny, p.ny, MPI_DOUBLE, p.rank - 1, 0, MPI_COMM_WORLD);
227+
// Receive top boundary from bottom rank
228+
MPI_Recv(u_old + 0, p.ny, MPI_DOUBLE, p.rank - 1, 1, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
229+
}
230+
// Compute prev boundary
231+
grid g{.x_begin = 1, .x_end = 2, .y_begin= 1, .y_end = p.ny - 1};
232+
return apply_stencil(u_new, u_old, g, p);
233+
}
234+
235+
// Evolve the solution of the part of the domain that
236+
// depends on data from the next MPI rank (rank + 1)
237+
double next(double *u_new, double *u_old, parameters p) {
238+
if (p.rank < p.nranks - 1) {
239+
// Receive bottom boundary from top rank
240+
MPI_Recv(u_old + (p.nx + 1) * p.ny, p.ny, MPI_DOUBLE, p.rank + 1, 0, MPI_COMM_WORLD,
241+
MPI_STATUS_IGNORE);
242+
// Send top boundary to top rank, and
243+
MPI_Send(u_old + p.nx * p.ny, p.ny, MPI_DOUBLE, p.rank + 1, 1, MPI_COMM_WORLD);
244+
}
245+
// Compute next boundary
246+
grid g{.x_begin = p.nx, .x_end = p.nx + 1, .y_begin = 1, .y_end = p.ny - 1};
247+
return apply_stencil(u_new, u_old, g, p);
248+
}

0 commit comments

Comments
 (0)