Skip to content

Commit 00bc0fa

Browse files
committed
Update Lab3 Trie
1 parent 26a1a59 commit 00bc0fa

File tree

5 files changed

+120
-146
lines changed

5 files changed

+120
-146
lines changed

labs/lab3_tree/exercise0.cpp renamed to labs/lab3_tree/exercise1.cpp

Lines changed: 4 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -34,17 +34,9 @@
3434
#include <thread>
3535
#include <utility>
3636
#include <vector>
37-
38-
#if defined(__clang__)
39-
// clang does not support libstdc++ ranges
40-
#include <range/v3/all.hpp>
41-
namespace views = ranges::views;
42-
#else
4337
#include <ranges>
44-
namespace views = std::views;
45-
#endif
46-
47-
// TODO: need to add some headers
38+
// TODO: add C++ standard library includes as necessary
39+
// #include <...>
4840

4941
/// Builds a trie in parallel by splitting the input into chunks
5042
void do_trie(std::vector<char> const &input, int domains);
@@ -121,9 +113,8 @@ void do_trie(std::vector<char> const &input, int domains) {
121113
using clk_t = std::chrono::steady_clock;
122114
auto const begin = clk_t::now();
123115

124-
auto it = views::iota(0).begin();
125-
// TODO: process all domains in parallel
126-
std::for_each_n(it, domains,
116+
// TODO: Use a parallel algorithm to process all domains in parallel
117+
std::for_each_n(std::views::iota(0).begin(), domains,
127118
[t, b, domains, input = input.data(), size = input.size()](auto domain) {
128119
make_trie(*t, *b, input, input + size, domain, domains);
129120
});

labs/lab3_tree/solutions/exercise0.cpp renamed to labs/lab3_tree/solutions/exercise1.cpp

Lines changed: 2 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -34,17 +34,8 @@
3434
#include <thread>
3535
#include <utility>
3636
#include <vector>
37-
38-
#if defined(__clang__)
39-
// clang does not support libstdc++ ranges
40-
#include <range/v3/all.hpp>
41-
namespace views = ranges::views;
42-
#else
4337
#include <ranges>
44-
namespace views = std::views;
45-
#endif
46-
47-
// DONE: need to add some headers
38+
// DONE: add C++ standard library includes as necessary
4839
#include <atomic>
4940

5041
/// Builds a trie in parallel by splitting the input into chunks
@@ -122,10 +113,9 @@ void do_trie(std::vector<char> const &input, int domains) {
122113
using clk_t = std::chrono::steady_clock;
123114
auto const begin = clk_t::now();
124115

125-
auto it = views::iota(0).begin();
126116
// DONE: process all domains in parallel
127117
// NOTE: we cannot use "par_unseq" here because the algorithm is starvation free.
128-
std::for_each_n(std::execution::par, it, domains,
118+
std::for_each_n(std::execution::par, std::views::iota(0).begin(), domains,
129119
[t, b, domains, input = input.data(), size = input.size()](auto domain) {
130120
make_trie(*t, *b, input, input + size, domain, domains);
131121
});

labs/lab3_tree/solutions/exercise0_gpu.cpp renamed to labs/lab3_tree/solutions/exercise1_gpu.cpp

Lines changed: 6 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -34,36 +34,22 @@
3434
#include <thread>
3535
#include <utility>
3636
#include <vector>
37-
38-
#if defined(__clang__)
39-
// clang does not support libstdc++ ranges
40-
#include <range/v3/all.hpp>
41-
namespace views = ranges::views;
42-
#elif defined(__NVCOMPILER)
43-
// Workaround: cuda::atomic requires C++17 so no ranges; fixed in next release.
44-
#include <thrust/iterator/counting_iterator.h>
45-
#else
4637
#include <ranges>
47-
namespace views = std::views;
48-
#endif
49-
50-
// DONE: need to add some headers
51-
#include <atomic>
52-
38+
// DONE: add C++ standard library includes as necessary
39+
// NOTE: We include std::atomic except when -stdpar=gpu, in which case we include cuda::atomic
5340
#if defined(_NVHPC_STDPAR_GPU)
5441
#include <cuda/atomic>
55-
#include <cuda/std/atomic>
5642
template <typename T> using atomic = cuda::atomic<T, cuda::thread_scope_device>;
5743
constexpr auto memory_order_relaxed = cuda::memory_order_relaxed;
5844
constexpr auto memory_order_acquire = cuda::memory_order_acquire;
5945
constexpr auto memory_order_release = cuda::memory_order_release;
60-
#else
46+
#else // _NVHPC_STDPAR_GPU
6147
#include <atomic>
6248
template <typename T> using atomic = std::atomic<T>;
6349
constexpr auto memory_order_relaxed = std::memory_order_relaxed;
6450
constexpr auto memory_order_acquire = std::memory_order_acquire;
6551
constexpr auto memory_order_release = std::memory_order_release;
66-
#endif
52+
#endif // _NVHPC_STDPAR_GPU
6753

6854
/// Builds a trie in parallel by splitting the input into chunks
6955
void do_trie(std::vector<char> const &input, int domains);
@@ -140,14 +126,9 @@ void do_trie(std::vector<char> const &input, int domains) {
140126
using clk_t = std::chrono::steady_clock;
141127
auto const begin = clk_t::now();
142128

143-
#if defined(__NVCOMPILER)
144-
auto it = thrust::counting_iterator<int>(0);
145-
#else
146-
auto it = views::iota(0).begin();
147-
#endif
148129
// DONE: process all domains in parallel
149130
// NOTE: we cannot use "par_unseq" here because the algorithm is starvation free.
150-
std::for_each_n(std::execution::par, it, domains,
131+
std::for_each_n(std::execution::par, std::views::iota(0).begin(), domains,
151132
[t, b, domains, input = input.data(), size = input.size()](auto domain) {
152133
make_trie(*t, *b, input, input + size, domain, domains);
153134
});
@@ -224,4 +205,4 @@ void make_trie(trie &root, atomic<trie *> &bump, const char *begin, const char *
224205
// And we traverse to it
225206
n = n->children[index].ptr.load(memory_order_relaxed);
226207
}
227-
}
208+
}

labs/lab3_tree/starting_point.cpp

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -33,15 +33,7 @@
3333
#include <string>
3434
#include <utility>
3535
#include <vector>
36-
37-
#if defined(__clang__)
38-
// clang does not support libstdc++ ranges
39-
#include <range/v3/all.hpp>
40-
namespace views = ranges::views;
41-
#else
4236
#include <ranges>
43-
namespace views = std::views;
44-
#endif
4537

4638
/// Builds a trie in parallel by splitting the input into chunks
4739
void do_trie(std::vector<char> const &input, int domains);
@@ -108,8 +100,7 @@ void do_trie(std::vector<char> const &input, int domains) {
108100
using clk_t = std::chrono::steady_clock;
109101
auto const begin = clk_t::now();
110102

111-
auto it = views::iota(0).begin();
112-
std::for_each_n(it, domains,
103+
std::for_each_n(std::views::iota(0).begin(), domains,
113104
[t, b, domains, input = input.data(), size = input.size()](auto domain) {
114105
make_trie(*t, *b, input, input + size, domain, domains);
115106
});

labs/lab3_tree/tree.ipynb

Lines changed: 107 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -21,38 +21,9 @@
2121
},
2222
{
2323
"cell_type": "code",
24-
"execution_count": 1,
25-
"metadata": {},
26-
"outputs": [
27-
{
28-
"name": "stdout",
29-
"output_type": "stream",
30-
"text": [
31-
"+ curl -O -# https://www.gutenberg.org/files/2600/2600-0.txt\n",
32-
"######################################################################### 100.0%\n",
33-
"+ curl -O -# https://www.gutenberg.org/files/8800/8800.txt\n",
34-
"######################################################################### 100.0%\n",
35-
"+ curl -O -# https://www.gutenberg.org/files/84/84-0.txt\n",
36-
"######################################################################### 100.0%\n",
37-
"+ curl -O -# https://www.gutenberg.org/files/2701/2701-0.txt\n",
38-
"######################################################################### 100.0%\n",
39-
"+ curl -O -# https://www.gutenberg.org/files/35/35-0.txt\n",
40-
"######################################################################### 100.0%\n",
41-
"+ curl -O -# https://www.gutenberg.org/files/1342/1342-0.txt\n",
42-
"######################################################################### 100.0%\n",
43-
"+ curl -O -# https://www.gutenberg.org/files/3825/3825-0.txt\n",
44-
"######################################################################### 100.0%\n",
45-
"+ curl -O -# https://www.gutenberg.org/files/996/996-0.txt\n",
46-
"######################################################################### 100.0%\n",
47-
"+ curl -O -# https://www.gutenberg.org/files/55/55-0.txt\n",
48-
"######################################################################### 100.0%\n",
49-
"+ curl -O -# https://www.gutenberg.org/files/6130/6130-0.txt\n",
50-
"######################################################################### 100.0%\n",
51-
"+ curl -O -# https://www.gutenberg.org/files/1727/1727-0.txt\n",
52-
"######################################################################### 100.0%\n"
53-
]
54-
}
55-
],
24+
"execution_count": null,
25+
"metadata": {},
26+
"outputs": [],
5627
"source": [
5728
"!./books.sh"
5829
]
@@ -68,20 +39,11 @@
6839
},
6940
{
7041
"cell_type": "code",
71-
"execution_count": 2,
72-
"metadata": {},
73-
"outputs": [
74-
{
75-
"name": "stdout",
76-
"output_type": "stream",
77-
"text": [
78-
"Input size 11451743 chars.\n",
79-
"Assembled 99632 nodes on 1 domains in 267ms.\n"
80-
]
81-
}
82-
],
83-
"source": [
84-
"!g++ -std=c++20 -o tree starting_point.cpp -ltbb\n",
42+
"execution_count": null,
43+
"metadata": {},
44+
"outputs": [],
45+
"source": [
46+
"!g++ -std=c++20 -Ofast -march=native -o tree starting_point.cpp -ltbb\n",
8547
"!./tree"
8648
]
8749
},
@@ -95,7 +57,44 @@
9557
"\n",
9658
"## Exercise 1: process the input in parallel\n",
9759
"\n",
98-
"The goal of this exercise is to process the input in parallel using multiple domains."
60+
"The goal of this exercise is to process the input in parallel using multiple domains.\n",
61+
"\n",
62+
"A template for the solution is provided in [exercise1.cpp]. The `TODO`s indicate the parts of the template that must be completed.\n",
63+
"\n",
64+
"[exercise1.cpp]: ./exercise1.cpp\n",
65+
"\n",
66+
"The example compiles and runs serially as provided.\n",
67+
"Once you parallelize it, the following blocks should compile and run correctly:"
68+
]
69+
},
70+
{
71+
"cell_type": "code",
72+
"execution_count": null,
73+
"metadata": {},
74+
"outputs": [],
75+
"source": [
76+
"!g++ -std=c++20 -Ofast -march=native -o tree exercise1.cpp -ltbb\n",
77+
"!./tree"
78+
]
79+
},
80+
{
81+
"cell_type": "code",
82+
"execution_count": null,
83+
"metadata": {},
84+
"outputs": [],
85+
"source": [
86+
"!clang++ -std=c++20 -Ofast -march=native -o tree exercise1.cpp -ltbb\n",
87+
"!./tree"
88+
]
89+
},
90+
{
91+
"cell_type": "code",
92+
"execution_count": null,
93+
"metadata": {},
94+
"outputs": [],
95+
"source": [
96+
"!nvc++ -std=c++20 -stdpar=multicore -O4 -fast -march=native -Mllvm-fast -o tree exercise1.cpp\n",
97+
"!./tree"
9998
]
10099
},
101100
{
@@ -106,53 +105,75 @@
106105
"\n",
107106
"The solutions for each example are available in the `solutions/` sub-directory.\n",
108107
"\n",
109-
"The following compiles and runs the solutions for Exercise 0 using different compilers."
108+
"The following compiles and runs the solutions for Exercise 1 using different compilers."
110109
]
111110
},
112111
{
113112
"cell_type": "code",
114-
"execution_count": 3,
115-
"metadata": {},
116-
"outputs": [
117-
{
118-
"name": "stdout",
119-
"output_type": "stream",
120-
"text": [
121-
"Input size 11451743 chars.\n",
122-
"Assembled 99632 nodes on 1 domains in 232ms.\n",
123-
"Assembled 99632 nodes on 16 domains in 201ms.\n",
124-
"Assembled 99632 nodes on 100000 domains in 166ms.\n"
125-
]
126-
}
127-
],
128-
"source": [
129-
"!g++ -std=c++20 -Ofast -DNDEBUG -o tree solutions/exercise0.cpp\n",
113+
"execution_count": null,
114+
"metadata": {},
115+
"outputs": [],
116+
"source": [
117+
"!g++ -std=c++20 -Ofast -march=native -DNDEBUG -o tree solutions/exercise1.cpp -ltbb\n",
130118
"!./tree"
131119
]
132120
},
133121
{
134122
"cell_type": "code",
135-
"execution_count": 6,
136-
"metadata": {},
137-
"outputs": [
138-
{
139-
"name": "stdout",
140-
"output_type": "stream",
141-
"text": [
142-
"nvvmCompileProgram error 9: NVVM_ERROR_COMPILATION.\n",
143-
"Error: /tmp/pgacc3rBLuXtV9Je.gpu (1281, 38): parse atomicrmw value and pointer type do not match\n",
144-
"NVC++-F-0155-Compiler failed to translate accelerator region (see -Minfo messages): Device compiler exited with error status code (solutions/exercise0.cpp: 1)\n",
145-
"NVC++/x86-64 Linux 22.5-0: compilation aborted\n",
146-
"Input size 11451743 chars.\n",
147-
"Assembled 99632 nodes on 1 domains in 6244ms.\n",
148-
"Assembled 99632 nodes on 16 domains in 726ms.\n",
149-
"Assembled 99632 nodes on 100000 domains in 22ms.\n"
150-
]
151-
}
152-
],
153-
"source": [
154-
"# A GPU version using cuda::std::atomic is available:\n",
155-
"!nvc++ -std=c++20 -stdpar=gpu -gpu=cc80 -fast -DNDEBUG -o tree solutions/exercise0.cpp\n",
123+
"execution_count": null,
124+
"metadata": {},
125+
"outputs": [],
126+
"source": [
127+
"!clang++ -std=c++20 -Ofast -march=native -DNDEBUG -o tree solutions/exercise1.cpp -ltbb\n",
128+
"!./tree"
129+
]
130+
},
131+
{
132+
"cell_type": "code",
133+
"execution_count": null,
134+
"metadata": {},
135+
"outputs": [],
136+
"source": [
137+
"!nvc++ -std=c++20 -stdpar=multicore -O4 -fast -march=native -Mllvm-fast -DNDEBUG -o tree solutions/exercise1.cpp\n",
138+
"!./tree"
139+
]
140+
},
141+
{
142+
"cell_type": "markdown",
143+
"metadata": {},
144+
"source": [
145+
"Currently, not all `std::atomic` operations are supported on GPUs.\n",
146+
"The CUDA Toolkit is included with the HPC SDK and includes [libcudacxx](https://github.com/NVIDIA/libcudacxx), the CUDA C++ standard library.\n",
147+
"This library provides the `cuda::atomic` and similar types in the `#include <cuda/atomic>` header and those can be used on GPUs."
148+
]
149+
},
150+
{
151+
"cell_type": "code",
152+
"execution_count": null,
153+
"metadata": {},
154+
"outputs": [],
155+
"source": [
156+
"!g++ -std=c++20 -Ofast -march=native -DNDEBUG -o tree solutions/exercise1_gpu.cpp -ltbb\n",
157+
"!./tree"
158+
]
159+
},
160+
{
161+
"cell_type": "code",
162+
"execution_count": null,
163+
"metadata": {},
164+
"outputs": [],
165+
"source": [
166+
"!clang++ -std=c++20 -Ofast -march=native -DNDEBUG -o tree solutions/exercise1_gpu.cpp -ltbb\n",
167+
"!./tree"
168+
]
169+
},
170+
{
171+
"cell_type": "code",
172+
"execution_count": null,
173+
"metadata": {},
174+
"outputs": [],
175+
"source": [
176+
"!nvc++ -std=c++20 -stdpar=gpu -O4 -fast -march=native -Mllvm-fast -DNDEBUG -o tree solutions/exercise1_gpu.cpp\n",
156177
"!./tree"
157178
]
158179
}
@@ -173,7 +194,7 @@
173194
"name": "python",
174195
"nbconvert_exporter": "python",
175196
"pygments_lexer": "ipython3",
176-
"version": "3.8.10"
197+
"version": "3.10.6"
177198
}
178199
},
179200
"nbformat": 4,

0 commit comments

Comments
 (0)