Update Lab3 Trie

gonzalobg · gonzalobg · commit 00bc0fae5835 · 2022-11-09T09:30:17.000-10:00
diff --git a/labs/lab3_tree/exercise1.cpp b/labs/lab3_tree/exercise1.cpp
@@ -34,17 +34,9 @@
 #include <thread>
 #include <utility>
 #include <vector>
-
-#if defined(__clang__)
-// clang does not support libstdc++ ranges
-#include <range/v3/all.hpp>
-namespace views = ranges::views;
-#else
 #include <ranges>
-namespace views = std::views;
-#endif
-
-// TODO: need to add some headers
+// TODO: add C++ standard library includes as necessary
+// #include <...>
 
 /// Builds a trie in parallel by splitting the input into chunks
 void do_trie(std::vector<char> const &input, int domains);
@@ -121,9 +113,8 @@ void do_trie(std::vector<char> const &input, int domains) {
   using clk_t = std::chrono::steady_clock;
   auto const begin = clk_t::now();
 
-  auto it = views::iota(0).begin();
-  // TODO: process all domains in parallel
-  std::for_each_n(it, domains,
+  // TODO: Use a parallel algorithm to process all domains in parallel
+  std::for_each_n(std::views::iota(0).begin(), domains,
                   [t, b, domains, input = input.data(), size = input.size()](auto domain) {
                     make_trie(*t, *b, input, input + size, domain, domains);
                   });
diff --git a/labs/lab3_tree/solutions/exercise1.cpp b/labs/lab3_tree/solutions/exercise1.cpp
@@ -34,17 +34,8 @@
 #include <thread>
 #include <utility>
 #include <vector>
-
-#if defined(__clang__)
-// clang does not support libstdc++ ranges
-#include <range/v3/all.hpp>
-namespace views = ranges::views;
-#else
 #include <ranges>
-namespace views = std::views;
-#endif
-
-// DONE: need to add some headers
+// DONE: add C++ standard library includes as necessary
 #include <atomic>
 
 /// Builds a trie in parallel by splitting the input into chunks
@@ -122,10 +113,9 @@ void do_trie(std::vector<char> const &input, int domains) {
   using clk_t = std::chrono::steady_clock;
   auto const begin = clk_t::now();
 
-  auto it = views::iota(0).begin();
   // DONE: process all domains in parallel
   // NOTE: we cannot use "par_unseq" here because the algorithm is starvation free.
-  std::for_each_n(std::execution::par, it, domains,
+  std::for_each_n(std::execution::par, std::views::iota(0).begin(), domains,
                   [t, b, domains, input = input.data(), size = input.size()](auto domain) {
                     make_trie(*t, *b, input, input + size, domain, domains);
                   });
diff --git a/labs/lab3_tree/solutions/exercise1_gpu.cpp b/labs/lab3_tree/solutions/exercise1_gpu.cpp
@@ -34,36 +34,22 @@
 #include <thread>
 #include <utility>
 #include <vector>
-
-#if defined(__clang__)
-// clang does not support libstdc++ ranges
-#include <range/v3/all.hpp>
-namespace views = ranges::views;
-#elif defined(__NVCOMPILER)
-// Workaround: cuda::atomic requires C++17 so no ranges; fixed in next release.
-#include <thrust/iterator/counting_iterator.h>
-#else
 #include <ranges>
-namespace views = std::views;
-#endif
-
-// DONE: need to add some headers
-#include <atomic>
-
+// DONE: add C++ standard library includes as necessary
+// NOTE: We include std::atomic except when -stdpar=gpu, in which case we include cuda::atomic
 #if defined(_NVHPC_STDPAR_GPU)
 #include <cuda/atomic>
-#include <cuda/std/atomic>
 template <typename T> using atomic = cuda::atomic<T, cuda::thread_scope_device>;
 constexpr auto memory_order_relaxed = cuda::memory_order_relaxed;
 constexpr auto memory_order_acquire = cuda::memory_order_acquire;
 constexpr auto memory_order_release = cuda::memory_order_release;
-#else
+#else // _NVHPC_STDPAR_GPU
 #include <atomic>
 template <typename T> using atomic = std::atomic<T>;
 constexpr auto memory_order_relaxed = std::memory_order_relaxed;
 constexpr auto memory_order_acquire = std::memory_order_acquire;
 constexpr auto memory_order_release = std::memory_order_release;
-#endif
+#endif // _NVHPC_STDPAR_GPU
 
 /// Builds a trie in parallel by splitting the input into chunks
 void do_trie(std::vector<char> const &input, int domains);
@@ -140,14 +126,9 @@ void do_trie(std::vector<char> const &input, int domains) {
   using clk_t = std::chrono::steady_clock;
   auto const begin = clk_t::now();
 
-#if defined(__NVCOMPILER)
-  auto it = thrust::counting_iterator<int>(0);
-#else
-  auto it = views::iota(0).begin();
-#endif
   // DONE: process all domains in parallel
   // NOTE: we cannot use "par_unseq" here because the algorithm is starvation free.
-  std::for_each_n(std::execution::par, it, domains,
+  std::for_each_n(std::execution::par, std::views::iota(0).begin(), domains,
                   [t, b, domains, input = input.data(), size = input.size()](auto domain) {
                     make_trie(*t, *b, input, input + size, domain, domains);
                   });
@@ -224,4 +205,4 @@ void make_trie(trie &root, atomic<trie *> &bump, const char *begin, const char *
     // And we traverse to it
     n = n->children[index].ptr.load(memory_order_relaxed);
   }
-}
+}
diff --git a/labs/lab3_tree/starting_point.cpp b/labs/lab3_tree/starting_point.cpp
@@ -33,15 +33,7 @@
 #include <string>
 #include <utility>
 #include <vector>
-
-#if defined(__clang__)
-// clang does not support libstdc++ ranges
-#include <range/v3/all.hpp>
-namespace views = ranges::views;
-#else
 #include <ranges>
-namespace views = std::views;
-#endif
 
 /// Builds a trie in parallel by splitting the input into chunks
 void do_trie(std::vector<char> const &input, int domains);
@@ -108,8 +100,7 @@ void do_trie(std::vector<char> const &input, int domains) {
   using clk_t = std::chrono::steady_clock;
   auto const begin = clk_t::now();
 
-  auto it = views::iota(0).begin();
-  std::for_each_n(it, domains,
+  std::for_each_n(std::views::iota(0).begin(), domains,
                   [t, b, domains, input = input.data(), size = input.size()](auto domain) {
                     make_trie(*t, *b, input, input + size, domain, domains);
                   });
diff --git a/labs/lab3_tree/tree.ipynb b/labs/lab3_tree/tree.ipynb
@@ -21,38 +21,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "+ curl -O -# https://www.gutenberg.org/files/2600/2600-0.txt\n",
-      "######################################################################### 100.0%\n",
-      "+ curl -O -# https://www.gutenberg.org/files/8800/8800.txt\n",
-      "######################################################################### 100.0%\n",
-      "+ curl -O -# https://www.gutenberg.org/files/84/84-0.txt\n",
-      "######################################################################### 100.0%\n",
-      "+ curl -O -# https://www.gutenberg.org/files/2701/2701-0.txt\n",
-      "######################################################################### 100.0%\n",
-      "+ curl -O -# https://www.gutenberg.org/files/35/35-0.txt\n",
-      "######################################################################### 100.0%\n",
-      "+ curl -O -# https://www.gutenberg.org/files/1342/1342-0.txt\n",
-      "######################################################################### 100.0%\n",
-      "+ curl -O -# https://www.gutenberg.org/files/3825/3825-0.txt\n",
-      "######################################################################### 100.0%\n",
-      "+ curl -O -# https://www.gutenberg.org/files/996/996-0.txt\n",
-      "######################################################################### 100.0%\n",
-      "+ curl -O -# https://www.gutenberg.org/files/55/55-0.txt\n",
-      "######################################################################### 100.0%\n",
-      "+ curl -O -# https://www.gutenberg.org/files/6130/6130-0.txt\n",
-      "######################################################################### 100.0%\n",
-      "+ curl -O -# https://www.gutenberg.org/files/1727/1727-0.txt\n",
-      "######################################################################### 100.0%\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "!./books.sh"
    ]
@@ -68,20 +39,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Input size 11451743 chars.\n",
-      "Assembled 99632 nodes on 1 domains in 267ms.\n"
-     ]
-    }
-   ],
-   "source": [
-    "!g++ -std=c++20 -o tree starting_point.cpp -ltbb\n",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!g++ -std=c++20 -Ofast -march=native -o tree starting_point.cpp -ltbb\n",
     "!./tree"
    ]
   },
@@ -95,7 +57,44 @@
     "\n",
     "## Exercise 1: process the input in parallel\n",
     "\n",
-    "The goal of this exercise is to process the input in parallel using multiple domains."
+    "The goal of this exercise is to process the input in parallel using multiple domains.\n",
+    "\n",
+    "A template for the solution is provided in [exercise1.cpp]. The `TODO`s indicate the parts of the template that must be completed.\n",
+    "\n",
+    "[exercise1.cpp]: ./exercise1.cpp\n",
+    "\n",
+    "The example compiles and runs serially as provided.\n",
+    "Once you parallelize it, the following blocks should compile and run correctly:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!g++ -std=c++20 -Ofast -march=native -o tree exercise1.cpp -ltbb\n",
+    "!./tree"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!clang++ -std=c++20 -Ofast -march=native -o tree exercise1.cpp -ltbb\n",
+    "!./tree"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!nvc++ -std=c++20 -stdpar=multicore -O4 -fast -march=native -Mllvm-fast -o tree exercise1.cpp\n",
+    "!./tree"
    ]
   },
   {
@@ -106,53 +105,75 @@
     "\n",
     "The solutions for each example are available in the `solutions/` sub-directory.\n",
     "\n",
-    "The following compiles and runs the solutions for Exercise 0 using different compilers."
+    "The following compiles and runs the solutions for Exercise 1 using different compilers."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Input size 11451743 chars.\n",
-      "Assembled 99632 nodes on 1 domains in 232ms.\n",
-      "Assembled 99632 nodes on 16 domains in 201ms.\n",
-      "Assembled 99632 nodes on 100000 domains in 166ms.\n"
-     ]
-    }
-   ],
-   "source": [
-    "!g++ -std=c++20 -Ofast -DNDEBUG -o tree solutions/exercise0.cpp\n",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!g++ -std=c++20 -Ofast -march=native -DNDEBUG -o tree solutions/exercise1.cpp -ltbb\n",
     "!./tree"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "nvvmCompileProgram error 9: NVVM_ERROR_COMPILATION.\n",
-      "Error: /tmp/pgacc3rBLuXtV9Je.gpu (1281, 38): parse atomicrmw value and pointer type do not match\n",
-      "NVC++-F-0155-Compiler failed to translate accelerator region (see -Minfo messages): Device compiler exited with error status code (solutions/exercise0.cpp: 1)\n",
-      "NVC++/x86-64 Linux 22.5-0: compilation aborted\n",
-      "Input size 11451743 chars.\n",
-      "Assembled 99632 nodes on 1 domains in 6244ms.\n",
-      "Assembled 99632 nodes on 16 domains in 726ms.\n",
-      "Assembled 99632 nodes on 100000 domains in 22ms.\n"
-     ]
-    }
-   ],
-   "source": [
-    "# A GPU version using cuda::std::atomic is available:\n",
-    "!nvc++ -std=c++20 -stdpar=gpu -gpu=cc80 -fast -DNDEBUG -o tree solutions/exercise0.cpp\n",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!clang++ -std=c++20 -Ofast -march=native -DNDEBUG -o tree solutions/exercise1.cpp -ltbb\n",
+    "!./tree"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!nvc++ -std=c++20 -stdpar=multicore -O4 -fast -march=native -Mllvm-fast -DNDEBUG -o tree solutions/exercise1.cpp\n",
+    "!./tree"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Currently, not all `std::atomic` operations are supported on GPUs.\n",
+    "The CUDA Toolkit is included with the HPC SDK and includes [libcudacxx](https://github.com/NVIDIA/libcudacxx), the CUDA C++ standard library.\n",
+    "This library provides the `cuda::atomic` and similar types in the `#include <cuda/atomic>` header and those can be used on GPUs."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!g++ -std=c++20 -Ofast -march=native -DNDEBUG -o tree solutions/exercise1_gpu.cpp -ltbb\n",
+    "!./tree"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!clang++ -std=c++20 -Ofast -march=native -DNDEBUG -o tree solutions/exercise1_gpu.cpp -ltbb\n",
+    "!./tree"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!nvc++ -std=c++20 -stdpar=gpu -O4 -fast -march=native -Mllvm-fast -DNDEBUG -o tree solutions/exercise1_gpu.cpp\n",
     "!./tree"
    ]
   }
@@ -173,7 +194,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.10"
+   "version": "3.10.6"
   }
  },
  "nbformat": 4,