Optimizing GEMM tutorial approximately complete.

devinamatthews · Jul 20, 2017 · 6fbab63 · 6fbab63
commit 6fbab63
Show file tree

Hide file tree

Showing 15 changed files with 1,669 additions and 0 deletions.
diff --git a/optimizing_gemm/LICENSE b/optimizing_gemm/LICENSE
@@ -0,0 +1,28 @@
+Copyright (c) 2017, Devin Matthews
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of Devin Matthews nor the names of any
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
diff --git a/optimizing_gemm/Makefile b/optimizing_gemm/Makefile
@@ -0,0 +1,29 @@
+SHELL=/bin/bash
+
+CXX=g++
+CXXFLAGS=-O3 -march=native -fopenmp
+LDFLAGS=-fopenmp -lopenblas
+CXXFLAGS+=-I/Users/dmatthews/src
+
+all: run
+
+%.o: %.cxx
+	$(CXX) -c -o $@ $^ $(CXXFLAGS)
+
+blas.x: driver.o blas_dgemm.o
+	$(CXX) -o $@ $^ $(LDFLAGS)
+
+driver.x: driver.o my_dgemm_$(STEP).o
+	$(CXX) -o $@ $^ $(LDFLAGS)
+
+run: blas.x driver.x
+	@echo "Running BLAS DGEMM"
+	@./blas.x | tee out_blas
+	@echo "Running DGEMM from Step $(STEP)"
+	@./driver.x | tee out_$(STEP)
+	@if [ $(STEP) = 0 ]; then \
+		./plot.sh $(STEP); \
+	else \
+		./plot.sh $$((STEP-1)) $(STEP); \
+	fi
+
diff --git a/optimizing_gemm/blas_dgemm.cxx b/optimizing_gemm/blas_dgemm.cxx
@@ -0,0 +1,15 @@
+#include "common.hpp"
+
+#include "lawrap/blas.h"
+using namespace LAWrap;
+
+/*
+ * Compute C = A*B
+ */
+void my_dgemm(int m, int n, int k, const matrix& A, const matrix& B, matrix& C)
+{
+    gemm('N', 'N', n, m, k,
+         1.0, B.data(), n,
+              A.data(), k,
+         1.0, C.data(), n);
+}
diff --git a/optimizing_gemm/common.hpp b/optimizing_gemm/common.hpp
@@ -0,0 +1,19 @@
+#ifndef COMMON_HPP_
+#define COMMON_HPP_
+
+#include <cstdio>
+#include <limits>
+
+#include <omp.h>
+
+#define EIGEN_NO_DEBUG
+#include <Eigen/Dense>
+
+using matrix = Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+
+/*
+ * Compute C = A*B
+ */
+void my_dgemm(int m, int n, int k, const matrix& A, const matrix& B, matrix& C);
+
+#endif
diff --git a/optimizing_gemm/driver.cxx b/optimizing_gemm/driver.cxx
@@ -0,0 +1,55 @@
+#include "common.hpp"
+
+template <typename Experiment>
+double benchmark(Experiment&& exp, int num_repeat=1)
+{
+    double min_time = std::numeric_limits<double>::max();
+
+    for (int i = 0;i < num_repeat;i++)
+    {
+        double t0 = omp_get_wtime();
+        exp();
+        double t1 = omp_get_wtime();
+        min_time = std::min(min_time, t1-t0);
+    }
+
+    return min_time;
+}
+
+int main(int argc, char** argv)
+{
+    for (int n = 24;n <= 1008;n += 24)
+    {
+        matrix A(n,n), B(n,n), C(n,n);
+
+        // time DGEMM on matrices of zeros (doesn't affect speed)
+
+        double elapsed = benchmark([&] { my_dgemm(n, n, n, A, B, C); }, 5);
+        double flops = 2.0*n*n*n;
+        double gflops = flops/elapsed/1e9;
+
+        printf("%d %f\n", n, gflops);
+        fflush(stdout);
+
+        // check accuracy for random matrices
+
+        A.setRandom();
+        B.setRandom();
+        C.setRandom();
+
+        matrix ABplusC = C;
+        my_dgemm(n, n, n, A, B, ABplusC);
+
+        ABplusC -= (A*B+C);
+        double error = ABplusC.norm();
+        double error_limit = 2*std::numeric_limits<double>::epsilon()*n*n;
+
+        if (error > error_limit)
+        {
+            fprintf(stderr, "error limit exceeded for n=%d: %g > %g\n",
+                    n, error, error_limit);
+        }
+    }
+
+    return 0;
+}
diff --git a/optimizing_gemm/my_dgemm_0.cxx b/optimizing_gemm/my_dgemm_0.cxx
@@ -0,0 +1,28 @@
+#include "common.hpp"
+
+/*
+ * Compute C = A*B
+ */
+void my_dgemm(int m, int n, int k, const matrix& A, const matrix& B, matrix& C)
+{
+    /*
+     * Step 0:
+     *
+     * Simple triple-loop matrix-matrix product.
+     *
+     * This ordering of the loops is called the "dot-product" algorithm.
+     */
+    for (int i = 0;i < m;i++)
+    {
+        for (int j = 0;j < n;j++)
+        {
+            /*
+             * ...because this loop is a dot product (duh).
+             */
+            for (int p = 0;p < k;p++)
+            {
+                C(i,j) += A(i,p) * B(p,j);
+            }
+        }
+    }
+}
diff --git a/optimizing_gemm/my_dgemm_1.cxx b/optimizing_gemm/my_dgemm_1.cxx
@@ -0,0 +1,44 @@
+#include "common.hpp"
+
+/*
+ * Compute C = A*B
+ */
+void my_dgemm(int m, int n, int k, const matrix& A, const matrix& B, matrix& C)
+{
+    /*
+     * Step 1:
+     *
+     * Variant triple-loop matrix-matrix product.
+     *
+     * This ordering of is the "outer-product" algorithm.
+     */
+    for (int p = 0;p < k;p++)
+    {
+        /*
+         * ...because these two loops perform an outer product (rank-1 update).
+         */
+        for (int j = 0;j < n;j++)
+        {
+            /*
+             * However, we've put the "i" loop innermost. When the data doesn't
+             * fit in cache, we waste bandwidth by moving more data than we
+             * need to.
+             *
+             * This is because elements of A and C in the m dimension aren't
+             * contiguous, and so don't fill up cache lines nicely.
+             *
+             * Note that different loop orderings can have a large effect on
+             * performance, and may hurt or help depending on the problem size.
+             * Generally, you should prefer loop orderings that have contiguous
+             * access in the innermost loop.
+             *
+             * Exercise: what is the maximum reduction in bandwidth when we
+             * reorder the loops to ensure contiguous access?
+             */
+            for (int i = 0;i < m;i++)
+            {
+                C(i,j) += A(i,p) * B(p,j);
+            }
+        }
+    }
+}
diff --git a/optimizing_gemm/my_dgemm_2.cxx b/optimizing_gemm/my_dgemm_2.cxx
@@ -0,0 +1,124 @@
+#include "common.hpp"
+
+#define M_UNROLL 6
+#define N_UNROLL 8
+
+/*
+ * Compute C = A*B for some subblocks of A, B, and C
+ */
+template <typename MatrixA, typename MatrixB, typename MatrixC>
+void my_dgemm_kernel(int k, const MatrixA& A, const MatrixB& B, MatrixC& C)
+{
+    for (int p = 0;p < k;p++)
+    {
+        C(0,0) += A(0,p) * B(p,0); C(0,1) += A(0,p) * B(p,1);
+        C(0,2) += A(0,p) * B(p,2); C(0,3) += A(0,p) * B(p,3);
+        C(0,4) += A(0,p) * B(p,4); C(0,5) += A(0,p) * B(p,5);
+        C(0,6) += A(0,p) * B(p,6); C(0,7) += A(0,p) * B(p,7);
+
+        C(1,0) += A(1,p) * B(p,0); C(1,1) += A(1,p) * B(p,1);
+        C(1,2) += A(1,p) * B(p,2); C(1,3) += A(1,p) * B(p,3);
+        C(1,4) += A(1,p) * B(p,4); C(1,5) += A(1,p) * B(p,5);
+        C(1,6) += A(1,p) * B(p,6); C(1,7) += A(1,p) * B(p,7);
+
+        C(2,0) += A(2,p) * B(p,0); C(2,1) += A(2,p) * B(p,1);
+        C(2,2) += A(2,p) * B(p,2); C(2,3) += A(2,p) * B(p,3);
+        C(2,4) += A(2,p) * B(p,4); C(2,5) += A(2,p) * B(p,5);
+        C(2,6) += A(2,p) * B(p,6); C(2,7) += A(2,p) * B(p,7);
+
+        C(3,0) += A(3,p) * B(p,0); C(3,1) += A(3,p) * B(p,1);
+        C(3,2) += A(3,p) * B(p,2); C(3,3) += A(3,p) * B(p,3);
+        C(3,4) += A(3,p) * B(p,4); C(3,5) += A(3,p) * B(p,5);
+        C(3,6) += A(3,p) * B(p,6); C(3,7) += A(3,p) * B(p,7);
+
+        C(4,0) += A(4,p) * B(p,0); C(4,1) += A(4,p) * B(p,1);
+        C(4,2) += A(4,p) * B(p,2); C(4,3) += A(4,p) * B(p,3);
+        C(4,4) += A(4,p) * B(p,4); C(4,5) += A(4,p) * B(p,5);
+        C(4,6) += A(4,p) * B(p,6); C(4,7) += A(4,p) * B(p,7);
+
+        C(5,0) += A(5,p) * B(p,0); C(5,1) += A(5,p) * B(p,1);
+        C(5,2) += A(5,p) * B(p,2); C(5,3) += A(5,p) * B(p,3);
+        C(5,4) += A(5,p) * B(p,4); C(5,5) += A(5,p) * B(p,5);
+        C(5,6) += A(5,p) * B(p,6); C(5,7) += A(5,p) * B(p,7);
+    }
+}
+
+/*
+ * Compute C = A*B
+ */
+void my_dgemm(int m, int n, int k, const matrix& A, const matrix& B, matrix& C)
+{
+    /*
+     * Step 2:
+     *
+     * The first real optimization is loop unrolling.
+     *
+     * We apply this optimization in two steps:
+     *
+     * First: make a copy of each loop to unroll, where the inner loop has
+     *        a fixed count N, and the outer loop increments in steps of N.
+
+    for (int p = 0;p < k;p++)
+    {
+        for (int j = 0;j < n;j++)
+        {
+            for (int i = 0;i < m;i++)
+            {
+                //loop body
+            }
+        }
+    }
+
+     * becomes
+
+    for (int p = 0;p < k;p++)
+    {
+        for (int j = 0;j < n;j += N_UNROLL)
+        {
+            for (int i = 0;i < m;i += M_UNROLL)
+            {
+                for (int j = 0;j < N_UNROLL;j++)
+                {
+                    for (int i = 0;i < M_UNROLL;i++)
+                    {
+                        //loop body
+                    }
+                }
+            }
+        }
+    }
+
+     * Second: the inner loops with a fixed count are fully expanded. In this
+     *         example, we've also changed the order of the loops slight and
+     *         moved the inner remaining loop (over p) into a kernel.
+
+    for (int p = 0;p < k;p++)
+    {
+        for (int j = 0;j < n;j += N_UNROLL)
+        {
+            for (int i = 0;i < m;i += M_UNROLL)
+            {
+                //loop body M_UNROLL*N_UNROLL times
+            }
+        }
+    }
+
+     * Some times the compiler can do this optimization automatically, or
+     * with the help of hints such as #pragma unroll (icc/icpc) or
+     * -funroll-loops (gcc/g++).
+     *
+     * Exercise: what happens when e.g. m%M_UNROLL != 0? How should the code
+     * be amended to address this case?
+     */
+    for (int j = 0;j < n;j += N_UNROLL)
+    {
+        for (int i = 0;i < m;i += M_UNROLL)
+        {
+            auto A_sub = A.block(i, 0, M_UNROLL,        k);
+            auto B_sub = B.block(0, j,        k, N_UNROLL);
+            auto C_sub = C.block(i, j, M_UNROLL, N_UNROLL);
+
+            my_dgemm_kernel(k, A_sub, B_sub, C_sub);
+        }
+    }
+}