Skip to content

Commit

Permalink
Optimizing GEMM tutorial approximately complete.
Browse files Browse the repository at this point in the history
  • Loading branch information
devinamatthews committed Jul 20, 2017
0 parents commit 6fbab63
Show file tree
Hide file tree
Showing 15 changed files with 1,669 additions and 0 deletions.
28 changes: 28 additions & 0 deletions optimizing_gemm/LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
Copyright (c) 2017, Devin Matthews
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.

* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.

* Neither the name of Devin Matthews nor the names of any
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

29 changes: 29 additions & 0 deletions optimizing_gemm/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
SHELL=/bin/bash

CXX=g++
CXXFLAGS=-O3 -march=native -fopenmp
LDFLAGS=-fopenmp -lopenblas
CXXFLAGS+=-I/Users/dmatthews/src

all: run

%.o: %.cxx
$(CXX) -c -o $@ $^ $(CXXFLAGS)

blas.x: driver.o blas_dgemm.o
$(CXX) -o $@ $^ $(LDFLAGS)

driver.x: driver.o my_dgemm_$(STEP).o
$(CXX) -o $@ $^ $(LDFLAGS)

run: blas.x driver.x
@echo "Running BLAS DGEMM"
@./blas.x | tee out_blas
@echo "Running DGEMM from Step $(STEP)"
@./driver.x | tee out_$(STEP)
@if [ $(STEP) = 0 ]; then \
./plot.sh $(STEP); \
else \
./plot.sh $$((STEP-1)) $(STEP); \
fi

15 changes: 15 additions & 0 deletions optimizing_gemm/blas_dgemm.cxx
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#include "common.hpp"

#include "lawrap/blas.h"
using namespace LAWrap;

/*
* Compute C = A*B
*/
void my_dgemm(int m, int n, int k, const matrix& A, const matrix& B, matrix& C)
{
gemm('N', 'N', n, m, k,
1.0, B.data(), n,
A.data(), k,
1.0, C.data(), n);
}
19 changes: 19 additions & 0 deletions optimizing_gemm/common.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#ifndef COMMON_HPP_
#define COMMON_HPP_

#include <cstdio>
#include <limits>

#include <omp.h>

#define EIGEN_NO_DEBUG
#include <Eigen/Dense>

using matrix = Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;

/*
* Compute C = A*B
*/
void my_dgemm(int m, int n, int k, const matrix& A, const matrix& B, matrix& C);

#endif
55 changes: 55 additions & 0 deletions optimizing_gemm/driver.cxx
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#include "common.hpp"

template <typename Experiment>
double benchmark(Experiment&& exp, int num_repeat=1)
{
double min_time = std::numeric_limits<double>::max();

for (int i = 0;i < num_repeat;i++)
{
double t0 = omp_get_wtime();
exp();
double t1 = omp_get_wtime();
min_time = std::min(min_time, t1-t0);
}

return min_time;
}

int main(int argc, char** argv)
{
for (int n = 24;n <= 1008;n += 24)
{
matrix A(n,n), B(n,n), C(n,n);

// time DGEMM on matrices of zeros (doesn't affect speed)

double elapsed = benchmark([&] { my_dgemm(n, n, n, A, B, C); }, 5);
double flops = 2.0*n*n*n;
double gflops = flops/elapsed/1e9;

printf("%d %f\n", n, gflops);
fflush(stdout);

// check accuracy for random matrices

A.setRandom();
B.setRandom();
C.setRandom();

matrix ABplusC = C;
my_dgemm(n, n, n, A, B, ABplusC);

ABplusC -= (A*B+C);
double error = ABplusC.norm();
double error_limit = 2*std::numeric_limits<double>::epsilon()*n*n;

if (error > error_limit)
{
fprintf(stderr, "error limit exceeded for n=%d: %g > %g\n",
n, error, error_limit);
}
}

return 0;
}
28 changes: 28 additions & 0 deletions optimizing_gemm/my_dgemm_0.cxx
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#include "common.hpp"

/*
* Compute C = A*B
*/
void my_dgemm(int m, int n, int k, const matrix& A, const matrix& B, matrix& C)
{
/*
* Step 0:
*
* Simple triple-loop matrix-matrix product.
*
* This ordering of the loops is called the "dot-product" algorithm.
*/
for (int i = 0;i < m;i++)
{
for (int j = 0;j < n;j++)
{
/*
* ...because this loop is a dot product (duh).
*/
for (int p = 0;p < k;p++)
{
C(i,j) += A(i,p) * B(p,j);
}
}
}
}
44 changes: 44 additions & 0 deletions optimizing_gemm/my_dgemm_1.cxx
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#include "common.hpp"

/*
* Compute C = A*B
*/
void my_dgemm(int m, int n, int k, const matrix& A, const matrix& B, matrix& C)
{
/*
* Step 1:
*
* Variant triple-loop matrix-matrix product.
*
* This ordering of is the "outer-product" algorithm.
*/
for (int p = 0;p < k;p++)
{
/*
* ...because these two loops perform an outer product (rank-1 update).
*/
for (int j = 0;j < n;j++)
{
/*
* However, we've put the "i" loop innermost. When the data doesn't
* fit in cache, we waste bandwidth by moving more data than we
* need to.
*
* This is because elements of A and C in the m dimension aren't
* contiguous, and so don't fill up cache lines nicely.
*
* Note that different loop orderings can have a large effect on
* performance, and may hurt or help depending on the problem size.
* Generally, you should prefer loop orderings that have contiguous
* access in the innermost loop.
*
* Exercise: what is the maximum reduction in bandwidth when we
* reorder the loops to ensure contiguous access?
*/
for (int i = 0;i < m;i++)
{
C(i,j) += A(i,p) * B(p,j);
}
}
}
}
124 changes: 124 additions & 0 deletions optimizing_gemm/my_dgemm_2.cxx
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
#include "common.hpp"

#define M_UNROLL 6
#define N_UNROLL 8

/*
* Compute C = A*B for some subblocks of A, B, and C
*/
template <typename MatrixA, typename MatrixB, typename MatrixC>
void my_dgemm_kernel(int k, const MatrixA& A, const MatrixB& B, MatrixC& C)
{
for (int p = 0;p < k;p++)
{
C(0,0) += A(0,p) * B(p,0); C(0,1) += A(0,p) * B(p,1);
C(0,2) += A(0,p) * B(p,2); C(0,3) += A(0,p) * B(p,3);
C(0,4) += A(0,p) * B(p,4); C(0,5) += A(0,p) * B(p,5);
C(0,6) += A(0,p) * B(p,6); C(0,7) += A(0,p) * B(p,7);

C(1,0) += A(1,p) * B(p,0); C(1,1) += A(1,p) * B(p,1);
C(1,2) += A(1,p) * B(p,2); C(1,3) += A(1,p) * B(p,3);
C(1,4) += A(1,p) * B(p,4); C(1,5) += A(1,p) * B(p,5);
C(1,6) += A(1,p) * B(p,6); C(1,7) += A(1,p) * B(p,7);

C(2,0) += A(2,p) * B(p,0); C(2,1) += A(2,p) * B(p,1);
C(2,2) += A(2,p) * B(p,2); C(2,3) += A(2,p) * B(p,3);
C(2,4) += A(2,p) * B(p,4); C(2,5) += A(2,p) * B(p,5);
C(2,6) += A(2,p) * B(p,6); C(2,7) += A(2,p) * B(p,7);

C(3,0) += A(3,p) * B(p,0); C(3,1) += A(3,p) * B(p,1);
C(3,2) += A(3,p) * B(p,2); C(3,3) += A(3,p) * B(p,3);
C(3,4) += A(3,p) * B(p,4); C(3,5) += A(3,p) * B(p,5);
C(3,6) += A(3,p) * B(p,6); C(3,7) += A(3,p) * B(p,7);

C(4,0) += A(4,p) * B(p,0); C(4,1) += A(4,p) * B(p,1);
C(4,2) += A(4,p) * B(p,2); C(4,3) += A(4,p) * B(p,3);
C(4,4) += A(4,p) * B(p,4); C(4,5) += A(4,p) * B(p,5);
C(4,6) += A(4,p) * B(p,6); C(4,7) += A(4,p) * B(p,7);

C(5,0) += A(5,p) * B(p,0); C(5,1) += A(5,p) * B(p,1);
C(5,2) += A(5,p) * B(p,2); C(5,3) += A(5,p) * B(p,3);
C(5,4) += A(5,p) * B(p,4); C(5,5) += A(5,p) * B(p,5);
C(5,6) += A(5,p) * B(p,6); C(5,7) += A(5,p) * B(p,7);
}
}

/*
* Compute C = A*B
*/
void my_dgemm(int m, int n, int k, const matrix& A, const matrix& B, matrix& C)
{
/*
* Step 2:
*
* The first real optimization is loop unrolling.
*
* We apply this optimization in two steps:
*
* First: make a copy of each loop to unroll, where the inner loop has
* a fixed count N, and the outer loop increments in steps of N.
for (int p = 0;p < k;p++)
{
for (int j = 0;j < n;j++)
{
for (int i = 0;i < m;i++)
{
//loop body
}
}
}
* becomes
for (int p = 0;p < k;p++)
{
for (int j = 0;j < n;j += N_UNROLL)
{
for (int i = 0;i < m;i += M_UNROLL)
{
for (int j = 0;j < N_UNROLL;j++)
{
for (int i = 0;i < M_UNROLL;i++)
{
//loop body
}
}
}
}
}
* Second: the inner loops with a fixed count are fully expanded. In this
* example, we've also changed the order of the loops slight and
* moved the inner remaining loop (over p) into a kernel.
for (int p = 0;p < k;p++)
{
for (int j = 0;j < n;j += N_UNROLL)
{
for (int i = 0;i < m;i += M_UNROLL)
{
//loop body M_UNROLL*N_UNROLL times
}
}
}
* Some times the compiler can do this optimization automatically, or
* with the help of hints such as #pragma unroll (icc/icpc) or
* -funroll-loops (gcc/g++).
*
* Exercise: what happens when e.g. m%M_UNROLL != 0? How should the code
* be amended to address this case?
*/
for (int j = 0;j < n;j += N_UNROLL)
{
for (int i = 0;i < m;i += M_UNROLL)
{
auto A_sub = A.block(i, 0, M_UNROLL, k);
auto B_sub = B.block(0, j, k, N_UNROLL);
auto C_sub = C.block(i, j, M_UNROLL, N_UNROLL);

my_dgemm_kernel(k, A_sub, B_sub, C_sub);
}
}
}
Loading

0 comments on commit 6fbab63

Please sign in to comment.