-
Notifications
You must be signed in to change notification settings - Fork 23
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Optimizing GEMM tutorial approximately complete.
- Loading branch information
0 parents
commit 6fbab63
Showing
15 changed files
with
1,669 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
Copyright (c) 2017, Devin Matthews | ||
All rights reserved. | ||
|
||
Redistribution and use in source and binary forms, with or without | ||
modification, are permitted provided that the following conditions are met: | ||
|
||
* Redistributions of source code must retain the above copyright notice, this | ||
list of conditions and the following disclaimer. | ||
|
||
* Redistributions in binary form must reproduce the above copyright notice, | ||
this list of conditions and the following disclaimer in the documentation | ||
and/or other materials provided with the distribution. | ||
|
||
* Neither the name of Devin Matthews nor the names of any | ||
contributors may be used to endorse or promote products derived from | ||
this software without specific prior written permission. | ||
|
||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | ||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | ||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE | ||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | ||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | ||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | ||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | ||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | ||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
SHELL=/bin/bash | ||
|
||
CXX=g++ | ||
CXXFLAGS=-O3 -march=native -fopenmp | ||
LDFLAGS=-fopenmp -lopenblas | ||
CXXFLAGS+=-I/Users/dmatthews/src | ||
|
||
all: run | ||
|
||
%.o: %.cxx | ||
$(CXX) -c -o $@ $^ $(CXXFLAGS) | ||
|
||
blas.x: driver.o blas_dgemm.o | ||
$(CXX) -o $@ $^ $(LDFLAGS) | ||
|
||
driver.x: driver.o my_dgemm_$(STEP).o | ||
$(CXX) -o $@ $^ $(LDFLAGS) | ||
|
||
run: blas.x driver.x | ||
@echo "Running BLAS DGEMM" | ||
@./blas.x | tee out_blas | ||
@echo "Running DGEMM from Step $(STEP)" | ||
@./driver.x | tee out_$(STEP) | ||
@if [ $(STEP) = 0 ]; then \ | ||
./plot.sh $(STEP); \ | ||
else \ | ||
./plot.sh $$((STEP-1)) $(STEP); \ | ||
fi | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
#include "common.hpp" | ||
|
||
#include "lawrap/blas.h" | ||
using namespace LAWrap; | ||
|
||
/* | ||
* Compute C = A*B | ||
*/ | ||
void my_dgemm(int m, int n, int k, const matrix& A, const matrix& B, matrix& C) | ||
{ | ||
gemm('N', 'N', n, m, k, | ||
1.0, B.data(), n, | ||
A.data(), k, | ||
1.0, C.data(), n); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
#ifndef COMMON_HPP_ | ||
#define COMMON_HPP_ | ||
|
||
#include <cstdio> | ||
#include <limits> | ||
|
||
#include <omp.h> | ||
|
||
#define EIGEN_NO_DEBUG | ||
#include <Eigen/Dense> | ||
|
||
using matrix = Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>; | ||
|
||
/* | ||
* Compute C = A*B | ||
*/ | ||
void my_dgemm(int m, int n, int k, const matrix& A, const matrix& B, matrix& C); | ||
|
||
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
#include "common.hpp" | ||
|
||
template <typename Experiment> | ||
double benchmark(Experiment&& exp, int num_repeat=1) | ||
{ | ||
double min_time = std::numeric_limits<double>::max(); | ||
|
||
for (int i = 0;i < num_repeat;i++) | ||
{ | ||
double t0 = omp_get_wtime(); | ||
exp(); | ||
double t1 = omp_get_wtime(); | ||
min_time = std::min(min_time, t1-t0); | ||
} | ||
|
||
return min_time; | ||
} | ||
|
||
int main(int argc, char** argv) | ||
{ | ||
for (int n = 24;n <= 1008;n += 24) | ||
{ | ||
matrix A(n,n), B(n,n), C(n,n); | ||
|
||
// time DGEMM on matrices of zeros (doesn't affect speed) | ||
|
||
double elapsed = benchmark([&] { my_dgemm(n, n, n, A, B, C); }, 5); | ||
double flops = 2.0*n*n*n; | ||
double gflops = flops/elapsed/1e9; | ||
|
||
printf("%d %f\n", n, gflops); | ||
fflush(stdout); | ||
|
||
// check accuracy for random matrices | ||
|
||
A.setRandom(); | ||
B.setRandom(); | ||
C.setRandom(); | ||
|
||
matrix ABplusC = C; | ||
my_dgemm(n, n, n, A, B, ABplusC); | ||
|
||
ABplusC -= (A*B+C); | ||
double error = ABplusC.norm(); | ||
double error_limit = 2*std::numeric_limits<double>::epsilon()*n*n; | ||
|
||
if (error > error_limit) | ||
{ | ||
fprintf(stderr, "error limit exceeded for n=%d: %g > %g\n", | ||
n, error, error_limit); | ||
} | ||
} | ||
|
||
return 0; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
#include "common.hpp" | ||
|
||
/* | ||
* Compute C = A*B | ||
*/ | ||
void my_dgemm(int m, int n, int k, const matrix& A, const matrix& B, matrix& C) | ||
{ | ||
/* | ||
* Step 0: | ||
* | ||
* Simple triple-loop matrix-matrix product. | ||
* | ||
* This ordering of the loops is called the "dot-product" algorithm. | ||
*/ | ||
for (int i = 0;i < m;i++) | ||
{ | ||
for (int j = 0;j < n;j++) | ||
{ | ||
/* | ||
* ...because this loop is a dot product (duh). | ||
*/ | ||
for (int p = 0;p < k;p++) | ||
{ | ||
C(i,j) += A(i,p) * B(p,j); | ||
} | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
#include "common.hpp" | ||
|
||
/* | ||
* Compute C = A*B | ||
*/ | ||
void my_dgemm(int m, int n, int k, const matrix& A, const matrix& B, matrix& C) | ||
{ | ||
/* | ||
* Step 1: | ||
* | ||
* Variant triple-loop matrix-matrix product. | ||
* | ||
* This ordering of is the "outer-product" algorithm. | ||
*/ | ||
for (int p = 0;p < k;p++) | ||
{ | ||
/* | ||
* ...because these two loops perform an outer product (rank-1 update). | ||
*/ | ||
for (int j = 0;j < n;j++) | ||
{ | ||
/* | ||
* However, we've put the "i" loop innermost. When the data doesn't | ||
* fit in cache, we waste bandwidth by moving more data than we | ||
* need to. | ||
* | ||
* This is because elements of A and C in the m dimension aren't | ||
* contiguous, and so don't fill up cache lines nicely. | ||
* | ||
* Note that different loop orderings can have a large effect on | ||
* performance, and may hurt or help depending on the problem size. | ||
* Generally, you should prefer loop orderings that have contiguous | ||
* access in the innermost loop. | ||
* | ||
* Exercise: what is the maximum reduction in bandwidth when we | ||
* reorder the loops to ensure contiguous access? | ||
*/ | ||
for (int i = 0;i < m;i++) | ||
{ | ||
C(i,j) += A(i,p) * B(p,j); | ||
} | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
#include "common.hpp" | ||
|
||
#define M_UNROLL 6 | ||
#define N_UNROLL 8 | ||
|
||
/* | ||
* Compute C = A*B for some subblocks of A, B, and C | ||
*/ | ||
template <typename MatrixA, typename MatrixB, typename MatrixC> | ||
void my_dgemm_kernel(int k, const MatrixA& A, const MatrixB& B, MatrixC& C) | ||
{ | ||
for (int p = 0;p < k;p++) | ||
{ | ||
C(0,0) += A(0,p) * B(p,0); C(0,1) += A(0,p) * B(p,1); | ||
C(0,2) += A(0,p) * B(p,2); C(0,3) += A(0,p) * B(p,3); | ||
C(0,4) += A(0,p) * B(p,4); C(0,5) += A(0,p) * B(p,5); | ||
C(0,6) += A(0,p) * B(p,6); C(0,7) += A(0,p) * B(p,7); | ||
|
||
C(1,0) += A(1,p) * B(p,0); C(1,1) += A(1,p) * B(p,1); | ||
C(1,2) += A(1,p) * B(p,2); C(1,3) += A(1,p) * B(p,3); | ||
C(1,4) += A(1,p) * B(p,4); C(1,5) += A(1,p) * B(p,5); | ||
C(1,6) += A(1,p) * B(p,6); C(1,7) += A(1,p) * B(p,7); | ||
|
||
C(2,0) += A(2,p) * B(p,0); C(2,1) += A(2,p) * B(p,1); | ||
C(2,2) += A(2,p) * B(p,2); C(2,3) += A(2,p) * B(p,3); | ||
C(2,4) += A(2,p) * B(p,4); C(2,5) += A(2,p) * B(p,5); | ||
C(2,6) += A(2,p) * B(p,6); C(2,7) += A(2,p) * B(p,7); | ||
|
||
C(3,0) += A(3,p) * B(p,0); C(3,1) += A(3,p) * B(p,1); | ||
C(3,2) += A(3,p) * B(p,2); C(3,3) += A(3,p) * B(p,3); | ||
C(3,4) += A(3,p) * B(p,4); C(3,5) += A(3,p) * B(p,5); | ||
C(3,6) += A(3,p) * B(p,6); C(3,7) += A(3,p) * B(p,7); | ||
|
||
C(4,0) += A(4,p) * B(p,0); C(4,1) += A(4,p) * B(p,1); | ||
C(4,2) += A(4,p) * B(p,2); C(4,3) += A(4,p) * B(p,3); | ||
C(4,4) += A(4,p) * B(p,4); C(4,5) += A(4,p) * B(p,5); | ||
C(4,6) += A(4,p) * B(p,6); C(4,7) += A(4,p) * B(p,7); | ||
|
||
C(5,0) += A(5,p) * B(p,0); C(5,1) += A(5,p) * B(p,1); | ||
C(5,2) += A(5,p) * B(p,2); C(5,3) += A(5,p) * B(p,3); | ||
C(5,4) += A(5,p) * B(p,4); C(5,5) += A(5,p) * B(p,5); | ||
C(5,6) += A(5,p) * B(p,6); C(5,7) += A(5,p) * B(p,7); | ||
} | ||
} | ||
|
||
/* | ||
* Compute C = A*B | ||
*/ | ||
void my_dgemm(int m, int n, int k, const matrix& A, const matrix& B, matrix& C) | ||
{ | ||
/* | ||
* Step 2: | ||
* | ||
* The first real optimization is loop unrolling. | ||
* | ||
* We apply this optimization in two steps: | ||
* | ||
* First: make a copy of each loop to unroll, where the inner loop has | ||
* a fixed count N, and the outer loop increments in steps of N. | ||
for (int p = 0;p < k;p++) | ||
{ | ||
for (int j = 0;j < n;j++) | ||
{ | ||
for (int i = 0;i < m;i++) | ||
{ | ||
//loop body | ||
} | ||
} | ||
} | ||
* becomes | ||
for (int p = 0;p < k;p++) | ||
{ | ||
for (int j = 0;j < n;j += N_UNROLL) | ||
{ | ||
for (int i = 0;i < m;i += M_UNROLL) | ||
{ | ||
for (int j = 0;j < N_UNROLL;j++) | ||
{ | ||
for (int i = 0;i < M_UNROLL;i++) | ||
{ | ||
//loop body | ||
} | ||
} | ||
} | ||
} | ||
} | ||
* Second: the inner loops with a fixed count are fully expanded. In this | ||
* example, we've also changed the order of the loops slight and | ||
* moved the inner remaining loop (over p) into a kernel. | ||
for (int p = 0;p < k;p++) | ||
{ | ||
for (int j = 0;j < n;j += N_UNROLL) | ||
{ | ||
for (int i = 0;i < m;i += M_UNROLL) | ||
{ | ||
//loop body M_UNROLL*N_UNROLL times | ||
} | ||
} | ||
} | ||
* Some times the compiler can do this optimization automatically, or | ||
* with the help of hints such as #pragma unroll (icc/icpc) or | ||
* -funroll-loops (gcc/g++). | ||
* | ||
* Exercise: what happens when e.g. m%M_UNROLL != 0? How should the code | ||
* be amended to address this case? | ||
*/ | ||
for (int j = 0;j < n;j += N_UNROLL) | ||
{ | ||
for (int i = 0;i < m;i += M_UNROLL) | ||
{ | ||
auto A_sub = A.block(i, 0, M_UNROLL, k); | ||
auto B_sub = B.block(0, j, k, N_UNROLL); | ||
auto C_sub = C.block(i, j, M_UNROLL, N_UNROLL); | ||
|
||
my_dgemm_kernel(k, A_sub, B_sub, C_sub); | ||
} | ||
} | ||
} |
Oops, something went wrong.