Skip to content

Commit

Permalink
.
Browse files Browse the repository at this point in the history
  • Loading branch information
cwpearson committed Apr 8, 2020
1 parent 882b30e commit 5fb1eaf
Show file tree
Hide file tree
Showing 4 changed files with 12 additions and 9 deletions.
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ Then open the report in the NVIDIA Nsight Compute GUI:

File > Open File > `report.nsight-cuprof-report`


**Only certain kernels:**

The `--kernel-id` flag takes a string like `context-id:stream-id:[name-operator:]kernel-name:invocation-nr`.
Expand Down Expand Up @@ -121,4 +122,6 @@ See [DEVELOPING.md](DEVELOPING.md)
* [Nvidia Nsight Systems Docs](https://docs.nvidia.com/nsight-systems/)
* [Nvidia Nsight Compute Docs](https://docs.nvidia.com/nsight-compute/)
* [Using Nvidia Nsight Systems in Containers and the Cloud](https://devblogs.nvidia.com/nvidia-nsight-systems-containers-cloud/)
* [Using Nsight Compute to Inspect your Kernels](https://devblogs.nvidia.com/using-nsight-compute-to-inspect-your-kernels/)
* [Using Nsight Compute to Inspect your Kernels](https://devblogs.nvidia.com/using-nsight-compute-to-inspect-your-kernels/)
* `docs/GEMM-joint-tiling.ppt`: Joint-tiling slide deck from ECE 508 Spring 2017
* [Nsight Graphics Stall Reasons](https://docs.nvidia.com/drive/drive_os_5.1.12.0L/nsight-graphics/activities/#shaderprofiler_stallreasons)
4 changes: 2 additions & 2 deletions sgemm/basic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

/* NOTE: A and C are column major, B is row major
*/
__global__ void mygemm(float *c, //<! [out] and MxN matrix
__global__ void mygemm(float * __restrict__ c, //<! [out] and MxN matrix
const float *a, //<! [in] an MxK matrix
const float *b, //<! [in] an KxN matrix
const int M, const int N, const int K) {
Expand Down Expand Up @@ -87,7 +87,7 @@ int main(int argc, char **argv) {
CUDA_RUNTIME(cudaEventCreate(&stop));

// GPU kernel launch parameters
dim3 dimBlock(32, 8);
dim3 dimBlock(32, 32);
dim3 dimGrid;
dimGrid.x = (n + dimBlock.x - 1) / dimBlock.x;
dimGrid.y = (m + dimBlock.y - 1) / dimBlock.y;
Expand Down
8 changes: 4 additions & 4 deletions sgemm/regtiled_coarsened.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@

#include "common.hpp"

#define TILE_SZ_A 128
#define TILE_SZ_A 64
#define TILE_SZ_B 16
#define TILE_SZ_RATIO (TILE_SZ_A / TILE_SZ_B)

/* NOTE: A and C are column major, B is row major
*/
__global__ void mygemm(float *c, //<! [out] and MxN matrix
__global__ void mygemm(float * __restrict__ c, //<! [out] and MxN matrix
const float *a, //<! [in] an MxK matrix
const float *b, //<! [in] an KxN matrix
const int M, const int N, const int K) {
Expand All @@ -24,7 +24,7 @@ __global__ void mygemm(float *c, //<! [out] and MxN matrix
__shared__ float B_s[TILE_SZ_RATIO][TILE_SZ_B];

// Index variables
const unsigned int row = blockIdx.x * blockDim.x + threadIdx.x;
const unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
const unsigned int col = blockIdx.y * TILE_SZ_B;

// Privatization of output variables
Expand Down Expand Up @@ -127,7 +127,7 @@ int main(int argc, char **argv) {
CUDA_RUNTIME(cudaEventCreate(&stop));

// GPU kernel launch parameters
dim3 dimGrid((m - 1) / TILE_SZ_A + 1, (n - 1) / TILE_SZ_B + 1);
dim3 dimGrid((m + TILE_SZ_A - 1) / TILE_SZ_A, (n +TILE_SZ_B - 1) / TILE_SZ_B);
dim3 dimBlock(TILE_SZ_A, 1);

// total elapsed time
Expand Down
4 changes: 2 additions & 2 deletions sgemm/tiled.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@

#include "common.hpp"

#define TILE_WIDTH 16
#define TILE_WIDTH 32

/* NOTE: A and C are column major, B is row major
*/
__global__ void mygemm(float *c, //<! [out] and MxN matrix
__global__ void mygemm(float * __restrict__ c, //<! [out] and MxN matrix
const float *a, //<! [in] an MxK matrix
const float *b, //<! [in] an KxN matrix
const int M, const int N, const int K) {
Expand Down

0 comments on commit 5fb1eaf

Please sign in to comment.