forked from NervanaSystems/maxas
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsgemm.cu
105 lines (76 loc) · 4.09 KB
/
sgemm.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
// Note this file isn't configured to automatically compile.
// Here's how:
// If you want to look at the ptx first:
// nvcc -arch sm_50 -m 32 -ptx sgemm.cu
// Manually compile your kernel to a cubin.
// You should only have to do this once, unless you change params or shared size or globals:
// nvcc -arch sm_50 -m 32 -cubin sgemm.cu
// If tweaking a kernel or writing a new one based on this shell code you would then do this:
// maxas.pl -e kernel.cubin kernel.sass
// I've already included a modified kernel (sgemm.sass) so the next step is..
// Splice the manually assembled code back into the cubin:
// maxas.pl -i sgemm.sass sgemm.cubin
#include <device_functions.h>
#include <device_launch_parameters.h>
#include <cuda_texture_types.h>
#include <texture_fetch_functions.h>
typedef texture<float4, cudaTextureType1D, cudaReadModeElementType> floatTex;
floatTex texA(0, cudaFilterModePoint, cudaAddressModeBorder);
floatTex texB(0, cudaFilterModePoint, cudaAddressModeBorder);
// Use extern C so C++ doesn't mangle our kernel name
extern "C"
// This kernel requires 256x1x1 threads per block
__global__ void __launch_bounds__(256) sgemm_kernel_128(
float *C,
const int m, const int n, const int k,
const int lda, const int ldb, const int ldc,
float alpha, int *D)
{
// Declare any shared memory your kernel requires
// Or you could just pass the amount in as a param to cuLaunchKernel
__shared__ float4 share[1024];
int tid = threadIdx.x;
// If you use indirect texture references, they will be passed as params at the end of the param list
// So set that up here to make sure they're available in your kernel
floatTex tex = tid > 127 ? texB : texA;
// Make use of shared and your textures so it doesn't get optimized away
share[tid] = tex1Dfetch(tex, tid);
__syncthreads();
// output something so your setup isn't optimized away.
C[tid] = share[255-tid].x;
}
extern "C"
__global__ void __launch_bounds__(64) sgemm_kernel_64(
float *C,
const int m, const int n, const int k,
const int lda, const int ldb, const int ldc,
float alpha, int *D)
{
__shared__ float4 share[512];
int tid = threadIdx.x;
floatTex tex = tid > 127 ? texB : texA;
share[tid] = tex1Dfetch(tex, tid);
__syncthreads();
C[tid] = share[255-tid].x;
}
// A note about using the Cuda Runtime.
// If that's your preference over the driver API then here's what you'd do:
// In your project properties in the Cuda C/C++ panel:
// -Set the "Keep Processed Files" (-keep) option
// -Add a -v manually to the command line
// If compiling on command line just add -keep -v options to nvcc.
// Rebuild your solution and look in the log for these lines that follow the ptxas step:
// #$ fatbinary --create="Release/kernel.fatbin" -32 --key="a7bce87544c2a492" --ident="C:/Users/Scott/Documents/sgemm6/sgemm6/kernel.cu" --cmdline="-v --opt-level 4 --generate-line-info " "--image=profile=sm_50,file=Release/kernel.sm_50.cubin" "--image=profile=compute_50,file=Release/kernel.ptx" --embedded-fatbin="Release/kernel.fatbin.c" --cuda
// #$ cl.exe @Release/kernel.cu.cpp.ii.res > "Release/kernel.cu.cpp.ii"
// #$ cl.exe @Release/kernel.cu.obj.res -Fo"Release/kernel.cu.obj"
// You just need to manually run these 3 commands (or add them to a build script)
// after you've modified the cubin generated from the preceeding ptxas command.
// That will give you a new .cu.obj file which will automatically be linked in for you next time you
// build your project (or you could manually run the linker step as well).
// Having done that you can call your kernel normally using the <<< >>> syntax.
// Debugging will have to be with the sass syntax but that's what you'll want to see anyway.
// With fatbin you can also keep non-maxwell optimized versions of your code.
// I just discovered this also works as a shortcut to the above:
// nvcc -lib -arch sm_52 -m 32 -use-cubin code=sm_52,cubin=sgemm.cubin -o sgemm.lib sgemm.cu
// The cu kernel definitions above need to have empty bodies.
// And, the cu file must be compiled to a lib seperately before linking.