-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathoffsetNstride.cuf
102 lines (87 loc) · 2.74 KB
/
offsetNstride.cuf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
!
! Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
!
! NVIDIA CORPORATION and its licensors retain all intellectual property
! and proprietary rights in and to this software, related documentation
! and any modifications thereto.
!
!
! These example codes are a portion of the code samples from the companion
! website to the book "CUDA Fortran for Scientists and Engineers":
!
! http://store.elsevier.com/product.jsp?isbn=9780124169708
!
module kernels_m
use precision_m
contains
attributes(global) subroutine offset(a, s)
real(fp_kind) :: a(*)
integer, value :: s
integer :: i
i = blockDim%x*(blockIdx%x-1)+threadIdx%x + s
a(i) = a(i)+1
end subroutine offset
attributes(global) subroutine stride(a, s)
real(fp_kind) :: a(*)
integer, value :: s
integer :: i
i = (blockDim%x*(blockIdx%x-1)+threadIdx%x) * s
a(i) = a(i)+1
end subroutine stride
end module kernels_m
program offsetNStride
use cudafor
use kernels_m
implicit none
integer, parameter :: nMB = 4 ! transfer size in MB
integer, parameter :: n = nMB*1024*1024/fp_kind
integer, parameter :: blockSize = 256
! array dimensions are 33*n for stride cases
real(fp_kind), device :: a_d(33*n), b_d(33*n)
type(cudaEvent) :: startEvent, stopEvent
type(cudaDeviceProp) :: prop
integer :: i, istat
real(4) :: time
istat = cudaGetDeviceProperties(prop, 0)
write(*,'(/,"Device: ",a)') trim(prop%name)
write(*,'("Transfer size (MB): ",i0)') nMB
if (kind(a_d) == singlePrecision) then
write(*,'(a,/)') 'Single Precision'
else
write(*,'(a,/)') 'Double Precision'
endif
istat = cudaEventCreate(startEvent)
istat = cudaEventCreate(stopEvent)
write(*,*) 'Offset, Bandwidth (GB/s):'
call offset<<<n/blockSize,blockSize>>>(b_d, 0)
do i = 0, 32
a_d = 0.0
istat = cudaEventRecord(startEvent,0)
call offset<<<n/blockSize,blockSize>>>(a_d, i)
istat = cudaEventRecord(stopEvent,0)
istat = cudaEventSynchronize(stopEvent)
istat = cudaEventElapsedTime(time, startEvent, &
stopEvent)
write(*,*) i, 2*n*fp_kind/time*1.e-6
enddo
write(*,*)
write(*,*) 'Stride, Bandwidth (GB/s):'
call stride<<<n/blockSize,blockSize>>>(b_d, 1)
do i = 1, 32
a_d = 0.0
istat = cudaEventRecord(startEvent,0)
call stride<<<n/blockSize,blockSize>>>(a_d, i)
istat = cudaEventRecord(stopEvent,0)
istat = cudaEventSynchronize(stopEvent)
istat = cudaEventElapsedTime(time, startEvent, &
stopEvent)
write(*,*) i, 2*n*fp_kind/time*1.e-6
enddo
istat = cudaEventDestroy(startEvent)
istat = cudaEventDestroy(stopEvent)
if (istat .ne. 0) then
write(*,*) "Test Failed"
else
write(*,*) "Test Passed"
endif
end program offsetNStride