@@ -90,22 +90,29 @@ using min_test_types =
90
90
91
91
template <class T >
92
92
void test_atomic_min () {
93
+ // atomicMin/Max on ROCm seem to be bugged when managed memory is used.
94
+ // Use device memory isntead.
93
95
T* data;
94
96
const size_t problem_size = 16 ;
95
97
const T offset = T (1024 *1024 );
96
- BOOST_CHECK (pcudaMallocManaged (&data, (problem_size + 1 ) * sizeof (T)) ==
98
+ BOOST_CHECK (pcudaMalloc (&data, (problem_size + 1 ) * sizeof (T)) ==
97
99
pcudaSuccess);
100
+ std::vector<T> input (problem_size+1 );
98
101
for (int i = 0 ; i < problem_size+1 ; ++i)
99
- data[i] = T (i);
100
- data[0 ] = offset;
102
+ input[i] = T (i);
103
+ input[0 ] = offset;
104
+ BOOST_CHECK (pcudaMemcpy (data, input.data (), input.size () * sizeof (T),
105
+ pcudaMemcpyDefault) == pcudaSuccess);
101
106
102
107
pcudaParallelFor (1 , problem_size, [=](){
103
108
int gid = threadIdx.x ;
104
109
atomicMin (data, data[gid+1 ]);
105
110
});
106
111
BOOST_CHECK (pcudaDeviceSynchronize () == pcudaSuccess);
107
- std::cout << *data << std::endl;
108
- BOOST_CHECK (*data == 1 );
112
+ BOOST_CHECK (pcudaMemcpy (input.data (), data, input.size () * sizeof (T),
113
+ pcudaMemcpyDefault) == pcudaSuccess);
114
+
115
+ BOOST_CHECK (input[0 ] == 1 );
109
116
110
117
BOOST_CHECK (pcudaFree (data) == pcudaSuccess);
111
118
}
@@ -120,21 +127,28 @@ using max_test_types =
120
127
121
128
template <class T >
122
129
void test_atomic_max () {
130
+ // atomicMin/Max on ROCm seem to be bugged when managed memory is used.
131
+ // Use device memory isntead.
123
132
T* data;
124
133
const size_t problem_size = 16 ;
125
134
const T offset = T (2 );
126
- BOOST_CHECK (pcudaMallocManaged (&data, (problem_size + 1 ) * sizeof (T)) ==
135
+ BOOST_CHECK (pcudaMalloc (&data, (problem_size + 1 ) * sizeof (T)) ==
127
136
pcudaSuccess);
137
+ std::vector<T> input (problem_size+1 );
128
138
for (int i = 0 ; i < problem_size+1 ; ++i)
129
- data[i] = T (i);
130
- data[0 ] = offset;
139
+ input[i] = T (i);
140
+ input[0 ] = offset;
141
+ BOOST_CHECK (pcudaMemcpy (data, input.data (), input.size () * sizeof (T),
142
+ pcudaMemcpyDefault) == pcudaSuccess);
131
143
132
144
pcudaParallelFor (1 , problem_size, [=](){
133
145
int gid = threadIdx.x ;
134
146
atomicMax (data, data[gid+1 ]);
135
147
});
136
148
BOOST_CHECK (pcudaDeviceSynchronize () == pcudaSuccess);
137
- BOOST_CHECK (*data == problem_size);
149
+ BOOST_CHECK (pcudaMemcpy (input.data (), data, input.size () * sizeof (T),
150
+ pcudaMemcpyDefault) == pcudaSuccess);
151
+ BOOST_CHECK (input[0 ] == problem_size);
138
152
139
153
BOOST_CHECK (pcudaFree (data) == pcudaSuccess);
140
154
}
0 commit comments