This repository has been archived by the owner on Jun 20, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcalc_dt_kernel_cuda.cu
executable file
·214 lines (179 loc) · 6.98 KB
/
calc_dt_kernel_cuda.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
/*Crown Copyright 2012 AWE.
*
* This file is part of CloverLeaf.
*
* CloverLeaf is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the
* Free Software Foundation, either version 3 of the License, or (at your option)
* any later version.
*
* CloverLeaf is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along with
* CloverLeaf. If not, see http://www.gnu.org/licenses/.
*/
/*
* @brief CUDA timestep kernel
* @author Michael Boulton NVIDIA Corporation
* @details Calculates the minimum timestep on the mesh chunk based on the CFL
* condition, the velocity gradient and the velocity divergence. A safety
* factor is used to ensure numerical stability.
*/
#include <iostream>
#include "cuda_common.cu"
#include "ftocmacros.h"
#include <algorithm>
#include "chunk_cuda.cu"
extern CloverleafCudaChunk chunk;
__global__ void device_calc_dt_kernel_cuda
(int x_min, int x_max, int y_min, int y_max,
const double g_small,
const double g_big,
const double dtmin,
const double dtc_safe,
const double dtu_safe,
const double dtv_safe,
const double dtdiv_safe,
const double* __restrict const xarea,
const double* __restrict const yarea,
const double* __restrict const celldx,
const double* __restrict const celldy,
const double* __restrict const volume,
const double* __restrict const density0,
const double* __restrict const viscosity,
const double* __restrict const soundspeed,
const double* __restrict const xvel0,
const double* __restrict const yvel0,
double* __restrict const jk_ctrl_out,
double* __restrict const dt_min_out)
{
__kernel_indexes;
double dsx, dsy;
double cc;
double dtct;
double div;
double dv1;
double dv2;
double dtut;
double dtvt;
double dtdivt;
double dt_min_val = g_big;
double jk_control = 0.0;
__shared__ double dt_min_shared[BLOCK_SZ];
__shared__ double jk_ctrl_shared[BLOCK_SZ];
dt_min_shared[threadIdx.x] = dt_min_val;
jk_ctrl_shared[threadIdx.x] = jk_control;
if (row >= (y_min + 1) && row <= (y_max + 1)
&& column >= (x_min + 1) && column <= (x_max + 1))
{
dsx = celldx[column];
dsy = celldy[row];
cc = soundspeed[THARR2D(0, 0, 0)] * soundspeed[THARR2D(0, 0, 0)];
cc += 2.0 * viscosity[THARR2D(0, 0, 0)] / density0[THARR2D(0, 0, 0)];
cc = MAX(sqrt(cc), g_small);
dtct = dtc_safe * MIN(dsx, dsy)/cc;
div = 0.0;
// x
dv1 = (xvel0[THARR2D(0, 0, 1)] + xvel0[THARR2D(0, 1, 1)])
* xarea[THARR2D(0, 0, 1)];
dv2 = (xvel0[THARR2D(1, 0, 1)] + xvel0[THARR2D(1, 1, 1)])
* xarea[THARR2D(1, 0, 1)];
div += dv2 - dv1;
dtut = dtu_safe * 2.0 * volume[THARR2D(0, 0, 0)]
/ MAX(g_small*volume[THARR2D(0, 0, 0)],
MAX(fabs(dv1), fabs(dv2)));
// y
dv1 = (yvel0[THARR2D(0, 0, 1)] + yvel0[THARR2D(1, 0, 1)])
* yarea[THARR2D(0, 0, 0)];
dv2 = (yvel0[THARR2D(0, 1, 1)] + yvel0[THARR2D(1, 1, 1)])
* yarea[THARR2D(0, 1, 0)];
div += dv2 - dv1;
dtvt = dtv_safe * 2.0 * volume[THARR2D(0, 0, 0)]
/ MAX(g_small*volume[THARR2D(0, 0, 0)],
MAX(fabs(dv1), fabs(dv2)));
//
div /= (2.0 * volume[THARR2D(0, 0, 0)]);
dtdivt = (div < (-g_small)) ? dtdiv_safe * (-1.0/div) : g_big;
dt_min_shared[threadIdx.x] = MIN(dtdivt, MIN(dtvt, MIN(dtct, dtut)));
jk_ctrl_shared[threadIdx.x] = (column + (x_max * (row - 1))) + 0.4;
}
Reduce< BLOCK_SZ/2 >::run(dt_min_shared, dt_min_out, min_func);
Reduce< BLOCK_SZ/2 >::run(jk_ctrl_shared, jk_ctrl_out, max_func);
}
extern "C" void calc_dt_kernel_cuda_
(double* g_small,
double* g_big,
double* dtmin,
double* dtc_safe,
double* dtu_safe,
double* dtv_safe,
double* dtdiv_safe,
double* dt_min_val,
int* dtl_control,
double* xl_pos,
double* yl_pos,
int* jldt,
int* kldt,
int* small)
{
chunk.calc_dt_kernel(*g_small, *g_big, *dtmin, *dtc_safe, *dtu_safe,
*dtv_safe, *dtdiv_safe, dt_min_val, dtl_control, xl_pos, yl_pos,
jldt, kldt, small);
}
void CloverleafCudaChunk::calc_dt_kernel
(double g_small, double g_big, double dtmin,
double dtc_safe, double dtu_safe, double dtv_safe, double dtdiv_safe,
double* dt_min_val,
int* dtl_control,
double* xl_pos,
double* yl_pos,
int* jldt,
int* kldt,
int* small)
{
CUDA_BEGIN_PROFILE;
device_calc_dt_kernel_cuda<<< num_blocks, BLOCK_SZ >>>
(x_min, x_max, y_min, y_max, g_small, g_big, dtmin, dtc_safe,
dtu_safe, dtv_safe, dtdiv_safe, xarea, yarea, celldx, celldy,
volume, density0, viscosity, soundspeed, xvel0, yvel0,
work_array_1, work_array_2);
CUDA_ERR_CHECK;
// reduce_ptr 2 is a thrust wrapper around work_array_2
*dt_min_val = *thrust::min_element(reduce_ptr_2,
reduce_ptr_2 + num_blocks);
// ditto on reduce ptr 1
double jk_control = *thrust::max_element(reduce_ptr_1,
reduce_ptr_1 + num_blocks);
CUDA_END_PROFILE;
*dtl_control = 10.01 * (jk_control - static_cast<int>(jk_control));
jk_control = jk_control - (jk_control - static_cast<int>(jk_control));
int tmp_jldt = *jldt = (static_cast<int>(jk_control)) % x_max;
int tmp_kldt = *kldt = 1 + (jk_control/x_max);
*xl_pos = thr_cellx[tmp_jldt];
*yl_pos = thr_celly[tmp_kldt];
*small = (*dt_min_val < dtmin) ? 1 : 0;
if (*small != 0)
{
std::cerr << "Timestep information:" << std::endl;
std::cerr << "j, k : " << tmp_jldt << " " << tmp_kldt << std::endl;
std::cerr << "x, y : " << thr_cellx[tmp_jldt] << " " << thr_celly[tmp_kldt] << std::endl;
std::cerr << "timestep : " << *dt_min_val << std::endl;
std::cerr << "Cell velocities;" << std::endl;
std::cerr << thr_xvel0[tmp_jldt +(x_max+5)*tmp_kldt ] << "\t";
std::cerr << thr_yvel0[tmp_jldt +(x_max+5)*tmp_kldt ] << std::endl;
std::cerr << thr_xvel0[tmp_jldt+1+(x_max+5)*tmp_kldt ] << "\t";
std::cerr << thr_yvel0[tmp_jldt+1+(x_max+5)*tmp_kldt ] << std::endl;
std::cerr << thr_xvel0[tmp_jldt+1+(x_max+5)*(tmp_kldt+1)] << "\t";
std::cerr << thr_yvel0[tmp_jldt+1+(x_max+5)*(tmp_kldt+1)] << std::endl;
std::cerr << thr_xvel0[tmp_jldt +(x_max+5)*(tmp_kldt+1)] << "\t";
std::cerr << thr_yvel0[tmp_jldt +(x_max+5)*(tmp_kldt+1)] << std::endl;
std::cerr << "density, energy, pressure, soundspeed " << std::endl;
std::cerr << thr_density0[tmp_jldt+(x_max+5)*tmp_kldt] << "\t";
std::cerr << thr_energy0[tmp_jldt+(x_max+5)*tmp_kldt] << "\t";
std::cerr << thr_pressure[tmp_jldt+(x_max+5)*tmp_kldt] << "\t";
std::cerr << thr_soundspeed[tmp_jldt+(x_max+5)*tmp_kldt] << std::endl;
}
}