forked from pmittaldev/john-the-ripper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcommon_opencl_pbkdf2.c
339 lines (203 loc) · 9.94 KB
/
common_opencl_pbkdf2.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
/* This software is Copyright (c) 2012 Sayantan Datta <std2048 at gmail dot com>
* and it is hereby released to the general public under the following terms:
* Redistribution and use in source and binary forms, with or without modification, are permitted.
* Based on S3nf implementation http://openwall.info/wiki/john/MSCash2
* This format supports salts upto 19 characters. Origial S3nf implementation supports only upto 8 charcters.
*/
#include "common_opencl_pbkdf2.h"
#include <string.h>
#include <math.h>
static cl_platform_id pltfrmid[MAX_PLATFORMS];
static cl_device_id devid[MAX_PLATFORMS][MAX_DEVICES_PER_PLATFORM];
static cl_context cntxt[MAX_PLATFORMS][MAX_DEVICES_PER_PLATFORM];
static cl_command_queue cmdq[MAX_PLATFORMS][MAX_DEVICES_PER_PLATFORM];
static cl_kernel krnl[MAX_PLATFORMS][MAX_DEVICES_PER_PLATFORM];
static cl_program prg[MAX_PLATFORMS][MAX_DEVICES_PER_PLATFORM];
static cl_int err;
static cl_event events[MAX_PLATFORMS*MAX_DEVICES_PER_PLATFORM];
static size_t lws[MAX_PLATFORMS][MAX_DEVICES_PER_PLATFORM];
static long double exec_time_inv[MAX_PLATFORMS][MAX_DEVICES_PER_PLATFORM]={{-1.000}};
static int event_ctr=0;
static cl_ulong kernelExecTimeNs = CL_ULONG_MAX;
static char PROFILE = 0 ;
static gpu_mem_buffer gpu_buffer[MAX_PLATFORMS][MAX_DEVICES_PER_PLATFORM];
static unsigned int active_dev_ctr=0;
static int store_platform_no[MAX_PLATFORMS*MAX_DEVICES_PER_PLATFORM]={-1};
static int store_dev_no[MAX_PLATFORMS*MAX_DEVICES_PER_PLATFORM]={-1};
static gpu_mem_buffer exec_pbkdf2(cl_uint *,cl_uint *,cl_uint ,cl_uint *,cl_uint ,int ,int ) ;
static void clean_gpu_buffer(gpu_mem_buffer *pThis)
{
HANDLE_CLERROR(clReleaseMemObject(pThis->pass_gpu),"Release Memory Object FAILED.");
HANDLE_CLERROR(clReleaseMemObject(pThis->hash_out_gpu),"Release Memory Object FAILED.");
HANDLE_CLERROR(clReleaseMemObject(pThis->salt_gpu),"Release Memory Object FAILED.");
}
void clean_all_buffer()
{ int i;
for(i=0;i<active_dev_ctr;i++)
clean_gpu_buffer(&gpu_buffer[store_platform_no[i]][store_dev_no[i]]);
}
static void find_best_workgroup(int pltform_no,int dev_no)
{
size_t _lws=0;
cl_device_type dTyp;
cl_uint *dcc_hash_host=(cl_uint*)malloc(4*sizeof(cl_uint)*64000);
cl_uint *dcc2_hash_host=(cl_uint*)malloc(4*sizeof(cl_uint)*64000);
cl_uint salt_api[9],length=10;
event_ctr=0;
HANDLE_CLERROR(clGetDeviceInfo(devid[pltform_no][dev_no],CL_DEVICE_TYPE,sizeof(cl_device_type),&dTyp,NULL),"Failed Device Info");
///Set Dummy DCC hash , unicode salt and ascii salt(username) length
memset(dcc_hash_host,0xb5,4*sizeof(cl_uint)*64000);
memset(salt_api,0xfe,9*sizeof(cl_uint));
cmdq[pltform_no][dev_no] = clCreateCommandQueue(cntxt[pltform_no][dev_no], devid[pltform_no][dev_no], CL_QUEUE_PROFILING_ENABLE,&err);
HANDLE_CLERROR(err, "Error creating command queue");
PROFILE=1;
kernelExecTimeNs = CL_ULONG_MAX;
///Find best local work size
while(1){
_lws=lws[pltform_no][dev_no];
if(dTyp==CL_DEVICE_TYPE_CPU){
exec_pbkdf2(dcc_hash_host,salt_api,length,dcc2_hash_host,4000,pltform_no,dev_no );
exec_time_inv[pltform_no][dev_no]=exec_time_inv[pltform_no][dev_no]/16;
}
else
exec_pbkdf2(dcc_hash_host,salt_api,length,dcc2_hash_host,64000,pltform_no,dev_no );
if(lws[pltform_no][dev_no]<=_lws) break;
}
PROFILE=0;
printf("Optimal Work Group Size:%d\n",(int)lws[pltform_no][dev_no]);
printf("Kernel Execution Speed (Higher is better):%Lf\n",exec_time_inv[pltform_no][dev_no]);
free(dcc_hash_host);
free(dcc2_hash_host);
}
size_t select_device(int platform_no,int dev_no)
{
lws[platform_no][dev_no]= 16;
opencl_init("$JOHN/pbkdf2_kernel.cl", dev_no, platform_no);
pltfrmid[platform_no]=platform[platform_no];
devid[platform_no][dev_no]=devices[dev_no];
cntxt[platform_no][dev_no]=context[dev_no];
prg[platform_no][dev_no]=program[dev_no];
krnl[platform_no][dev_no]=clCreateKernel(prg[platform_no][dev_no],"PBKDF2",&err) ;
if(err) {printf("Create Kernel PBKDF2 FAILED\n"); return 0;}
gpu_buffer[platform_no][dev_no].pass_gpu=clCreateBuffer(cntxt[platform_no][dev_no],CL_MEM_READ_ONLY,4*MAX_KEYS_PER_CRYPT*sizeof(cl_uint),NULL,&err);
if((gpu_buffer[platform_no][dev_no].pass_gpu==(cl_mem)0)) { HANDLE_CLERROR(err, "Create Buffer FAILED"); }
gpu_buffer[platform_no][dev_no].salt_gpu=clCreateBuffer(cntxt[platform_no][dev_no],CL_MEM_READ_ONLY,(MAX_SALT_LENGTH/2 +1)*sizeof(cl_uint),NULL,&err);
if((gpu_buffer[platform_no][dev_no].salt_gpu==(cl_mem)0)) { HANDLE_CLERROR(err, "Create Buffer FAILED"); }
gpu_buffer[platform_no][dev_no].hash_out_gpu=clCreateBuffer(cntxt[platform_no][dev_no],CL_MEM_WRITE_ONLY,4*MAX_KEYS_PER_CRYPT*sizeof(cl_uint),NULL,&err);
if((gpu_buffer[platform_no][dev_no].hash_out_gpu==(cl_mem)0)) {HANDLE_CLERROR(err, "Create Buffer FAILED"); }
HANDLE_CLERROR(clSetKernelArg(krnl[platform_no][dev_no],0,sizeof(cl_mem),&gpu_buffer[platform_no][dev_no].pass_gpu),"Set Kernel Arg FAILED arg0");
HANDLE_CLERROR(clSetKernelArg(krnl[platform_no][dev_no],1,sizeof(cl_mem),&gpu_buffer[platform_no][dev_no].salt_gpu),"Set Kernel Arg FAILED arg1");
HANDLE_CLERROR(clSetKernelArg(krnl[platform_no][dev_no],4,sizeof(cl_mem),&gpu_buffer[platform_no][dev_no].hash_out_gpu),"Set Kernel Arg FAILED arg4");
find_best_workgroup(platform_no,dev_no);
cmdq[platform_no][dev_no]=queue[dev_no];
store_platform_no[active_dev_ctr]=platform_no;
store_dev_no[active_dev_ctr++]=dev_no;
return lws[platform_no][dev_no];
}
size_t select_default_device()
{
return select_device(0,0);
}
static size_t max_lws()
{
int i;
size_t max=0;
for(i=0;i<active_dev_ctr;++i)
if(max<lws[store_platform_no[i]][store_dev_no[i]])
max=lws[store_platform_no[i]][store_dev_no[i]];
return max;
}
void pbkdf2_divide_work(cl_uint *pass_api,cl_uint *salt_api,cl_uint saltlen_api,cl_uint *hash_out_api,cl_uint num)
{
double total_exec_time_inv=0;
int i;
unsigned int work_part,work_offset=0,lws_max=max_lws();
cl_int ret;
event_ctr=0;
memset(hash_out_api,0,num*sizeof(cl_uint));
/// Make num multiple of lws_max
if(num%lws_max!=0)
num=(num/lws_max + 1)*lws_max;
///Divide work only if number of keys is greater than 8192, else use first device selected
if(num>8192){
///Calculates t0tal Kernel Execution Speed
for(i=0;i<active_dev_ctr;++i){
total_exec_time_inv+=exec_time_inv[store_platform_no[i]][store_dev_no[i]];
}
///Calculate work division ratio
for(i=0;i<active_dev_ctr;++i)
exec_time_inv[store_platform_no[i]][store_dev_no[i]]/=total_exec_time_inv;
///Divide memory and work
for(i=0;i<active_dev_ctr;++i){
if(i==active_dev_ctr-1){
work_part= num- work_offset;
if(work_part%lws_max!=0)
work_part=(work_part/lws_max + 1)*lws_max;
}
else{
work_part=num*exec_time_inv[store_platform_no[i]][store_dev_no[i]];
if(work_part%lws_max!=0)
work_part=(work_part/lws_max + 1)*lws_max;
}
///call to exec_pbkdf2()
#ifdef _DEBUG
printf("Work Offset:%d Work Part Size:%d %d\n",work_offset,work_part,event_ctr);
#endif
exec_pbkdf2(pass_api+4*work_offset,salt_api,saltlen_api,hash_out_api+4*work_offset,work_part,store_platform_no[i],store_dev_no[i]);
work_offset+=work_part;
}
///Synchronize Device memory and Host memory
for(i=active_dev_ctr-1;i>=0;--i)
HANDLE_CLERROR(clFlush(cmdq[store_platform_no[i]][store_dev_no[i]]),"Flush Error");
for(i=0;i<active_dev_ctr;++i){
while(1){
HANDLE_CLERROR(clGetEventInfo(events[i],CL_EVENT_COMMAND_EXECUTION_STATUS,sizeof(cl_int),&ret,NULL),"Error in Get Event Info");
if((ret)==CL_COMPLETE) break;
#ifdef _DEBUG
printf("%d%d ", ret,i);
#endif
}
}
for(i=0;i<active_dev_ctr;++i)
HANDLE_CLERROR(clFinish(cmdq[store_platform_no[i]][store_dev_no[i]]),"Finish Error");
}
else{
exec_pbkdf2(pass_api,salt_api,saltlen_api,hash_out_api,num, store_platform_no[0],store_dev_no[0]);
HANDLE_CLERROR(clFinish(cmdq[store_platform_no[0]][store_dev_no[0]]),"Finish Error");
}
}
static gpu_mem_buffer exec_pbkdf2(cl_uint *pass_api,cl_uint *salt_api,cl_uint saltlen_api,cl_uint *hash_out_api,cl_uint num,int platform_no,int dev_no )
{
cl_event evnt;
size_t N=num,M=lws[platform_no][dev_no];
HANDLE_CLERROR(clEnqueueWriteBuffer(cmdq[platform_no][dev_no],gpu_buffer[platform_no][dev_no].pass_gpu,CL_TRUE,0,4*num*sizeof(cl_uint),pass_api,0,NULL,NULL ), "Copy data to gpu");
HANDLE_CLERROR(clEnqueueWriteBuffer(cmdq[platform_no][dev_no],gpu_buffer[platform_no][dev_no].salt_gpu,CL_TRUE,0,(MAX_SALT_LENGTH/2 + 1)*sizeof(cl_uint),salt_api,0,NULL,NULL ), "Copy data to gpu");
HANDLE_CLERROR(clSetKernelArg(krnl[platform_no][dev_no],2,sizeof(cl_uint),&saltlen_api),"Set Kernel Arg FAILED arg2");
HANDLE_CLERROR(clSetKernelArg(krnl[platform_no][dev_no],3,sizeof(cl_uint),&num),"Set Kernel Arg FAILED arg3");
err=clEnqueueNDRangeKernel(cmdq[platform_no][dev_no],krnl[platform_no][dev_no],1,NULL,&N,&M,0,NULL,&evnt);
if(err){
if(PROFILE){
lws[platform_no][dev_no]=lws[platform_no][dev_no]/2;
}
else
HANDLE_CLERROR(err,"Enque Kernel Failed");
return gpu_buffer[platform_no][dev_no];
}
if(PROFILE){
cl_ulong startTime, endTime;
HANDLE_CLERROR(CL_SUCCESS!=clWaitForEvents(1,&evnt),"SYNC FAILED");
HANDLE_CLERROR(clFinish(cmdq[platform_no][dev_no]), "clFinish error");
clGetEventProfilingInfo(evnt, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &startTime, NULL);
clGetEventProfilingInfo(evnt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL);
if ((endTime - startTime) < kernelExecTimeNs) {
kernelExecTimeNs = endTime - startTime;
//printf("%d\n",(int)kernelExecTimeNs);
lws[platform_no][dev_no] =lws[platform_no][dev_no]*2;
exec_time_inv[platform_no][dev_no]= (long double)pow(10,9)/(long double)kernelExecTimeNs;
}
}
else{
HANDLE_CLERROR(clEnqueueReadBuffer(cmdq[platform_no][dev_no],gpu_buffer[platform_no][dev_no].hash_out_gpu,CL_FALSE,0,4*num*sizeof(cl_uint),hash_out_api, 1, &evnt, &events[event_ctr++]),"Write FAILED");
}
return gpu_buffer[platform_no][dev_no];
}