94b2b13d
Pedro Roque
PHACT source
|
1
2
3
4
|
/*
* solve.c
*
* Created on: 28/01/2015
|
4d26a735
Pedro Roque
Increased recogni...
|
5
|
* Author: pedro
|
94b2b13d
Pedro Roque
PHACT source
|
6
7
8
9
10
11
12
13
14
15
16
17
|
*/
#include "solve.h"
#include <stdbool.h>
#include <string.h>
#include <stdio.h>
#include "bitmaps.h"
#include "devices.h"
#include "domains.h"
#include "intervals.h"
|
4d26a735
Pedro Roque
Increased recogni...
|
18
|
#include "utils/cl_errors.h"
|
94b2b13d
Pedro Roque
PHACT source
|
19
20
21
22
23
24
25
26
27
|
#include "variables.h"
/*
* Return the number of solutions found or 0 if no solution is found
* dev_args - device_args structure about this device
* dev_info - device_info structure about this device
* depth - Tree expansion depth needed to get n_ss disjoint search spaces
* n_ss - total number of sub-search spaces
* stats_lock - mutex to control accesses to statistics structure
|
4d26a735
Pedro Roque
Increased recogni...
|
28
|
* */
|
94b2b13d
Pedro Roque
PHACT source
|
29
|
cl_ulong count_sols(device_args* dev_args, device_info* dev_info, unsigned int depth, unsigned int n_ss, pthread_mutex_t* stats_lock, bool filtering) {
|
4d26a735
Pedro Roque
Increased recogni...
|
30
|
|
94b2b13d
Pedro Roque
PHACT source
|
31
32
33
34
35
36
37
38
39
40
41
42
43
44
|
cl_ulong n_solutions = 0; // number of solutions found on this kernels execution
unsigned int i;
// buffer for atomics data (Most devices only have atomics for 32 bits variables)
// 0 - first sub-search to explore
// 1 - last sub-search to explore
// 2 - n_ss
// 3 - depth
// 4 - WIs still working for work-sharing
// 5 - 5+N_VS - n_repeat per variable
// 5+N_VS...5+N_VS+N_WG*N_WI_WG - number of solutions found per work-item
set_strs_generat_data(dev_args, dev_info, depth, n_ss, filtering);
dev_args->atoms[4] = (unsigned int)dev_args->wi_total;
|
4d26a735
Pedro Roque
Increased recogni...
|
45
|
|
94b2b13d
Pedro Roque
PHACT source
|
46
47
48
49
50
51
52
53
54
55
56
|
for (i = 5 + N_VS; i < 5 + N_VS + dev_args->wi_total; i++) {
dev_args->atoms[i] = 0;
}
#if RUN_IN_CUDA
CUresult err = cuInit(0);
if (dev_info->type == CL_DEVICE_TYPE_GPU) {
err = cuMemcpyHtoD(dev_args->atoms_mem_cu, dev_args->atoms, dev_args->atoms_size);
if (err != CUDA_SUCCESS) {
|
4d26a735
Pedro Roque
Increased recogni...
|
57
58
|
fprintf(stderr, "CUDA error in cuMemcpyHtoD atoms_mem_cu\n");
cuCtxDestroy(dev_args->context_cu);
|
94b2b13d
Pedro Roque
PHACT source
|
59
60
61
62
|
exit(-1);
}
} else {
#endif
|
4d26a735
Pedro Roque
Increased recogni...
|
63
|
|
94b2b13d
Pedro Roque
PHACT source
|
64
65
66
67
68
69
70
71
72
73
74
75
|
// Update atoms buffer data on device
cl_check_error(clEnqueueWriteBuffer(dev_args->cq, dev_args->atoms_mem, CL_TRUE, 0, dev_args->atoms_size, dev_args->atoms, 0, NULL, NULL),
"clEnqueueWriteBuffer atoms_p_mem", dev_info->dev_name);
#if RUN_IN_CUDA
}
#endif
#if SHARED_SS > 0
#if RUN_IN_CUDA
if (dev_info->type == CL_DEVICE_TYPE_GPU) {
|
4d26a735
Pedro Roque
Increased recogni...
|
76
|
|
94b2b13d
Pedro Roque
PHACT source
|
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
|
err = cuMemcpyHtoD(dev_args->shared_stores_flag_mem_cu, dev_args->shared_stores_flag, dev_args->shared_stores_flag_size);
if (err != CUDA_SUCCESS) {
fprintf(stderr, "CUDA error in cuMemcpyHtoD shared_stores_flag_mem_cu\n");
cuCtxDestroy(dev_args->context_cu);
exit(-1);
}
} else {
#endif
// Update shared stored flags buffer data on device
cl_check_error(clEnqueueWriteBuffer(dev_args->cq, dev_args->shared_stores_flag_mem, CL_TRUE, 0, dev_args->shared_stores_flag_size,
dev_args->shared_stores_flag, 0, NULL, NULL), "clEnqueueWriteBuffer shared_stores_flag_mem", dev_info->dev_name);
#if RUN_IN_CUDA
}
#endif
#endif
|
4d26a735
Pedro Roque
Increased recogni...
|
94
|
#if RUN_IN_CUDA
|
94b2b13d
Pedro Roque
PHACT source
|
95
96
|
if (dev_info->type == CL_DEVICE_TYPE_GPU) {
|
4d26a735
Pedro Roque
Increased recogni...
|
97
98
|
err = cuLaunchKernel(dev_args->function_cu, (unsigned int)dev_args->wi_total / (unsigned int)dev_args->wi_local, 1, 1, (unsigned int)dev_args->wi_local, 1, 1,
|
94b2b13d
Pedro Roque
PHACT source
|
99
100
101
102
103
104
105
106
107
108
109
110
|
(unsigned int)dev_args->shared_memory_size_cu, 0, dev_args->kernel_args_cu, 0);
if (err != CUDA_SUCCESS) {
fprintf(stderr, "CUDA error in cuLaunchKernel %d\n", err);
cuCtxDestroy(dev_args->context_cu);
exit(-1);
}
} else {
#endif
cl_check_error(clEnqueueNDRangeKernel(dev_args->cq, dev_args->kernel, 1, NULL, &dev_args->wi_total, &dev_args->wi_local, 0, NULL, NULL),
"clEnqueueNDRangeKernel", dev_info->dev_name);
#if RUN_IN_CUDA
}
|
4d26a735
Pedro Roque
Increased recogni...
|
111
112
|
#endif
|
94b2b13d
Pedro Roque
PHACT source
|
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
|
if (filtering) {
return get_filtering_results(dev_args, dev_info);
}
#if RUN_IN_CUDA
if (dev_info->type == CL_DEVICE_TYPE_GPU) {
err = cuMemcpyDtoH(dev_args->atoms, dev_args->atoms_mem_cu, dev_args->atoms_size);
if (err != CUDA_SUCCESS) {
fprintf(stderr, "CUDA error in cuMemcpyDtoH atoms_mem_cu\n");
cuCtxDestroy(dev_args->context_cu);
exit(-1);
}
} else {
#endif
// Transfer number of solutions found
cl_check_error(clEnqueueReadBuffer(dev_args->cq, dev_args->atoms_mem, CL_TRUE, 0, dev_args->atoms_size, dev_args->atoms, 0, NULL, NULL),
|
4d26a735
Pedro Roque
Increased recogni...
|
131
132
133
|
"clEnqueueReadBuffer atoms_p_mem", dev_info->dev_name);
#if RUN_IN_CUDA
}
|
94b2b13d
Pedro Roque
PHACT source
|
134
|
#endif
|
4d26a735
Pedro Roque
Increased recogni...
|
135
|
|
94b2b13d
Pedro Roque
PHACT source
|
136
|
for (i = 5 + N_VS; i < 5 + N_VS + dev_args->wi_total; i++) {
|
4d26a735
Pedro Roque
Increased recogni...
|
137
|
n_solutions += dev_args->atoms[i];
|
94b2b13d
Pedro Roque
PHACT source
|
138
|
}
|
4d26a735
Pedro Roque
Increased recogni...
|
139
|
|
94b2b13d
Pedro Roque
PHACT source
|
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
|
if (N_DEVS > 1) {
#if RUN_IN_CUDA
if (dev_info->type == CL_DEVICE_TYPE_GPU) {
err = cuMemcpyDtoH(dev_args->props, dev_args->props_mem_cu, dev_args->props_size);
if (err != CUDA_SUCCESS) {
fprintf(stderr, "CUDA error in cuMemcpyDtoH props_mem_cu\n");
cuCtxDestroy(dev_args->context_cu);
exit(-1);
}
} else {
#endif
cl_check_error(clEnqueueReadBuffer(dev_args->cq, dev_args->props_mem, CL_TRUE, 0, dev_args->props_size, dev_args->props, 0,
NULL, NULL), "clEnqueueReadBuffer props_mem", dev_info->dev_name);
#if RUN_IN_CUDA
}
#endif
dev_info->last_props = 0;
for (i = 0; i < dev_args->wi_total; i++) {
dev_info->props_total += dev_args->props[i];
dev_info->last_props += dev_args->props[i];
}
}
|
4d26a735
Pedro Roque
Increased recogni...
|
166
|
// copy statistics from device to host
|
94b2b13d
Pedro Roque
PHACT source
|
167
|
if (PRINT_STATS) {
|
4d26a735
Pedro Roque
Increased recogni...
|
168
169
|
get_stats(dev_args, dev_info, stats_lock);
}
|
94b2b13d
Pedro Roque
PHACT source
|
170
171
172
173
174
175
176
177
178
179
180
181
182
183
|
// return number of solutions found
return n_solutions;
}
/*
* Return 1 if a solution is found, or 0 if no solution is found.
* dev_args - device_args structure about this device
* dev_info - device_info structure about this device
* sol_found - atomic flag for solution found
* depth - Tree expansion depth needed to get n_ss disjoint search spaces
* n_ss - total number of sub-search spaces
* stats_lock - mutex to control accesses to statistics structure
* */
|
4d26a735
Pedro Roque
Increased recogni...
|
184
|
cl_ulong find_one_sol(device_args* dev_args, device_info* dev_info, unsigned char* sol_found, unsigned int depth, unsigned int n_ss, pthread_mutex_t* stats_lock, bool filtering) {
|
94b2b13d
Pedro Roque
PHACT source
|
185
186
187
188
189
190
191
192
|
int sol_found_atom = 0; // Previous value of sol_found
unsigned int i;
// buffer for atomics data (Most devices only have atomics for 32 bits variables)
// 0 - first sub-search to explore
// 1 - last sub-search to explore
// 2 - n_ss
|
4d26a735
Pedro Roque
Increased recogni...
|
193
|
// 3 - depth
|
94b2b13d
Pedro Roque
PHACT source
|
194
195
|
// 4 - WIs still working for work-sharing
// 5 - 5+N_VS - n_repeat per variable
|
4d26a735
Pedro Roque
Increased recogni...
|
196
197
198
|
// 5+N_VS - solution found flag
set_strs_generat_data(dev_args, dev_info, depth, n_ss, filtering);
|
94b2b13d
Pedro Roque
PHACT source
|
199
200
201
202
|
dev_args->atoms[4] = (unsigned int)dev_args->wi_total;
#if RUN_IN_CUDA
|
94b2b13d
Pedro Roque
PHACT source
|
203
204
205
|
CUresult err = cuInit(0);
if (dev_info->type == CL_DEVICE_TYPE_GPU) {
|
4d26a735
Pedro Roque
Increased recogni...
|
206
|
|
94b2b13d
Pedro Roque
PHACT source
|
207
208
|
err = cuMemcpyHtoD(dev_args->atoms_mem_cu, dev_args->atoms, dev_args->atoms_size);
if (err != CUDA_SUCCESS) {
|
4d26a735
Pedro Roque
Increased recogni...
|
209
|
fprintf(stderr, "CUDA error in cuMemcpyHtoD atoms_mem_cu\n");
|
94b2b13d
Pedro Roque
PHACT source
|
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
|
cuCtxDestroy(dev_args->context_cu);
exit(-1);
}
} else {
#endif
// Update atoms buffer data on device
cl_check_error(clEnqueueWriteBuffer(dev_args->cq, dev_args->atoms_mem, CL_TRUE, 0, dev_args->atoms_size, dev_args->atoms, 0, NULL, NULL),
"clEnqueueWriteBuffer atoms_p_mem", dev_info->dev_name);
#if RUN_IN_CUDA
}
#endif
#if SHARED_SS > 0
#if RUN_IN_CUDA
|
4d26a735
Pedro Roque
Increased recogni...
|
226
|
if (dev_info->type == CL_DEVICE_TYPE_GPU) {
|
94b2b13d
Pedro Roque
PHACT source
|
227
228
|
err = cuMemcpyHtoD(dev_args->shared_stores_flag_mem_cu, dev_args->shared_stores_flag, dev_args->shared_stores_flag_size);
|
4d26a735
Pedro Roque
Increased recogni...
|
229
230
|
if (err != CUDA_SUCCESS) {
fprintf(stderr, "CUDA error in cuMemcpyHtoD shared_stores_flag_mem_cu\n");
|
94b2b13d
Pedro Roque
PHACT source
|
231
232
233
234
235
236
237
238
239
240
241
242
243
244
|
cuCtxDestroy(dev_args->context_cu);
exit(-1);
}
} else {
#endif
// Update shared stored flags buffer data on device
cl_check_error(clEnqueueWriteBuffer(dev_args->cq, dev_args->shared_stores_flag_mem, CL_TRUE, 0, dev_args->shared_stores_flag_size,
dev_args->shared_stores_flag, 0, NULL, NULL), "clEnqueueWriteBuffer shared_stores_flag_mem", dev_info->dev_name);
#if RUN_IN_CUDA
}
#endif
#endif
#if RUN_IN_CUDA
|
4d26a735
Pedro Roque
Increased recogni...
|
245
|
|
94b2b13d
Pedro Roque
PHACT source
|
246
247
|
if (dev_info->type == CL_DEVICE_TYPE_GPU) {
|
4d26a735
Pedro Roque
Increased recogni...
|
248
249
250
|
err = cuLaunchKernel(dev_args->function_cu, (unsigned int)dev_args->wi_total / (unsigned int)dev_args->wi_local, 1, 1, (unsigned int)dev_args->wi_local, 1, 1,
(unsigned int)dev_args->shared_memory_size_cu, 0, dev_args->kernel_args_cu, 0);
if (err != CUDA_SUCCESS) {
|
94b2b13d
Pedro Roque
PHACT source
|
251
252
253
254
255
256
257
258
259
|
fprintf(stderr, "CUDA error in cuLaunchKernel %d\n", err);
cuCtxDestroy(dev_args->context_cu);
exit(-1);
}
} else {
#endif
cl_check_error(clEnqueueNDRangeKernel(dev_args->cq, dev_args->kernel, 1, NULL, &dev_args->wi_total, &dev_args->wi_local, 0, NULL, NULL), "clEnqueueNDRangeKernel",
dev_info->dev_name);
#if RUN_IN_CUDA
|
4d26a735
Pedro Roque
Increased recogni...
|
260
261
|
}
#endif
|
94b2b13d
Pedro Roque
PHACT source
|
262
263
|
if (filtering) {
|
4d26a735
Pedro Roque
Increased recogni...
|
264
265
266
|
return get_filtering_results(dev_args, dev_info);
}
|
94b2b13d
Pedro Roque
PHACT source
|
267
268
269
270
271
272
273
274
275
276
277
|
if (DOMAIN_TYPE == BITMAP_) {
#if RUN_IN_CUDA
if (dev_info->type == CL_DEVICE_TYPE_GPU) {
err = cuMemcpyDtoH(dev_args->bitmaps, dev_args->domains_mem_cu, dev_args->domains_size);
if (err != CUDA_SUCCESS) {
fprintf(stderr, "CUDA error in cuMemcpyDtoH domains_mem_cu\n");
cuCtxDestroy(dev_args->context_cu);
exit(-1);
}
|
4d26a735
Pedro Roque
Increased recogni...
|
278
279
|
} else {
#endif
|
94b2b13d
Pedro Roque
PHACT source
|
280
281
|
// Transfer possible solution
cl_check_error(clEnqueueReadBuffer(dev_args->cq, dev_args->domains_mem, CL_TRUE, 0, dev_args->domains_size, dev_args->bitmaps, 0, NULL, NULL),
|
4d26a735
Pedro Roque
Increased recogni...
|
282
283
|
"clEnqueueReadBuffer domains_mem", dev_info->dev_name);
#if RUN_IN_CUDA
|
94b2b13d
Pedro Roque
PHACT source
|
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
|
}
#endif
} else if (DOMAIN_TYPE == INTERVAL) {
#if RUN_IN_CUDA
if (dev_info->type == CL_DEVICE_TYPE_GPU) {
err = cuMemcpyDtoH(dev_args->intervals, dev_args->domains_mem_cu, dev_args->domains_size);
if (err != CUDA_SUCCESS) {
fprintf(stderr, "CUDA error in cuMemcpyDtoH domains_mem_cu\n");
cuCtxDestroy(dev_args->context_cu);
exit(-1);
}
} else {
#endif
// Transfer possible solution
cl_check_error(clEnqueueReadBuffer(dev_args->cq, dev_args->domains_mem, CL_TRUE, 0, dev_args->domains_size, dev_args->intervals, 0, NULL, NULL),
"clEnqueueReadBuffer domains_mem", dev_info->dev_name);
#if RUN_IN_CUDA
}
#endif
}
if (N_DEVS > 1) {
#if RUN_IN_CUDA
if (dev_info->type == CL_DEVICE_TYPE_GPU) {
err = cuMemcpyDtoH(dev_args->props, dev_args->props_mem_cu, dev_args->props_size);
if (err != CUDA_SUCCESS) {
fprintf(stderr, "CUDA error in cuMemcpyDtoH props_mem_cu\n");
cuCtxDestroy(dev_args->context_cu);
exit(-1);
}
} else {
#endif
cl_check_error(clEnqueueReadBuffer(dev_args->cq, dev_args->props_mem, CL_TRUE, 0, dev_args->props_size, dev_args->props, 0,
NULL, NULL), "clEnqueueReadBuffer props_mem", dev_info->dev_name);
#if RUN_IN_CUDA
}
#endif
dev_info->last_props = 0;
for (i = 0; i < dev_args->wi_total; i++) {
|
4d26a735
Pedro Roque
Increased recogni...
|
328
329
|
dev_info->props_total += dev_args->props[i];
dev_info->last_props += dev_args->props[i];
|
94b2b13d
Pedro Roque
PHACT source
|
330
331
|
}
}
|
4d26a735
Pedro Roque
Increased recogni...
|
332
333
334
|
// copy statistics from device to host
if (PRINT_STATS) {
|
94b2b13d
Pedro Roque
PHACT source
|
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
|
get_stats(dev_args, dev_info, stats_lock);
}
// if solution found signalizes it for the other devices and saves it
// if using bitmap domains
if (DOMAIN_TYPE == BITMAP_) {
bitmap b_result;
b_clear(&b_result);
b_copy_dev_to_host(&b_result, dev_args->bitmaps, 0);
if (!b_is_empty(&b_result)) {
#if defined(WIN32) || defined(_WIN32) || defined(__WIN32) && !defined(__CYGWIN__)
sol_found_atom = InterlockedCompareExchange(sol_found, 1, 0);
#else
sol_found_atom = __atomic_fetch_add(sol_found, 1, __ATOMIC_SEQ_CST);
#endif
if (sol_found_atom < 1) {
vs_copy_dev_to_host(VS, dev_args->bitmaps, N_VS);
}
return 1;
}
// if using interval domains
} else if (DOMAIN_TYPE == INTERVAL) {
#if RUN_IN_CUDA
if (dev_info->type == CL_DEVICE_TYPE_GPU) {
|
4d26a735
Pedro Roque
Increased recogni...
|
365
|
err = cuMemcpyDtoH(dev_args->atoms, dev_args->atoms_mem_cu, dev_args->atoms_size);
|
94b2b13d
Pedro Roque
PHACT source
|
366
|
if (err != CUDA_SUCCESS) {
|
4d26a735
Pedro Roque
Increased recogni...
|
367
368
369
|
fprintf(stderr, "CUDA error in cuMemcpyDtoH atoms_mem_cu\n");
cuCtxDestroy(dev_args->context_cu);
exit(-1);
|
94b2b13d
Pedro Roque
PHACT source
|
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
|
}
} else {
#endif
// Transfer possible solution
cl_check_error(clEnqueueReadBuffer(dev_args->cq, dev_args->atoms_mem, CL_TRUE, 0, dev_args->atoms_size, dev_args->atoms, 0, NULL, NULL),
"clEnqueueReadBuffer atoms_mem", dev_info->dev_name);
#if RUN_IN_CUDA
}
#endif
if (dev_args->atoms[5 + N_VS] != 0) {
#if defined(WIN32) || defined(_WIN32) || defined(__WIN32) && !defined(__CYGWIN__)
sol_found_atom = InterlockedCompareExchange(sol_found, 1, 0);
#else
sol_found_atom = __atomic_fetch_add(sol_found, 1, __ATOMIC_SEQ_CST);
|
4d26a735
Pedro Roque
Increased recogni...
|
386
|
#endif
|
94b2b13d
Pedro Roque
PHACT source
|
387
388
|
if (sol_found_atom < 1) {
convert_intervals_to_vars(VS, dev_args->intervals, N_VS);
|
4d26a735
Pedro Roque
Increased recogni...
|
389
|
}
|
94b2b13d
Pedro Roque
PHACT source
|
390
391
392
|
return 1;
}
}
|
4d26a735
Pedro Roque
Increased recogni...
|
393
|
|
94b2b13d
Pedro Roque
PHACT source
|
394
|
return 0;
|
4d26a735
Pedro Roque
Increased recogni...
|
395
|
}
|
94b2b13d
Pedro Roque
PHACT source
|
396
397
|
/*
|
4d26a735
Pedro Roque
Increased recogni...
|
398
399
|
* Return 1 if an optimal solution was found, or 0 if no optimal solution was found.
* dev_args - device_args structure about this device
|
94b2b13d
Pedro Roque
PHACT source
|
400
401
|
* dev_info - device_info structure about this device
* val_to_opt - Value to optimize
|
4d26a735
Pedro Roque
Increased recogni...
|
402
403
404
|
* opt_lock - mutex to control accesses to value to optimize and best solution found
* depth - Tree expansion depth needed to get n_ss disjoint search spaces
* n_ss - total number of sub-search spaces
|
94b2b13d
Pedro Roque
PHACT source
|
405
|
* stats_lock - mutex to control accesses to statistics structure
|
4d26a735
Pedro Roque
Increased recogni...
|
406
|
* */
|
94b2b13d
Pedro Roque
PHACT source
|
407
408
409
410
411
412
413
414
415
|
cl_ulong find_best_sol( device_args* dev_args, device_info* dev_info, cl_uint* val_to_opt, pthread_mutex_t* opt_lock, unsigned int depth, unsigned int n_ss,
pthread_mutex_t* stats_lock, bool filtering) {
bool opt_sol_found = false;
unsigned int i;
// buffer for atomics data (Most devices only have atomics for 32 bits variables)
// 0 - first sub-search to explore
// 1 - last sub-search to explore
|
4d26a735
Pedro Roque
Increased recogni...
|
416
417
|
// 2 - n_ss
// 3 - depth
|
94b2b13d
Pedro Roque
PHACT source
|
418
419
420
421
422
423
|
// 4 - WIs still working for work-sharing
// 5 - 5+N_VS - n_repeat per variable
// 5+N_VS - solution found flag
// 6+N_VS - Value to optimize
// 7+N_VS - WIs still working for saving the best solution
set_strs_generat_data(dev_args, dev_info, depth, n_ss, filtering);
|
4d26a735
Pedro Roque
Increased recogni...
|
424
|
|
94b2b13d
Pedro Roque
PHACT source
|
425
426
427
428
429
430
431
432
433
|
dev_args->atoms[4] = (unsigned int)dev_args->wi_total;
dev_args->atoms[5 + N_VS] = 0;
dev_args->atoms[6 + N_VS] = *val_to_opt;
dev_args->atoms[7 + N_VS] = (unsigned int)dev_args->wi_total;
#if RUN_IN_CUDA
CUresult err = cuInit(0);
|
4d26a735
Pedro Roque
Increased recogni...
|
434
435
|
if (dev_info->type == CL_DEVICE_TYPE_GPU) {
|
94b2b13d
Pedro Roque
PHACT source
|
436
437
|
err = cuMemcpyHtoD(dev_args->atoms_mem_cu, dev_args->atoms, dev_args->atoms_size);
if (err != CUDA_SUCCESS) {
|
4d26a735
Pedro Roque
Increased recogni...
|
438
439
|
fprintf(stderr, "CUDA error in cuMemcpyHtoD atoms_mem_cu\n");
cuCtxDestroy(dev_args->context_cu);
|
94b2b13d
Pedro Roque
PHACT source
|
440
|
exit(-1);
|
4d26a735
Pedro Roque
Increased recogni...
|
441
|
}
|
94b2b13d
Pedro Roque
PHACT source
|
442
443
444
445
446
447
448
449
450
451
452
453
|
} else {
#endif
// Update atoms buffer data on device
cl_check_error(clEnqueueWriteBuffer(dev_args->cq, dev_args->atoms_mem, CL_TRUE, 0, dev_args->atoms_size, dev_args->atoms, 0, NULL, NULL),
"clEnqueueWriteBuffer atoms_p_mem", dev_info->dev_name);
#if RUN_IN_CUDA
}
#endif
#if SHARED_SS > 0
#if RUN_IN_CUDA
|
4d26a735
Pedro Roque
Increased recogni...
|
454
455
|
if (dev_info->type == CL_DEVICE_TYPE_GPU) {
|
94b2b13d
Pedro Roque
PHACT source
|
456
457
|
err = cuMemcpyHtoD(dev_args->shared_stores_flag_mem_cu, dev_args->shared_stores_flag, dev_args->shared_stores_flag_size);
|
4d26a735
Pedro Roque
Increased recogni...
|
458
459
|
if (err != CUDA_SUCCESS) {
fprintf(stderr, "CUDA error in cuMemcpyHtoD shared_stores_flag_mem_cu\n");
|
94b2b13d
Pedro Roque
PHACT source
|
460
|
cuCtxDestroy(dev_args->context_cu);
|
4d26a735
Pedro Roque
Increased recogni...
|
461
|
exit(-1);
|
94b2b13d
Pedro Roque
PHACT source
|
462
463
464
465
466
467
468
469
470
471
|
}
} else {
#endif
// Update shared stored flags buffer data on device
cl_check_error(clEnqueueWriteBuffer(dev_args->cq, dev_args->shared_stores_flag_mem, CL_TRUE, 0, dev_args->shared_stores_flag_size, dev_args->shared_stores_flag,
0, NULL, NULL), "clEnqueueWriteBuffer shared_stores_flag_mem", dev_info->dev_name);
#if RUN_IN_CUDA
}
#endif
#endif
|
4d26a735
Pedro Roque
Increased recogni...
|
472
|
|
94b2b13d
Pedro Roque
PHACT source
|
473
474
|
#if RUN_IN_CUDA
|
4d26a735
Pedro Roque
Increased recogni...
|
475
476
|
if (dev_info->type == CL_DEVICE_TYPE_GPU) {
|
94b2b13d
Pedro Roque
PHACT source
|
477
478
|
err = cuLaunchKernel(dev_args->function_cu, (unsigned int)dev_args->wi_total / (unsigned int)dev_args->wi_local, 1, 1, (unsigned int)dev_args->wi_local, 1, 1,
(unsigned int)dev_args->shared_memory_size_cu, 0, dev_args->kernel_args_cu, 0);
|
4d26a735
Pedro Roque
Increased recogni...
|
479
480
|
if (err != CUDA_SUCCESS) {
fprintf(stderr, "CUDA error in cuLaunchKernel %d\n", err);
|
94b2b13d
Pedro Roque
PHACT source
|
481
482
483
484
485
486
487
488
489
490
491
|
cuCtxDestroy(dev_args->context_cu);
exit(-1);
}
} else {
#endif
cl_check_error(clEnqueueNDRangeKernel(dev_args->cq, dev_args->kernel, 1, NULL, &dev_args->wi_total, &dev_args->wi_local, 0, NULL, NULL), "clEnqueueNDRangeKernel",
dev_info->dev_name);
#if RUN_IN_CUDA
}
#endif
|
4d26a735
Pedro Roque
Increased recogni...
|
492
493
|
if (filtering) {
return get_filtering_results(dev_args, dev_info);
|
94b2b13d
Pedro Roque
PHACT source
|
494
495
|
}
|
4d26a735
Pedro Roque
Increased recogni...
|
496
497
|
// Transfer best value found flag
#if RUN_IN_CUDA
|
94b2b13d
Pedro Roque
PHACT source
|
498
499
500
501
502
503
504
505
506
507
508
|
if (dev_info->type == CL_DEVICE_TYPE_GPU) {
err = cuMemcpyDtoH(dev_args->atoms, dev_args->atoms_mem_cu, dev_args->atoms_size);
if (err != CUDA_SUCCESS) {
fprintf(stderr, "CUDA error in cuMemcpyDtoH atoms_mem_cu\n");
cuCtxDestroy(dev_args->context_cu);
exit(-1);
}
} else {
#endif
|
4d26a735
Pedro Roque
Increased recogni...
|
509
510
511
|
cl_check_error(clEnqueueReadBuffer(dev_args->cq, dev_args->atoms_mem, CL_TRUE, 0, dev_args->atoms_size, dev_args->atoms, 0, NULL, NULL),
"clEnqueueReadBuffer atoms_mem", dev_info->dev_name);
#if RUN_IN_CUDA
|
94b2b13d
Pedro Roque
PHACT source
|
512
513
|
}
#endif
|
4d26a735
Pedro Roque
Increased recogni...
|
514
515
|
opt_sol_found = dev_args->atoms[5 + N_VS];
|
94b2b13d
Pedro Roque
PHACT source
|
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
|
if (opt_sol_found) {
if (DOMAIN_TYPE == BITMAP_) {
// Transfer possible solution
#if RUN_IN_CUDA
if (dev_info->type == CL_DEVICE_TYPE_GPU) {
err = cuMemcpyDtoH(dev_args->bitmaps, dev_args->domains_mem_cu, dev_args->domains_size);
if (err != CUDA_SUCCESS) {
fprintf(stderr, "CUDA error in cuMemcpyDtoH domains_mem_cu\n");
cuCtxDestroy(dev_args->context_cu);
exit(-1);
}
} else {
#endif
cl_check_error(clEnqueueReadBuffer(dev_args->cq, dev_args->domains_mem, CL_TRUE, 0, dev_args->domains_size, dev_args->bitmaps, 0, NULL, NULL),
"clEnqueueReadBuffer domains_mem", dev_info->dev_name);
#if RUN_IN_CUDA
}
#endif
} else if (DOMAIN_TYPE == INTERVAL) {
// Transfer possible solution
#if RUN_IN_CUDA
if (dev_info->type == CL_DEVICE_TYPE_GPU) {
err = cuMemcpyDtoH(dev_args->intervals, dev_args->domains_mem_cu, dev_args->domains_size);
if (err != CUDA_SUCCESS) {
fprintf(stderr, "CUDA error in cuMemcpyDtoH domains_mem_cu\n");
cuCtxDestroy(dev_args->context_cu);
exit(-1);
}
} else {
#endif
cl_check_error(clEnqueueReadBuffer(dev_args->cq, dev_args->domains_mem, CL_TRUE, 0, dev_args->domains_size, dev_args->intervals, 0, NULL, NULL),
"clEnqueueReadBuffer domains_mem", dev_info->dev_name);
#if RUN_IN_CUDA
}
#endif
}
}
if (N_DEVS > 1) {
#if RUN_IN_CUDA
if (dev_info->type == CL_DEVICE_TYPE_GPU) {
err = cuMemcpyDtoH(dev_args->props, dev_args->props_mem_cu, dev_args->props_size);
if (err != CUDA_SUCCESS) {
fprintf(stderr, "CUDA error in cuMemcpyDtoH props_mem_cu\n");
cuCtxDestroy(dev_args->context_cu);
exit(-1);
}
} else {
#endif
cl_check_error(clEnqueueReadBuffer(dev_args->cq, dev_args->props_mem, CL_TRUE, 0, dev_args->props_size, dev_args->props, 0,
NULL, NULL), "clEnqueueReadBuffer props_mem", dev_info->dev_name);
#if RUN_IN_CUDA
|
4d26a735
Pedro Roque
Increased recogni...
|
577
578
579
580
|
}
#endif
dev_info->last_props = 0;
|
94b2b13d
Pedro Roque
PHACT source
|
581
582
|
for (i = 0; i < dev_args->wi_total; i++) {
dev_info->props_total += dev_args->props[i];
|
94b2b13d
Pedro Roque
PHACT source
|
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
|
dev_info->last_props += dev_args->props[i];
}
}
// copy statistics from device to host
if (PRINT_STATS) {
get_stats(dev_args, dev_info, stats_lock);
}
// if optimal solution found
if (opt_sol_found) {
if (N_DEVS > 1) {
// lock access to the place to write the optimal solution and writes it
pthread_mutex_lock(opt_lock);
// if using bitmap domains
if (DOMAIN_TYPE == BITMAP_) {
vs_copy_dev_to_host(VS_LOCK, dev_args->bitmaps, N_VS);
} else if (DOMAIN_TYPE == INTERVAL) {
|
4d26a735
Pedro Roque
Increased recogni...
|
606
|
convert_intervals_to_vars(VS_LOCK, dev_args->intervals, N_VS);
|
94b2b13d
Pedro Roque
PHACT source
|
607
|
}
|
4d26a735
Pedro Roque
Increased recogni...
|
608
|
|
94b2b13d
Pedro Roque
PHACT source
|
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
|
if (OPT_MODE == DECREASE) {
if (VS_LOCK[VAR_ID_TO_OPT].max < VS_LOCK_BEST[VAR_ID_TO_OPT].max) {
opt_sol_found = true;
} else {
opt_sol_found = false;
}
} else {
if (VS_LOCK[VAR_ID_TO_OPT].min > VS_LOCK_BEST[VAR_ID_TO_OPT].min) {
opt_sol_found = true;
} else {
opt_sol_found = false;
}
}
if (opt_sol_found) {
*val_to_opt = dev_args->atoms[6 + N_VS];
vs_copy(VS_LOCK_BEST, VS_LOCK, N_VS);
}
pthread_mutex_unlock(opt_lock);
} else {
*val_to_opt = dev_args->atoms[6 + N_VS];
// if using bitmap domains
if (DOMAIN_TYPE == BITMAP_) {
vs_copy_dev_to_host(VS_LOCK_BEST, dev_args->bitmaps, N_VS);
} else if (DOMAIN_TYPE == INTERVAL) {
convert_intervals_to_vars(VS_LOCK_BEST, dev_args->intervals, N_VS);
}
}
}
if (opt_sol_found) {
return 1;
} else {
return 0;
}
}
/*
* Generates the data needed to each work-item for generating the sub-search spaces
* dev_args - device_args structure about this device
* dev_info - device_info structure about this device
* depth - Tree expansion depth needed to get n_ss disjoint search spaces
* n_ss - total number of sub-search spaces
*/
void set_strs_generat_data(device_args* dev_args, device_info* dev_info, unsigned int depth, unsigned int n_ss, bool filtering) {
unsigned int n_ss_new;
unsigned int depth_prev = depth;
unsigned int depth_new = depth;
unsigned int new_multiplier = 1;
unsigned int i, j;
// buffer for atomics data (Most devices only have atomics for 32 bits variables)
// 0 - first sub-search to explore
// 1 - last sub-search to explore
// 2 - n_ss
// 3 - depth
// 4 - WIs still working for work-sharing
// 5 - 5+N_VS - n_repeat per variable
// ...
if (filtering) {
for (i = 0; i < N_VS; i++) {
dev_info->exp_values[i] = 0;
dev_args->atoms[5 + i] = 0;
}
dev_info->n_ss_mult = 1;
dev_info->n_ss_mult_max = 1;
dev_info->first_store = 0;
dev_info->last_store = 1;
dev_args->atoms[0] = 0;
dev_args->atoms[1] = 1;
dev_args->atoms[2] = 1;
dev_args->atoms[3] = 0;
return;
}
for (i = 0; i < N_VS; i++) {
dev_info->exp_values[i] = EXP_VALUES[i];
dev_args->atoms[5 + i] = EXP_VALUES[i];
}
// calculate a valid n_ss multiplier bigger than the one provided
// get the max multiplier that can be applied to the number of ss inside each device
if (dev_info->n_ss_mult > dev_info->n_ss_mult_max) {
dev_info->n_ss_mult = dev_info->n_ss_mult_max;
}
for (i = depth_prev; new_multiplier < dev_info->n_ss_mult && new_multiplier < dev_info->n_ss_mult_max; i++) {
if (VS[i].n_vals > 1 && VS[i].to_label) {
|
4d26a735
Pedro Roque
Increased recogni...
|
711
|
new_multiplier *= VS[i].n_vals;
|
94b2b13d
Pedro Roque
PHACT source
|
712
713
714
715
716
717
718
719
720
721
722
|
dev_info->exp_values[i] = VS[i].n_vals;
} else {
dev_info->exp_values[i] = 1;
}
}
i--;
// if expanding all the previous tree nodes to new depth generate more than the required multiplier
if (new_multiplier > dev_info->n_ss_mult) {
new_multiplier /= VS[i].n_vals;
|
4d26a735
Pedro Roque
Increased recogni...
|
723
|
if (new_multiplier * 2 > dev_info->n_ss_mult_max) {
|
94b2b13d
Pedro Roque
PHACT source
|
724
725
726
727
|
dev_info->exp_values[i] = 0;
i--;
} else {
for (j = 2; j < VS[i].n_vals; j++) {
|
4d26a735
Pedro Roque
Increased recogni...
|
728
729
|
if (new_multiplier * j >= dev_info->n_ss_mult || new_multiplier * (j + 1) >= dev_info->n_ss_mult_max) {
new_multiplier *= j;
|
94b2b13d
Pedro Roque
PHACT source
|
730
731
|
dev_info->exp_values[i] = j;
break;
|
4d26a735
Pedro Roque
Increased recogni...
|
732
733
|
}
}
|
94b2b13d
Pedro Roque
PHACT source
|
734
|
if (j == VS[i].n_vals) {
|
4d26a735
Pedro Roque
Increased recogni...
|
735
|
new_multiplier *= VS[i].n_vals;
|
94b2b13d
Pedro Roque
PHACT source
|
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
|
dev_info->exp_values[i] = VS[i].n_vals;
}
}
}
depth_new = i + 1;
dev_info->n_ss_mult = new_multiplier;
n_ss_new = n_ss * dev_info->n_ss_mult;
// non labeling variables will not be expanded
for (i = 0; i < depth_new; i++) {
dev_args->atoms[5 + i] = dev_info->exp_values[i];
}
for (; i < N_VS; i++) {
dev_args->atoms[5 + i] = 0;
}
// 0 - first sub-search to explore
// 1 - last sub-search to explore
// 2 - n_ss
dev_args->atoms[0] = dev_info->first_store * dev_info->n_ss_mult;
dev_args->atoms[1] = dev_info->last_store * dev_info->n_ss_mult;
dev_args->atoms[2] = n_ss_new;
dev_args->atoms[3] = depth_new;
}
/*
* Load statistics from the device
* dev_args - device_args structure about this device
* dev_info - device_info structure about this device
|
4d26a735
Pedro Roque
Increased recogni...
|
765
766
767
|
* stats_lock - mutex to control accesses to statistics structure
*/
void get_stats(device_args* dev_args, device_info* dev_info, pthread_mutex_t* stats_lock) {
|
94b2b13d
Pedro Roque
PHACT source
|
768
769
|
unsigned int i;
|
4d26a735
Pedro Roque
Increased recogni...
|
770
771
|
// 0 - nodes_fail
// 1 - nodes_expl
|
94b2b13d
Pedro Roque
PHACT source
|
772
|
// 2 - backtracks
|
4d26a735
Pedro Roque
Increased recogni...
|
773
|
// 3 - labels
|
94b2b13d
Pedro Roque
PHACT source
|
774
775
776
777
778
779
780
|
// 4 - props_not_ok
// 5 - props_ok
// ... repeat per work-item
#if RUN_IN_CUDA
CUresult err = cuInit(0);
|
4d26a735
Pedro Roque
Increased recogni...
|
781
|
if (dev_info->type == CL_DEVICE_TYPE_GPU) {
|
94b2b13d
Pedro Roque
PHACT source
|
782
783
784
785
786
|
err = cuMemcpyDtoH(dev_args->stats, dev_args->stats_mem_cu, dev_args->stats_size);
if (err != CUDA_SUCCESS) {
fprintf(stderr, "CUDA error in cuMemcpyDtoH stats_mem_cu\n");
cuCtxDestroy(dev_args->context_cu);
|
4d26a735
Pedro Roque
Increased recogni...
|
787
|
exit(-1);
|
94b2b13d
Pedro Roque
PHACT source
|
788
789
790
791
|
}
} else {
#endif
cl_check_error(clEnqueueReadBuffer(dev_args->cq, dev_args->stats_mem, CL_TRUE, 0, dev_args->stats_size, dev_args->stats, 0, NULL, NULL), "clEnqueueReadBuffer stats_mem",
|
4d26a735
Pedro Roque
Increased recogni...
|
792
793
|
dev_info->dev_name);
#if RUN_IN_CUDA
|
94b2b13d
Pedro Roque
PHACT source
|
794
795
|
}
#endif
|
4d26a735
Pedro Roque
Increased recogni...
|
796
797
798
799
|
if (N_DEVS > 1) {
// lock access to the place to write the optimal solution and writes it
pthread_mutex_lock(stats_lock);
|
94b2b13d
Pedro Roque
PHACT source
|
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
|
}
// copy statistics to host global counter
for (i = 0; i < dev_args->wi_total; i++) {
STATS.nodes_fail += dev_args->stats[i * 7];
STATS.nodes_expl += dev_args->stats[i * 7 + 1];
STATS.backtracks += dev_args->stats[i * 7 + 2];
STATS.labels += dev_args->stats[i * 7 + 3];
STATS.pruning += dev_args->stats[i * 7 + 4];
STATS.props_ok += dev_args->stats[i * 7 + 5];
if (dev_args->stats[i * 7 + 6] > STATS.max_depth) {
STATS.max_depth = dev_args->stats[i * 7 + 6];
}
}
STATS.search_spaces += dev_info->block_size * dev_info->n_ss_mult;
// clear counters on device buffer for next run
memset(dev_args->stats, 0, dev_args->stats_size);
|
4d26a735
Pedro Roque
Increased recogni...
|
819
820
|
#if RUN_IN_CUDA
|
94b2b13d
Pedro Roque
PHACT source
|
821
822
|
if (dev_info->type == CL_DEVICE_TYPE_GPU) {
|
4d26a735
Pedro Roque
Increased recogni...
|
823
824
825
|
err = cuMemcpyHtoD(dev_args->stats_mem_cu, dev_args->stats, dev_args->stats_size);
if (err != CUDA_SUCCESS) {
|
94b2b13d
Pedro Roque
PHACT source
|
826
|
fprintf(stderr, "CUDA error in cuMemcpyHtoD stats_mem_cu\n");
|
4d26a735
Pedro Roque
Increased recogni...
|
827
|
cuCtxDestroy(dev_args->context_cu);
|
94b2b13d
Pedro Roque
PHACT source
|
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
|
exit(-1);
}
} else {
#endif
cl_check_error(clEnqueueWriteBuffer(dev_args->cq, dev_args->stats_mem, CL_TRUE, 0, dev_args->stats_size, dev_args->stats, 0, NULL, NULL), "clEnqueueWriteBuffer stats_mem",
dev_info->dev_name);
#if RUN_IN_CUDA
}
#endif
if (N_DEVS > 1) {
pthread_mutex_unlock(stats_lock);
}
}
bool get_filtering_results(device_args* dev_args, device_info* dev_info) {
|
4d26a735
Pedro Roque
Increased recogni...
|
845
|
if (DOMAIN_TYPE == BITMAP_) {
|
94b2b13d
Pedro Roque
PHACT source
|
846
847
848
|
#if RUN_IN_CUDA
|
4d26a735
Pedro Roque
Increased recogni...
|
849
|
CUresult err = cuInit(0);
|
94b2b13d
Pedro Roque
PHACT source
|
850
851
852
853
|
if (dev_info->type == CL_DEVICE_TYPE_GPU) {
err = cuMemcpyDtoH(dev_args->filt_bitmaps, dev_args->filt_domains_mem_cu, dev_args->filt_domains_size);
|
4d26a735
Pedro Roque
Increased recogni...
|
854
855
|
if (err != CUDA_SUCCESS) {
fprintf(stderr, "CUDA error in cuMemcpyDtoH filt_domains_mem_cu in %s\n", dev_info->dev_name);
|
94b2b13d
Pedro Roque
PHACT source
|
856
857
|
cuCtxDestroy(dev_args->context_cu);
exit(-1);
|
4d26a735
Pedro Roque
Increased recogni...
|
858
859
860
861
|
}
} else {
#endif
// Transfer filtered CSP
|
94b2b13d
Pedro Roque
PHACT source
|
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
|
cl_check_error(clEnqueueReadBuffer(dev_args->cq, dev_args->filt_domains_mem, CL_TRUE, 0, dev_args->filt_domains_size, dev_args->filt_bitmaps, 0, NULL, NULL),
"clEnqueueReadBuffer filt_domains_mem", dev_info->dev_name);
#if RUN_IN_CUDA
}
#endif
bitmap b_result;
b_clear(&b_result);
b_copy_dev_to_host(&b_result, dev_args->filt_bitmaps, 0);
// consistent CSP after filtering
if (!b_is_empty(&b_result)) {
if (CS_IGNORE) {
unsigned int i;
|
4d26a735
Pedro Roque
Increased recogni...
|
877
878
|
#if RUN_IN_CUDA
|
94b2b13d
Pedro Roque
PHACT source
|
879
880
|
if (dev_info->type == CL_DEVICE_TYPE_GPU) {
|
4d26a735
Pedro Roque
Increased recogni...
|
881
882
883
|
err = cuMemcpyDtoH(dev_args->filt_cs, dev_args->filt_cs_mem_cu, dev_args->filt_cs_size);
if (err != CUDA_SUCCESS) {
|
94b2b13d
Pedro Roque
PHACT source
|
884
|
fprintf(stderr, "CUDA error in cuMemcpyDtoH filt_cs_mem_cu\n");
|
4d26a735
Pedro Roque
Increased recogni...
|
885
|
cuCtxDestroy(dev_args->context_cu);
|
94b2b13d
Pedro Roque
PHACT source
|
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
|
exit(-1);
}
} else {
#endif
// Transfer cs_ignore results
cl_check_error(clEnqueueReadBuffer(dev_args->cq, dev_args->filt_cs_mem, CL_TRUE, 0, dev_args->filt_cs_size, dev_args->filt_cs, 0, NULL, NULL),
"clEnqueueReadBuffer filt_cs_mem", dev_info->dev_name);
#if RUN_IN_CUDA
}
#endif
for (i = 0; i < N_CS; i++) {
if (dev_args->filt_cs[i] == 1) {
CS[i].ignore = true;
} else {
CS[i].ignore = false;
}
}
}
|