Blame view

src/solve.c 29.1 KB
94b2b13d   Pedro Roque   PHACT source
1
2
3
4
/*
 * solve.c
 *
 *  Created on: 28/01/2015
4d26a735   Pedro Roque   Increased recogni...
5
 *      Author: pedro
94b2b13d   Pedro Roque   PHACT source
6
7
8
9
10
11
12
13
14
15
16
17
 */

#include "solve.h"

#include <stdbool.h>
#include <string.h>
#include <stdio.h>

#include "bitmaps.h"
#include "devices.h"
#include "domains.h"
#include "intervals.h"
4d26a735   Pedro Roque   Increased recogni...
18
#include "utils/cl_errors.h"
94b2b13d   Pedro Roque   PHACT source
19
20
21
22
23
24
25
26
27
#include "variables.h"

/*
 * Return the number of solutions found or 0 if no solution is found
 * dev_args - device_args structure about this device
 * dev_info - device_info structure about this device
 * depth - Tree expansion depth needed to get n_ss disjoint search spaces
 * n_ss - total number of sub-search spaces
 * stats_lock - mutex to control accesses to statistics structure
4d26a735   Pedro Roque   Increased recogni...
28
 * */
94b2b13d   Pedro Roque   PHACT source
29
cl_ulong count_sols(device_args* dev_args, device_info* dev_info, unsigned int depth, unsigned int n_ss, pthread_mutex_t* stats_lock, bool filtering) {
4d26a735   Pedro Roque   Increased recogni...
30

94b2b13d   Pedro Roque   PHACT source
31
32
33
34
35
36
37
38
39
40
41
42
43
44
	cl_ulong n_solutions = 0;	// number of solutions found on this kernels execution
	unsigned int i;

	// buffer for atomics data (Most devices only have atomics for 32 bits variables)
	// 0 - first sub-search to explore
	// 1 - last sub-search to explore
	// 2 - n_ss
	// 3 - depth
	// 4 - WIs still working for work-sharing
	// 5 - 5+N_VS - n_repeat per variable
	// 5+N_VS...5+N_VS+N_WG*N_WI_WG - number of solutions found per work-item
	set_strs_generat_data(dev_args, dev_info, depth, n_ss, filtering);

	dev_args->atoms[4] = (unsigned int)dev_args->wi_total;
4d26a735   Pedro Roque   Increased recogni...
45

94b2b13d   Pedro Roque   PHACT source
46
47
48
49
50
51
52
53
54
55
56
	for (i = 5 + N_VS; i < 5 + N_VS + dev_args->wi_total; i++) {
		dev_args->atoms[i] = 0;
	}

#if RUN_IN_CUDA
	CUresult err = cuInit(0);

	if (dev_info->type == CL_DEVICE_TYPE_GPU) {

		err = cuMemcpyHtoD(dev_args->atoms_mem_cu, dev_args->atoms, dev_args->atoms_size);
		if (err != CUDA_SUCCESS) {
4d26a735   Pedro Roque   Increased recogni...
57
58
			fprintf(stderr, "CUDA error in cuMemcpyHtoD atoms_mem_cu\n");
			cuCtxDestroy(dev_args->context_cu);
94b2b13d   Pedro Roque   PHACT source
59
60
61
62
			exit(-1);
		}
	} else {
#endif
4d26a735   Pedro Roque   Increased recogni...
63

94b2b13d   Pedro Roque   PHACT source
64
65
66
67
68
69
70
71
72
73
74
75
	// Update atoms buffer data on device
	cl_check_error(clEnqueueWriteBuffer(dev_args->cq, dev_args->atoms_mem, CL_TRUE, 0,  dev_args->atoms_size, dev_args->atoms, 0, NULL, NULL),
			"clEnqueueWriteBuffer atoms_p_mem", dev_info->dev_name);

#if RUN_IN_CUDA
	}
#endif

#if SHARED_SS > 0
#if RUN_IN_CUDA

	if (dev_info->type == CL_DEVICE_TYPE_GPU) {
4d26a735   Pedro Roque   Increased recogni...
76

94b2b13d   Pedro Roque   PHACT source
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
		err = cuMemcpyHtoD(dev_args->shared_stores_flag_mem_cu, dev_args->shared_stores_flag, dev_args->shared_stores_flag_size);
		if (err != CUDA_SUCCESS) {
			fprintf(stderr, "CUDA error in cuMemcpyHtoD shared_stores_flag_mem_cu\n");
			cuCtxDestroy(dev_args->context_cu);
			exit(-1);
		}
	} else {
#endif
		// Update shared stored flags buffer data on device
		cl_check_error(clEnqueueWriteBuffer(dev_args->cq, dev_args->shared_stores_flag_mem, CL_TRUE, 0,  dev_args->shared_stores_flag_size,
				dev_args->shared_stores_flag, 0, NULL, NULL), "clEnqueueWriteBuffer shared_stores_flag_mem", dev_info->dev_name);

#if RUN_IN_CUDA
	}
#endif
#endif

4d26a735   Pedro Roque   Increased recogni...
94
#if RUN_IN_CUDA
94b2b13d   Pedro Roque   PHACT source
95
96

	if (dev_info->type == CL_DEVICE_TYPE_GPU) {
4d26a735   Pedro Roque   Increased recogni...
97
98

		err = cuLaunchKernel(dev_args->function_cu, (unsigned int)dev_args->wi_total / (unsigned int)dev_args->wi_local, 1, 1, (unsigned int)dev_args->wi_local, 1, 1,
94b2b13d   Pedro Roque   PHACT source
99
100
101
102
103
104
105
106
107
108
109
110
				(unsigned int)dev_args->shared_memory_size_cu, 0, dev_args->kernel_args_cu, 0);
		if (err != CUDA_SUCCESS) {
			fprintf(stderr, "CUDA error in cuLaunchKernel %d\n", err);
			cuCtxDestroy(dev_args->context_cu);
			exit(-1);
		}
	} else {
#endif
		cl_check_error(clEnqueueNDRangeKernel(dev_args->cq, dev_args->kernel, 1, NULL, &dev_args->wi_total, &dev_args->wi_local, 0, NULL, NULL),
				"clEnqueueNDRangeKernel", dev_info->dev_name);
#if RUN_IN_CUDA
	}
4d26a735   Pedro Roque   Increased recogni...
111
112
#endif

94b2b13d   Pedro Roque   PHACT source
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
	if (filtering) {
		return get_filtering_results(dev_args, dev_info);
	}

#if RUN_IN_CUDA

	if (dev_info->type == CL_DEVICE_TYPE_GPU) {

	err = cuMemcpyDtoH(dev_args->atoms, dev_args->atoms_mem_cu, dev_args->atoms_size);
	if (err != CUDA_SUCCESS) {
		fprintf(stderr, "CUDA error in cuMemcpyDtoH atoms_mem_cu\n");
		cuCtxDestroy(dev_args->context_cu);
		exit(-1);
	}
	} else {
#endif
	// Transfer number of solutions found
	cl_check_error(clEnqueueReadBuffer(dev_args->cq, dev_args->atoms_mem, CL_TRUE, 0, dev_args->atoms_size, dev_args->atoms, 0, NULL, NULL),
4d26a735   Pedro Roque   Increased recogni...
131
132
133
			"clEnqueueReadBuffer atoms_p_mem", dev_info->dev_name);
#if RUN_IN_CUDA
	}
94b2b13d   Pedro Roque   PHACT source
134
#endif
4d26a735   Pedro Roque   Increased recogni...
135

94b2b13d   Pedro Roque   PHACT source
136
	for (i = 5 + N_VS; i < 5 + N_VS + dev_args->wi_total; i++) {
4d26a735   Pedro Roque   Increased recogni...
137
		n_solutions += dev_args->atoms[i];
94b2b13d   Pedro Roque   PHACT source
138
	}
4d26a735   Pedro Roque   Increased recogni...
139

94b2b13d   Pedro Roque   PHACT source
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
	if (N_DEVS > 1) {
#if RUN_IN_CUDA

		if (dev_info->type == CL_DEVICE_TYPE_GPU) {

		err = cuMemcpyDtoH(dev_args->props, dev_args->props_mem_cu, dev_args->props_size);
		if (err != CUDA_SUCCESS) {
			fprintf(stderr, "CUDA error in cuMemcpyDtoH props_mem_cu\n");
			cuCtxDestroy(dev_args->context_cu);
			exit(-1);
		}
		} else {
	#endif
		cl_check_error(clEnqueueReadBuffer(dev_args->cq, dev_args->props_mem, CL_TRUE, 0, dev_args->props_size, dev_args->props, 0,
						NULL, NULL), "clEnqueueReadBuffer props_mem", dev_info->dev_name);
#if RUN_IN_CUDA
	}
#endif

		dev_info->last_props = 0;
		for (i = 0; i < dev_args->wi_total; i++) {
			dev_info->props_total += dev_args->props[i];
			dev_info->last_props += dev_args->props[i];
		}
	}

4d26a735   Pedro Roque   Increased recogni...
166
	// copy statistics from device to host
94b2b13d   Pedro Roque   PHACT source
167
	if (PRINT_STATS) {
4d26a735   Pedro Roque   Increased recogni...
168
169
		get_stats(dev_args, dev_info, stats_lock);
	}
94b2b13d   Pedro Roque   PHACT source
170
171
172
173
174
175
176
177
178
179
180
181
182
183

	// return number of solutions found
	return n_solutions;
}

/*
 * Return 1 if a solution is found, or 0 if no solution is found.
 * dev_args - device_args structure about this device
 * dev_info - device_info structure about this device
 * sol_found - atomic flag for solution found
 * depth - Tree expansion depth needed to get n_ss disjoint search spaces
 * n_ss - total number of sub-search spaces
 * stats_lock - mutex to control accesses to statistics structure
 * */
4d26a735   Pedro Roque   Increased recogni...
184
cl_ulong find_one_sol(device_args* dev_args, device_info* dev_info, unsigned char* sol_found, unsigned int depth, unsigned int n_ss, pthread_mutex_t* stats_lock, bool filtering) {
94b2b13d   Pedro Roque   PHACT source
185
186
187
188
189
190
191
192

	int sol_found_atom = 0; // Previous value of sol_found
	unsigned int i;

	// buffer for atomics data (Most devices only have atomics for 32 bits variables)
	// 0 - first sub-search to explore
	// 1 - last sub-search to explore
	// 2 - n_ss
4d26a735   Pedro Roque   Increased recogni...
193
	// 3 - depth
94b2b13d   Pedro Roque   PHACT source
194
195
	// 4 - WIs still working for work-sharing
	// 5 - 5+N_VS - n_repeat per variable
4d26a735   Pedro Roque   Increased recogni...
196
197
198
	// 5+N_VS - solution found flag
	set_strs_generat_data(dev_args, dev_info, depth, n_ss, filtering);

94b2b13d   Pedro Roque   PHACT source
199
200
201
202
	dev_args->atoms[4] = (unsigned int)dev_args->wi_total;

#if RUN_IN_CUDA

94b2b13d   Pedro Roque   PHACT source
203
204
205
	CUresult err = cuInit(0);

	if (dev_info->type == CL_DEVICE_TYPE_GPU) {
4d26a735   Pedro Roque   Increased recogni...
206

94b2b13d   Pedro Roque   PHACT source
207
208
		err = cuMemcpyHtoD(dev_args->atoms_mem_cu, dev_args->atoms, dev_args->atoms_size);
		if (err != CUDA_SUCCESS) {
4d26a735   Pedro Roque   Increased recogni...
209
			fprintf(stderr, "CUDA error in cuMemcpyHtoD atoms_mem_cu\n");
94b2b13d   Pedro Roque   PHACT source
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
			cuCtxDestroy(dev_args->context_cu);
			exit(-1);
		}
	} else {
#endif
		// Update atoms buffer data on device
		cl_check_error(clEnqueueWriteBuffer(dev_args->cq, dev_args->atoms_mem, CL_TRUE, 0, dev_args->atoms_size, dev_args->atoms, 0, NULL, NULL),
				"clEnqueueWriteBuffer atoms_p_mem", dev_info->dev_name);
#if RUN_IN_CUDA
	}
#endif


#if SHARED_SS > 0
#if RUN_IN_CUDA

4d26a735   Pedro Roque   Increased recogni...
226
		if (dev_info->type == CL_DEVICE_TYPE_GPU) {
94b2b13d   Pedro Roque   PHACT source
227
228

		err = cuMemcpyHtoD(dev_args->shared_stores_flag_mem_cu, dev_args->shared_stores_flag, dev_args->shared_stores_flag_size);
4d26a735   Pedro Roque   Increased recogni...
229
230
		if (err != CUDA_SUCCESS) {
			fprintf(stderr, "CUDA error in cuMemcpyHtoD shared_stores_flag_mem_cu\n");
94b2b13d   Pedro Roque   PHACT source
231
232
233
234
235
236
237
238
239
240
241
242
243
244
			cuCtxDestroy(dev_args->context_cu);
			exit(-1);
		}
	} else {
#endif
		// Update shared stored flags buffer data on device
		cl_check_error(clEnqueueWriteBuffer(dev_args->cq, dev_args->shared_stores_flag_mem, CL_TRUE, 0,  dev_args->shared_stores_flag_size,
				dev_args->shared_stores_flag, 0, NULL, NULL), "clEnqueueWriteBuffer shared_stores_flag_mem", dev_info->dev_name);
#if RUN_IN_CUDA
	}
#endif
#endif

#if RUN_IN_CUDA
4d26a735   Pedro Roque   Increased recogni...
245

94b2b13d   Pedro Roque   PHACT source
246
247
	if (dev_info->type == CL_DEVICE_TYPE_GPU) {

4d26a735   Pedro Roque   Increased recogni...
248
249
250
		err = cuLaunchKernel(dev_args->function_cu, (unsigned int)dev_args->wi_total / (unsigned int)dev_args->wi_local, 1, 1, (unsigned int)dev_args->wi_local, 1, 1,
				(unsigned int)dev_args->shared_memory_size_cu, 0, dev_args->kernel_args_cu, 0);
		if (err != CUDA_SUCCESS) {
94b2b13d   Pedro Roque   PHACT source
251
252
253
254
255
256
257
258
259
			fprintf(stderr, "CUDA error in cuLaunchKernel %d\n", err);
			cuCtxDestroy(dev_args->context_cu);
			exit(-1);
		}
	} else {
#endif
		cl_check_error(clEnqueueNDRangeKernel(dev_args->cq, dev_args->kernel, 1, NULL, &dev_args->wi_total, &dev_args->wi_local, 0, NULL, NULL), "clEnqueueNDRangeKernel",
				dev_info->dev_name);
#if RUN_IN_CUDA
4d26a735   Pedro Roque   Increased recogni...
260
261
	}
#endif
94b2b13d   Pedro Roque   PHACT source
262
263

	if (filtering) {
4d26a735   Pedro Roque   Increased recogni...
264
265
266
		return get_filtering_results(dev_args, dev_info);
	}

94b2b13d   Pedro Roque   PHACT source
267
268
269
270
271
272
273
274
275
276
277
	if (DOMAIN_TYPE == BITMAP_) {
#if RUN_IN_CUDA

		if (dev_info->type == CL_DEVICE_TYPE_GPU) {

			err = cuMemcpyDtoH(dev_args->bitmaps, dev_args->domains_mem_cu, dev_args->domains_size);
			if (err != CUDA_SUCCESS) {
				fprintf(stderr, "CUDA error in cuMemcpyDtoH domains_mem_cu\n");
				cuCtxDestroy(dev_args->context_cu);
				exit(-1);
			}
4d26a735   Pedro Roque   Increased recogni...
278
279
		} else {
#endif
94b2b13d   Pedro Roque   PHACT source
280
281
			// Transfer possible solution
			cl_check_error(clEnqueueReadBuffer(dev_args->cq, dev_args->domains_mem, CL_TRUE, 0, dev_args->domains_size, dev_args->bitmaps, 0, NULL, NULL),
4d26a735   Pedro Roque   Increased recogni...
282
283
					"clEnqueueReadBuffer domains_mem", dev_info->dev_name);
#if RUN_IN_CUDA
94b2b13d   Pedro Roque   PHACT source
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
		}
#endif
	} else if (DOMAIN_TYPE == INTERVAL) {
#if RUN_IN_CUDA

		if (dev_info->type == CL_DEVICE_TYPE_GPU) {

			err = cuMemcpyDtoH(dev_args->intervals, dev_args->domains_mem_cu, dev_args->domains_size);
			if (err != CUDA_SUCCESS) {
				fprintf(stderr, "CUDA error in cuMemcpyDtoH domains_mem_cu\n");
				cuCtxDestroy(dev_args->context_cu);
				exit(-1);
			}
		} else {
#endif
			// Transfer possible solution
			cl_check_error(clEnqueueReadBuffer(dev_args->cq, dev_args->domains_mem, CL_TRUE, 0, dev_args->domains_size, dev_args->intervals, 0, NULL, NULL),
					"clEnqueueReadBuffer domains_mem", dev_info->dev_name);
#if RUN_IN_CUDA
		}
#endif
	}

	if (N_DEVS > 1) {
#if RUN_IN_CUDA

		if (dev_info->type == CL_DEVICE_TYPE_GPU) {

			err = cuMemcpyDtoH(dev_args->props, dev_args->props_mem_cu, dev_args->props_size);
			if (err != CUDA_SUCCESS) {
				fprintf(stderr, "CUDA error in cuMemcpyDtoH props_mem_cu\n");
				cuCtxDestroy(dev_args->context_cu);
				exit(-1);
			}
		} else {
#endif
			cl_check_error(clEnqueueReadBuffer(dev_args->cq, dev_args->props_mem, CL_TRUE, 0, dev_args->props_size, dev_args->props, 0,
							NULL, NULL), "clEnqueueReadBuffer props_mem", dev_info->dev_name);
#if RUN_IN_CUDA
		}
#endif

		dev_info->last_props = 0;
		for (i = 0; i < dev_args->wi_total; i++) {
4d26a735   Pedro Roque   Increased recogni...
328
329
			dev_info->props_total += dev_args->props[i];
			dev_info->last_props += dev_args->props[i];
94b2b13d   Pedro Roque   PHACT source
330
331
		}
	}
4d26a735   Pedro Roque   Increased recogni...
332
333
334

	// copy statistics from device to host
	if (PRINT_STATS) {
94b2b13d   Pedro Roque   PHACT source
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
		get_stats(dev_args, dev_info, stats_lock);
	}

	// if solution found signalizes it for the other devices and saves it
	// if using bitmap domains
	if (DOMAIN_TYPE == BITMAP_) {
		bitmap b_result;
		b_clear(&b_result);
		b_copy_dev_to_host(&b_result, dev_args->bitmaps, 0);

		if (!b_is_empty(&b_result)) {

#if defined(WIN32) || defined(_WIN32) || defined(__WIN32) && !defined(__CYGWIN__)
			sol_found_atom = InterlockedCompareExchange(sol_found, 1, 0);
#else
			sol_found_atom = __atomic_fetch_add(sol_found, 1, __ATOMIC_SEQ_CST);
#endif

			if (sol_found_atom < 1) {
				vs_copy_dev_to_host(VS, dev_args->bitmaps, N_VS);
			}
			return 1;
		}
		// if using interval domains
	} else if (DOMAIN_TYPE == INTERVAL) {

#if RUN_IN_CUDA

		if (dev_info->type == CL_DEVICE_TYPE_GPU) {

4d26a735   Pedro Roque   Increased recogni...
365
			err = cuMemcpyDtoH(dev_args->atoms, dev_args->atoms_mem_cu, dev_args->atoms_size);
94b2b13d   Pedro Roque   PHACT source
366
			if (err != CUDA_SUCCESS) {
4d26a735   Pedro Roque   Increased recogni...
367
368
369
				fprintf(stderr, "CUDA error in cuMemcpyDtoH atoms_mem_cu\n");
				cuCtxDestroy(dev_args->context_cu);
				exit(-1);
94b2b13d   Pedro Roque   PHACT source
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
			}
		} else {
#endif
			// Transfer possible solution
			cl_check_error(clEnqueueReadBuffer(dev_args->cq, dev_args->atoms_mem, CL_TRUE, 0, dev_args->atoms_size, dev_args->atoms, 0, NULL, NULL),
					"clEnqueueReadBuffer atoms_mem", dev_info->dev_name);
#if RUN_IN_CUDA
		}
#endif

		if (dev_args->atoms[5 + N_VS] != 0) {

#if defined(WIN32) || defined(_WIN32) || defined(__WIN32) && !defined(__CYGWIN__)
			sol_found_atom = InterlockedCompareExchange(sol_found, 1, 0);
#else
			sol_found_atom = __atomic_fetch_add(sol_found, 1, __ATOMIC_SEQ_CST);
4d26a735   Pedro Roque   Increased recogni...
386
#endif
94b2b13d   Pedro Roque   PHACT source
387
388
			if (sol_found_atom < 1) {
				convert_intervals_to_vars(VS, dev_args->intervals, N_VS);
4d26a735   Pedro Roque   Increased recogni...
389
			}
94b2b13d   Pedro Roque   PHACT source
390
391
392
			return 1;
		}
	}
4d26a735   Pedro Roque   Increased recogni...
393

94b2b13d   Pedro Roque   PHACT source
394
	return 0;
4d26a735   Pedro Roque   Increased recogni...
395
}
94b2b13d   Pedro Roque   PHACT source
396
397

/*
4d26a735   Pedro Roque   Increased recogni...
398
399
 * Return 1 if an optimal solution was found, or 0 if no optimal solution was found.
 * dev_args - device_args structure about this device
94b2b13d   Pedro Roque   PHACT source
400
401
 * dev_info - device_info structure about this device
 * val_to_opt - Value to optimize
4d26a735   Pedro Roque   Increased recogni...
402
403
404
 * opt_lock - mutex to control accesses to value to optimize and best solution found
 * depth - Tree expansion depth needed to get n_ss disjoint search spaces
 * n_ss - total number of sub-search spaces
94b2b13d   Pedro Roque   PHACT source
405
 * stats_lock - mutex to control accesses to statistics structure
4d26a735   Pedro Roque   Increased recogni...
406
 * */
94b2b13d   Pedro Roque   PHACT source
407
408
409
410
411
412
413
414
415
cl_ulong find_best_sol( device_args* dev_args, device_info* dev_info, cl_uint* val_to_opt, pthread_mutex_t* opt_lock, unsigned int depth, unsigned int n_ss,
		pthread_mutex_t* stats_lock, bool filtering) {

	bool opt_sol_found = false;
	unsigned int i;

	// buffer for atomics data (Most devices only have atomics for 32 bits variables)
	// 0 - first sub-search to explore
	// 1 - last sub-search to explore
4d26a735   Pedro Roque   Increased recogni...
416
417
	// 2 - n_ss
	// 3 - depth
94b2b13d   Pedro Roque   PHACT source
418
419
420
421
422
423
	// 4 - WIs still working for work-sharing
	// 5 - 5+N_VS - n_repeat per variable
	// 5+N_VS - solution found flag
	// 6+N_VS - Value to optimize
	// 7+N_VS - WIs still working for saving the best solution
	set_strs_generat_data(dev_args, dev_info, depth, n_ss, filtering);
4d26a735   Pedro Roque   Increased recogni...
424

94b2b13d   Pedro Roque   PHACT source
425
426
427
428
429
430
431
432
433
	dev_args->atoms[4] = (unsigned int)dev_args->wi_total;
	dev_args->atoms[5 + N_VS] = 0;
	dev_args->atoms[6 + N_VS] = *val_to_opt;
	dev_args->atoms[7 + N_VS] = (unsigned int)dev_args->wi_total;

#if RUN_IN_CUDA

    CUresult err = cuInit(0);

4d26a735   Pedro Roque   Increased recogni...
434
435
		if (dev_info->type == CL_DEVICE_TYPE_GPU) {

94b2b13d   Pedro Roque   PHACT source
436
437
		err = cuMemcpyHtoD(dev_args->atoms_mem_cu, dev_args->atoms, dev_args->atoms_size);
		if (err != CUDA_SUCCESS) {
4d26a735   Pedro Roque   Increased recogni...
438
439
			fprintf(stderr, "CUDA error in cuMemcpyHtoD atoms_mem_cu\n");
			cuCtxDestroy(dev_args->context_cu);
94b2b13d   Pedro Roque   PHACT source
440
			exit(-1);
4d26a735   Pedro Roque   Increased recogni...
441
		}
94b2b13d   Pedro Roque   PHACT source
442
443
444
445
446
447
448
449
450
451
452
453
	} else {
#endif
		// Update atoms buffer data on device
		cl_check_error(clEnqueueWriteBuffer(dev_args->cq, dev_args->atoms_mem, CL_TRUE, 0, dev_args->atoms_size, dev_args->atoms, 0, NULL, NULL),
				"clEnqueueWriteBuffer atoms_p_mem", dev_info->dev_name);
#if RUN_IN_CUDA
		}
#endif

#if SHARED_SS > 0

#if RUN_IN_CUDA
4d26a735   Pedro Roque   Increased recogni...
454
455

	if (dev_info->type == CL_DEVICE_TYPE_GPU) {
94b2b13d   Pedro Roque   PHACT source
456
457

		err = cuMemcpyHtoD(dev_args->shared_stores_flag_mem_cu, dev_args->shared_stores_flag, dev_args->shared_stores_flag_size);
4d26a735   Pedro Roque   Increased recogni...
458
459
		if (err != CUDA_SUCCESS) {
			fprintf(stderr, "CUDA error in cuMemcpyHtoD shared_stores_flag_mem_cu\n");
94b2b13d   Pedro Roque   PHACT source
460
			cuCtxDestroy(dev_args->context_cu);
4d26a735   Pedro Roque   Increased recogni...
461
			exit(-1);
94b2b13d   Pedro Roque   PHACT source
462
463
464
465
466
467
468
469
470
471
		}
	} else {
#endif
		// Update shared stored flags buffer data on device
		cl_check_error(clEnqueueWriteBuffer(dev_args->cq, dev_args->shared_stores_flag_mem, CL_TRUE, 0, dev_args->shared_stores_flag_size, dev_args->shared_stores_flag,
				0, NULL, NULL), "clEnqueueWriteBuffer shared_stores_flag_mem", dev_info->dev_name);
#if RUN_IN_CUDA
		}
#endif
#endif
4d26a735   Pedro Roque   Increased recogni...
472

94b2b13d   Pedro Roque   PHACT source
473
474
#if RUN_IN_CUDA

4d26a735   Pedro Roque   Increased recogni...
475
476
	if (dev_info->type == CL_DEVICE_TYPE_GPU) {

94b2b13d   Pedro Roque   PHACT source
477
478
		err = cuLaunchKernel(dev_args->function_cu, (unsigned int)dev_args->wi_total / (unsigned int)dev_args->wi_local, 1, 1, (unsigned int)dev_args->wi_local, 1, 1,
				(unsigned int)dev_args->shared_memory_size_cu, 0, dev_args->kernel_args_cu, 0);
4d26a735   Pedro Roque   Increased recogni...
479
480
		if (err != CUDA_SUCCESS) {
			fprintf(stderr, "CUDA error in cuLaunchKernel %d\n", err);
94b2b13d   Pedro Roque   PHACT source
481
482
483
484
485
486
487
488
489
490
491
			cuCtxDestroy(dev_args->context_cu);
			exit(-1);
		}
	} else {
#endif
		cl_check_error(clEnqueueNDRangeKernel(dev_args->cq, dev_args->kernel, 1, NULL, &dev_args->wi_total, &dev_args->wi_local, 0, NULL, NULL), "clEnqueueNDRangeKernel",
				dev_info->dev_name);
#if RUN_IN_CUDA
		}
#endif

4d26a735   Pedro Roque   Increased recogni...
492
493
	if (filtering) {
		return get_filtering_results(dev_args, dev_info);
94b2b13d   Pedro Roque   PHACT source
494
495
	}

4d26a735   Pedro Roque   Increased recogni...
496
497
	// Transfer best value found flag
#if RUN_IN_CUDA
94b2b13d   Pedro Roque   PHACT source
498
499
500
501
502
503
504
505
506
507
508

	if (dev_info->type == CL_DEVICE_TYPE_GPU) {

		err = cuMemcpyDtoH(dev_args->atoms, dev_args->atoms_mem_cu, dev_args->atoms_size);
		if (err != CUDA_SUCCESS) {
			fprintf(stderr, "CUDA error in cuMemcpyDtoH atoms_mem_cu\n");
			cuCtxDestroy(dev_args->context_cu);
			exit(-1);
		}
	} else {
#endif
4d26a735   Pedro Roque   Increased recogni...
509
510
511
		cl_check_error(clEnqueueReadBuffer(dev_args->cq, dev_args->atoms_mem, CL_TRUE, 0, dev_args->atoms_size, dev_args->atoms, 0, NULL, NULL),
				"clEnqueueReadBuffer atoms_mem", dev_info->dev_name);
#if RUN_IN_CUDA
94b2b13d   Pedro Roque   PHACT source
512
513
		}
#endif
4d26a735   Pedro Roque   Increased recogni...
514
515

	opt_sol_found = dev_args->atoms[5 + N_VS];
94b2b13d   Pedro Roque   PHACT source
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576

	if (opt_sol_found) {

		if (DOMAIN_TYPE == BITMAP_) {
			// Transfer possible solution
#if RUN_IN_CUDA

				if (dev_info->type == CL_DEVICE_TYPE_GPU) {

				err = cuMemcpyDtoH(dev_args->bitmaps, dev_args->domains_mem_cu, dev_args->domains_size);
				if (err != CUDA_SUCCESS) {
					fprintf(stderr, "CUDA error in cuMemcpyDtoH domains_mem_cu\n");
					cuCtxDestroy(dev_args->context_cu);
					exit(-1);
				}
			} else {
#endif
				cl_check_error(clEnqueueReadBuffer(dev_args->cq, dev_args->domains_mem, CL_TRUE, 0, dev_args->domains_size, dev_args->bitmaps, 0, NULL, NULL),
						"clEnqueueReadBuffer domains_mem", dev_info->dev_name);
#if RUN_IN_CUDA
			}
#endif

		} else if (DOMAIN_TYPE == INTERVAL) {
			// Transfer possible solution
#if RUN_IN_CUDA

			if (dev_info->type == CL_DEVICE_TYPE_GPU) {

				err = cuMemcpyDtoH(dev_args->intervals, dev_args->domains_mem_cu, dev_args->domains_size);
				if (err != CUDA_SUCCESS) {
					fprintf(stderr, "CUDA error in cuMemcpyDtoH domains_mem_cu\n");
					cuCtxDestroy(dev_args->context_cu);
					exit(-1);
				}
			} else {
#endif
				cl_check_error(clEnqueueReadBuffer(dev_args->cq, dev_args->domains_mem, CL_TRUE, 0, dev_args->domains_size, dev_args->intervals, 0, NULL, NULL),
						"clEnqueueReadBuffer domains_mem", dev_info->dev_name);
#if RUN_IN_CUDA
			}
#endif
		}
	}

	if (N_DEVS > 1) {
#if RUN_IN_CUDA

		if (dev_info->type == CL_DEVICE_TYPE_GPU) {

		err = cuMemcpyDtoH(dev_args->props, dev_args->props_mem_cu, dev_args->props_size);
			if (err != CUDA_SUCCESS) {
				fprintf(stderr, "CUDA error in cuMemcpyDtoH props_mem_cu\n");
				cuCtxDestroy(dev_args->context_cu);
				exit(-1);
			}
		} else {
#endif
			cl_check_error(clEnqueueReadBuffer(dev_args->cq, dev_args->props_mem, CL_TRUE, 0, dev_args->props_size, dev_args->props, 0,
							NULL, NULL), "clEnqueueReadBuffer props_mem", dev_info->dev_name);
#if RUN_IN_CUDA
4d26a735   Pedro Roque   Increased recogni...
577
578
579
580
		}
#endif

		dev_info->last_props = 0;
94b2b13d   Pedro Roque   PHACT source
581
582
		for (i = 0; i < dev_args->wi_total; i++) {
			dev_info->props_total += dev_args->props[i];
94b2b13d   Pedro Roque   PHACT source
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
			dev_info->last_props += dev_args->props[i];
		}
	}

	// copy statistics from device to host
	if (PRINT_STATS) {
		get_stats(dev_args, dev_info, stats_lock);
	}

	// if optimal solution found
	if (opt_sol_found) {

		if (N_DEVS > 1) {
			// lock access to the place to write the optimal solution and writes it
			pthread_mutex_lock(opt_lock);

			// if using bitmap domains
			if (DOMAIN_TYPE == BITMAP_) {

				vs_copy_dev_to_host(VS_LOCK, dev_args->bitmaps, N_VS);

			} else if (DOMAIN_TYPE == INTERVAL) {

4d26a735   Pedro Roque   Increased recogni...
606
				convert_intervals_to_vars(VS_LOCK, dev_args->intervals, N_VS);
94b2b13d   Pedro Roque   PHACT source
607
			}
4d26a735   Pedro Roque   Increased recogni...
608

94b2b13d   Pedro Roque   PHACT source
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
			if (OPT_MODE == DECREASE) {

				if (VS_LOCK[VAR_ID_TO_OPT].max < VS_LOCK_BEST[VAR_ID_TO_OPT].max) {
					opt_sol_found = true;
				} else {
					opt_sol_found = false;
				}

			} else {

				if (VS_LOCK[VAR_ID_TO_OPT].min > VS_LOCK_BEST[VAR_ID_TO_OPT].min) {
					opt_sol_found = true;
				} else {
					opt_sol_found = false;
				}
			}

			if (opt_sol_found) {
				*val_to_opt = dev_args->atoms[6 + N_VS];

				vs_copy(VS_LOCK_BEST, VS_LOCK, N_VS);
			}

			pthread_mutex_unlock(opt_lock);

		} else {
			*val_to_opt = dev_args->atoms[6 + N_VS];

			// if using bitmap domains
			if (DOMAIN_TYPE == BITMAP_) {

				vs_copy_dev_to_host(VS_LOCK_BEST, dev_args->bitmaps, N_VS);

			} else if (DOMAIN_TYPE == INTERVAL) {

				convert_intervals_to_vars(VS_LOCK_BEST, dev_args->intervals, N_VS);
			}
		}
	}

	if (opt_sol_found) {
		return 1;

	} else {
		return 0;
	}
}

/*
 * Generates the data needed to each work-item for generating the sub-search spaces
 * dev_args - device_args structure about this device
 * dev_info - device_info structure about this device
 * depth - Tree expansion depth needed to get n_ss disjoint search spaces
 * n_ss - total number of sub-search spaces
 */
void set_strs_generat_data(device_args* dev_args, device_info* dev_info, unsigned int depth, unsigned int n_ss, bool filtering) {
	unsigned int n_ss_new;
	unsigned int depth_prev = depth;
	unsigned int depth_new = depth;
	unsigned int new_multiplier = 1;
	unsigned int i, j;

	// buffer for atomics data (Most devices only have atomics for 32 bits variables)
	// 0 - first sub-search to explore
	// 1 - last sub-search to explore
	// 2 - n_ss
	// 3 - depth
	// 4 - WIs still working for work-sharing
	// 5 - 5+N_VS - n_repeat per variable
	// ...

	if (filtering) {

		for (i = 0; i < N_VS; i++) {
			dev_info->exp_values[i] = 0;
			dev_args->atoms[5 + i] = 0;
		}

		dev_info->n_ss_mult = 1;
		dev_info->n_ss_mult_max = 1;
		dev_info->first_store = 0;
		dev_info->last_store = 1;
		dev_args->atoms[0] = 0;
		dev_args->atoms[1] = 1;
		dev_args->atoms[2] = 1;
		dev_args->atoms[3] = 0;

		return;
	}

	for (i = 0; i < N_VS; i++) {
		dev_info->exp_values[i] = EXP_VALUES[i];
		dev_args->atoms[5 + i] = EXP_VALUES[i];
	}

	// calculate a valid n_ss multiplier bigger than the one provided
	// get the max multiplier that can be applied to the number of ss inside each device
	if (dev_info->n_ss_mult > dev_info->n_ss_mult_max) {
		dev_info->n_ss_mult = dev_info->n_ss_mult_max;
	}
	for (i = depth_prev; new_multiplier < dev_info->n_ss_mult && new_multiplier < dev_info->n_ss_mult_max; i++) {
		if (VS[i].n_vals > 1 && VS[i].to_label) {
4d26a735   Pedro Roque   Increased recogni...
711
			new_multiplier *= VS[i].n_vals;
94b2b13d   Pedro Roque   PHACT source
712
713
714
715
716
717
718
719
720
721
722
			dev_info->exp_values[i] = VS[i].n_vals;
		} else {
			dev_info->exp_values[i] = 1;
		}
	}
	i--;

	// if expanding all the previous tree nodes to new depth generate more than the required multiplier
	if (new_multiplier > dev_info->n_ss_mult) {
		new_multiplier /= VS[i].n_vals;

4d26a735   Pedro Roque   Increased recogni...
723
		if (new_multiplier * 2 > dev_info->n_ss_mult_max) {
94b2b13d   Pedro Roque   PHACT source
724
725
726
727
			dev_info->exp_values[i] = 0;
			i--;
		} else {
			for (j = 2; j < VS[i].n_vals; j++) {
4d26a735   Pedro Roque   Increased recogni...
728
729
				if (new_multiplier * j >= dev_info->n_ss_mult || new_multiplier * (j + 1) >= dev_info->n_ss_mult_max) {
					new_multiplier *= j;
94b2b13d   Pedro Roque   PHACT source
730
731
					dev_info->exp_values[i] = j;
					break;
4d26a735   Pedro Roque   Increased recogni...
732
733
				}
			}
94b2b13d   Pedro Roque   PHACT source
734
			if (j == VS[i].n_vals) {
4d26a735   Pedro Roque   Increased recogni...
735
				new_multiplier *= VS[i].n_vals;
94b2b13d   Pedro Roque   PHACT source
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
				dev_info->exp_values[i] = VS[i].n_vals;
			}
		}
	}
	depth_new = i + 1;
	dev_info->n_ss_mult = new_multiplier;
	n_ss_new = n_ss * dev_info->n_ss_mult;

	// non labeling variables will not be expanded
	for (i = 0; i < depth_new; i++) {
		dev_args->atoms[5 + i] = dev_info->exp_values[i];
	}
	for (; i < N_VS; i++) {
		dev_args->atoms[5 + i] = 0;
	}

	// 0 - first sub-search to explore
	// 1 - last sub-search to explore
	// 2 - n_ss
	dev_args->atoms[0] = dev_info->first_store * dev_info->n_ss_mult;
	dev_args->atoms[1] = dev_info->last_store * dev_info->n_ss_mult;
	dev_args->atoms[2] = n_ss_new;
	dev_args->atoms[3] = depth_new;
}

/*
 * Load statistics from the device
 * dev_args - device_args structure about this device
 * dev_info - device_info structure about this device
4d26a735   Pedro Roque   Increased recogni...
765
766
767
 * stats_lock - mutex to control accesses to statistics structure
 */
void get_stats(device_args* dev_args, device_info* dev_info, pthread_mutex_t* stats_lock) {
94b2b13d   Pedro Roque   PHACT source
768
769
	unsigned int i;

4d26a735   Pedro Roque   Increased recogni...
770
771
	// 0 - nodes_fail
	// 1 - nodes_expl
94b2b13d   Pedro Roque   PHACT source
772
	// 2 - backtracks
4d26a735   Pedro Roque   Increased recogni...
773
	// 3 - labels
94b2b13d   Pedro Roque   PHACT source
774
775
776
777
778
779
780
	// 4 - props_not_ok
	// 5 - props_ok
	// ... repeat per work-item
#if RUN_IN_CUDA

    CUresult err = cuInit(0);

4d26a735   Pedro Roque   Increased recogni...
781
	if (dev_info->type == CL_DEVICE_TYPE_GPU) {
94b2b13d   Pedro Roque   PHACT source
782
783
784
785
786

		err = cuMemcpyDtoH(dev_args->stats, dev_args->stats_mem_cu, dev_args->stats_size);
		if (err != CUDA_SUCCESS) {
			fprintf(stderr, "CUDA error in cuMemcpyDtoH stats_mem_cu\n");
			cuCtxDestroy(dev_args->context_cu);
4d26a735   Pedro Roque   Increased recogni...
787
			exit(-1);
94b2b13d   Pedro Roque   PHACT source
788
789
790
791
		}
	} else {
#endif
	cl_check_error(clEnqueueReadBuffer(dev_args->cq, dev_args->stats_mem, CL_TRUE, 0, dev_args->stats_size, dev_args->stats, 0, NULL, NULL), "clEnqueueReadBuffer stats_mem",
4d26a735   Pedro Roque   Increased recogni...
792
793
			dev_info->dev_name);
#if RUN_IN_CUDA
94b2b13d   Pedro Roque   PHACT source
794
795
		}
#endif
4d26a735   Pedro Roque   Increased recogni...
796
797
798
799

	if (N_DEVS > 1) {
		// lock access to the place to write the optimal solution and writes it
		pthread_mutex_lock(stats_lock);
94b2b13d   Pedro Roque   PHACT source
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
	}

	// copy statistics to host global counter
	for (i = 0; i < dev_args->wi_total; i++) {
		STATS.nodes_fail += dev_args->stats[i * 7];
		STATS.nodes_expl += dev_args->stats[i * 7 + 1];
		STATS.backtracks += dev_args->stats[i * 7 + 2];
		STATS.labels += dev_args->stats[i * 7 + 3];
		STATS.pruning += dev_args->stats[i * 7 + 4];
		STATS.props_ok += dev_args->stats[i * 7 + 5];

		if (dev_args->stats[i * 7 + 6] > STATS.max_depth) {
			STATS.max_depth = dev_args->stats[i * 7 + 6];
		}
	}
	STATS.search_spaces += dev_info->block_size * dev_info->n_ss_mult;

	// clear counters on device buffer for next run
	memset(dev_args->stats, 0, dev_args->stats_size);
4d26a735   Pedro Roque   Increased recogni...
819
820

#if RUN_IN_CUDA
94b2b13d   Pedro Roque   PHACT source
821
822

	if (dev_info->type == CL_DEVICE_TYPE_GPU) {
4d26a735   Pedro Roque   Increased recogni...
823
824
825

	err = cuMemcpyHtoD(dev_args->stats_mem_cu, dev_args->stats, dev_args->stats_size);
    if (err != CUDA_SUCCESS) {
94b2b13d   Pedro Roque   PHACT source
826
        fprintf(stderr, "CUDA error in cuMemcpyHtoD stats_mem_cu\n");
4d26a735   Pedro Roque   Increased recogni...
827
        cuCtxDestroy(dev_args->context_cu);
94b2b13d   Pedro Roque   PHACT source
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
        exit(-1);
    }
	} else {
#endif
	cl_check_error(clEnqueueWriteBuffer(dev_args->cq, dev_args->stats_mem, CL_TRUE, 0, dev_args->stats_size, dev_args->stats, 0, NULL, NULL), "clEnqueueWriteBuffer stats_mem",
			dev_info->dev_name);
#if RUN_IN_CUDA
		}
#endif

	if (N_DEVS > 1) {
		pthread_mutex_unlock(stats_lock);
	}
}

bool get_filtering_results(device_args* dev_args, device_info* dev_info) {

4d26a735   Pedro Roque   Increased recogni...
845
	if (DOMAIN_TYPE == BITMAP_) {
94b2b13d   Pedro Roque   PHACT source
846
847
848

#if RUN_IN_CUDA

4d26a735   Pedro Roque   Increased recogni...
849
	    CUresult err = cuInit(0);
94b2b13d   Pedro Roque   PHACT source
850
851
852
853

		if (dev_info->type == CL_DEVICE_TYPE_GPU) {

			err = cuMemcpyDtoH(dev_args->filt_bitmaps, dev_args->filt_domains_mem_cu, dev_args->filt_domains_size);
4d26a735   Pedro Roque   Increased recogni...
854
855
			if (err != CUDA_SUCCESS) {
				fprintf(stderr, "CUDA error in cuMemcpyDtoH filt_domains_mem_cu in %s\n", dev_info->dev_name);
94b2b13d   Pedro Roque   PHACT source
856
857
				cuCtxDestroy(dev_args->context_cu);
				exit(-1);
4d26a735   Pedro Roque   Increased recogni...
858
859
860
861
			}
		} else {
#endif
			// Transfer filtered CSP
94b2b13d   Pedro Roque   PHACT source
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
			cl_check_error(clEnqueueReadBuffer(dev_args->cq, dev_args->filt_domains_mem, CL_TRUE, 0, dev_args->filt_domains_size, dev_args->filt_bitmaps, 0, NULL, NULL),
					"clEnqueueReadBuffer filt_domains_mem", dev_info->dev_name);
#if RUN_IN_CUDA
		}
#endif

		bitmap b_result;
		b_clear(&b_result);
		b_copy_dev_to_host(&b_result, dev_args->filt_bitmaps, 0);

		// consistent CSP after filtering
		if (!b_is_empty(&b_result)) {

			if (CS_IGNORE) {
				unsigned int i;
4d26a735   Pedro Roque   Increased recogni...
877
878

#if RUN_IN_CUDA
94b2b13d   Pedro Roque   PHACT source
879
880

				if (dev_info->type == CL_DEVICE_TYPE_GPU) {
4d26a735   Pedro Roque   Increased recogni...
881
882
883

					err = cuMemcpyDtoH(dev_args->filt_cs, dev_args->filt_cs_mem_cu, dev_args->filt_cs_size);
					if (err != CUDA_SUCCESS) {
94b2b13d   Pedro Roque   PHACT source
884
						fprintf(stderr, "CUDA error in cuMemcpyDtoH filt_cs_mem_cu\n");
4d26a735   Pedro Roque   Increased recogni...
885
						cuCtxDestroy(dev_args->context_cu);
94b2b13d   Pedro Roque   PHACT source
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
						exit(-1);
					}
				} else {
#endif
					// Transfer cs_ignore results
					cl_check_error(clEnqueueReadBuffer(dev_args->cq, dev_args->filt_cs_mem, CL_TRUE, 0, dev_args->filt_cs_size, dev_args->filt_cs, 0, NULL, NULL),
							"clEnqueueReadBuffer filt_cs_mem", dev_info->dev_name);
#if RUN_IN_CUDA
		}
#endif

				for (i = 0; i < N_CS; i++) {
					if (dev_args->filt_cs[i] == 1) {
						CS[i].ignore = true;

					} else {
						CS[i].ignore = false;
					}
				}
			}