/* * config_device.c * * Created on: 04/03/2016 * Author: pedro */ #include #include #include #include // for elapsed time calculation under windows #if defined(WIN32) || defined(_WIN32) || defined(__WIN32) && !defined(__CYGWIN__) #include #include #include #else #include #endif #define CL_USE_DEPRECATED_OPENCL_1_2_APIS #include "CL/cl.h" #include "CL/cl_platform.h" #include "config_device.h" #include "bitmaps.h" #include "kernels/cl_constraints.h" #include "kernels/cl_variables.h" #include "config.h" #include "devices.h" #include "constraints.h" #include "split.h" #include "utils/cl_errors.h" #include "utils/benchmark.h" #include "variables.h" #if RUN_IN_CUDA #include #include #endif /* * Initialize device to do labeling, propagation and backtracking on device * dev_info - device_info structure about this device * dev_args - device_args structure about this device * platform_args - arguments for the platform */ void init_device(device_info* dev_info, device_args* dev_args, bool filtering) { #if RUN_IN_CUDA if (dev_info->type == CL_DEVICE_TYPE_GPU) { struct timeval start_cu, end_cu; char start_time_cu[40]; // for elapse time calculation char end_time_cu[40]; // for elapse time calculation char elapsed_time_cu[40]; // for elapse time calculation char cu_build_opt[2000]; char cu_cs_usage[40]; int max_com_size = 2000; char command_cp[max_com_size]; const char kernel_filename[] = "src/kernels/cl_explore.cu"; const char fatbin_filename[] = "src/kernels/cl_explore.fatbin"; const char kernel_name[] = "explore"; CUresult err = cuInit(0); unsigned int k, l; // get first CUDA device err = cuDeviceGet(&dev_args->device_cu, 0); if (err != CUDA_SUCCESS) { fprintf(stderr, "CUDA error cuDeviceGet\n"); exit(-1); } char name[100]; cuDeviceGetName(name, 100, dev_args->device_cu); // -save-temps sprintf(cu_build_opt, " -fatbin -ccbin g++ --cudart=static -gencode arch=compute_30,code=sm_30 -gencode arch=compute_30,code=compute_30" " -D CUDA_VERSION=1 -D __OPENCL_VERSION__=120 -D CL_N_VS=%d -D CL_N_CS=%d -D CL_N_VS_TO_LABEL=%d -D CL_SPLIT_VALUES_EXT=%d -D CL_N_VS_CS=%d -D CL_N_CS_VS=%d" " -D CL_BITS=%d -D CL_WORD=%d -D CL_MEM=%d -D CL_LABEL_M=%d -D CL_ASSIGN_M=%d -D CL_WORK=%d -D CL_OPT_M=%d -D CL_VAR_ID_TO_OPT=%d -D CL_D_TYPE=%d" " -D CL_D_MAX=%d -D CL_D_MIN=%d -D CL_STATS=%u -D CL_INTS_CONST=%d -D CL_B_DS_CONST=%d -D CL_VS_CONST=%d -D CL_CS_CONST=%d -D CL_N_SHARED_SS=%d -D CL_N_DEVS=%d" " -D PRINT_SOLS=%d -D CL_PRE_LABELING=%d -D CL_CS_IGNORE=%d -D CL_BOOLEAN_VS=%d -D CL_TO_LABEL_THRESHOLD=%d -D CL_FILTERING=%d " " -D CL_USE_N_BUFFERS=%d -D CL_N_TERMS=%d -D CL_CHECK_ERRORS=%d -D CL_VERIFY_SOLS=%d -I src/ -I src/utils/ -I src/kernels/", N_VS, N_CS, dev_args->n_vs_to_label, dev_args->split_values_ext, dev_args->n_vs_cs, dev_args->n_cs_vs, CL_BITS_, CL_WORD_, dev_info->use_local_mem, LABEL_MODE, ASSIGN_MODE, WORK, OPT_MODE, VAR_ID_TO_OPT, DOMAIN_TYPE, D_MAX, D_MIN, PRINT_STATS, dev_args->ints_const, dev_args->b_ds_const, dev_args->cl_vs_const, dev_args->cl_cs_const, dev_args->n_shared_stores, N_DEVS, PRINT_SOLUTIONS, REV, CS_IGNORE, BOOLEAN_VS, TO_LABEL_THRESHOLD, filtering, dev_info->n_buffers, dev_info->n_terms, CL_CHECK_ERRORS, CL_VERIFY_SOLS); // to tell OpenCL compiler which constraints should be compiled and which ones uses reification for (k = 0; k < N_C_TYPES; k++) { #if COMPILE_ALL_CS == 0 if (USE_CS[k] == 1) { #endif sprintf(cu_cs_usage, " -D CS_%s=1", cs_get_type((c_kind)k)); strcat(cu_build_opt, cu_cs_usage); #if COMPILE_ALL_CS == 0 if (USE_CS_REIFI[k] == 1) { #endif sprintf(cu_cs_usage, " -D CS_R_%s=1", cs_get_type((c_kind)k)); strcat(cu_build_opt, cu_cs_usage); #if COMPILE_ALL_CS == 0 } } #endif } #if DEBUG_IN_CUDA fprintf(stderr, "Debug activated inside kernel (only available for Nvidia CPUs).\n\n"); char cu_add[100]; sprintf(cu_add, " -g -G -O0 "); strcat(cu_build_opt, cu_add); snprintf(command_cp, (unsigned long)max_com_size, "nvcc %s -o %s -v %s", kernel_filename, fatbin_filename, cu_build_opt); printf("\n\nNVCC command:\n%s\n\n", command_cp); #else snprintf(command_cp, (unsigned long)max_com_size, "nvcc %s -o %s %s", kernel_filename, fatbin_filename, cu_build_opt); #endif // for elapsed time calculation if (VERBOSE) { gettimeofday(&start_cu, NULL); } int nvcc_exit_status = system(command_cp); if (nvcc_exit_status) { printf("ERROR: nvcc exits with status code: %d\n", nvcc_exit_status); exit(-1); } if (VERBOSE) { gettimeofday(&end_cu, NULL); format_elapsed_time_s_ms(elapsed_time_cu, start_cu.tv_sec, start_cu.tv_usec, end_cu.tv_sec, end_cu.tv_usec); format_time_s_ms(start_time_cu, start_cu.tv_sec, start_cu.tv_usec); format_time_s_ms(end_time_cu, end_cu.tv_sec, end_cu.tv_usec); printf("%s...%s = %s (s.ms) -> CUDA kernel compiled for %s\n", start_time_cu, end_time_cu, elapsed_time_cu, name); } // for elapsed time calculation if (VERBOSE) { gettimeofday(&start_cu, NULL); } err = cuCtxCreate(&dev_args->context_cu, 0, dev_args->device_cu); if (err != CUDA_SUCCESS) { fprintf(stderr, "CUDA error cuCtxCreate\n"); cuCtxDestroy(dev_args->context_cu); exit(-1); } if (VERBOSE) { gettimeofday(&end_cu, NULL); format_elapsed_time_s_ms(elapsed_time_cu, start_cu.tv_sec, start_cu.tv_usec, end_cu.tv_sec, end_cu.tv_usec); format_time_s_ms(start_time_cu, start_cu.tv_sec, start_cu.tv_usec); format_time_s_ms(end_time_cu, end_cu.tv_sec, end_cu.tv_usec); printf("%s...%s = %s (s.ms) -> CUDA context created\n", start_time_cu, end_time_cu, elapsed_time_cu); } // for elapsed time calculation if (VERBOSE) { gettimeofday(&start_cu, NULL); } err = cuModuleLoad(&dev_args->module_cu, fatbin_filename); if (err != CUDA_SUCCESS) { fprintf(stderr, "CUDA error loading the fatbin %s\n", fatbin_filename); cuCtxDestroy(dev_args->context_cu); exit(-0); } if (VERBOSE) { gettimeofday(&end_cu, NULL); format_elapsed_time_s_ms(elapsed_time_cu, start_cu.tv_sec, start_cu.tv_usec, end_cu.tv_sec, end_cu.tv_usec); format_time_s_ms(start_time_cu, start_cu.tv_sec, start_cu.tv_usec); format_time_s_ms(end_time_cu, end_cu.tv_sec, end_cu.tv_usec); printf("%s...%s = %s (s.ms) -> CUDA kernel loaded\n", start_time_cu, end_time_cu, elapsed_time_cu); } // for elapsed time calculation if (VERBOSE) { gettimeofday(&start_cu, NULL); } err = cuModuleGetFunction(&dev_args->function_cu, dev_args->module_cu, kernel_name); if (err != CUDA_SUCCESS) { fprintf(stderr, "CUDA error getting kernel function %s\n", kernel_name); cuCtxDestroy(dev_args->context_cu); exit(-1); } if (VERBOSE) { gettimeofday(&end_cu, NULL); format_elapsed_time_s_ms(elapsed_time_cu, start_cu.tv_sec, start_cu.tv_usec, end_cu.tv_sec, end_cu.tv_usec); format_time_s_ms(start_time_cu, start_cu.tv_sec, start_cu.tv_usec); format_time_s_ms(end_time_cu, end_cu.tv_sec, end_cu.tv_usec); printf("%s...%s = %s (s.ms) -> CUDA function obtained\n", start_time_cu, end_time_cu, elapsed_time_cu); } // 0...cs_vs_idx - each constraint list of constrained variables ids placed per constraint order // cs_vs_idx...cs_vs_idx+vs_cs_idx - each variable list of constraints ids placed per variable order // cs_vs_idx+vs_cs_idx...cs_vs_idx+vs_cs_idx+n_const_cs - each constraint list of constants placed per constraint order err = cuMemAlloc(&dev_args->ints_mem_cu, dev_args->ints_size); if (err != CUDA_SUCCESS) { fprintf(stderr, "CUDA error in cuMemAlloc ints_mem_cu\n"); cuCtxDestroy(dev_args->context_cu); exit(-1); } dev_args->ints = calloc(dev_args->ints_size / sizeof(cl_int), sizeof(cl_int)); // buffer for cl_constr constant data err = cuMemAlloc(&dev_args->cl_cs_mem_cu, dev_args->cl_cs_size); if (err != CUDA_SUCCESS) { fprintf(stderr, "CUDA error in cuMemAlloc cl_cs_mem_cu\n"); cuCtxDestroy(dev_args->context_cu); exit(-1); } dev_args->cl_cs = malloc(dev_args->cl_cs_size); unsigned int n_vs_cs_cnt_cu = 0; // Fills constraints and variables per constraint buffer for (k = 0; k < N_CS; k++) { dev_args->cl_cs[k].kind = CS[k].kind; dev_args->cl_cs[k].n_c_vs = CS[k].n_c_vs; dev_args->cl_cs[k].n_c_consts = CS[k].n_c_consts; dev_args->cl_cs[k].v_idx = n_vs_cs_cnt_cu; dev_args->cl_cs[k].constant_val = CS[k].constant_val; dev_args->cl_cs[k].reif_var_id = CS[k].reif_v_id; dev_args->cl_cs[k].reified = CS[k].reified; dev_args->cl_cs[k].boolean = CS[k].boolean; dev_args->cl_cs[k].c_id = CS[k].c_id; for (l = 0; l < CS[k].n_c_vs; l++) { dev_args->ints[n_vs_cs_cnt_cu++] = CS[k].c_vs[l]->v_id; } } // Fills variables and constraints per variable buffer int n_cs_vs_cnt_cu = 0; if (DOMAIN_TYPE == BITMAP_) { err = cuMemAlloc(&dev_args->cl_vs_mem_cu, dev_args->cl_vs_size); if (err != CUDA_SUCCESS) { fprintf(stderr, "CUDA error in cuMemAlloc cl_vs_mem_cu\n"); cuCtxDestroy(dev_args->context_cu); exit(-1); } dev_args->cl_vs_bitmaps = malloc(dev_args->cl_vs_size); err = cuMemAlloc(&dev_args->b_ds_mem_cu, dev_args->b_ds_size); if (err != CUDA_SUCCESS) { fprintf(stderr, "CUDA error in cuMemAlloc b_ds_mem_cu\n"); cuCtxDestroy(dev_args->context_cu); exit(-1); } dev_args->b_ds = malloc(dev_args->b_ds_size); vs_copy_host_to_dev(dev_args->b_ds, VS, N_VS); for (k = 0; k < N_VS; k++) { dev_args->cl_vs_bitmaps[k].n_cs = VS[k].n_cs; dev_args->cl_vs_bitmaps[k].to_label = VS[k].to_label; dev_args->cl_vs_bitmaps[k].boolean = VS[k].boolean; dev_args->cl_vs_bitmaps[k].expanded = VS[k].expanded; dev_args->cl_vs_bitmaps[k].n_vals = VS[k].n_vals; if (VS[k].n_cs > 0) { dev_args->cl_vs_bitmaps[k].c_idx = (unsigned int)n_cs_vs_cnt_cu; for (l = 0; l < VS[k].n_cs; l++) { dev_args->ints[dev_args->n_vs_cs + (unsigned int)n_cs_vs_cnt_cu] = (int)VS[k].cs[l]->c_id; n_cs_vs_cnt_cu++; } } else { dev_args->cl_vs_bitmaps[k].c_idx = 0; } } } else if (DOMAIN_TYPE == INTERVAL) { // buffer for cl_var constant data err = cuMemAlloc(&dev_args->cl_vs_mem_cu, dev_args->cl_vs_size); if (err != CUDA_SUCCESS) { fprintf(stderr, "CUDA error in cuMemAlloc cl_vs_mem_cu\n"); cuCtxDestroy(dev_args->context_cu); exit(-1); } dev_args->cl_vs_intervals = malloc(dev_args->cl_vs_size); for (k = 0; k < N_VS; k++) { dev_args->cl_vs_intervals[k].domain.s[0] = VS[k].domain_i.s[0]; dev_args->cl_vs_intervals[k].domain.s[1] = VS[k].domain_i.s[1]; dev_args->cl_vs_intervals[k].n_cs = VS[k].n_cs; dev_args->cl_vs_intervals[k].c_idx = (unsigned int)n_cs_vs_cnt_cu; dev_args->cl_vs_intervals[k].to_label = VS[k].to_label; dev_args->cl_vs_intervals[k].boolean = VS[k].boolean; dev_args->cl_vs_intervals[k].expanded = VS[k].expanded; dev_args->cl_vs_intervals[k].n_vals = VS[k].n_vals; if (VS[k].n_cs > 0) { dev_args->cl_vs_intervals[k].c_idx = (unsigned int)n_cs_vs_cnt_cu; for (l = 0; l < VS[k].n_cs; l++) { dev_args->ints[dev_args->n_vs_cs + (unsigned int)n_cs_vs_cnt_cu] = (int)VS[k].cs[l]->c_id; n_cs_vs_cnt_cu++; } } else { dev_args->cl_vs_intervals[k].c_idx = 0; } } } // Fills constraint constant values, if existent unsigned int cs_consts_cnt_cu = 0; for (k = 0; k < N_CS; k++) { if (CS[k].n_c_consts > 0) { dev_args->cl_cs[k].const_idx = cs_consts_cnt_cu; for (l = 0; l < CS[k].n_c_consts; l++) { dev_args->ints[dev_args->n_vs_cs + dev_args->n_cs_vs + cs_consts_cnt_cu] = CS[k].c_consts[l]; cs_consts_cnt_cu++; } } else { dev_args->cl_cs[k].const_idx = 0; } } // if not using local memory (using global memory only) if (!dev_info->use_local_mem) { // buffer for cl_vs_prop data err = cuMemAlloc(&dev_args->cl_vs_prop_mem_cu, dev_args->cl_vs_prop_size); if (err != CUDA_SUCCESS) { fprintf(stderr, "CUDA error in cuMemAlloc cl_vs_prop_mem_cu\n"); cuCtxDestroy(dev_args->context_cu); exit(-1); } // buffer for cl_vs_prop data err = cuMemAlloc(&dev_args->vs_id_to_prop_mem_cu, dev_args->vs_id_to_prop_size); if (err != CUDA_SUCCESS) { fprintf(stderr, "CUDA error in cuMemAlloc vs_id_to_prop_mem_cu\n"); cuCtxDestroy(dev_args->context_cu); exit(-1); } } // buffer for backtracking history // 0...(n_vs_to_label+1)*N_VS*wi_total - backtracking history err = cuMemAlloc(&dev_args->backtrack_mem1_cu, dev_args->backtrack_size); if (err != CUDA_SUCCESS) { fprintf(stderr, "CUDA error in cuMemAlloc backtrack_mem1_cu\n"); cuCtxDestroy(dev_args->context_cu); exit(-1); } // more buffers for backtracking #if USE_MORE_BUFFERS if (dev_info->n_buffers > 1) { err = cuMemAlloc(&dev_args->backtrack_mem2_cu, dev_args->backtrack_size); if (err != CUDA_SUCCESS) { fprintf(stderr, "CUDA error in cuMemAlloc backtrack_mem2_cu\n"); cuCtxDestroy(dev_args->context_cu); exit(-1); } } if (dev_info->n_buffers > 2) { err = cuMemAlloc(&dev_args->backtrack_mem3_cu, dev_args->backtrack_size); if (err != CUDA_SUCCESS) { fprintf(stderr, "CUDA error in cuMemAlloc backtrack_mem3_cu\n"); cuCtxDestroy(dev_args->context_cu); exit(-1); } } if (dev_info->n_buffers > 3) { err = cuMemAlloc(&dev_args->backtrack_mem4_cu, dev_args->backtrack_size); if (err != CUDA_SUCCESS) { fprintf(stderr, "CUDA error in cuMemAlloc backtrack_mem4_cu\n"); cuCtxDestroy(dev_args->context_cu); exit(-1); } } #endif // (dev_args->n_vs_to_label + 2 + TO_LABEL_THRESHOLD) * dev_args->split_values_ext) * 2 - to use in kernel (hist_labeleds_id and hist_labeleds_n_vals) // n_terms * dev_args->wi_total - to use in propagators // D_MAX+1 - for ss generation err = cuMemAlloc(&dev_args->generic_mem_cu, dev_args->generic_size); if (err != CUDA_SUCCESS) { fprintf(stderr, "CUDA error in cuMemAlloc generic_mem_cu\n"); cuCtxDestroy(dev_args->context_cu); exit(-1); } if (CS_IGNORE) { // N_CS * dev_args->wi_total - to flag CS_IGNORE err = cuMemAlloc(&dev_args->cs_ignore_mem_cu, dev_args->cs_ignore_size); if (err != CUDA_SUCCESS) { fprintf(stderr, "CUDA error in cuMemAlloc cs_ignore_mem_cu\n"); cuCtxDestroy(dev_args->context_cu); exit(-1); } } // if all solutions must be found if (WORK == CNT) { // buffer for atomics data (Most devices only have atomics for 32 bits variables) // 0 - first sub-search to explore // 1 - last sub-search to explore // 2 - n_ss // 3 - depth // 4 - WIs still working for work-sharing // 5 - 5+N_VS - n_repeat per variable // 5+N_VS...5+N_VS+N_WG*N_WI_WG - number of solutions found per work-item err = cuMemAlloc(&dev_args->atoms_mem_cu, dev_args->atoms_size); if (err != CUDA_SUCCESS) { fprintf(stderr, "CUDA error in cuMemAlloc atoms_mem_cu\n"); cuCtxDestroy(dev_args->context_cu); exit(-1); } dev_args->atoms = calloc(dev_args->atoms_size / sizeof(cl_uint), sizeof(cl_uint)); // if only one solution must be found } else if (WORK == ONE) { // buffer for atomics data (Most devices only have atomics for 32 bits variables) // 0 - first sub-search to explore // 1 - last sub-search to explore // 2 - n_ss // 3 - depth // 4 - WIs still working for work-sharing // 5 - 5+N_VS - n_repeat per variable // 5+N_VS - solution found flag err = cuMemAlloc(&dev_args->atoms_mem_cu, dev_args->atoms_size); if (err != CUDA_SUCCESS) { fprintf(stderr, "CUDA error in cuMemAlloc atoms_mem_cu\n"); cuCtxDestroy(dev_args->context_cu); exit(-1); } dev_args->atoms = calloc(dev_args->atoms_size / sizeof(cl_uint), sizeof(cl_uint)); // buffer for domains writable data // 0...N_VS - solution domains err = cuMemAlloc(&dev_args->domains_mem_cu, dev_args->domains_size); if (err != CUDA_SUCCESS) { fprintf(stderr, "CUDA error in cuMemAlloc domains_mem_cu\n"); cuCtxDestroy(dev_args->context_cu); exit(-1); } if (DOMAIN_TYPE == BITMAP_) { dev_args->bitmaps = malloc(dev_args->domains_size); // set buffer initial values to zero memset(dev_args->bitmaps, 0, dev_args->domains_size); err = cuMemcpyHtoD(dev_args->domains_mem_cu, dev_args->bitmaps, dev_args->domains_size); if (err != CUDA_SUCCESS) { fprintf(stderr, "CUDA error in cuMemcpyHtoD domains_mem_cu\n"); cuCtxDestroy(dev_args->context_cu); exit(-1); } } else if (DOMAIN_TYPE == INTERVAL) { dev_args->intervals = malloc(dev_args->domains_size); // set buffer initial values to zero memset(dev_args->intervals, 0, N_VS * 4); err = cuMemcpyHtoD(dev_args->domains_mem_cu, dev_args->intervals, dev_args->domains_size); if (err != CUDA_SUCCESS) { fprintf(stderr, "CUDA error in cuMemcpyHtoD domains_mem_cu\n"); cuCtxDestroy(dev_args->context_cu); exit(-1); } } // if optimization } else if (WORK == OPT) { // buffer for atomics data (Most devices only have atomics for 32 bits variables) // 0 - first sub-search to explore // 1 - last sub-search to explore // 2 - n_ss // 3 - depth // 4 - WIs still working for work-sharing // 5 - 5+N_VS - n_repeat per variable // 5+N_VS - solution found flag // 6+N_VS - Value to optimize // 7+N_VS - WIs still working for saving the best solution err = cuMemAlloc(&dev_args->atoms_mem_cu, dev_args->atoms_size); if (err != CUDA_SUCCESS) { fprintf(stderr, "CUDA error in cuMemAlloc atoms_mem_cu\n"); cuCtxDestroy(dev_args->context_cu); exit(-1); } dev_args->atoms = calloc(dev_args->atoms_size / sizeof(cl_uint), sizeof(cl_uint)); // buffer for domains writable data // 0...N_VS*(D_MAX+1) - (D_MAX+1) solution stores because concurrency control err = cuMemAlloc(&dev_args->domains_mem_cu, dev_args->domains_size); if (err != CUDA_SUCCESS) { fprintf(stderr, "CUDA error in cuMemAlloc domains_mem_cu\n"); cuCtxDestroy(dev_args->context_cu); exit(-1); } if (DOMAIN_TYPE == BITMAP_) { dev_args->bitmaps = malloc(dev_args->domains_size); // set buffer initial values to zero memset(dev_args->bitmaps, 0, dev_args->domains_size); err = cuMemcpyHtoD(dev_args->domains_mem_cu, dev_args->bitmaps, dev_args->domains_size); if (err != CUDA_SUCCESS) { fprintf(stderr, "CUDA error in cuMemcpyHtoD domains_mem_cu\n"); cuCtxDestroy(dev_args->context_cu); exit(-1); } } else if (DOMAIN_TYPE == INTERVAL) { dev_args->intervals = malloc(dev_args->domains_size); // set buffer initial values to zero memset(dev_args->intervals, 0, N_VS * (D_MAX + 1) * 4); err = cuMemcpyHtoD(dev_args->domains_mem_cu, dev_args->intervals, dev_args->domains_size); if (err != CUDA_SUCCESS) { fprintf(stderr, "CUDA error in cuMemcpyHtoD domains_mem_cu\n"); cuCtxDestroy(dev_args->context_cu); exit(-1); } } } if (N_DEVS > 1) { // to count number of propagations done per work-item, for rank calculation err = cuMemAlloc(&dev_args->props_mem_cu, dev_args->props_size); if (err != CUDA_SUCCESS) { fprintf(stderr, "CUDA error in cuMemAlloc props_mem_cu\n"); cuCtxDestroy(dev_args->context_cu); exit(-1); } dev_args->props = calloc(dev_args->props_size / sizeof(cl_ulong), sizeof(cl_ulong)); } #if SHARED_SS > 0 // for work-sharing after the ss in the block have finished err = cuMemAlloc(&dev_args->shared_stores_mem_cu, dev_args->shared_stores_size); if (err != CUDA_SUCCESS) { fprintf(stderr, "CUDA error in cuMemAlloc shared_stores_mem_mem_cu\n"); cuCtxDestroy(dev_args->context_cu); exit(-1); } // flags for signaling the state of each work-sharing store // 0 - next shared SS to be picked // 1 - next shared SS to be filled // 2...number of SS already filled // 3..3+CL_N_SHARED_SS - V_ID that was labeled to generate this SS err = cuMemAlloc(&dev_args->shared_stores_flag_mem_cu, dev_args->shared_stores_flag_size); if (err != CUDA_SUCCESS) { fprintf(stderr, "CUDA error in cuMemAlloc shared_stores_flag_mem_cu\n"); cuCtxDestroy(dev_args->context_cu); exit(-1); } dev_args->shared_stores_flag = calloc(dev_args->shared_stores_flag_size / sizeof(cl_int), sizeof(cl_int)); #endif if (PRINT_STATS) { // 0 - nodes_fail // 1 - nodes_expl // 2 - backtracks // 3 - labels // 4 - pruning // 5 - props_ok // 6 - max_depth // ... repeat per work-item err = cuMemAlloc(&dev_args->stats_mem_cu, dev_args->stats_size); if (err != CUDA_SUCCESS) { fprintf(stderr, "CUDA error in cuMemAlloc stats_mem_cu\n"); cuCtxDestroy(dev_args->context_cu); exit(-1); } dev_args->stats = malloc(dev_args->stats_size); memset(dev_args->stats, 0, dev_args->stats_size); err = cuMemcpyHtoD(dev_args->stats_mem_cu, dev_args->stats, dev_args->stats_size); if (err != CUDA_SUCCESS) { fprintf(stderr, "CUDA error in cuMemcpyHtoD stats_mem_cu\n"); cuCtxDestroy(dev_args->context_cu); exit(-1); } } if (filtering) { // 0...N_VS - size of domains_mem buffer for the filtering result err = cuMemAlloc(&dev_args->filt_domains_mem_cu, dev_args->filt_domains_size); if (err != CUDA_SUCCESS) { fprintf(stderr, "CUDA error in cuMemAlloc filt_domains_mem_cu\n"); cuCtxDestroy(dev_args->context_cu); exit(-1); } if (DOMAIN_TYPE == BITMAP_) { dev_args->filt_bitmaps = malloc(dev_args->filt_domains_size); } else if (DOMAIN_TYPE == INTERVAL) { dev_args->filt_intervals = malloc(dev_args->filt_domains_size); } if (CS_IGNORE) { // 0...N_CS - size of filt_cs_mem buffer for the filtering err = cuMemAlloc(&dev_args->filt_cs_mem_cu, dev_args->filt_cs_size); if (err != CUDA_SUCCESS) { fprintf(stderr, "CUDA error in cuMemAlloc filt_cs_mem_cu\n"); cuCtxDestroy(dev_args->context_cu); exit(-1); } dev_args->filt_cs = malloc(dev_args->filt_cs_size); } } // write permanent data to device buffers err = cuMemcpyHtoD(dev_args->ints_mem_cu, dev_args->ints, dev_args->ints_size); if (err != CUDA_SUCCESS) { fprintf(stderr, "CUDA error in cuMemcpyHtoD ints_mem_cu\n"); cuCtxDestroy(dev_args->context_cu); exit(-1); } if (DOMAIN_TYPE == BITMAP_) { err = cuMemcpyHtoD(dev_args->cl_vs_mem_cu, dev_args->cl_vs_bitmaps, dev_args->cl_vs_size); if (err != CUDA_SUCCESS) { fprintf(stderr, "CUDA error in cuMemcpyHtoD cl_vs_mem_cu\n"); cuCtxDestroy(dev_args->context_cu); exit(-1); } err = cuMemcpyHtoD(dev_args->b_ds_mem_cu, dev_args->b_ds, dev_args->b_ds_size); if (err != CUDA_SUCCESS) { fprintf(stderr, "CUDA error in cuMemcpyHtoD b_ds_mem_mem_cu\n"); cuCtxDestroy(dev_args->context_cu); exit(-1); } } else if (DOMAIN_TYPE == INTERVAL) { err = cuMemcpyHtoD(dev_args->cl_vs_mem_cu, dev_args->cl_vs_intervals, dev_args->cl_vs_size); if (err != CUDA_SUCCESS) { fprintf(stderr, "CUDA error in cuMemcpyHtoD cl_vs_mem_cu\n"); cuCtxDestroy(dev_args->context_cu); exit(-1); } } err = cuMemcpyHtoD(dev_args->cl_cs_mem_cu, dev_args->cl_cs, dev_args->cl_cs_size); if (err != CUDA_SUCCESS) { fprintf(stderr, "CUDA error in cuMemcpyHtoD cl_cs_mem_cu\n"); cuCtxDestroy(dev_args->context_cu); exit(-1); } // Set OpenCL kernel arguments cl_uint arg_number_cu = 8; #if USE_MORE_BUFFERS if (dev_info->n_buffers > 1) { arg_number_cu++; } if (dev_info->n_buffers > 2) { arg_number_cu++; } if (dev_info->n_buffers > 3) { arg_number_cu++; } #endif if (CS_IGNORE) { arg_number_cu++; } if (WORK == ONE || WORK == OPT) { arg_number_cu++; } if (DOMAIN_TYPE == BITMAP_) { arg_number_cu++; } if (PRINT_STATS) { arg_number_cu++; } if (N_DEVS > 1) { arg_number_cu++; } #if SHARED_SS > 0 arg_number_cu += 2; #endif if (filtering) { arg_number_cu++; if (CS_IGNORE) { arg_number_cu++; } } dev_args->kernel_args_cu = malloc(arg_number_cu * sizeof(CUdeviceptr)); arg_number_cu = 0; dev_args->kernel_args_cu[arg_number_cu++] = &dev_args->atoms_mem_cu; dev_args->kernel_args_cu[arg_number_cu++] = &dev_args->ints_mem_cu; dev_args->kernel_args_cu[arg_number_cu++] = &dev_args->backtrack_mem1_cu; // more buffers for backtracking #if USE_MORE_BUFFERS if (dev_info->n_buffers > 1) { dev_args->kernel_args_cu[arg_number_cu++] = &dev_args->backtrack_mem2_cu; } if (dev_info->n_buffers > 2) { dev_args->kernel_args_cu[arg_number_cu++] = &dev_args->backtrack_mem3_cu; } if (dev_info->n_buffers > 3) { dev_args->kernel_args_cu[arg_number_cu++] = &dev_args->backtrack_mem4_cu; } #endif dev_args->kernel_args_cu[arg_number_cu++] = &dev_args->generic_mem_cu; if (CS_IGNORE) { dev_args->kernel_args_cu[arg_number_cu++] = &dev_args->cs_ignore_mem_cu; } if (WORK == ONE || WORK == OPT) { dev_args->kernel_args_cu[arg_number_cu++] = &dev_args->domains_mem_cu; } dev_args->kernel_args_cu[arg_number_cu++] = &dev_args->cl_vs_mem_cu; dev_args->kernel_args_cu[arg_number_cu++] = &dev_args->cl_cs_mem_cu; // if using local memory if (dev_info->use_local_mem) { dev_args->shared_memory_size_cu = dev_args->cl_vs_prop_size; dev_args->shared_memory_size_cu += dev_args->vs_id_to_prop_size; // if not using local memory } else { dev_args->shared_memory_size_cu = 0; dev_args->kernel_args_cu[arg_number_cu++] = &dev_args->cl_vs_prop_mem_cu; dev_args->kernel_args_cu[arg_number_cu++] = &dev_args->vs_id_to_prop_mem_cu; } if (DOMAIN_TYPE == BITMAP_) { dev_args->kernel_args_cu[arg_number_cu++] = &dev_args->b_ds_mem_cu; } if (PRINT_STATS) { dev_args->kernel_args_cu[arg_number_cu++] = &dev_args->stats_mem_cu; } if (N_DEVS > 1) { dev_args->kernel_args_cu[arg_number_cu++] = &dev_args->props_mem_cu; } #if SHARED_SS > 0 dev_args->kernel_args_cu[arg_number_cu++] = &dev_args->shared_stores_mem_cu; dev_args->kernel_args_cu[arg_number_cu++] = &dev_args->shared_stores_flag_mem_cu; #endif if (filtering) { dev_args->kernel_args_cu[arg_number_cu++] = &dev_args->filt_domains_mem_cu; if (CS_IGNORE) { dev_args->kernel_args_cu[arg_number_cu++] = &dev_args->filt_cs_mem_cu; } } } else { #endif dev_args->cq = NULL; // Queue for buffering devices operations dev_args->kernel = NULL; // kernel cl_int ret; // Returned value from each OpenCL host function unsigned int i, j; struct timeval start, end; char start_time[40]; // for elapse time calculation char end_time[40]; // for elapse time calculation char elapsed_time[40]; // for elapse time calculation // for elapsed time calculation if (VERBOSE) { gettimeofday(&start, NULL); } cl_context_properties contextProperties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties) dev_info->platform_id, 0 }; dev_info->context = clCreateContext(contextProperties, 1, &dev_info->device_id, NULL, NULL, &ret); cl_check_error(ret, "clCreateContext", dev_info->dev_name); if (VERBOSE) { gettimeofday(&end, NULL); format_elapsed_time_s_ms(elapsed_time, start.tv_sec, start.tv_usec, end.tv_sec, end.tv_usec); format_time_s_ms(start_time, start.tv_sec, start.tv_usec); format_time_s_ms(end_time, end.tv_sec, end.tv_usec); printf("%s...%s = %s (s.ms) -> OpenCL context created for %s (%d)\n", start_time, end_time, elapsed_time, dev_info->dev_name, dev_info->dev_type_n); } // Load kernel source file FILE *fp; const char kernel_file[] = "src/kernels/cl_explore.cu"; size_t src_size; char *src_str; fp = fopen(kernel_file, "r"); if (!fp) { fprintf(stderr, "Failed to load cl_explore kernel at %s.\n", kernel_file); exit(-1); } src_str = (char *) malloc(0x1000000); src_size = fread(src_str, 1, 0x1000000, fp); fclose(fp); // kernel build options to load header files needed in kernel // -Werror AMD_OCL_BUILD_OPTIONS_APPEND=-cl-opt-disable -cl-strict-aliasing AMD_OCL_BUILD_OPTIONS_APPEND=-save-temps // CL_CONFIG_USE_VECTORIZER=false oclgrind --inst-counts --check-api --data-races --uninitialized --uniform-writes // --constant-mem-size 65536 --global-mem-size 4231331840 --local-mem-size 49152 ./PHACT -E QUEENS 5 -D GPU -COUNT -V // // -cl-nv-verbose -save-temps -Werror char build_opt[2000]; sprintf(build_opt, " -Werror -D CUDA_VERSION=0 -cl-std=CL1.2 -D CL_N_VS=%d -D CL_N_CS=%d -D CL_N_VS_TO_LABEL=%d -D CL_SPLIT_VALUES_EXT=%d -D CL_N_VS_CS=%d -D CL_N_CS_VS=%d" " -D CL_BITS=%d -D CL_WORD=%d -D CL_MEM=%d -D CL_LABEL_M=%d -D CL_ASSIGN_M=%d -D CL_WORK=%d -D CL_OPT_M=%d -D CL_VAR_ID_TO_OPT=%d -D CL_D_TYPE=%d" " -D CL_D_MAX=%d -D CL_D_MIN=%d -D CL_STATS=%u -D CL_INTS_CONST=%d -D CL_B_DS_CONST=%d -D CL_VS_CONST=%d -D CL_CS_CONST=%d -D CL_N_SHARED_SS=%d -D CL_N_DEVS=%d" " -D PRINT_SOLS=%d -D CL_PRE_LABELING=%d -D CL_CS_IGNORE=%d -D CL_BOOLEAN_VS=%d -D CL_TO_LABEL_THRESHOLD=%d -D CL_FILTERING=%d " " -D CL_USE_N_BUFFERS=%d -D CL_N_TERMS=%d -D CL_CHECK_ERRORS=%d -D CL_VERIFY_SOLS=%d -I src/ -I src/utils/ -I src/kernels/", N_VS, N_CS, dev_args->n_vs_to_label, dev_args->split_values_ext, dev_args->n_vs_cs, dev_args->n_cs_vs, CL_BITS_, CL_WORD_, dev_info->use_local_mem, LABEL_MODE, ASSIGN_MODE, WORK, OPT_MODE, VAR_ID_TO_OPT, DOMAIN_TYPE, D_MAX, D_MIN, PRINT_STATS, dev_args->ints_const, dev_args->b_ds_const, dev_args->cl_vs_const, dev_args->cl_cs_const, dev_args->n_shared_stores, N_DEVS, PRINT_SOLUTIONS, REV, CS_IGNORE, BOOLEAN_VS, TO_LABEL_THRESHOLD, filtering, dev_info->n_buffers, dev_info->n_terms, CL_CHECK_ERRORS, CL_VERIFY_SOLS); #if DEBUG if (dev_info->type == CL_DEVICE_TYPE_CPU) { fprintf(stderr, "Debug activated inside kernel (only available for Intel CPUs).\n\n"); char add[100]; sprintf(add, " -g -s src/kernels/cl_explore.cu "); strcat(build_opt, add); } #endif #if CL_COMP_OPT == 0 fprintf(stderr, "The OpenCL compiler optimizations are disabled.\n\n"); char add[100]; sprintf(add, " -cl-opt-disable "); strcat(build_opt, add); #endif #if COMPILE_ALL_CS == 1 printf("All propagators are being compiled in kernel.\n\n"); #endif // to tell OpenCL compiler which constraints should be compiled and which ones uses reification char cs_usage[40]; for (i = 0; i < N_C_TYPES; i++) { #if COMPILE_ALL_CS == 0 if (USE_CS[i] == 1) { #endif sprintf(cs_usage, " -D CS_%s=1", cs_get_type((c_kind)i)); strcat(build_opt, cs_usage); #if COMPILE_ALL_CS == 0 if (USE_CS_REIFI[i] == 1) { #endif sprintf(cs_usage, " -D CS_R_%s=1", cs_get_type((c_kind)i)); strcat(build_opt, cs_usage); #if COMPILE_ALL_CS == 0 } } #endif } // Creates a program object dev_info->prog = clCreateProgramWithSource(dev_info->context, 1, (const char**) &src_str, (const size_t *) &src_size, &ret); cl_check_error(ret, "clCreateProgramWithSource", dev_info->dev_name); free(src_str); // for elapsed time calculation if (VERBOSE) { gettimeofday(&start, NULL); } // Compiles and links the kernel and check for errors cl_check_build_error(clBuildProgram(dev_info->prog, 1, &dev_info->device_id, build_opt, NULL, NULL), &dev_info->prog, &dev_info->device_id, dev_info->dev_name); if (VERBOSE) { gettimeofday(&end, NULL); format_elapsed_time_s_ms(elapsed_time, start.tv_sec, start.tv_usec, end.tv_sec, end.tv_usec); format_time_s_ms(start_time, start.tv_sec, start.tv_usec); format_time_s_ms(end_time, end.tv_sec, end.tv_usec); printf("%s...%s = %s (s.ms) -> Kernel compiled for %s (%d)\n", start_time, end_time, elapsed_time, dev_info->dev_name, dev_info->dev_type_n); } // Create Command Queue dev_args->cq = clCreateCommandQueue(dev_info->context, dev_info->device_id, 0, &ret); cl_check_error(ret, "clCreateCommandQueue", dev_info->dev_name); // Create OpenCL kernels allowing it to be called from this source code dev_args->kernel = clCreateKernel(dev_info->prog, "explore", &ret); cl_check_error(ret, "clCreateKernel", dev_info->dev_name); // 0...cs_vs_idx - each constraint list of constrained variables ids placed per constraint order // cs_vs_idx...cs_vs_idx+vs_cs_idx - each variable list of constraints ids placed per variable order // cs_vs_idx+vs_cs_idx...cs_vs_idx+vs_cs_idx+n_const_cs - each constraint list of constants placed per constraint order dev_args->ints_mem = clCreateBuffer(dev_info->context, CL_MEM_READ_ONLY, dev_args->ints_size, NULL, &ret); cl_check_error(ret, "clCreateBuffer int_p_mem", dev_info->dev_name); dev_args->ints = calloc(dev_args->ints_size / sizeof(cl_int), sizeof(cl_int)); // buffer for cl_constr constant data dev_args->cl_cs_mem = clCreateBuffer(dev_info->context, CL_MEM_READ_ONLY, dev_args->cl_cs_size, NULL, &ret); cl_check_error(ret, "clCreateBuffer cl_cs_p_mem", dev_info->dev_name); dev_args->cl_cs = malloc(dev_args->cl_cs_size); // Fills constraints and variables per constraint buffer unsigned int n_vs_cs_cnt = 0; for (i = 0; i < N_CS; i++) { dev_args->cl_cs[i].kind = CS[i].kind; dev_args->cl_cs[i].n_c_vs = CS[i].n_c_vs; dev_args->cl_cs[i].n_c_consts = CS[i].n_c_consts; dev_args->cl_cs[i].v_idx = n_vs_cs_cnt; dev_args->cl_cs[i].constant_val = CS[i].constant_val; dev_args->cl_cs[i].reif_var_id = CS[i].reif_v_id; dev_args->cl_cs[i].reified = CS[i].reified; dev_args->cl_cs[i].boolean = CS[i].boolean; dev_args->cl_cs[i].c_id = CS[i].c_id; for (j = 0; j < CS[i].n_c_vs; j++) { dev_args->ints[n_vs_cs_cnt++] = CS[i].c_vs[j]->v_id; } } // Fills variables and constraints per variable buffer int n_cs_vs_cnt = 0; if (DOMAIN_TYPE == BITMAP_) { dev_args->cl_vs_mem = clCreateBuffer(dev_info->context, CL_MEM_READ_ONLY, dev_args->cl_vs_size, NULL, &ret); cl_check_error(ret, "clCreateBuffer cl_vs_mem", dev_info->dev_name); dev_args->cl_vs_bitmaps = malloc(dev_args->cl_vs_size); dev_args->b_ds_mem = clCreateBuffer(dev_info->context, CL_MEM_READ_ONLY, dev_args->b_ds_size, NULL, &ret); cl_check_error(ret, "clCreateBuffer bitmaps_aux_mem", dev_info->dev_name); dev_args->b_ds = malloc(dev_args->b_ds_size); vs_copy_host_to_dev(dev_args->b_ds, VS, N_VS); for (i = 0; i < N_VS; i++) { dev_args->cl_vs_bitmaps[i].n_cs = VS[i].n_cs; dev_args->cl_vs_bitmaps[i].to_label = VS[i].to_label; dev_args->cl_vs_bitmaps[i].boolean = VS[i].boolean; dev_args->cl_vs_bitmaps[i].expanded = VS[i].expanded; dev_args->cl_vs_bitmaps[i].n_vals = VS[i].n_vals; if (VS[i].n_cs > 0) { dev_args->cl_vs_bitmaps[i].c_idx = (unsigned int)n_cs_vs_cnt; for (j = 0; j < VS[i].n_cs; j++) { dev_args->ints[dev_args->n_vs_cs + (unsigned int)n_cs_vs_cnt] = (int)VS[i].cs[j]->c_id; n_cs_vs_cnt++; } } else { dev_args->cl_vs_bitmaps[i].c_idx = 0; } } } else if (DOMAIN_TYPE == INTERVAL) { // buffer for cl_var constant data dev_args->cl_vs_mem = clCreateBuffer(dev_info->context, CL_MEM_READ_ONLY, dev_args->cl_vs_size, NULL, &ret); cl_check_error(ret, "clCreateBuffer cl_vs_p_mem", dev_info->dev_name); dev_args->cl_vs_intervals = malloc(dev_args->cl_vs_size); for (i = 0; i < N_VS; i++) { dev_args->cl_vs_intervals[i].domain.s[0] = VS[i].domain_i.s[0]; dev_args->cl_vs_intervals[i].domain.s[1] = VS[i].domain_i.s[1]; dev_args->cl_vs_intervals[i].n_cs = VS[i].n_cs; dev_args->cl_vs_intervals[i].c_idx = (unsigned int)n_cs_vs_cnt; dev_args->cl_vs_intervals[i].to_label = VS[i].to_label; dev_args->cl_vs_intervals[i].boolean = VS[i].boolean; dev_args->cl_vs_intervals[i].expanded = VS[i].expanded; dev_args->cl_vs_intervals[i].n_vals = VS[i].n_vals; if (VS[i].n_cs > 0) { dev_args->cl_vs_intervals[i].c_idx = (unsigned int)n_cs_vs_cnt; for (j = 0; j < VS[i].n_cs; j++) { dev_args->ints[dev_args->n_vs_cs + (unsigned int)n_cs_vs_cnt] = (int)VS[i].cs[j]->c_id; n_cs_vs_cnt++; } } else { dev_args->cl_vs_intervals[i].c_idx = 0; } } } // Fills constraint constant values, if existent unsigned int cs_consts_cnt = 0; for (i = 0; i < N_CS; i++) { if (CS[i].n_c_consts > 0) { dev_args->cl_cs[i].const_idx = cs_consts_cnt; for (j = 0; j < CS[i].n_c_consts; j++) { dev_args->ints[dev_args->n_vs_cs + dev_args->n_cs_vs + cs_consts_cnt] = CS[i].c_consts[j]; cs_consts_cnt++; } } else { dev_args->cl_cs[i].const_idx = 0; } } // if not using local memory (using global memory only) if (!dev_info->use_local_mem) { // buffer for cl_vs_prop data dev_args->cl_vs_prop_mem = clCreateBuffer(dev_info->context, CL_MEM_READ_WRITE, dev_args->cl_vs_prop_size, NULL, &ret); // buffer for vs_id_to_prop data dev_args->vs_id_to_prop_mem = clCreateBuffer(dev_info->context, CL_MEM_READ_WRITE, dev_args->vs_id_to_prop_size, NULL, &ret); cl_check_error(ret, "clCreateBuffer vs_id_to_prop_mem", dev_info->dev_name); } // buffer for backtracking history // 0...(n_vs_to_label+1)*N_VS*wi_total - backtracking history dev_args->backtrack_mem1 = clCreateBuffer(dev_info->context, CL_MEM_READ_WRITE, dev_args->backtrack_size, NULL, &ret); cl_check_error(ret, "clCreateBuffer backtrack_mem1", dev_info->dev_name); // more buffers for backtracking #if USE_MORE_BUFFERS if (dev_info->n_buffers > 1) { dev_args->backtrack_mem2 = clCreateBuffer(dev_info->context, CL_MEM_READ_WRITE, dev_args->backtrack_size, NULL, &ret); cl_check_error(ret, "clCreateBuffer backtrack_mem2", dev_info->dev_name); } if (dev_info->n_buffers > 2) { dev_args->backtrack_mem3 = clCreateBuffer(dev_info->context, CL_MEM_READ_WRITE, dev_args->backtrack_size, NULL, &ret); cl_check_error(ret, "clCreateBuffer backtrack_mem3", dev_info->dev_name); } if (dev_info->n_buffers > 3) { dev_args->backtrack_mem4 = clCreateBuffer(dev_info->context, CL_MEM_READ_WRITE, dev_args->backtrack_size, NULL, &ret); cl_check_error(ret, "clCreateBuffer backtrack_mem4", dev_info->dev_name); } #endif // (dev_args->n_vs_to_label + 2 + TO_LABEL_THRESHOLD) * dev_args->split_values_ext) * 2 - to use in kernel (hist_labeleds_id and hist_labeleds_n_vals) // n_terms * dev_args->wi_total - to use in propagators // D_MAX+1 - for ss generation dev_args->generic_mem = clCreateBuffer(dev_info->context, CL_MEM_READ_WRITE, dev_args->generic_size, NULL, &ret); cl_check_error(ret, "clCreateBuffer generic_mem", dev_info->dev_name); if (CS_IGNORE) { // N_CS * dev_args->wi_total - to flag CS_IGNORE dev_args->cs_ignore_mem = clCreateBuffer(dev_info->context, CL_MEM_READ_WRITE, dev_args->cs_ignore_size, NULL, &ret); cl_check_error(ret, "clCreateBuffer cs_ignore_mem", dev_info->dev_name); } // if all solutions must be found if (WORK == CNT) { // buffer for atomics data (Most devices only have atomics for 32 bits variables) // 0 - first sub-search to explore // 1 - last sub-search to explore // 2 - n_ss // 3 - depth // 4 - WIs still working for work-sharing // 5 - 5+N_VS - n_repeat per variable // 5+N_VS...5+N_VS+N_WG*N_WI_WG - number of solutions found per work-item dev_args->atoms_mem = clCreateBuffer(dev_info->context, CL_MEM_READ_WRITE, dev_args->atoms_size, NULL, &ret); cl_check_error(ret, "clCreateBuffer atoms_p_mem", dev_info->dev_name); dev_args->atoms = calloc(dev_args->atoms_size / sizeof(cl_uint), sizeof(cl_uint)); // if only one solution must be found } else if (WORK == ONE) { // buffer for atomics data (Most devices only have atomics for 32 bits variables) // 0 - first sub-search to explore // 1 - last sub-search to explore // 2 - n_ss // 3 - depth // 4 - WIs still working for work-sharing // 5 - 5+N_VS - n_repeat per variable // 5+N_VS - solution found flag dev_args->atoms_mem = clCreateBuffer(dev_info->context, CL_MEM_READ_WRITE, dev_args->atoms_size, NULL, &ret); cl_check_error(ret, "clCreateBuffer atoms_p_mem", dev_info->dev_name); dev_args->atoms = calloc(dev_args->atoms_size / sizeof(cl_uint), sizeof(cl_uint)); // buffer for domains writable data // 0...N_VS - solution domains dev_args->domains_mem = clCreateBuffer(dev_info->context, CL_MEM_READ_WRITE, dev_args->domains_size, NULL, &ret); cl_check_error(ret, "clCreateBuffer domains_mem", dev_info->dev_name); if (DOMAIN_TYPE == BITMAP_) { dev_args->bitmaps = malloc(dev_args->domains_size); // set buffer initial values to zero memset(dev_args->bitmaps, 0, dev_args->domains_size); cl_check_error(clEnqueueWriteBuffer(dev_args->cq, dev_args->domains_mem, CL_TRUE, 0, dev_args->domains_size, dev_args->bitmaps, 0, NULL, NULL), "clEnqueueWriteBuffer domains_mem", dev_info->dev_name); } else if (DOMAIN_TYPE == INTERVAL) { dev_args->intervals = malloc(dev_args->domains_size); // set buffer initial values to zero memset(dev_args->intervals, 0, N_VS * 4); cl_check_error(clEnqueueWriteBuffer(dev_args->cq, dev_args->domains_mem, CL_TRUE, 0, dev_args->domains_size, dev_args->intervals, 0, NULL, NULL), "clEnqueueWriteBuffer domains_mem", dev_info->dev_name); } // if optimization } else if (WORK == OPT) { // buffer for atomics data (Most devices only have atomics for 32 bits variables) // 0 - first sub-search to explore // 1 - last sub-search to explore // 2 - n_ss // 3 - depth // 4 - WIs still working for work-sharing // 5 - 5+N_VS - n_repeat per variable // 5+N_VS - solution found flag // 6+N_VS - Value to optimize // 7+N_VS - WIs still working for saving the best solution dev_args->atoms_mem = clCreateBuffer(dev_info->context, CL_MEM_READ_WRITE, dev_args->atoms_size, NULL, &ret); cl_check_error(ret, "clCreateBuffer atoms_p_mem", dev_info->dev_name); dev_args->atoms = calloc(dev_args->atoms_size / sizeof(cl_uint), sizeof(cl_uint)); // buffer for domains writable data // 0...N_VS*(D_MAX+1) - (D_MAX+1) solution stores because concurrency control dev_args->domains_mem = clCreateBuffer(dev_info->context, CL_MEM_READ_WRITE, dev_args->domains_size, NULL, &ret); cl_check_error(ret, "clCreateBuffer domains_mem", dev_info->dev_name); if (DOMAIN_TYPE == BITMAP_) { dev_args->bitmaps = malloc(dev_args->domains_size); // set buffer initial values to zero memset(dev_args->bitmaps, 0, dev_args->domains_size); cl_check_error(clEnqueueWriteBuffer(dev_args->cq, dev_args->domains_mem, CL_TRUE, 0, dev_args->domains_size, dev_args->bitmaps, 0, NULL, NULL), "clEnqueueWriteBuffer domains_mem", dev_info->dev_name); } else if (DOMAIN_TYPE == INTERVAL) { dev_args->intervals = malloc(dev_args->domains_size); // set buffer initial values to zero memset(dev_args->intervals, 0, N_VS * (D_MAX + 1) * 4); cl_check_error(clEnqueueWriteBuffer(dev_args->cq, dev_args->domains_mem, CL_TRUE, 0, dev_args->domains_size, dev_args->intervals, 0, NULL, NULL), "clEnqueueWriteBuffer domains_mem", dev_info->dev_name); } } if (N_DEVS > 1) { // to count number of propagations done per work-item, for rank calculation dev_args->props_mem = clCreateBuffer(dev_info->context, CL_MEM_WRITE_ONLY, dev_args->props_size, NULL, &ret); cl_check_error(ret, "clCreateBuffer props_mem", dev_info->dev_name); dev_args->props = calloc(dev_args->props_size / sizeof(cl_ulong), sizeof(cl_ulong)); } #if SHARED_SS > 0 // for work-sharing after the ss in the block have finished dev_args->shared_stores_mem = clCreateBuffer(dev_info->context, CL_MEM_READ_WRITE, dev_args->shared_stores_size, NULL, &ret); cl_check_error(ret, "clCreateBuffer shared_stores_mem", dev_info->dev_name); // flags for signaling the state of each work-sharing store // 0 - next shared SS to be picked // 1 - next shared SS to be filled // 2...number of SS already filled // 3..3+CL_N_SHARED_SS - V_ID that was labeled to generate this SS dev_args->shared_stores_flag_mem = clCreateBuffer(dev_info->context, CL_MEM_READ_WRITE, dev_args->shared_stores_flag_size, NULL, &ret); cl_check_error(ret, "clCreateBuffer shared_stores_flag_mem", dev_info->dev_name); dev_args->shared_stores_flag = calloc(dev_args->shared_stores_flag_size / sizeof(cl_int), sizeof(cl_int)); #endif if (PRINT_STATS) { // 0 - nodes_fail // 1 - nodes_expl // 2 - backtracks // 3 - labels // 4 - pruning // 5 - props_ok // 6 - max_depth // ... repeat per work-item dev_args->stats_mem = clCreateBuffer(dev_info->context, CL_MEM_READ_WRITE, dev_args->stats_size, NULL, &ret); cl_check_error(ret, "clCreateBuffer stats_mem", dev_info->dev_name); dev_args->stats = malloc(dev_args->stats_size); memset(dev_args->stats, 0, dev_args->stats_size); cl_check_error(clEnqueueWriteBuffer(dev_args->cq, dev_args->stats_mem, CL_TRUE, 0, dev_args->stats_size, dev_args->stats, 0, NULL, NULL), "clEnqueueWriteBuffer stats_mem", dev_info->dev_name); } if (filtering) { // 0...N_VS - size of domains_mem buffer for the filtering result dev_args->filt_domains_mem = clCreateBuffer(dev_info->context, CL_MEM_READ_WRITE, dev_args->filt_domains_size, NULL, &ret); cl_check_error(ret, "clCreateBuffer filt_domains_mem", dev_info->dev_name); if (DOMAIN_TYPE == BITMAP_) { dev_args->filt_bitmaps = malloc(dev_args->filt_domains_size); } else if (DOMAIN_TYPE == INTERVAL) { dev_args->filt_intervals = malloc(dev_args->filt_domains_size); } if (CS_IGNORE) { // 0...N_CS - size of filt_cs_mem buffer for the filtering dev_args->filt_cs_mem = clCreateBuffer(dev_info->context, CL_MEM_READ_WRITE, dev_args->filt_cs_size, NULL, &ret); cl_check_error(ret, "clCreateBuffer filt_cs_mem", dev_info->dev_name); dev_args->filt_cs = malloc(dev_args->filt_cs_size); } } // write permanent data to device buffers cl_check_error(clEnqueueWriteBuffer(dev_args->cq, dev_args->ints_mem, CL_TRUE, 0, dev_args->ints_size, dev_args->ints, 0, NULL, NULL), "clEnqueueWriteBuffer int_p_mem", dev_info->dev_name); if (DOMAIN_TYPE == BITMAP_) { cl_check_error(clEnqueueWriteBuffer(dev_args->cq, dev_args->cl_vs_mem, CL_TRUE, 0, dev_args->cl_vs_size, dev_args->cl_vs_bitmaps, 0, NULL, NULL), "clEnqueueWriteBuffer cl_vs_mem", dev_info->dev_name); cl_check_error(clEnqueueWriteBuffer(dev_args->cq, dev_args->b_ds_mem, CL_TRUE, 0, dev_args->b_ds_size, dev_args->b_ds, 0, NULL, NULL), "clEnqueueWriteBuffer bitmaps_aux_mem", dev_info->dev_name); } else if (DOMAIN_TYPE == INTERVAL) { cl_check_error(clEnqueueWriteBuffer(dev_args->cq, dev_args->cl_vs_mem, CL_TRUE, 0, dev_args->cl_vs_size, dev_args->cl_vs_intervals, 0, NULL, NULL), "clEnqueueWriteBuffer cl_vs_p_mem", dev_info->dev_name); } cl_check_error(clEnqueueWriteBuffer(dev_args->cq, dev_args->cl_cs_mem, CL_TRUE, 0, dev_args->cl_cs_size, dev_args->cl_cs, 0, NULL, NULL), "clEnqueueWriteBuffer cl_cs_p_mem", dev_info->dev_name); // Set OpenCL kernel arguments cl_uint arg_number = 0; cl_check_error(clSetKernelArg(dev_args->kernel, arg_number++, sizeof(dev_args->atoms_mem), &dev_args->atoms_mem), "clSetKernelArg atoms_p_mem", dev_info->dev_name); cl_check_error(clSetKernelArg(dev_args->kernel, arg_number++, sizeof(dev_args->ints_mem), &dev_args->ints_mem), "clSetKernelArg int_p_mem", dev_info->dev_name); cl_check_error(clSetKernelArg(dev_args->kernel, arg_number++, sizeof(dev_args->backtrack_mem1), &dev_args->backtrack_mem1), "clSetKernelArg backtrack_mem1", dev_info->dev_name); // more buffers for backtracking #if USE_MORE_BUFFERS if (dev_info->n_buffers > 1) { cl_check_error(clSetKernelArg(dev_args->kernel, arg_number++, sizeof(dev_args->backtrack_mem2), &dev_args->backtrack_mem2), "clSetKernelArg backtrack_mem2", dev_info->dev_name); } if (dev_info->n_buffers > 2) { cl_check_error(clSetKernelArg(dev_args->kernel, arg_number++, sizeof(dev_args->backtrack_mem3), &dev_args->backtrack_mem3), "clSetKernelArg backtrack_mem3", dev_info->dev_name); } if (dev_info->n_buffers > 3) { cl_check_error(clSetKernelArg(dev_args->kernel, arg_number++, sizeof(dev_args->backtrack_mem4), &dev_args->backtrack_mem4), "clSetKernelArg backtrack_mem4", dev_info->dev_name); } #endif cl_check_error(clSetKernelArg(dev_args->kernel, arg_number++, sizeof(dev_args->generic_mem), &dev_args->generic_mem), "clSetKernelArg generic_mem", dev_info->dev_name); if (CS_IGNORE) { cl_check_error(clSetKernelArg(dev_args->kernel, arg_number++, sizeof(dev_args->cs_ignore_mem), &dev_args->cs_ignore_mem), "clSetKernelArg cs_ignore_mem", dev_info->dev_name); } if (WORK == ONE || WORK == OPT) { cl_check_error(clSetKernelArg(dev_args->kernel, arg_number++, sizeof(dev_args->domains_mem), &dev_args->domains_mem), "clSetKernelArg domains_mem", dev_info->dev_name); } cl_check_error(clSetKernelArg(dev_args->kernel, arg_number++, sizeof(dev_args->cl_vs_mem), &dev_args->cl_vs_mem), "clSetKernelArg cl_vs_mem", dev_info->dev_name); cl_check_error(clSetKernelArg(dev_args->kernel, arg_number++, sizeof(dev_args->cl_cs_mem), &dev_args->cl_cs_mem), "clSetKernelArg cl_cs_mem", dev_info->dev_name); // if using local memory if (dev_info->use_local_mem) { cl_check_error(clSetKernelArg(dev_args->kernel, arg_number++, dev_args->cl_vs_prop_size, NULL), "clSetKernelArg cl_vs_prop_mem", dev_info->dev_name); cl_check_error(clSetKernelArg(dev_args->kernel, arg_number++, dev_args->vs_id_to_prop_size, NULL), "clSetKernelArg vs_id_to_prop_mem", dev_info->dev_name); // if not using local memory } else { cl_check_error(clSetKernelArg(dev_args->kernel, arg_number++, sizeof(dev_args->cl_vs_prop_mem), &dev_args->cl_vs_prop_mem), "clSetKernelArg cl_vs_prop_mem", dev_info->dev_name); cl_check_error(clSetKernelArg(dev_args->kernel, arg_number++, sizeof(dev_args->vs_id_to_prop_mem), &dev_args->vs_id_to_prop_mem), "clSetKernelArg vs_id_to_prop_mem", dev_info->dev_name); } if (DOMAIN_TYPE == BITMAP_) { cl_check_error(clSetKernelArg(dev_args->kernel, arg_number++, sizeof(dev_args->b_ds_mem), &dev_args->b_ds_mem), "clSetKernelArg bitmaps_aux_mem", dev_info->dev_name); } if (PRINT_STATS) { cl_check_error(clSetKernelArg(dev_args->kernel, arg_number++, sizeof(dev_args->stats_mem), &dev_args->stats_mem), "clSetKernelArg stats_mem", dev_info->dev_name); } if (N_DEVS > 1) { cl_check_error(clSetKernelArg(dev_args->kernel, arg_number++, sizeof(dev_args->props_mem), &dev_args->props_mem), "clSetKernelArg props_mem", dev_info->dev_name); } #if SHARED_SS > 0 cl_check_error(clSetKernelArg(dev_args->kernel, arg_number++, sizeof(dev_args->shared_stores_mem), &dev_args->shared_stores_mem), "clSetKernelArg shared_stores_mem", dev_info->dev_name); cl_check_error(clSetKernelArg(dev_args->kernel, arg_number++, sizeof(dev_args->shared_stores_flag_mem), &dev_args->shared_stores_flag_mem), "clSetKernelArg shared_stores_flag_mem", dev_info->dev_name); #endif if (filtering) { cl_check_error(clSetKernelArg(dev_args->kernel, arg_number++, sizeof(dev_args->filt_domains_mem), &dev_args->filt_domains_mem), "clSetKernelArg filt_domains_mem", dev_info->dev_name); if (CS_IGNORE) { cl_check_error(clSetKernelArg(dev_args->kernel, arg_number++, sizeof(dev_args->filt_cs_mem), &dev_args->filt_cs_mem), "clSetKernelArg filt_cs_mem", dev_info->dev_name); } } #if RUN_IN_CUDA } #endif } /* * Clear device objects * dev_args - device_args structure about this device * dev_info - device_info structure about this device */ void release_device(device_args* dev_args, device_info* dev_info, bool filtering) { #if RUN_IN_CUDA if (dev_info->type == CL_DEVICE_TYPE_GPU) { CUresult err = cuInit(0); err = cuMemFree(dev_args->atoms_mem_cu); if (err != CUDA_SUCCESS) { fprintf(stderr, "CUDA error in cuMemFree atoms_mem_cu\n"); cuCtxDestroy(dev_args->context_cu); exit(-1); } err = cuMemFree(dev_args->ints_mem_cu); if (err != CUDA_SUCCESS) { fprintf(stderr, "CUDA error in cuMemFree ints_mem_cu\n"); cuCtxDestroy(dev_args->context_cu); exit(-1); } err = cuMemFree(dev_args->backtrack_mem1_cu); if (err != CUDA_SUCCESS) { fprintf(stderr, "CUDA error in cuMemFree backtrack_mem1_cu\n"); cuCtxDestroy(dev_args->context_cu); exit(-1); } // more buffers for backtracking #if USE_MORE_BUFFERS if (dev_info->n_buffers > 1) { err = cuMemFree(dev_args->backtrack_mem2_cu); if (err != CUDA_SUCCESS) { fprintf(stderr, "CUDA error in cuMemFree backtrack_mem2_cu\n"); cuCtxDestroy(dev_args->context_cu); exit(-1); } } if (dev_info->n_buffers > 2) { err = cuMemFree(dev_args->backtrack_mem3_cu); if (err != CUDA_SUCCESS) { fprintf(stderr, "CUDA error in cuMemFree backtrack_mem3_cu\n"); cuCtxDestroy(dev_args->context_cu); exit(-1); } } if (dev_info->n_buffers > 3) { err = cuMemFree(dev_args->backtrack_mem4_cu); if (err != CUDA_SUCCESS) { fprintf(stderr, "CUDA error in cuMemFree backtrack_mem4_cu\n"); cuCtxDestroy(dev_args->context_cu); exit(-1); } } #endif err = cuMemFree(dev_args->generic_mem_cu); if (err != CUDA_SUCCESS) { fprintf(stderr, "CUDA error in cuMemFree generic_mem_cu\n"); cuCtxDestroy(dev_args->context_cu); exit(-1); } if (CS_IGNORE) { err = cuMemFree(dev_args->cs_ignore_mem_cu); if (err != CUDA_SUCCESS) { fprintf(stderr, "CUDA error in cuMemFree cs_ignore_mem_cu\n"); cuCtxDestroy(dev_args->context_cu); exit(-1); } } if (WORK == ONE || WORK == OPT) { err = cuMemFree(dev_args->domains_mem_cu); if (err != CUDA_SUCCESS) { fprintf(stderr, "CUDA error in cuMemFree domains_mem_cu\n"); cuCtxDestroy(dev_args->context_cu); exit(-1); } } err = cuMemFree(dev_args->cl_vs_mem_cu); if (err != CUDA_SUCCESS) { fprintf(stderr, "CUDA error in cuMemFree cl_vs_mem_cu\n"); cuCtxDestroy(dev_args->context_cu); exit(-1); } err = cuMemFree(dev_args->cl_cs_mem_cu); if (err != CUDA_SUCCESS) { fprintf(stderr, "CUDA error in cuMemFree cl_cs_mem_cu\n"); cuCtxDestroy(dev_args->context_cu); exit(-1); } // if not using local memory if (!dev_info->use_local_mem) { err = cuMemFree(dev_args->vs_id_to_prop_mem_cu); if (err != CUDA_SUCCESS) { fprintf(stderr, "CUDA error in cuMemFree vs_id_to_prop_mem_cu\n"); cuCtxDestroy(dev_args->context_cu); exit(-1); } err = cuMemFree(dev_args->cl_vs_prop_mem_cu); if (err != CUDA_SUCCESS) { fprintf(stderr, "CUDA error in cuMemFree cl_vs_prop_mem_cu\n"); cuCtxDestroy(dev_args->context_cu); exit(-1); } } if (PRINT_STATS) { err = cuMemFree(dev_args->stats_mem_cu); if (err != CUDA_SUCCESS) { fprintf(stderr, "CUDA error in cuMemFree stats_mem_cu\n"); cuCtxDestroy(dev_args->context_cu); exit(-1); } free(dev_args->stats); } #if SHARED_SS > 0 err = cuMemFree(dev_args->shared_stores_mem_cu); if (err != CUDA_SUCCESS) { fprintf(stderr, "CUDA error in cuMemFree shared_stores_mem_cu\n"); cuCtxDestroy(dev_args->context_cu); exit(-1); } err = cuMemFree(dev_args->shared_stores_flag_mem_cu); if (err != CUDA_SUCCESS) { fprintf(stderr, "CUDA error in cuMemFree shared_stores_flag_mem_cu\n"); cuCtxDestroy(dev_args->context_cu); exit(-1); } #endif if (N_DEVS > 1) { err = cuMemFree(dev_args->props_mem_cu); if (err != CUDA_SUCCESS) { fprintf(stderr, "CUDA error in cuMemFree props_mem_cu\n"); cuCtxDestroy(dev_args->context_cu); exit(-1); } } if (filtering) { err = cuMemFree(dev_args->filt_domains_mem_cu); if (err != CUDA_SUCCESS) { fprintf(stderr, "CUDA error in cuMemFree filt_domains_mem_cu\n"); cuCtxDestroy(dev_args->context_cu); exit(-1); } if (CS_IGNORE) { err = cuMemFree(dev_args->filt_cs_mem_cu); if (err != CUDA_SUCCESS) { fprintf(stderr, "CUDA error in cuMemFree filt_cs_mem_cu\n"); cuCtxDestroy(dev_args->context_cu); exit(-1); } } } free(dev_args->atoms); free(dev_args->ints); if (DOMAIN_TYPE == BITMAP_) { if (WORK == ONE || WORK == OPT) { free(dev_args->bitmaps); } free(dev_args->b_ds); free(dev_args->cl_vs_bitmaps); } else if (DOMAIN_TYPE == INTERVAL) { if (WORK == ONE || WORK == OPT) { free(dev_args->intervals); } free(dev_args->cl_vs_intervals); } free(dev_args->cl_cs); #if SHARED_SS > 0 free(dev_args->shared_stores_flag); #endif if (N_DEVS > 1) { free(dev_args->props); } if (filtering) { if (DOMAIN_TYPE == BITMAP_) { free(dev_args->filt_bitmaps); } else { free(dev_args->filt_intervals); } if (CS_IGNORE) { free(dev_args->filt_cs); } } cuCtxDestroy(dev_args->context_cu); } else { #endif cl_check_error(clFlush(dev_args->cq), "clFlush", dev_info->dev_name); cl_check_error(clFinish(dev_args->cq), "clFinish", dev_info->dev_name); cl_check_error(clReleaseKernel(dev_args->kernel), "clReleaseKernel", dev_info->dev_name); cl_check_error(clReleaseMemObject(dev_args->atoms_mem), "clReleaseMemObject atoms_p_mem", dev_info->dev_name); cl_check_error(clReleaseMemObject(dev_args->ints_mem), "clReleaseMemObject int_p_mem", dev_info->dev_name); cl_check_error(clReleaseMemObject(dev_args->backtrack_mem1), "clReleaseMemObject backtrack_mem1", dev_info->dev_name); // more buffers for backtracking #if USE_MORE_BUFFERS if (dev_info->n_buffers > 1) { cl_check_error(clReleaseMemObject(dev_args->backtrack_mem2), "clReleaseMemObject backtrack_mem2", dev_info->dev_name); } if (dev_info->n_buffers > 2) { cl_check_error(clReleaseMemObject(dev_args->backtrack_mem3), "clReleaseMemObject backtrack_mem3", dev_info->dev_name); } if (dev_info->n_buffers > 3) { cl_check_error(clReleaseMemObject(dev_args->backtrack_mem4), "clReleaseMemObject backtrack_mem4", dev_info->dev_name); } #endif cl_check_error(clReleaseMemObject(dev_args->generic_mem), "clReleaseMemObject generic_mem", dev_info->dev_name); if (CS_IGNORE) { cl_check_error(clReleaseMemObject(dev_args->cs_ignore_mem), "clReleaseMemObject cs_ignore_mem", dev_info->dev_name); } if (WORK == ONE || WORK == OPT) { cl_check_error(clReleaseMemObject(dev_args->domains_mem), "clReleaseMemObject domains_mem", dev_info->dev_name); } cl_check_error(clReleaseMemObject(dev_args->cl_vs_mem), "clReleaseMemObject cl_vs_mem", dev_info->dev_name); cl_check_error(clReleaseMemObject(dev_args->cl_cs_mem), "clReleaseMemObject cl_cs_mem", dev_info->dev_name); // if not using local memory if (!dev_info->use_local_mem) { cl_check_error(clReleaseMemObject(dev_args->vs_id_to_prop_mem), "clReleaseMemObject vs_id_to_prop_mem", dev_info->dev_name); cl_check_error(clReleaseMemObject(dev_args->cl_vs_prop_mem), "clReleaseMemObject cl_vs_prop_mem", dev_info->dev_name); } if (PRINT_STATS) { cl_check_error(clReleaseMemObject(dev_args->stats_mem), "clReleaseMemObject stats_mem", dev_info->dev_name); free(dev_args->stats); } #if SHARED_SS > 0 cl_check_error(clReleaseMemObject(dev_args->shared_stores_mem), "clReleaseMemObject shared_stores_mem", dev_info->dev_name); cl_check_error(clReleaseMemObject(dev_args->shared_stores_flag_mem), "clReleaseMemObject shared_stores_flag_mem", dev_info->dev_name); #endif if (N_DEVS > 1) { cl_check_error(clReleaseMemObject(dev_args->props_mem), "clReleaseMemObject props_mem", dev_info->dev_name); } if (filtering) { cl_check_error(clReleaseMemObject(dev_args->filt_domains_mem), "clReleaseMemObject filt_domains_mem", dev_info->dev_name); if (CS_IGNORE) { cl_check_error(clReleaseMemObject(dev_args->filt_cs_mem), "clReleaseMemObject filt_cs_mem", dev_info->dev_name); } } free(dev_args->atoms); free(dev_args->ints); if (DOMAIN_TYPE == BITMAP_) { if (WORK == ONE || WORK == OPT) { free(dev_args->bitmaps); } free(dev_args->b_ds); free(dev_args->cl_vs_bitmaps); } else if (DOMAIN_TYPE == INTERVAL) { if (WORK == ONE || WORK == OPT) { free(dev_args->intervals); } free(dev_args->cl_vs_intervals); } free(dev_args->cl_cs); #if SHARED_SS > 0 free(dev_args->shared_stores_flag); #endif if (N_DEVS > 1) { free(dev_args->props); } if (filtering) { if (DOMAIN_TYPE == BITMAP_) { free(dev_args->filt_bitmaps); } else { free(dev_args->filt_intervals); } if (CS_IGNORE) { free(dev_args->filt_cs); } } cl_check_error(clReleaseProgram(dev_info->prog), "clReleaseProgram", dev_info->dev_name); cl_check_error(clReleaseCommandQueue(dev_args->cq), "clReleaseCommandQueue", dev_info->dev_name); cl_check_error(clReleaseContext(dev_info->context), "clReleaseContext", dev_info->dev_name); #if RUN_IN_CUDA } #endif }