/* * solve.c * * Created on: 28/01/2015 * Author: Pedro */ #include "solve.h" #include #include #include #include "bitmaps.h" #include "devices.h" #include "domains.h" #include "intervals.h" #include "utils/dev_errors.h" #include "variables.h" /* * Return the number of solutions found or 0 if no solution is found * dev_args - device_args structure about this device * dev_info - device_info structure about this device * depth - Tree expansion depth needed to get n_ss disjoint search spaces * n_ss - total number of sub-search spaces * stats_lock - mutex to control accesses to statistics structure * filtering - if it is being executed in the prefiltering phase * */ cl_ulong count_sols(device_args *dev_args, device_info *dev_info, unsigned int depth, unsigned int n_ss, pthread_mutex_t *stats_lock, bool filtering) { cl_ulong n_solutions = 0; // number of solutions found on this kernels execution unsigned int i; // buffer for atomics data (Most devices only have atomics for 32 bits variables) // 0 - first sub-search to explore // 1 - last sub-search to explore // 2 - n_ss // 3 - depth // 4 - WIs still working for work-sharing // 5 - 5+N_VS - n_repeat per variable // 5+N_VS...5+N_VS+N_WG*N_WI_WG - number of solutions found per work-item set_strs_generat_data(dev_args, dev_info, depth, n_ss, filtering); dev_args->atoms[4] = (unsigned int) dev_args->wi_total; for (i = 5 + N_VS; i < 5 + N_VS + dev_args->wi_total; i++) { dev_args->atoms[i] = 0; } #if RUN_IN_CUDA CUresult err = cuInit(0); if (dev_info->type == CL_DEVICE_TYPE_GPU) { err = cuMemcpyHtoD(dev_args->atoms_mem_cu, dev_args->atoms, dev_args->atoms_size); cuda_check_error(err, "cuMemcpyHtoD atoms_mem_cu", dev_info->dev_name); } else { #endif // Update atoms buffer data on device cl_check_error(clEnqueueWriteBuffer(dev_args->cq, dev_args->atoms_mem, CL_TRUE, 0, dev_args->atoms_size, dev_args->atoms, 0, NULL, NULL), "clEnqueueWriteBuffer atoms_p_mem", dev_info->dev_name); #if RUN_IN_CUDA } #endif #if SHARED_SS > 0 #if RUN_IN_CUDA if (dev_info->type == CL_DEVICE_TYPE_GPU) { err = cuMemcpyHtoD(dev_args->shared_stores_flag_mem_cu, dev_args->shared_stores_flag, dev_args->shared_stores_flag_size); cuda_check_error(err, "cuMemcpyHtoD shared_stores_flag_mem_cu", dev_info->dev_name); } else { #endif // Update shared stored flags buffer data on device cl_check_error(clEnqueueWriteBuffer(dev_args->cq, dev_args->shared_stores_flag_mem, CL_TRUE, 0, dev_args->shared_stores_flag_size, dev_args->shared_stores_flag, 0, NULL, NULL), "clEnqueueWriteBuffer shared_stores_flag_mem", dev_info->dev_name); #if RUN_IN_CUDA } #endif #endif #if RUN_IN_CUDA if (dev_info->type == CL_DEVICE_TYPE_GPU) { err = cuLaunchKernel(dev_args->function_cu, (unsigned int)dev_args->wi_total / (unsigned int)dev_args->wi_local, 1, 1, (unsigned int)dev_args->wi_local, 1, 1, (unsigned int)dev_args->shared_memory_size_cu, 0, dev_args->kernel_args_cu, 0); cuda_check_error(err, "cuLaunchKernel", dev_info->dev_name); } else { #endif cl_check_error(clEnqueueNDRangeKernel(dev_args->cq, dev_args->kernel, 1, NULL, &dev_args->wi_total, &dev_args->wi_local, 0, NULL, NULL), "clEnqueueNDRangeKernel", dev_info->dev_name); #if RUN_IN_CUDA } #endif if (filtering) { return get_filtering_results(dev_args, dev_info); } #if RUN_IN_CUDA if (dev_info->type == CL_DEVICE_TYPE_GPU) { err = cuMemcpyDtoH(dev_args->atoms, dev_args->atoms_mem_cu, dev_args->atoms_size); cuda_check_error(err, "cuMemcpyDtoH atoms", dev_info->dev_name); } else { #endif // Transfer number of solutions found cl_check_error(clEnqueueReadBuffer(dev_args->cq, dev_args->atoms_mem, CL_TRUE, 0, dev_args->atoms_size, dev_args->atoms, 0, NULL, NULL), "clEnqueueReadBuffer atoms_p_mem", dev_info->dev_name); #if RUN_IN_CUDA } #endif for (i = 5 + N_VS; i < 5 + N_VS + dev_args->wi_total; i++) { n_solutions += dev_args->atoms[i]; } if (N_DEVS > 1) { #if RUN_IN_CUDA if (dev_info->type == CL_DEVICE_TYPE_GPU) { err = cuMemcpyDtoH(dev_args->props, dev_args->props_mem_cu, dev_args->props_size); cuda_check_error(err, "cuMemcpyDtoH props", dev_info->dev_name); } else { #endif cl_check_error(clEnqueueReadBuffer(dev_args->cq, dev_args->props_mem, CL_TRUE, 0, dev_args->props_size, dev_args->props, 0, NULL, NULL), "clEnqueueReadBuffer props_mem", dev_info->dev_name); #if RUN_IN_CUDA } #endif dev_info->last_props = 0; for (i = 0; i < dev_args->wi_total; i++) { dev_info->props_total += dev_args->props[i]; dev_info->last_props += dev_args->props[i]; } } // copy statistics from device to host if (PRINT_STATS) { get_stats(dev_args, dev_info, stats_lock); } // return number of solutions found return n_solutions; } /* * Return 1 if a solution is found, or 0 if no solution is found. * dev_args - device_args structure about this device * dev_info - device_info structure about this device * sol_found - atomic flag for solution found * depth - Tree expansion depth needed to get n_ss disjoint search spaces * n_ss - total number of sub-search spaces * stats_lock - mutex to control accesses to statistics structure * filtering - if it is being executed in the prefiltering phase * */ cl_ulong find_one_sol(device_args *dev_args, device_info *dev_info, unsigned char *sol_found, unsigned int depth, unsigned int n_ss, pthread_mutex_t *stats_lock, bool filtering) { int sol_found_atom = 0; // Previous value of sol_found unsigned int i; // buffer for atomics data (Most devices only have atomics for 32 bits variables) // 0 - first sub-search to explore // 1 - last sub-search to explore // 2 - n_ss // 3 - depth // 4 - WIs still working for work-sharing // 5 - 5+N_VS - n_repeat per variable // 5+N_VS - solution found flag set_strs_generat_data(dev_args, dev_info, depth, n_ss, filtering); dev_args->atoms[4] = (unsigned int) dev_args->wi_total; #if RUN_IN_CUDA CUresult err = cuInit(0); if (dev_info->type == CL_DEVICE_TYPE_GPU) { err = cuMemcpyHtoD(dev_args->atoms_mem_cu, dev_args->atoms, dev_args->atoms_size); cuda_check_error(err, "cuMemcpyHtoD atoms_mem_cu", dev_info->dev_name); } else { #endif // Update atoms buffer data on device cl_check_error(clEnqueueWriteBuffer(dev_args->cq, dev_args->atoms_mem, CL_TRUE, 0, dev_args->atoms_size, dev_args->atoms, 0, NULL, NULL), "clEnqueueWriteBuffer atoms_p_mem", dev_info->dev_name); #if RUN_IN_CUDA } #endif #if SHARED_SS > 0 #if RUN_IN_CUDA if (dev_info->type == CL_DEVICE_TYPE_GPU) { err = cuMemcpyHtoD(dev_args->shared_stores_flag_mem_cu, dev_args->shared_stores_flag, dev_args->shared_stores_flag_size); cuda_check_error(err, "cuMemcpyHtoD shared_stores_flag_mem_cu", dev_info->dev_name); } else { #endif // Update shared stored flags buffer data on device cl_check_error(clEnqueueWriteBuffer(dev_args->cq, dev_args->shared_stores_flag_mem, CL_TRUE, 0, dev_args->shared_stores_flag_size, dev_args->shared_stores_flag, 0, NULL, NULL), "clEnqueueWriteBuffer shared_stores_flag_mem", dev_info->dev_name); #if RUN_IN_CUDA } #endif #endif #if RUN_IN_CUDA if (dev_info->type == CL_DEVICE_TYPE_GPU) { err = cuLaunchKernel(dev_args->function_cu, (unsigned int)dev_args->wi_total / (unsigned int)dev_args->wi_local, 1, 1, (unsigned int)dev_args->wi_local, 1, 1, (unsigned int)dev_args->shared_memory_size_cu, 0, dev_args->kernel_args_cu, 0); cuda_check_error(err, "cuLaunchKernel function_cu", dev_info->dev_name); } else { #endif cl_check_error(clEnqueueNDRangeKernel(dev_args->cq, dev_args->kernel, 1, NULL, &dev_args->wi_total, &dev_args->wi_local, 0, NULL, NULL), "clEnqueueNDRangeKernel", dev_info->dev_name); #if RUN_IN_CUDA } #endif if (filtering) { return get_filtering_results(dev_args, dev_info); } if (DOMAIN_TYPE == BITMAP_) { #if RUN_IN_CUDA if (dev_info->type == CL_DEVICE_TYPE_GPU) { err = cuMemcpyDtoH(dev_args->bitmaps, dev_args->domains_mem_cu, dev_args->domains_size); cuda_check_error(err, "cuMemcpyDtoH bitmaps", dev_info->dev_name); } else { #endif // Transfer possible solution cl_check_error(clEnqueueReadBuffer(dev_args->cq, dev_args->domains_mem, CL_TRUE, 0, dev_args->domains_size, dev_args->bitmaps, 0, NULL, NULL), "clEnqueueReadBuffer domains_mem", dev_info->dev_name); #if RUN_IN_CUDA } #endif } else if (DOMAIN_TYPE == INTERVAL) { #if RUN_IN_CUDA if (dev_info->type == CL_DEVICE_TYPE_GPU) { err = cuMemcpyDtoH(dev_args->intervals, dev_args->domains_mem_cu, dev_args->domains_size); cuda_check_error(err, "cuMemcpyDtoH intervals", dev_info->dev_name); } else { #endif // Transfer possible solution cl_check_error(clEnqueueReadBuffer(dev_args->cq, dev_args->domains_mem, CL_TRUE, 0, dev_args->domains_size, dev_args->intervals, 0, NULL, NULL), "clEnqueueReadBuffer domains_mem", dev_info->dev_name); #if RUN_IN_CUDA } #endif } if (N_DEVS > 1) { #if RUN_IN_CUDA if (dev_info->type == CL_DEVICE_TYPE_GPU) { err = cuMemcpyDtoH(dev_args->props, dev_args->props_mem_cu, dev_args->props_size); cuda_check_error(err, "cuMemcpyDtoH props", dev_info->dev_name); } else { #endif cl_check_error(clEnqueueReadBuffer(dev_args->cq, dev_args->props_mem, CL_TRUE, 0, dev_args->props_size, dev_args->props, 0, NULL, NULL), "clEnqueueReadBuffer props_mem", dev_info->dev_name); #if RUN_IN_CUDA } #endif dev_info->last_props = 0; for (i = 0; i < dev_args->wi_total; i++) { dev_info->props_total += dev_args->props[i]; dev_info->last_props += dev_args->props[i]; } } // copy statistics from device to host if (PRINT_STATS) { get_stats(dev_args, dev_info, stats_lock); } // if solution found signalizes it for the other devices and saves it // if using bitmap domains if (DOMAIN_TYPE == BITMAP_) { bitmap b_result; b_clear(&b_result); b_copy_dev_to_host(&b_result, dev_args->bitmaps, 0); if (!b_is_empty(&b_result)) { #if defined(WIN32) || defined(_WIN32) || defined(__WIN32) && !defined(__CYGWIN__) sol_found_atom = InterlockedCompareExchange(sol_found, 1, 0); #else sol_found_atom = __atomic_fetch_add(sol_found, 1, __ATOMIC_SEQ_CST); #endif if (sol_found_atom < 1) { vs_copy_dev_to_host(VS, dev_args->bitmaps, N_VS); } return 1; } // if using interval domains } else if (DOMAIN_TYPE == INTERVAL) { #if RUN_IN_CUDA if (dev_info->type == CL_DEVICE_TYPE_GPU) { err = cuMemcpyDtoH(dev_args->atoms, dev_args->atoms_mem_cu, dev_args->atoms_size); cuda_check_error(err, "cuMemcpyDtoH atoms", dev_info->dev_name); } else { #endif // Transfer possible solution cl_check_error(clEnqueueReadBuffer(dev_args->cq, dev_args->atoms_mem, CL_TRUE, 0, dev_args->atoms_size, dev_args->atoms, 0, NULL, NULL), "clEnqueueReadBuffer atoms_mem", dev_info->dev_name); #if RUN_IN_CUDA } #endif if (dev_args->atoms[5 + N_VS] != 0) { #if defined(WIN32) || defined(_WIN32) || defined(__WIN32) && !defined(__CYGWIN__) sol_found_atom = InterlockedCompareExchange(sol_found, 1, 0); #else sol_found_atom = __atomic_fetch_add(sol_found, 1, __ATOMIC_SEQ_CST); #endif if (sol_found_atom < 1) { convert_intervals_to_vars(VS, dev_args->intervals, N_VS); } return 1; } } return 0; } /* * Return 1 if an optimal solution was found, or 0 if no optimal solution was found. * dev_args - device_args structure about this device * dev_info - device_info structure about this device * val_to_opt - Value to optimize * opt_lock - mutex to control accesses to value to optimize and best solution found * depth - Tree expansion depth needed to get n_ss disjoint search spaces * n_ss - total number of sub-search spaces * stats_lock - mutex to control accesses to statistics structure * filtering - if it is being executed in the prefiltering phase * */ cl_ulong find_best_sol(device_args *dev_args, device_info *dev_info, cl_uint *val_to_opt, pthread_mutex_t *opt_lock, unsigned int depth, unsigned int n_ss, pthread_mutex_t *stats_lock, bool filtering) { bool opt_sol_found = false; unsigned int i; // buffer for atomics data (Most devices only have atomics for 32 bits variables) // 0 - first sub-search to explore // 1 - last sub-search to explore // 2 - n_ss // 3 - depth // 4 - WIs still working for work-sharing // 5 - 5+N_VS - n_repeat per variable // 5+N_VS - solution found flag // 6+N_VS - Value to optimize // 7+N_VS - WIs still working for saving the best solution set_strs_generat_data(dev_args, dev_info, depth, n_ss, filtering); dev_args->atoms[4] = (unsigned int) dev_args->wi_total; dev_args->atoms[5 + N_VS] = 0; dev_args->atoms[6 + N_VS] = *val_to_opt; dev_args->atoms[7 + N_VS] = (unsigned int) dev_args->wi_total; #if RUN_IN_CUDA CUresult err = cuInit(0); if (dev_info->type == CL_DEVICE_TYPE_GPU) { err = cuMemcpyHtoD(dev_args->atoms_mem_cu, dev_args->atoms, dev_args->atoms_size); cuda_check_error(err, "cuMemcpyHtoD atoms_mem_cu", dev_info->dev_name); } else { #endif // Update atoms buffer data on device cl_check_error(clEnqueueWriteBuffer(dev_args->cq, dev_args->atoms_mem, CL_TRUE, 0, dev_args->atoms_size, dev_args->atoms, 0, NULL, NULL), "clEnqueueWriteBuffer atoms_p_mem", dev_info->dev_name); #if RUN_IN_CUDA } #endif #if SHARED_SS > 0 #if RUN_IN_CUDA if (dev_info->type == CL_DEVICE_TYPE_GPU) { err = cuMemcpyHtoD(dev_args->shared_stores_flag_mem_cu, dev_args->shared_stores_flag, dev_args->shared_stores_flag_size); cuda_check_error(err, "cuMemcpyHtoD shared_stores_flag_mem_cu", dev_info->dev_name); } else { #endif // Update shared stored flags buffer data on device cl_check_error(clEnqueueWriteBuffer(dev_args->cq, dev_args->shared_stores_flag_mem, CL_TRUE, 0, dev_args->shared_stores_flag_size, dev_args->shared_stores_flag, 0, NULL, NULL), "clEnqueueWriteBuffer shared_stores_flag_mem", dev_info->dev_name); #if RUN_IN_CUDA } #endif #endif #if RUN_IN_CUDA if (dev_info->type == CL_DEVICE_TYPE_GPU) { err = cuLaunchKernel(dev_args->function_cu, (unsigned int)dev_args->wi_total / (unsigned int)dev_args->wi_local, 1, 1, (unsigned int)dev_args->wi_local, 1, 1, (unsigned int)dev_args->shared_memory_size_cu, 0, dev_args->kernel_args_cu, 0); cuda_check_error(err, "cuLaunchKernel function_cu", dev_info->dev_name); } else { #endif cl_check_error(clEnqueueNDRangeKernel(dev_args->cq, dev_args->kernel, 1, NULL, &dev_args->wi_total, &dev_args->wi_local, 0, NULL, NULL), "clEnqueueNDRangeKernel", dev_info->dev_name); #if RUN_IN_CUDA } #endif if (filtering) { return get_filtering_results(dev_args, dev_info); } // Transfer best value found flag #if RUN_IN_CUDA if (dev_info->type == CL_DEVICE_TYPE_GPU) { err = cuMemcpyDtoH(dev_args->atoms, dev_args->atoms_mem_cu, dev_args->atoms_size); cuda_check_error(err, "cuMemcpyDtoH atoms", dev_info->dev_name); } else { #endif cl_check_error(clEnqueueReadBuffer(dev_args->cq, dev_args->atoms_mem, CL_TRUE, 0, dev_args->atoms_size, dev_args->atoms, 0, NULL, NULL), "clEnqueueReadBuffer atoms_mem", dev_info->dev_name); #if RUN_IN_CUDA } #endif opt_sol_found = dev_args->atoms[5 + N_VS]; if (opt_sol_found) { if (DOMAIN_TYPE == BITMAP_) { // Transfer possible solution #if RUN_IN_CUDA if (dev_info->type == CL_DEVICE_TYPE_GPU) { err = cuMemcpyDtoH(dev_args->bitmaps, dev_args->domains_mem_cu, dev_args->domains_size); cuda_check_error(err, "cuMemcpyDtoH bitmaps", dev_info->dev_name); } else { #endif cl_check_error(clEnqueueReadBuffer(dev_args->cq, dev_args->domains_mem, CL_TRUE, 0, dev_args->domains_size, dev_args->bitmaps, 0, NULL, NULL), "clEnqueueReadBuffer domains_mem", dev_info->dev_name); #if RUN_IN_CUDA } #endif } else if (DOMAIN_TYPE == INTERVAL) { // Transfer possible solution #if RUN_IN_CUDA if (dev_info->type == CL_DEVICE_TYPE_GPU) { err = cuMemcpyDtoH(dev_args->intervals, dev_args->domains_mem_cu, dev_args->domains_size); cuda_check_error(err, "cuMemcpyDtoH intervals", dev_info->dev_name); } else { #endif cl_check_error(clEnqueueReadBuffer(dev_args->cq, dev_args->domains_mem, CL_TRUE, 0, dev_args->domains_size, dev_args->intervals, 0, NULL, NULL), "clEnqueueReadBuffer domains_mem", dev_info->dev_name); #if RUN_IN_CUDA } #endif } } if (N_DEVS > 1) { #if RUN_IN_CUDA if (dev_info->type == CL_DEVICE_TYPE_GPU) { err = cuMemcpyDtoH(dev_args->props, dev_args->props_mem_cu, dev_args->props_size); cuda_check_error(err, "cuMemcpyDtoH props", dev_info->dev_name); } else { #endif cl_check_error(clEnqueueReadBuffer(dev_args->cq, dev_args->props_mem, CL_TRUE, 0, dev_args->props_size, dev_args->props, 0, NULL, NULL), "clEnqueueReadBuffer props_mem", dev_info->dev_name); #if RUN_IN_CUDA } #endif dev_info->last_props = 0; for (i = 0; i < dev_args->wi_total; i++) { dev_info->props_total += dev_args->props[i]; dev_info->last_props += dev_args->props[i]; } } // copy statistics from device to host if (PRINT_STATS) { get_stats(dev_args, dev_info, stats_lock); } // if optimal solution found if (opt_sol_found) { if (N_DEVS > 1) { // lock access to the place to write the optimal solution and writes it pthread_mutex_lock(opt_lock); // if using bitmap domains if (DOMAIN_TYPE == BITMAP_) { vs_copy_dev_to_host(VS_LOCK, dev_args->bitmaps, N_VS); } else if (DOMAIN_TYPE == INTERVAL) { convert_intervals_to_vars(VS_LOCK, dev_args->intervals, N_VS); } if (OPT_MODE == DECREASE) { if (VS_LOCK[VAR_ID_TO_OPT].max < VS_LOCK_BEST[VAR_ID_TO_OPT].max) { opt_sol_found = true; } else { opt_sol_found = false; } } else { if (VS_LOCK[VAR_ID_TO_OPT].min > VS_LOCK_BEST[VAR_ID_TO_OPT].min) { opt_sol_found = true; } else { opt_sol_found = false; } } if (opt_sol_found) { *val_to_opt = dev_args->atoms[6 + N_VS]; vs_copy(VS_LOCK_BEST, VS_LOCK, N_VS); } pthread_mutex_unlock(opt_lock); } else { *val_to_opt = dev_args->atoms[6 + N_VS]; if (*val_to_opt > D_MAX) { *val_to_opt = 0; } // if using bitmap domains if (DOMAIN_TYPE == BITMAP_) { vs_copy_dev_to_host(VS_LOCK_BEST, dev_args->bitmaps, N_VS); } else if (DOMAIN_TYPE == INTERVAL) { convert_intervals_to_vars(VS_LOCK_BEST, dev_args->intervals, N_VS); } } } if (opt_sol_found) { return 1; } else { return 0; } } /* * Generates the data needed to each work-item for generating the sub-search spaces * dev_args - device_args structure about this device * dev_info - device_info structure about this device * depth - Tree expansion depth needed to get n_ss disjoint search spaces * n_ss - total number of sub-search spaces * filtering - if it is being executed in the prefiltering phase */ void set_strs_generat_data(device_args *dev_args, device_info *dev_info, unsigned int depth, unsigned int n_ss, bool filtering) { unsigned int n_ss_new; unsigned int depth_prev = depth; unsigned int depth_new = depth; unsigned int new_multiplier = 1; unsigned int i, j; // buffer for atomics data (Most devices only have atomics for 32 bits variables) // 0 - first sub-search to explore // 1 - last sub-search to explore // 2 - n_ss // 3 - depth // 4 - WIs still working for work-sharing // 5 - 5+N_VS - n_repeat per variable // ... if (filtering) { for (i = 0; i < N_VS; i++) { dev_info->exp_values[i] = 0; dev_args->atoms[5 + i] = 0; } dev_info->n_ss_mult = 1; dev_info->n_ss_mult_max = 1; dev_info->first_store = 0; dev_info->last_store = 1; dev_args->atoms[0] = 0; dev_args->atoms[1] = 1; dev_args->atoms[2] = 1; dev_args->atoms[3] = 0; return; } for (i = 0; i < N_VS; i++) { dev_info->exp_values[i] = EXP_VALUES[i]; dev_args->atoms[5 + i] = EXP_VALUES[i]; } // calculate a valid n_ss multiplier bigger than the one provided // get the max multiplier that can be applied to the number of ss inside each device if (dev_info->n_ss_mult > dev_info->n_ss_mult_max) { dev_info->n_ss_mult = dev_info->n_ss_mult_max; } for (i = depth_prev; new_multiplier < dev_info->n_ss_mult && new_multiplier < dev_info->n_ss_mult_max; i++) { if (VS[i].n_vals > 1 && VS[i].to_label) { new_multiplier *= VS[i].n_vals; dev_info->exp_values[i] = VS[i].n_vals; } else { dev_info->exp_values[i] = 1; } } i--; // if expanding all the previous tree nodes to new depth generate more than the required multiplier if (new_multiplier > dev_info->n_ss_mult) { new_multiplier /= VS[i].n_vals; if (new_multiplier * 2 > dev_info->n_ss_mult_max) { dev_info->exp_values[i] = 0; i--; } else { for (j = 2; j < VS[i].n_vals; j++) { if (new_multiplier * j >= dev_info->n_ss_mult || new_multiplier * (j + 1) >= dev_info->n_ss_mult_max) { new_multiplier *= j; dev_info->exp_values[i] = j; break; } } if (j == VS[i].n_vals) { new_multiplier *= VS[i].n_vals; dev_info->exp_values[i] = VS[i].n_vals; } } } depth_new = i + 1; dev_info->n_ss_mult = new_multiplier; n_ss_new = n_ss * dev_info->n_ss_mult; // non labeling variables will not be expanded for (i = 0; i < depth_new; i++) { dev_args->atoms[5 + i] = dev_info->exp_values[i]; } for (; i < N_VS; i++) { dev_args->atoms[5 + i] = 0; } // 0 - first sub-search to explore // 1 - last sub-search to explore // 2 - n_ss dev_args->atoms[0] = dev_info->first_store * dev_info->n_ss_mult; dev_args->atoms[1] = dev_info->last_store * dev_info->n_ss_mult; dev_args->atoms[2] = n_ss_new; dev_args->atoms[3] = depth_new; } /* * Load statistics from the device * dev_args - device_args structure about this device * dev_info - device_info structure about this device * stats_lock - mutex to control accesses to statistics structure */ void get_stats(device_args *dev_args, device_info *dev_info, pthread_mutex_t *stats_lock) { unsigned int i; // 0 - nodes_fail // 1 - nodes_expl // 2 - backtracks // 3 - labels // 4 - props_not_ok // 5 - props_ok // ... repeat per work-item #if RUN_IN_CUDA CUresult err = cuInit(0); if (dev_info->type == CL_DEVICE_TYPE_GPU) { err = cuMemcpyDtoH(dev_args->stats, dev_args->stats_mem_cu, dev_args->stats_size); cuda_check_error(err, "cuMemcpyDtoH stats", dev_info->dev_name); } else { #endif cl_check_error(clEnqueueReadBuffer(dev_args->cq, dev_args->stats_mem, CL_TRUE, 0, dev_args->stats_size, dev_args->stats, 0, NULL, NULL), "clEnqueueReadBuffer stats_mem", dev_info->dev_name); #if RUN_IN_CUDA } #endif if (N_DEVS > 1) { // lock access to the place to write the optimal solution and writes it pthread_mutex_lock(stats_lock); } // copy statistics to host global counter for (i = 0; i < dev_args->wi_total; i++) { STATS.nodes_fail += dev_args->stats[i * 7]; STATS.nodes_expl += dev_args->stats[i * 7 + 1]; STATS.backtracks += dev_args->stats[i * 7 + 2]; STATS.labels += dev_args->stats[i * 7 + 3]; STATS.pruning += dev_args->stats[i * 7 + 4]; STATS.props_ok += dev_args->stats[i * 7 + 5]; if (dev_args->stats[i * 7 + 6] > STATS.max_depth) { STATS.max_depth = dev_args->stats[i * 7 + 6]; } } STATS.search_spaces += dev_info->block_size * dev_info->n_ss_mult; // clear counters on device buffer for next run memset(dev_args->stats, 0, dev_args->stats_size); #if RUN_IN_CUDA if (dev_info->type == CL_DEVICE_TYPE_GPU) { err = cuMemcpyHtoD(dev_args->stats_mem_cu, dev_args->stats, dev_args->stats_size); cuda_check_error(err, "cuMemcpyHtoD stats_mem_cu", dev_info->dev_name); } else { #endif cl_check_error(clEnqueueWriteBuffer(dev_args->cq, dev_args->stats_mem, CL_TRUE, 0, dev_args->stats_size, dev_args->stats, 0, NULL, NULL), "clEnqueueWriteBuffer stats_mem", dev_info->dev_name); #if RUN_IN_CUDA } #endif if (N_DEVS > 1) { pthread_mutex_unlock(stats_lock); } } bool get_filtering_results(device_args *dev_args, device_info *dev_info) { if (DOMAIN_TYPE == BITMAP_) { #if RUN_IN_CUDA CUresult err = cuInit(0); if (dev_info->type == CL_DEVICE_TYPE_GPU) { err = cuMemcpyDtoH(dev_args->filt_bitmaps, dev_args->filt_domains_mem_cu, dev_args->filt_domains_size); cuda_check_error(err, "cuMemcpyDtoH filt_bitmaps", dev_info->dev_name); } else { #endif // Transfer filtered CSP cl_check_error( clEnqueueReadBuffer(dev_args->cq, dev_args->filt_domains_mem, CL_TRUE, 0, dev_args->filt_domains_size, dev_args->filt_bitmaps, 0, NULL, NULL), "clEnqueueReadBuffer filt_domains_mem", dev_info->dev_name); #if RUN_IN_CUDA } #endif bitmap b_result; b_clear(&b_result); b_copy_dev_to_host(&b_result, dev_args->filt_bitmaps, 0); // consistent CSP after filtering if (!b_is_empty(&b_result)) { if (CS_IGNORE) { unsigned int i; #if RUN_IN_CUDA if (dev_info->type == CL_DEVICE_TYPE_GPU) { err = cuMemcpyDtoH(dev_args->filt_cs, dev_args->filt_cs_mem_cu, dev_args->filt_cs_size); cuda_check_error(err, "cuMemcpyDtoH filt_cs", dev_info->dev_name); } else { #endif // Transfer cs_ignore results cl_check_error(clEnqueueReadBuffer(dev_args->cq, dev_args->filt_cs_mem, CL_TRUE, 0, dev_args->filt_cs_size, dev_args->filt_cs, 0, NULL, NULL), "clEnqueueReadBuffer filt_cs_mem", dev_info->dev_name); #if RUN_IN_CUDA } #endif for (i = 0; i < N_CS; i++) { if (dev_args->filt_cs[i] == 1) { CS[i].ignore = true; } else { CS[i].ignore = false; } } } vs_copy_dev_to_host(VS, dev_args->filt_bitmaps, N_VS); return 1; } } else { #if RUN_IN_CUDA CUresult err = cuInit(0); if (dev_info->type == CL_DEVICE_TYPE_GPU) { err = cuMemcpyDtoH(dev_args->filt_intervals, dev_args->filt_domains_mem_cu, dev_args->filt_domains_size); cuda_check_error(err, "cuMemcpyDtoH filt_intervals", dev_info->dev_name); } else { #endif // Transfer filtered CSP cl_check_error( clEnqueueReadBuffer(dev_args->cq, dev_args->filt_domains_mem, CL_TRUE, 0, dev_args->filt_domains_size, dev_args->filt_intervals, 0, NULL, NULL), "clEnqueueReadBuffer filt_domains_mem", dev_info->dev_name); #if RUN_IN_CUDA } #endif // consistent CSP after filtering if (dev_args->filt_intervals[0].s0 <= dev_args->filt_intervals[0].s1) { if (CS_IGNORE) { unsigned int i; #if RUN_IN_CUDA if (dev_info->type == CL_DEVICE_TYPE_GPU) { err = cuMemcpyDtoH(dev_args->filt_cs, dev_args->filt_cs_mem_cu, dev_args->filt_cs_size); cuda_check_error(err, "cuMemcpyDtoH filt_cs", dev_info->dev_name); } else { #endif // Transfer cs_ignore results cl_check_error(clEnqueueReadBuffer(dev_args->cq, dev_args->filt_cs_mem, CL_TRUE, 0, dev_args->filt_cs_size, dev_args->filt_cs, 0, NULL, NULL), "clEnqueueReadBuffer filt_cs_mem", dev_info->dev_name); #if RUN_IN_CUDA } #endif for (i = 0; i < N_CS; i++) { if (dev_args->filt_cs[i] == 1) { CS[i].ignore = true; } else { CS[i].ignore = false; } } } convert_intervals_to_vars(VS, dev_args->filt_intervals, N_VS); return 1; } } return 0; }