/* * split.c * * Created on: 19/02/2015 * Author: pedro */ #include "split.h" #include #include #include #include #include #include #include #if defined(WIN32) || defined(_WIN32) || defined(__WIN32) && !defined(__CYGWIN__) #include "utils\pthread_win32\pthread.h" #include "windows.h" #else #include #include #endif #include "bitmaps.h" #include "config.h" #include "config_device.h" #include "constraints.h" #include "domains.h" #include "intervals.h" #include "solve.h" #include "utils/benchmark.h" #include "utils/cl_errors.h" #include "variables.h" unsigned int devs_working; // number of devices still working unsigned int devs_ranked; // number of devices already ranked pthread_mutex_t opt_lock; // to synchronize threads for gathering optimization results pthread_mutex_t stats_lock; // to synchronize threads for gathering statistics results pthread_barrier_t barrier; unsigned int mult_max; // ID of the variable that should be expanded when multiplying the ss unsigned int best_sols_found; // counter for number of best solutions found unsigned int vs_labeled_at_exp; // number of variables marked for labeling that were fully expanded when creating sub-search spaces bool filtering = false; bool all_GPUs = true; /* * Split the CSP between all the selected devices and work-items and solves it. * If only one solution, or the best solution is wanted it return 1 if it is found, or 0 if none is found. * If all solutions must be found it returns the number of solutions found. * */ cl_ulong solve_CSP() { unsigned int i, j; if (N_VS_ORIGINAL == 0) { N_VS_ORIGINAL = V_ID_CNTR; } if (N_CS_ORIGINAL == 0) { N_CS_ORIGINAL = C_ID_CNTR; } if (PRINT_CSP) { print_CSP(); } if (N_DEVS > 1 && DOMAIN_TYPE == INTERVAL) { fprintf(stderr, "\nPHACT cannot use interval domains with more than one device at the same time.\n" "Please remove \"-INTERVALS\" from the command arguments, or use a single device.\n\n"); exit(-1); } if (CS_IGNORE && DOMAIN_TYPE == INTERVAL) { CS_IGNORE = false; } #if PRE_FILTER // Prunes domain values that are already inconsistent at CSP definition if (!filter_CSP()) { return 0; } // If these results are a solution, returns 1 if (cs_check(false)) { return 1; } // remove constraints fixed after filtering if (CS_IGNORE) { cs_remove_ignored(); } // all the constraints were fixed at filtering, but some variables are not singleton if (N_CS == 0) { j = 1; for (i = 0; i < N_VS; i++) { j *= VS[i].n_vals; } printf("\n"); return j; } #endif devs_ranked = 0; // number of devices already ranked mult_max = 1; // ID of the variable that should be expanded when multiplying the ss best_sols_found = 1; // counter for number of best solutions found vs_labeled_at_exp = 0; // number of variables marked for labeling that were fully expanded when creating sub-search spaces unsigned int n_ss = 0; // Number of sub-search spaces created unsigned int depth = 0; // Tree expansion depth needed to get n_ss disjoint search spaces unsigned int gpu_cntr = 0; // Number of GPUs compatible with OpenCL present on the running machine unsigned int cpu_cntr = 0; // Number of CPUs compatible with OpenCL present on the running machine unsigned int acc_cntr = 0; // Number of ACCs compatible with OpenCL present on the running machine cl_ulong result = 0; // Number of solutions found, or 0 or 1 if only one solution is wanted unsigned int next_str = 0; // Index in stores where the next unexplored sub-search space is placed (atomic read and write) unsigned char sol_found = 0; // To set to 1 when only one solution is wanted and is found (atomic read and write) int thread_ret; // Value returned from each device thread unsigned int n_vs_cs; // number of all variables in all constraints unsigned int n_cs_vs; // number of all constraints in all variables unsigned int n_const_cs; // number of all constant values in all constraints with more than one constant value size_t l_mem_per_wi; // size in bytes of the local memory needed per work-item char* host_name = NULL; STATS.n_solutions = 0; STATS.search_spaces = 0; platf_args* platform_args; // each platform arguments for all devices of the same platform cl_device_id gpu_dev[MAX_DEVS]; // to save all GPUs cl_device_id cl_platform_id gpu_platf[MAX_DEVS]; // to save all GPUs cl_platform_id cl_device_id cpu_dev[MAX_DEVS]; // to save all CPUs cl_device_id cl_platform_id cpu_platf[MAX_DEVS]; // to save all CPUs cl_platform_id cl_device_id acc_dev[MAX_DEVS]; // to save all ACCs cl_device_id cl_platform_id acc_platf[MAX_DEVS]; // to save all ACCs cl_platform_id cl_platform_id* platfs = NULL; // to save all devices cl_platform_id cl_device_id* devs = NULL; // to save all devices cl_device_id cl_uint platf_cnt = 0; // number of platforms (Intel, Nvidia...) cl_uint dev_cnt = 0; // number of devices on each platform (CPU, MIC...) cl_int ret; // output of clGetPlatformIDs and clGetDeviceIDs size_t val_size; // Use command line heuristics for labeling and assignment, if existent // if not use default if (LABEL_MODE_COM != DEFAULT_L) { LABEL_MODE = LABEL_MODE_COM; } else if (LABEL_MODE == DEFAULT_L) { LABEL_MODE = LABEL_MODE_D; } if (ASSIGN_MODE_COM != DEFAULT_A) { ASSIGN_MODE = ASSIGN_MODE_COM; } else if (ASSIGN_MODE == DEFAULT_A) { ASSIGN_MODE = ASSIGN_MODE_D; } init_csp_and_d_bits(); // discover all platforms (Intel, Nvidia, AMD,...) ret = clGetPlatformIDs(0, NULL, &platf_cnt); cl_check_error(ret, "clGetPlatformIDs", "discovering devices"); platfs = (cl_platform_id*) malloc(platf_cnt * sizeof(cl_platform_id)); ret = clGetPlatformIDs(platf_cnt, platfs, NULL); cl_check_error(ret, "clGetPlatformIDs", "discovering devices"); platform_args = (platf_args*) malloc(platf_cnt * sizeof(platf_args)); // when all devices should be used if (ALL_DEVS) { N_DEVS = 0; // for each platform for (i = 0; i < platf_cnt; i++) { // discover all devices (GPUs, CPUs, MICs,...) ret = clGetDeviceIDs(platfs[i], CL_DEVICE_TYPE_ALL, 0, NULL, &dev_cnt); cl_check_error(ret, "clGetDeviceIDs", "discovering devices"); devs = (cl_device_id*) malloc(sizeof(cl_device_id) * dev_cnt); ret = clGetDeviceIDs(platfs[i], CL_DEVICE_TYPE_ALL, dev_cnt, devs, NULL); cl_check_error(ret, "clGetDeviceIDs", "discovering devices"); platform_args[i].platform_id = platfs[i]; platform_args[i].n_devs = dev_cnt; // for each device for (j = 0; j < dev_cnt; j++) { // Identify the type of device (GPU, CPU, MIC,...) cl_device_type cl_device_type; ret = clGetDeviceInfo(devs[j], CL_DEVICE_TYPE, sizeof(cl_device_type), &cl_device_type, NULL); cl_check_error(ret, "clGetDeviceInfo", "CL_DEVICE_TYPE"); // to identify the device that user wants to use if (cl_device_type & CL_DEVICE_TYPE_GPU) { DEVICES_INFO[N_DEVS].type = CL_DEVICE_TYPE_GPU; DEVICES_INFO[N_DEVS].dev_type_n = (int)gpu_cntr + 1; DEVICES_INFO[N_DEVS].n_wg = 0; DEVICES_INFO[N_DEVS].n_wi_wg = 0; gpu_platf[gpu_cntr] = platfs[i]; gpu_dev[gpu_cntr++] = devs[j]; } else if (cl_device_type & CL_DEVICE_TYPE_CPU) { DEVICES_INFO[N_DEVS].type = CL_DEVICE_TYPE_CPU; DEVICES_INFO[N_DEVS].dev_type_n = (int)cpu_cntr + 1; DEVICES_INFO[N_DEVS].n_wg = 0; DEVICES_INFO[N_DEVS].n_wi_wg = 0; cpu_platf[cpu_cntr] = platfs[i]; cpu_dev[cpu_cntr++] = devs[j]; // get device name ret = clGetDeviceInfo(devs[j], CL_DEVICE_NAME, 0, NULL, &val_size); cl_check_error(ret, "clGetDeviceInfo", "CL_DEVICE_NAME"); char* aux_name = (char*) malloc(val_size); ret = clGetDeviceInfo(devs[j], CL_DEVICE_NAME, val_size, aux_name, NULL); cl_check_error(ret, "clGetDeviceInfo", "CL_DEVICE_NAME"); // trim leading spaces on device name unsigned int del = 0; while (isspace((unsigned char )(aux_name[del]))) del++; host_name = (char*) malloc(val_size - del); strcpy(host_name, &aux_name[del]); free(aux_name); } else if (cl_device_type & CL_DEVICE_TYPE_ACCELERATOR) { DEVICES_INFO[N_DEVS].type = CL_DEVICE_TYPE_ACCELERATOR; DEVICES_INFO[N_DEVS].dev_type_n = (int)acc_cntr + 1; DEVICES_INFO[N_DEVS].n_wg = 0; DEVICES_INFO[N_DEVS].n_wi_wg = 0; acc_platf[acc_cntr] = platfs[i]; acc_dev[acc_cntr++] = devs[j]; } else { fprintf(stderr, "\nclGetDeviceInfo returned a device type that is not supported by the Solver.\n"); exit(-1); } N_DEVS++; } } // when user selected the devices to use } else { // for each platform for (i = 0; i < platf_cnt; i++) { // discover all devices (GPUs, CPUs, MICs,...) ret = clGetDeviceIDs(platfs[i], CL_DEVICE_TYPE_ALL, 0, NULL, &dev_cnt); cl_check_error(ret, "clGetDeviceIDs", "discovering devices"); devs = (cl_device_id*) malloc(dev_cnt * sizeof(cl_device_id)); ret = clGetDeviceIDs(platfs[i], CL_DEVICE_TYPE_ALL, dev_cnt, devs, NULL); cl_check_error(ret, "clGetDeviceIDs", "discovering devices"); platform_args[i].platform_id = platfs[i]; platform_args[i].n_devs = dev_cnt; // for each device for (j = 0; j < dev_cnt; j++) { // Identify the type of device (GPU, CPU, MIC,...) cl_device_type cl_device_type; ret = clGetDeviceInfo(devs[j], CL_DEVICE_TYPE, sizeof(cl_device_type), &cl_device_type, NULL); cl_check_error(ret, "clGetDeviceInfo", "CL_DEVICE_TYPE"); // to identify the device that user wants to use if (cl_device_type & CL_DEVICE_TYPE_GPU) { gpu_platf[gpu_cntr] = platfs[i]; gpu_dev[gpu_cntr++] = devs[j]; } else if (cl_device_type & CL_DEVICE_TYPE_CPU) { cpu_platf[cpu_cntr] = platfs[i]; cpu_dev[cpu_cntr++] = devs[j]; // get device name ret = clGetDeviceInfo(devs[j], CL_DEVICE_NAME, 0, NULL, &val_size); cl_check_error(ret, "clGetDeviceInfo", "CL_DEVICE_NAME"); char* aux_name = (char*) malloc(val_size); ret = clGetDeviceInfo(devs[j], CL_DEVICE_NAME, val_size, aux_name, NULL); cl_check_error(ret, "clGetDeviceInfo", "CL_DEVICE_NAME"); // trim leading spaces on device name unsigned int del = 0; while (isspace((unsigned char )(aux_name[del]))) del++; host_name = (char*) malloc(val_size - del); strcpy(host_name, &aux_name[del]); free(aux_name); } else if (cl_device_type & CL_DEVICE_TYPE_ACCELERATOR) { acc_platf[acc_cntr] = platfs[i]; acc_dev[acc_cntr++] = devs[j]; } else { fprintf(stderr, "\nclGetDeviceInfo returned a device type that is not supported by the Solver.\n"); exit(-1); } } free(devs); } } free(platfs); if (N_GPUs > gpu_cntr || N_CPUs > cpu_cntr || N_ACCs > acc_cntr) { fprintf(stderr, "\nYou are trying to use a device that is not compatible with OpenCL, or that doesn't exist on this machine. " "Please check the selected devices.\n"); exit(-1); } for (i = 0; i < N_DEVS; i++) { // if user wants to use all the devices of a type if (DEVICES_INFO[i].dev_type_n == 0) { DEVICES_INFO[i].dev_type_n = 1; if (DEVICES_INFO[i].type == CL_DEVICE_TYPE_GPU) { for (j = 1; j < gpu_cntr; j++) { DEVICES_INFO[N_DEVS].dev_type_n = DEVICES_INFO[i].dev_type_n + (int)j; DEVICES_INFO[N_DEVS].type = DEVICES_INFO[i].type; DEVICES_INFO[N_DEVS].n_wg = DEVICES_INFO[i].n_wg; DEVICES_INFO[N_DEVS].n_wi_wg = DEVICES_INFO[i].n_wi_wg; N_DEVS++; } } else if (DEVICES_INFO[i].type == CL_DEVICE_TYPE_CPU) { for (j = 1; j < cpu_cntr; j++) { DEVICES_INFO[N_DEVS].dev_type_n = DEVICES_INFO[i].dev_type_n + (int)j; DEVICES_INFO[N_DEVS].type = DEVICES_INFO[i].type; DEVICES_INFO[N_DEVS].n_wg = DEVICES_INFO[i].n_wg; DEVICES_INFO[N_DEVS].n_wi_wg = DEVICES_INFO[i].n_wi_wg; N_DEVS++; } all_GPUs = false; } else { for (j = 1; j < acc_cntr; j++) { DEVICES_INFO[N_DEVS].dev_type_n = DEVICES_INFO[i].dev_type_n + (int)j; DEVICES_INFO[N_DEVS].type = DEVICES_INFO[i].type; DEVICES_INFO[N_DEVS].n_wg = DEVICES_INFO[i].n_wg; DEVICES_INFO[N_DEVS].n_wi_wg = DEVICES_INFO[i].n_wi_wg; N_DEVS++; } all_GPUs = false; } } // save the platform_id, device_id and the n_wi_wg (if default) for all the devices to use if (DEVICES_INFO[i].type == CL_DEVICE_TYPE_GPU) { DEVICES_INFO[i].platform_id = gpu_platf[DEVICES_INFO[i].dev_type_n - 1]; DEVICES_INFO[i].device_id = gpu_dev[DEVICES_INFO[i].dev_type_n - 1]; } else if (DEVICES_INFO[i].type == CL_DEVICE_TYPE_CPU) { DEVICES_INFO[i].platform_id = cpu_platf[DEVICES_INFO[i].dev_type_n - 1]; DEVICES_INFO[i].device_id = cpu_dev[DEVICES_INFO[i].dev_type_n - 1]; } else { DEVICES_INFO[i].platform_id = acc_platf[DEVICES_INFO[i].dev_type_n - 1]; DEVICES_INFO[i].device_id = acc_dev[DEVICES_INFO[i].dev_type_n - 1]; } // get number of compute units on this device ret = clGetDeviceInfo(DEVICES_INFO[i].device_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &DEVICES_INFO[i].compute_units, NULL); cl_check_error(ret, "clGetDeviceInfo", "CL_DEVICE_MAX_COMPUTE_UNITS"); // get maximum cores frequency ret = clGetDeviceInfo(DEVICES_INFO[i].device_id, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(cl_uint), &DEVICES_INFO[i].max_freq, NULL); cl_check_error(ret, "clGetDeviceInfo", "CL_DEVICE_MAX_CLOCK_FREQUENCY"); // get the amount of local memory to check if it is enough ret = clGetDeviceInfo(DEVICES_INFO[i].device_id, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(DEVICES_INFO[i].local_mem_max_alloc), &DEVICES_INFO[i].local_mem_max_alloc, NULL); cl_check_error(ret, "clGetDeviceInfo", "CL_DEVICE_LOCAL_MEM_SIZE"); // set number of work-items and work-groups to use on each device if (DEVICES_INFO[i].type == CL_DEVICE_TYPE_GPU) { DEVICES_INFO[i].use_local_mem = false; DEVICES_INFO[i].def_n_wi_wg = GPU_DEFAULT_N_WI; DEVICES_INFO[i].def_n_wg = GPU_DEFAULT_N_WG; } else if (DEVICES_INFO[i].type == CL_DEVICE_TYPE_CPU) { DEVICES_INFO[i].use_local_mem = true; DEVICES_INFO[i].def_n_wi_wg = 1; // default number of work-items per work-group to use DEVICES_INFO[i].def_n_wg = DEVICES_INFO[i].compute_units; // default number of work-groups to use } // MICs else { DEVICES_INFO[i].use_local_mem = true; DEVICES_INFO[i].def_n_wi_wg = 1; // default number of work-items per work-group to use DEVICES_INFO[i].def_n_wg = DEVICES_INFO[i].compute_units; // default number of work-groups to use } if (DEVICES_INFO[i].n_wg == 0) { DEVICES_INFO[i].n_wg = DEVICES_INFO[i].def_n_wg; } if (DEVICES_INFO[i].n_wi_wg == 0) { DEVICES_INFO[i].n_wi_wg = DEVICES_INFO[i].def_n_wi_wg; } DEVICES_INFO[i].stores_explored = 0; DEVICES_INFO[i].last_1ss_solv_time = 0; DEVICES_INFO[i].avg_1ss_solv_time = 0; DEVICES_INFO[i].max_1ss_solv_time = 0; DEVICES_INFO[i].n_ss_mult = 1; DEVICES_INFO[i].rank = 0; DEVICES_INFO[i].times_used = 0; DEVICES_INFO[i].ms_solve_time = 0; DEVICES_INFO[i].first_time_ranked = false; DEVICES_INFO[i].props_total = 0; DEVICES_INFO[i].last_props = 0; DEVICES_INFO[i].avg_time_prop = 0; DEVICES_INFO[i].last_time_prop = 0; DEVICES_INFO[i].max_time_prop = 0; DEVICES_INFO[i].ranked = false; DEVICES_INFO[i].working = true; DEVICES_INFO[i].last_explor_time = 0; DEVICES_INFO[i].n_fast_blocks = 0; DEVICES_INFO[i].sols_found = 0; DEVICES_INFO[i].n_buffers = 1; DEVICES_ARGS[i].split_values_ext = 1; DEVICES_INFO[i].exp_values = calloc(N_VS, sizeof(unsigned int)); DEVICES_INFO[i].n_empty_blocks = 0; } if (DOMAIN_TYPE == INTERVAL && !CAN_USE_INTERVALS) { fprintf(stderr, "\nThe domains of the variables of the current CSP contain non-contiguous values,\n" "which are not permitted when using interval domains in PHACT.\n" "Please remove \"-INTERVALS\" from the command arguments.\n\n"); exit(-1); } // if using intervals convert bitmaps to intervals if (DOMAIN_TYPE == INTERVAL) { set_interval_domains(); } // Reset variables set for labeling, because some of them may be singleton already and count the number of variables that should be labeled N_VS_TO_LABEL = vs_cnt_vs_to_label(VS, N_VS); n_vs_cs = (unsigned int)cs_cnt_vs(CS, N_CS); // count the number of variables in all constraints n_cs_vs = vs_cnt_cs(VS, N_VS); // count the number of constraints in all variables n_const_cs = (unsigned int)cs_cnt_constants(CS, N_CS); // count the number of constants constrained by all the constraints (if more than one) if (ASSIGN_MODE == SPLIT_VALS) { unsigned int split_values_ext = 1; unsigned int n_vals_ctr = 1; for (i = 0; i < N_VS; i++) { if (VS[i].to_label && VS[i].n_vals > n_vals_ctr) { n_vals_ctr = VS[i].n_vals; } } while (((unsigned int)(n_vals_ctr / 2)) > 0) { n_vals_ctr /= 2; split_values_ext++; } for (i = 0; i < N_DEVS; i++) { DEVICES_ARGS[i].split_values_ext = split_values_ext; } } #if SORT_BY_LABEL // sort variables by the ones that may be labeled vs_sort_label_first(VS, N_VS); #elif SORT_BY_MOST_USED_CONSTR // sort constraints on each variable by the constraint that is more common on the CSP vs_sort_constr(VS, N_VS); #elif SORT_BY_LABEL_MORE_VALS // sort variables by the ones that may be labeled and that have more values on their domains vs_sort_label_more_vals_first(); #elif SORT_BY_LABEL_LESS_VALS // sort variables by the ones that may be labeled and that have more values on their domains vs_sort_label_less_vals_first(); #endif // split the search space and fill the stores split_ss(&depth, &n_ss, (unsigned int)N_VS_TO_LABEL); if (N_VS_TO_LABEL == 0) { fprintf(stderr, "\nNo CSP variable is marked for labeling. Please mark at least one variable for labeling.\n\n"); exit(-1); } // variables that are fully expanded during sub-search spaces creation are already labeled N_VS_TO_LABEL -= (int)vs_labeled_at_exp; if (N_VS_TO_LABEL <= 0) { N_VS_TO_LABEL = 1; } if (USE_TTL) { fprintf(stderr, "\nTTL is enabled inside the kernel.\n\n"); } if (!QUIET) { printf("\nSolving the CSP with:\n - Host: %s", host_name); } if (VERBOSE) { if (DOMAIN_TYPE == BITMAP_) { printf(" with %d-bits bitmap domains on %d-bits words\n", H_BITS, H_WORD); } else if (DOMAIN_TYPE == INTERVAL) { printf(" with interval domains\n"); } } else if (!QUIET) { printf("\n"); } // set revision to on (REV=1) or off (REV=0) by default. If PRE_LABELING==2, it is set to 1 if any propagator is capable of propagating // variables with more than one value in its domain if (PRE_LABELING == 0) { REV = 0; } else if (PRE_LABELING == 1) { REV = 1; } if (!QUIET) { printf(" - Device(s) with %s and %s heuristics", get_label_heur(), get_assign_heur()); } if (REV == 1 && !QUIET) { printf(" (and revision)"); } if (!QUIET) { printf(":\n"); } if (host_name != NULL) { free(host_name); } if (DOMAIN_TYPE == BITMAP_) { l_mem_per_wi = (N_VS + 2) * sizeof(cl_ushort) + N_VS * (sizeof(cl_var_p_bitmap) - sizeof(cl_bitmap) + DOMAIN_SIZE); } else { l_mem_per_wi = (N_VS + 2) * sizeof(cl_ushort) + N_VS * sizeof(cl_var_p_interval); } // check if memory is enough. If not, reduce the number of wi, and wg if also needed. for (i = 0; i < N_DEVS; i++) { #if USE_LOCAL_MEM == 0 DEVICES_INFO[i].use_local_mem = false; #elif USE_LOCAL_MEM == 1 DEVICES_INFO[i].use_local_mem = true; #endif // get device name ret = clGetDeviceInfo(DEVICES_INFO[i].device_id, CL_DEVICE_NAME, 0, NULL, &val_size); cl_check_error(ret, "clGetDeviceInfo", "CL_DEVICE_NAME"); char* aux_name = (char*) malloc(val_size); ret = clGetDeviceInfo(DEVICES_INFO[i].device_id, CL_DEVICE_NAME, val_size, aux_name, NULL); cl_check_error(ret, "clGetDeviceInfo", "CL_DEVICE_NAME"); // trim leading spaces on device name unsigned int del = 0; while (isspace((unsigned char )(aux_name[del]))) del++; DEVICES_INFO[i].dev_name = (char*) malloc(val_size - del); strcpy(DEVICES_INFO[i].dev_name, &aux_name[del]); free(aux_name); if (DEVICES_INFO[i].use_local_mem && DEVICES_INFO[i].local_mem_max_alloc < l_mem_per_wi * DEVICES_INFO[i].n_wi_wg) { printf(" Due to the amount of memory required, local memory will not be used in %s.\n", DEVICES_INFO[i].dev_name); DEVICES_INFO[i].use_local_mem = false; } DEVICES_ARGS[i].n_vs_to_label = (unsigned int)N_VS_TO_LABEL; DEVICES_ARGS[i].n_vs_cs = n_vs_cs; DEVICES_ARGS[i].n_cs_vs = n_cs_vs; DEVICES_ARGS[i].n_const_cs = n_const_cs; // get the amount of allowable global memory per buffer to check if it is enough (or if more than one buffer is required) ret = clGetDeviceInfo(DEVICES_INFO[i].device_id, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(DEVICES_INFO[i].global_mem_max_alloc), &DEVICES_INFO[i].global_mem_max_alloc, NULL); cl_check_error(ret, "clGetDeviceInfo", "CL_DEVICE_MAX_MEM_ALLOC_SIZE"); // get the size of the global memory ret = clGetDeviceInfo(DEVICES_INFO[i].device_id, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(DEVICES_INFO[i].global_mem_size), &DEVICES_INFO[i].global_mem_size, NULL); cl_check_error(ret, "clGetDeviceInfo", "CL_DEVICE_GLOBAL_MEM_SIZE"); // get the maximum size of each constant memory buffer to check if it is enough ret = clGetDeviceInfo(DEVICES_INFO[i].device_id, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(DEVICES_INFO[i].constant_mem_max_alloc), &DEVICES_INFO[i].constant_mem_max_alloc, NULL); cl_check_error(ret, "clGetDeviceInfo", "CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE"); size_t n_wg_init = DEVICES_INFO[i].n_wg; size_t n_wi_wg_init = DEVICES_INFO[i].n_wi_wg; if (DEVICES_INFO[i].use_local_mem) { // set buffers size and check if global memory is enough DEVICES_INFO[i].n_wi_wg++; do { DEVICES_INFO[i].n_wi_wg--; // number of work-items that will be created on this device DEVICES_ARGS[i].wi_local = DEVICES_INFO[i].n_wi_wg; DEVICES_ARGS[i].wi_total = DEVICES_INFO[i].n_wg * DEVICES_INFO[i].n_wi_wg; set_buffs_size(&DEVICES_ARGS[i], &DEVICES_INFO[i], filtering); #if USE_MORE_BUFFERS } while (DEVICES_INFO[i].global_mem_used > (double)DEVICES_INFO[i].global_mem_size * USE_MORE_BUFFERS_P && DEVICES_INFO[i].n_wi_wg > 1); #else } while (DEVICES_INFO[i].global_mem_used > DEVICES_INFO[i].global_mem_max_alloc && DEVICES_INFO[i].n_wi_wg > 1); #endif #if USE_MORE_BUFFERS // if global memory not enough for 1 wi_wg, exit if (DEVICES_INFO[i].n_wi_wg == 1 && DEVICES_INFO[i].global_mem_used > (double)DEVICES_INFO[i].global_mem_size * USE_MORE_BUFFERS_P) { #else // if global memory not enough for 1 wi_wg, exit if (DEVICES_INFO[i].n_wi_wg == 1 && DEVICES_INFO[i].global_mem_used > DEVICES_INFO[i].global_mem_max_alloc) { #endif DEVICES_INFO[i].n_wg++; do { DEVICES_INFO[i].n_wg--; // number of work-items that will be created on this device DEVICES_ARGS[i].wi_local = DEVICES_INFO[i].n_wi_wg; DEVICES_ARGS[i].wi_total = DEVICES_INFO[i].n_wg * DEVICES_INFO[i].n_wi_wg; set_buffs_size(&DEVICES_ARGS[i], &DEVICES_INFO[i], filtering); #if USE_MORE_BUFFERS } while (DEVICES_INFO[i].global_mem_used > (double)DEVICES_INFO[i].global_mem_size * USE_MORE_BUFFERS_P && DEVICES_INFO[i].n_wg > 1); #else } while (DEVICES_INFO[i].global_mem_used > DEVICES_INFO[i].global_mem_max_alloc && DEVICES_INFO[i].n_wg > 1); #endif } } else { DEVICES_INFO[i].n_wg++; do { DEVICES_INFO[i].n_wg--; // number of work-items that will be created on this device DEVICES_ARGS[i].wi_local = DEVICES_INFO[i].n_wi_wg; DEVICES_ARGS[i].wi_total = DEVICES_INFO[i].n_wg * DEVICES_INFO[i].n_wi_wg; set_buffs_size(&DEVICES_ARGS[i], &DEVICES_INFO[i], filtering); } #if USE_MORE_BUFFERS while (DEVICES_INFO[i].global_mem_used > (double)DEVICES_INFO[i].global_mem_size * USE_MORE_BUFFERS_P && DEVICES_INFO[i].n_wg > 128); #else while (DEVICES_INFO[i].global_mem_used > DEVICES_INFO[i].global_mem_max_alloc && DEVICES_INFO[i].n_wg > 128); #endif if (DEVICES_INFO[i].n_wg <= 128 && DEVICES_INFO[i].global_mem_used > DEVICES_INFO[i].global_mem_max_alloc) { DEVICES_INFO[i].n_wi_wg++; do { DEVICES_INFO[i].n_wi_wg--; // number of work-items that will be created on this device DEVICES_ARGS[i].wi_local = DEVICES_INFO[i].n_wi_wg; DEVICES_ARGS[i].wi_total = DEVICES_INFO[i].n_wg * DEVICES_INFO[i].n_wi_wg; set_buffs_size(&DEVICES_ARGS[i], &DEVICES_INFO[i], filtering); } #if USE_MORE_BUFFERS while (DEVICES_INFO[i].global_mem_used > (double)DEVICES_INFO[i].global_mem_size * USE_MORE_BUFFERS_P && DEVICES_INFO[i].n_wi_wg > 1); #else while (DEVICES_INFO[i].global_mem_used > DEVICES_INFO[i].global_mem_max_alloc && DEVICES_INFO[i].n_wi_wg > 1); #endif } #if USE_MORE_BUFFERS if (DEVICES_INFO[i].n_wg <= 128 && DEVICES_INFO[i].global_mem_used > (double)DEVICES_INFO[i].global_mem_size * USE_MORE_BUFFERS_P && DEVICES_INFO[i].n_wi_wg == 1) { #else if (DEVICES_INFO[i].n_wg <= 128 && DEVICES_INFO[i].global_mem_used > DEVICES_INFO[i].global_mem_max_alloc && DEVICES_INFO[i].n_wi_wg == 1) { #endif DEVICES_INFO[i].n_wg++; do { DEVICES_INFO[i].n_wg--; // number of work-items that will be created on this device DEVICES_ARGS[i].wi_local = DEVICES_INFO[i].n_wi_wg; DEVICES_ARGS[i].wi_total = DEVICES_INFO[i].n_wg * DEVICES_INFO[i].n_wi_wg; set_buffs_size(&DEVICES_ARGS[i], &DEVICES_INFO[i], filtering); } #if USE_MORE_BUFFERS while (DEVICES_INFO[i].global_mem_used > (double)DEVICES_INFO[i].global_mem_size * USE_MORE_BUFFERS_P && DEVICES_INFO[i].n_wg > 1); #else while (DEVICES_INFO[i].global_mem_used > DEVICES_INFO[i].global_mem_max_alloc && DEVICES_INFO[i].n_wg > 1); #endif } } if (n_wg_init != DEVICES_INFO[i].n_wg) { printf(" Due to the amount of memory required, the number of work-groups was reduced in %s.\n", DEVICES_INFO[i].dev_name); } if (n_wi_wg_init != DEVICES_INFO[i].n_wi_wg) { printf(" Due to the amount of memory required, the number of work-items per work-group was reduced in %s.\n", DEVICES_INFO[i].dev_name); } #if USE_MORE_BUFFERS // if global memory not enough for 1 wi_wg and 1 wg, exit if (DEVICES_INFO[i].global_mem_used > (double)DEVICES_INFO[i].global_mem_size * USE_MORE_BUFFERS_P) { fprintf(stderr, "\nPHACT is trying to use more global memory (%lu Mb) than the amount available (%f Mb) on %s (%d) with %lu " "work-item(s) and %lu work-group(s).\n If possible, please reduce the amount of work-items per work-group to use.", DEVICES_INFO[i].global_mem_used / 1000000, (double)DEVICES_INFO[i].global_mem_size * USE_MORE_BUFFERS_P / 1000000, DEVICES_INFO[i].dev_name, DEVICES_INFO[i].dev_type_n, DEVICES_INFO[i].n_wg, DEVICES_INFO[i].n_wi_wg); exit(-1); } #else // if global memory not enough for 1 wi_wg and 1 wg, exit if (DEVICES_INFO[i].global_mem_used > DEVICES_INFO[i].global_mem_max_alloc) { fprintf(stderr, "\nPHACT is trying to use more global memory (%lu Mb) than the amount available (%lu Mb) on %s (%d) with %lu " "work-item(s) and %lu work-group(s).\n If possible, please reduce the amount of work-items per work-group to use.", DEVICES_INFO[i].global_mem_used / 1000000, DEVICES_INFO[i].global_mem_max_alloc / 1000000, DEVICES_INFO[i].dev_name, DEVICES_INFO[i].dev_type_n, DEVICES_INFO[i].n_wg, DEVICES_INFO[i].n_wi_wg); exit(-1); } #endif #if USE_MORE_BUFFERS // calculate the number of buffers that are needed and divide the work-groups and work-items per them if (DEVICES_INFO[i].global_mem_used > DEVICES_INFO[i].global_mem_max_alloc) { DEVICES_INFO[i].n_buffers = (unsigned int)ceil(((double)DEVICES_INFO[i].global_mem_used * 1.0) / (double)DEVICES_INFO[i].global_mem_max_alloc); // ensures that the buffers are enough if (DEVICES_INFO[i].n_buffers > DEVICES_INFO[i].global_mem_size / DEVICES_INFO[i].global_mem_max_alloc) { DEVICES_INFO[i].n_wg--; } // for less calculations, all buffers must be assigned to the same number of work-groups while (DEVICES_INFO[i].n_wg % DEVICES_INFO[i].n_buffers > 0) { DEVICES_INFO[i].n_wg--; } DEVICES_ARGS[i].backtrack_size = (unsigned int)ceil(((double)DEVICES_ARGS[i].backtrack_size * 1.0) / (double)DEVICES_INFO[i].n_buffers); } #endif if (!QUIET) { printf(" - %s (%d) with %lu work-group(s) and %lu work-items(s) per work-group", DEVICES_INFO[i].dev_name, DEVICES_INFO[i].dev_type_n, DEVICES_INFO[i].n_wg, DEVICES_INFO[i].n_wi_wg); } if (VERBOSE) { if (DEVICES_INFO[i].use_local_mem) { printf(", local memory"); } #if USE_MORE_BUFFERS if ((double)DEVICES_INFO[i].global_mem_used / 1000000.0 > 1.0) { printf(", %.02f Mb (%u buffer(s)) of global memory (Max. %.02f Mb) and", (double)DEVICES_INFO[i].global_mem_used / 1000000.0, DEVICES_INFO[i].n_buffers, (double)DEVICES_INFO[i].global_mem_size * USE_MORE_BUFFERS_P / 1000000.0); } else { printf(", %.02f Kb (%u buffer(s)) of global memory (Max. %.02f Mb) and", (double)DEVICES_INFO[i].global_mem_used / 1000.0, DEVICES_INFO[i].n_buffers, (double)DEVICES_INFO[i].global_mem_size * USE_MORE_BUFFERS_P / 1000000.0); } #else if ((double)DEVICES_INFO[i].global_mem_used / 1000000.0 > 1.0) { printf(", %.02f Mb of global memory (Max. %.02f Mb) and", (double)DEVICES_INFO[i].global_mem_used / 1000000.0, (double)DEVICES_INFO[i].global_mem_max_alloc / 1000000.0); } else { printf(", %.02f Kb of global memory (Max. %.02f Mb) and", (double)DEVICES_INFO[i].global_mem_used / 1000.0, (double)DEVICES_INFO[i].global_mem_max_alloc / 1000000.0); } #endif if (DOMAIN_TYPE == BITMAP_) { printf(" %d-bits bitmap domains on %d-bits words", CL_BITS_, CL_WORD_); } else if (DOMAIN_TYPE == INTERVAL) { printf(" interval domains"); } #if SHARED_SS > 0 printf(" and %d shared sub-search spaces", DEVICES_ARGS[i].n_shared_stores); #endif } if (!QUIET) { printf("\n"); } } // calculate the expected speed when comparing the hardware of all the used devices calculate_rel_expect_speed(DEVICES_INFO); // set amount of sub-search spaces to send to each device at the beginning for (i = 0; i < N_DEVS; i++) { DEVICES_INFO[i].n_ss_mult_max = mult_max; // set n_ss_mult_max to the device number of cores if (mult_max > DEVICES_INFO[i].n_wi_wg * DEVICES_INFO[i].n_wg) { DEVICES_INFO[i].n_ss_mult_max = (unsigned int)(DEVICES_INFO[i].n_wi_wg * DEVICES_INFO[i].n_wg); } if (DEVICES_INFO[i].n_ss_mult_max > DEVICES_INFO[i].compute_units * 32) { if (DEVICES_INFO[i].type == CL_DEVICE_TYPE_GPU) { DEVICES_INFO[i].n_ss_mult_max = DEVICES_INFO[i].compute_units * 32; } else { DEVICES_INFO[i].n_ss_mult_max = DEVICES_INFO[i].compute_units; } } if (N_DEVS == 1) { DEVICES_INFO[i].block_size = n_ss; } else if (WORK == CNT) { DEVICES_INFO[i].block_size = (unsigned int)(n_ss * CNT_INIT_PERC * DEVICES_INFO[i].rel_speed_expect); // WORK == ONE or OPT } else { DEVICES_INFO[i].block_size = (unsigned int)(n_ss * OPT_ONE_INIT_PERC * DEVICES_INFO[i].rel_speed_expect); } if (DEVICES_INFO[i].block_size == 0) { DEVICES_INFO[i].block_size = 1; } // multiplier for first block of sub-search spaces #if SS_MULTIPLIER if (DEVICES_INFO[i].block_size > 0) { //GPU if (DEVICES_INFO[i].type == CL_DEVICE_TYPE_GPU && DEVICES_INFO[i].block_size < SS_GPU / (double)(GPU_DEFAULT_N_WI / DEVICES_INFO[i].n_wi_wg) / (GPU_DEFAULT_N_WG / (double)DEVICES_INFO[i].n_wg * 1.0)) { DEVICES_INFO[i].n_ss_mult = (unsigned int)(SS_GPU / (GPU_DEFAULT_N_WI / (double)DEVICES_INFO[i].n_wi_wg * 1.0) / (double)(GPU_DEFAULT_N_WG / (double)DEVICES_INFO[i].n_wg * 1.0)) / DEVICES_INFO[i].block_size; // ACC } else if (DEVICES_INFO[i].type == CL_DEVICE_TYPE_ACCELERATOR && DEVICES_INFO[i].block_size < SS_ACC / (DEVICES_INFO[i].compute_units / (double)DEVICES_INFO[i].n_wg * 1.0)) { DEVICES_INFO[i].n_ss_mult = (unsigned int)(SS_ACC / (DEVICES_INFO[i].compute_units / (double)DEVICES_INFO[i].n_wg * 1.0) / DEVICES_INFO[i].block_size); // CPU } else if (DEVICES_INFO[i].type == CL_DEVICE_TYPE_CPU && DEVICES_INFO[i].block_size < SS_CPU * DEVICES_INFO[i].compute_units / (DEVICES_INFO[i].compute_units / (double)DEVICES_INFO[i].n_wg * 1.0)) { DEVICES_INFO[i].n_ss_mult = (unsigned int)((SS_CPU * DEVICES_INFO[i].compute_units) / (double)(DEVICES_INFO[i].compute_units / (double)DEVICES_INFO[i].n_wg * 1.0) / DEVICES_INFO[i].block_size); } if (DEVICES_INFO[i].n_ss_mult > mult_max) { DEVICES_INFO[i].n_ss_mult = mult_max; } else if (DEVICES_INFO[i].n_ss_mult == 0) { DEVICES_INFO[i].n_ss_mult = 1; } } #endif DEVICES_INFO[i].first_block_size = DEVICES_INFO[i].block_size; if (PRINT_SOLUTIONS && DEVICES_INFO[i].n_wi_wg * DEVICES_INFO[i].n_wg > 1) { fprintf(stderr, "The solutions will not be printed because one or more devices will be running more than one thread.\n\n"); PRINT_SOLUTIONS = false; } } if (!QUIET) { printf("\nTotal sub-search spaces: %u\n\n", n_ss); } // create one thread per device to solve different sub-search spaces in parallel pthread_t* threads = malloc(N_DEVS * sizeof(pthread_t)); threads_data* thread_data = malloc(N_DEVS * sizeof(threads_data)); void* t_result; cl_ulong* results = malloc(N_DEVS * sizeof(cl_ulong)); if (N_DEVS > 1) { if (WORK == OPT && pthread_mutex_init(&opt_lock, NULL) != 0) { fprintf(stderr, "ERROR: threads opt_lock not created\n"); exit(-1); } if (pthread_mutex_init(&stats_lock, NULL) != 0) { fprintf(stderr, "ERROR: threads stats_lock not created\n"); exit(-1); } } if (WORK == OPT) { VS_LOCK[VAR_ID_TO_OPT].min = VS[VAR_ID_TO_OPT].min; VS_LOCK[VAR_ID_TO_OPT].max = VS[VAR_ID_TO_OPT].max; VS_LOCK_BEST[VAR_ID_TO_OPT].min = VS[VAR_ID_TO_OPT].min; VS_LOCK_BEST[VAR_ID_TO_OPT].max = VS[VAR_ID_TO_OPT].max; } devs_working = N_DEVS; if (N_DEVS > 1) { pthread_barrier_init(&barrier, NULL, N_DEVS); } for (i = 0; i < N_DEVS; i++) { thread_data[i].depth = depth; thread_data[i].dev_info = DEVICES_INFO; thread_data[i].dev_args = DEVICES_ARGS; thread_data[i].dev_number = i; thread_data[i].next_str = &next_str; thread_data[i].val_to_opt = &VAL_TO_OPT; thread_data[i].sol_found = &sol_found; thread_data[i].n_ss = n_ss; thread_data[i].platform_args = platform_args; if (N_DEVS > 1) { thread_ret = pthread_create(&threads[i], NULL, solve_on_device, (void *) &thread_data[i]); if (thread_ret) { fprintf(stderr, "ERROR: return code from pthread_create devices threads is %d\n", thread_ret); exit(-1); } } else { result = (cl_ulong) solve_on_device(thread_data); results[0] = result; } } if (N_DEVS > 1) { // sum the result from all devices for (i = 0; i < N_DEVS; i++) { pthread_join(threads[i], &t_result); results[i] = (unsigned long)(intptr_t)t_result; result += results[i]; } } if (N_DEVS > 1) { if (WORK == OPT) { pthread_mutex_destroy(&opt_lock); } pthread_mutex_destroy(&stats_lock); } if (!QUIET) { printf("\n"); } for (i = 0; i < N_DEVS; i++) { STATS.solve_time += DEVICES_INFO[i].ms_solve_time; STATS.n_solutions += results[i]; if (!QUIET) { if (DEVICES_INFO[i].stores_explored != 0) { printf("%s (%d) took %lu ms to found %lu solution(s) on %d store(s)", DEVICES_INFO[i].dev_name, DEVICES_INFO[i].dev_type_n, DEVICES_INFO[i].ms_solve_time, results[i], DEVICES_INFO[i].stores_explored); if ( DEVICES_INFO[i].times_used > 1) { printf(" split in %u blocks,", DEVICES_INFO[i].times_used); } printf(" with an average %.03f ms per sub-search space\n", DEVICES_INFO[i].avg_1ss_solv_time); } else { printf("The other devices solved the CSP before %s (%d) could began\n", DEVICES_INFO[i].dev_name, DEVICES_INFO[i].dev_type_n); } } } // Compare the finish time between the first and the last device if (VERBOSE && N_DEVS > 1) { cl_ulong ms_first = DEVICES_INFO[0].ms_finish_time; cl_ulong ms_last = ms_first; unsigned int first_dev = 0; unsigned int last_dev = 0; for (i = 1; i < N_DEVS; i++) { if (DEVICES_INFO[i].ms_finish_time < ms_first) { ms_first = DEVICES_INFO[i].ms_finish_time; first_dev = i; } else if (DEVICES_INFO[i].ms_finish_time > ms_last) { ms_last = DEVICES_INFO[i].ms_finish_time; last_dev = i; } } printf("%s (%d) finished %lu ms before %s (%d)\n", DEVICES_INFO[first_dev].dev_name, DEVICES_INFO[first_dev].dev_type_n, ms_last - ms_first, DEVICES_INFO[last_dev].dev_name, DEVICES_INFO[last_dev].dev_type_n); } for (i = 0; i < N_DEVS; i++) { free(DEVICES_INFO[i].exp_values); } free(platform_args); free(threads); free(thread_data); free(results); if (WORK == OPT) { if (DOMAIN_TYPE == BITMAP_) { for (i = 0; i < N_VS; i++) { b_copy(&VS[i].domain_b, &VS_LOCK_BEST[i].domain_b); } } else { for (i = 0; i < N_VS; i++) { VS[i].domain_i = VS_LOCK_BEST[i].domain_i; } } for (i = 0; i < N_VS; i++) { VS[i].max = VS_LOCK_BEST[i].max; VS[i].min = VS_LOCK_BEST[i].min; VS[i].n_vals = VS_LOCK_BEST[i].n_vals; } } #if VERIFY_SOL // If only one solution or best is to be found, but is incorrect if ((WORK == ONE || WORK == OPT) && result > 0) { result = cs_check(true); } #endif if (!QUIET) { printf("\n"); } // when more than one device finds a solution and only one is wanted, set result to 1, as only one solution is saved if (WORK != CNT && result > 1) { result = 1; } // remove solutions from backtracking count if (PRINT_STATS) { if (STATS.backtracks > result - 1) { STATS.backtracks -= result - 1; } } return result; } /* * Filter the CSP by pruning values from the variables, when possible, and without labeling * Use 1 thread on the CPU * Return true if CSP is consistent after filtering * */ bool filter_CSP() { best_sols_found = 1; // counter for number of best solutions found cl_ulong result = 0; // Number of solutions found, or 0 or 1 if only one solution is wanted unsigned char sol_found = 0; // To set to 1 when only one solution is wanted and is found (atomic read and write) unsigned int n_vs_cs; // number of all variables in all constraints unsigned int n_cs_vs; // number of all constraints in all variables unsigned int n_const_cs; // number of all constant values in all constraints with more than one constant value unsigned int next_str = 0; // Index in stores where the next unexplored sub-search space is placed (atomic read and write) size_t l_mem_per_wi; // size in bytes of the local memory needed per work-item char* host_name = NULL; unsigned int n_ss = 1; // Number of sub-search spaces created unsigned int depth = 0; // Tree expansion depth needed to get n_ss disjoint search spaces unsigned int i, j; char* aux_name; filtering = true; STATS.n_solutions = 0; device_info filt_dev_info; device_args filt_dev_args; platf_args* platform_args; // each platform arguments for all devices of the same platform cl_platform_id* platfs = NULL; // to save all devices cl_platform_id cl_device_id* devs = NULL; // to save all devices cl_device_id cl_uint platf_cnt = 0; // number of platforms (Intel, Nvidia...) cl_uint dev_cnt = 0; // number of devices on each platform (CPU, MIC...) cl_int ret; // output of clGetPlatformIDs and clGetDeviceIDs size_t val_size; // Use command line heuristics for labeling and assignment, if existent // if not use default if (LABEL_MODE_COM != DEFAULT_L) { LABEL_MODE = LABEL_MODE_COM; } else if (LABEL_MODE == DEFAULT_L) { LABEL_MODE = LABEL_MODE_D; } if (ASSIGN_MODE_COM != DEFAULT_A) { ASSIGN_MODE = ASSIGN_MODE_COM; } else if (ASSIGN_MODE == DEFAULT_A) { ASSIGN_MODE = ASSIGN_MODE_D; } init_csp_and_d_bits(); // discover all platforms (Intel, Nvidia, AMD,...) ret = clGetPlatformIDs(0, NULL, &platf_cnt); cl_check_error(ret, "clGetPlatformIDs", "discovering devices"); platfs = (cl_platform_id*) malloc(platf_cnt * sizeof(cl_platform_id)); ret = clGetPlatformIDs(platf_cnt, platfs, NULL); cl_check_error(ret, "clGetPlatformIDs", "discovering devices"); platform_args = (platf_args*) malloc(platf_cnt * sizeof(platf_args)); // for each platform for (i = 0; i < platf_cnt; i++) { // discover the first CPU ret = clGetDeviceIDs(platfs[i], CL_DEVICE_TYPE_CPU, 0, NULL, &dev_cnt); devs = (cl_device_id*) malloc(sizeof(cl_device_id) * dev_cnt); ret = clGetDeviceIDs(platfs[i], CL_DEVICE_TYPE_CPU, dev_cnt, devs, NULL); platform_args[i].platform_id = platfs[i]; platform_args[i].n_devs = dev_cnt; // for each device for (j = 0; j < dev_cnt; j++) { // Identify the type of device (GPU, CPU, MIC,...) cl_device_type cl_device_type; ret = clGetDeviceInfo(devs[j], CL_DEVICE_TYPE, sizeof(cl_device_type), &cl_device_type, NULL); cl_check_error(ret, "clGetDeviceInfo", "CL_DEVICE_TYPE"); filt_dev_info.device_id = devs[j]; filt_dev_info.platform_id = platfs[i]; filt_dev_info.type = CL_DEVICE_TYPE_CPU; filt_dev_info.dev_type_n = 1; filt_dev_info.n_wg = 1; filt_dev_info.n_wi_wg = 1; filt_dev_info.use_local_mem = true; filt_dev_info.def_n_wi_wg = 1; filt_dev_info.def_n_wg = 1; filt_dev_info.stores_explored = 0; filt_dev_info.last_1ss_solv_time = 0; filt_dev_info.avg_1ss_solv_time = 0; filt_dev_info.max_1ss_solv_time = 0; filt_dev_info.n_ss_mult = 1; filt_dev_info.rank = 0; filt_dev_info.times_used = 0; filt_dev_info.ms_solve_time = 0; filt_dev_info.first_time_ranked = false; filt_dev_info.props_total = 0; filt_dev_info.last_props = 0; filt_dev_info.avg_time_prop = 0; filt_dev_info.last_time_prop = 0; filt_dev_info.max_time_prop = 0; filt_dev_info.ranked = false; filt_dev_info.working = true; filt_dev_info.last_explor_time = 0; filt_dev_info.n_fast_blocks = 0; filt_dev_info.sols_found = 0; filt_dev_info.n_buffers = 1; filt_dev_info.exp_values = calloc(N_VS, sizeof(unsigned int)); filt_dev_info.n_empty_blocks = 0; filt_dev_args.split_values_ext = 1; // get device name ret = clGetDeviceInfo(devs[j], CL_DEVICE_NAME, 0, NULL, &val_size); cl_check_error(ret, "clGetDeviceInfo", "CL_DEVICE_NAME"); aux_name = (char*) malloc(val_size); ret = clGetDeviceInfo(devs[j], CL_DEVICE_NAME, val_size, aux_name, NULL); cl_check_error(ret, "clGetDeviceInfo", "CL_DEVICE_NAME"); // get the amount of local memory to check if it is enough ret = clGetDeviceInfo(filt_dev_info.device_id, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(filt_dev_info.local_mem_max_alloc), &filt_dev_info.local_mem_max_alloc, NULL); cl_check_error(ret, "clGetDeviceInfo", "CL_DEVICE_LOCAL_MEM_SIZE"); // trim leading spaces on device name unsigned int del = 0; while (isspace((unsigned char )(aux_name[del]))) del++; host_name = (char*) malloc(val_size - del); strcpy(host_name, &aux_name[del]); filt_dev_info.dev_name = (char*) malloc(val_size - del); strcpy(filt_dev_info.dev_name, &aux_name[del]); free(aux_name); j++; break; } if (j > 0) { break; } } free(devs); free(platfs); if (j == 0) { fprintf(stderr, "\nThis machine does not have a CPU for filtering the CSP.\n"); return false; } // if using intervals convert bitmaps to intervals if (DOMAIN_TYPE == INTERVAL) { set_interval_domains(); } // Reset variables set for labeling, because some of them may be singleton already and count the number of variables that should be labeled N_VS_TO_LABEL = vs_cnt_vs_to_label(VS, N_VS); n_vs_cs = (unsigned int)cs_cnt_vs(CS, N_CS); // count the number of variables in all constraints n_cs_vs = vs_cnt_cs(VS, N_VS); // count the number of constraints in all variables n_const_cs = (unsigned int)cs_cnt_constants(CS, N_CS); // count the number of constants constrained by all the constraints (if more than one) if (ASSIGN_MODE == SPLIT_VALS) { unsigned int split_values_ext = 1; unsigned int n_vals_ctr = 1; for (i = 0; i < N_VS; i++) { if (VS[i].to_label && VS[i].n_vals > n_vals_ctr) { n_vals_ctr = VS[i].n_vals; } } while (((unsigned int)(n_vals_ctr / 2)) > 0) { n_vals_ctr /= 2; split_values_ext++; } filt_dev_args.split_values_ext = split_values_ext; } if (N_VS_TO_LABEL == 0) { fprintf(stderr, "\nNo CSP variable is marked for labeling. Please mark at least one variable for labeling.\n\n"); exit(-1); } // variables that are fully expanded during sub-search spaces creation are already labeled if (N_VS_TO_LABEL <= 0) { N_VS_TO_LABEL = 1; } if (USE_TTL) { fprintf(stderr, "\nTTL is enabled inside the kernel.\n\n"); } if (!QUIET) { printf("\nFiltering the CSP with 1 thread of %s", host_name); } // set revision to on (REV=1) or off (REV=0) by default. If PRE_LABELING==2, it is set to 1 if any propagator is capable of propagating // variables with more than one value in its domain if (PRE_LABELING == 0) { REV = 0; } else if (PRE_LABELING == 1) { REV = 1; } if (!QUIET) { printf(" with %s and %s heuristics", get_label_heur(), get_assign_heur()); } if (REV == 1 && !QUIET) { printf(" (and revision)"); } if (host_name != NULL) { free(host_name); } if (DOMAIN_TYPE == BITMAP_) { l_mem_per_wi = (N_VS + 3) * sizeof(cl_ushort) + N_VS * (sizeof(cl_var_p_bitmap) - sizeof(cl_bitmap) + DOMAIN_SIZE); } else { l_mem_per_wi = (N_VS + 3) * sizeof(cl_ushort) + N_VS * sizeof(cl_var_p_interval); } // check if memory is enough. If not, reduce the number of wi, and wg if also needed. #if USE_LOCAL_MEM == 0 filt_dev_info.use_local_mem = false; #elif USE_LOCAL_MEM == 1 filt_dev_info.use_local_mem = true; #endif if (filt_dev_info.use_local_mem && filt_dev_info.local_mem_max_alloc < l_mem_per_wi) { printf("\nDue to the amount of memory required, local memory will not be used in %s.\n", filt_dev_info.dev_name); filt_dev_info.use_local_mem = false; } filt_dev_args.n_vs_to_label = (unsigned int)N_VS_TO_LABEL; filt_dev_args.n_vs_cs = n_vs_cs; filt_dev_args.n_cs_vs = n_cs_vs; filt_dev_args.n_const_cs = n_const_cs; // get the amount of global memory to check if it is enough ret = clGetDeviceInfo(filt_dev_info.device_id, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(filt_dev_info.global_mem_max_alloc), &filt_dev_info.global_mem_max_alloc, NULL); cl_check_error(ret, "clGetDeviceInfo", "CL_DEVICE_MAX_MEM_ALLOC_SIZE"); // get the maximum size of each constant memory buffer to check if it is enough ret = clGetDeviceInfo(filt_dev_info.device_id, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(filt_dev_info.constant_mem_max_alloc), &filt_dev_info.constant_mem_max_alloc, NULL); cl_check_error(ret, "clGetDeviceInfo", "CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE"); // number of work-items that will be created on this device filt_dev_args.wi_local = filt_dev_info.n_wi_wg; filt_dev_args.wi_total = filt_dev_info.n_wg * filt_dev_info.n_wi_wg; set_buffs_size(&filt_dev_args, &filt_dev_info, filtering); if (filt_dev_info.global_mem_used > filt_dev_info.global_mem_max_alloc) { fprintf(stderr, "\nPHACT is trying to use more global memory (%lu Mb) than the amount available (%lu Mb) on %s (%d)\n", filt_dev_info.global_mem_used / 1000000, filt_dev_info.global_mem_max_alloc / 1000000, filt_dev_info.dev_name, filt_dev_info.dev_type_n); exit(-1); } if (VERBOSE) { if (filt_dev_info.use_local_mem) { printf(", local memory"); } if ((double)filt_dev_info.global_mem_used / 1000000.0 > 1.0) { printf(", %.02f Mb of global memory (Max. %.02f Mb) and", (double)filt_dev_info.global_mem_used / 1000000.0, (double)filt_dev_info.global_mem_max_alloc / 1000000.0); } else { printf(", %.02f Kb of global memory (Max. %.02f Mb) and", (double)filt_dev_info.global_mem_used / 1000.0, (double)filt_dev_info.global_mem_max_alloc / 1000000.0); } if (DOMAIN_TYPE == BITMAP_) { printf(" %d-bits bitmap domains on %d-bits words", CL_BITS_, CL_WORD_); } else if (DOMAIN_TYPE == INTERVAL) { printf(" interval domains"); } #if SHARED_SS > 0 printf(" and %d shared sub-search spaces", DEVICES_ARGS[i].n_shared_stores); #endif } if (!QUIET) { printf("\n\n"); } filt_dev_info.n_ss_mult_max = mult_max; filt_dev_info.block_size = n_ss; filt_dev_info.n_ss_mult = 1; filt_dev_info.n_ss_mult_max = 1; filt_dev_info.first_block_size = 1; filt_dev_info.block_size = 1; // create one thread for the CPU threads_data thread_data; devs_working = 1; thread_data.depth = depth; thread_data.dev_info = &filt_dev_info; thread_data.dev_args = &filt_dev_args; thread_data.dev_number = 0; thread_data.next_str = &next_str; thread_data.val_to_opt = &VAL_TO_OPT; thread_data.sol_found = &sol_found; thread_data.n_ss = n_ss; thread_data.platform_args = platform_args; result = (cl_ulong) solve_on_device(&thread_data); free(platform_args); free(filt_dev_info.dev_name); filtering = false; STATS.solve_time = filt_dev_info.ms_solve_time; free(filt_dev_info.exp_values); return (bool)result; } /* * Thread responsible for solving sub-search spaces on a device * thread_arg - structure with all thread arguments */ void* solve_on_device(void* thread_arg) { struct threads_data* thread_d; thread_d = (struct threads_data*) thread_arg; unsigned int depth = thread_d->depth; // Tree expansion depth needed to get n_ss disjoint search spaces unsigned int n_ss = thread_d->n_ss; // Number of sub-search spaces created unsigned int* next_str = thread_d->next_str; // Index in stores where the next unexplored sub-search space is placed (atomic read and write) cl_uint* val_to_opt = thread_d->val_to_opt; // Max value on the domain of the variable to optimize (atomic read and write) unsigned char* sol_found = thread_d->sol_found; // To set to 1 when only one solution is wanted and is found (atomic read and write) device_info* this_dev_info = &thread_d->dev_info[thread_d->dev_number]; // Information about the device to use device_info* all_dev_info = thread_d->dev_info; // Information about all the device to use device_args* this_dev_args = &thread_d->dev_args[thread_d->dev_number]; // Device arguments (buffers, etc.) cl_ulong result = 0; // 0, 1 or number of solutions found by this device cl_ulong total_result = 0; // 0, 1 or number of solutions found by this device unsigned int stores_idx; // index of the next unexplored store to pick // To get and print elapsed times char elapsed_time[40]; char start_time[40]; char end_time[40]; struct timeval start, end; if (N_DEVS > 1 && !filtering) { pthread_barrier_wait(&barrier); } if (VERBOSE) { gettimeofday(&start, NULL); } // Initialize OpenCL objects on this device init_device(this_dev_info, this_dev_args, filtering); #if RUN_IN_CUDA if (N_DEVS > 1 && !filtering) { pthread_barrier_wait(&barrier); } #endif // calculate time spent for initializing the devices if (VERBOSE) { gettimeofday(&end, NULL); format_elapsed_time_s_ms(elapsed_time, start.tv_sec, start.tv_usec, end.tv_sec, end.tv_usec); format_time_s_ms(start_time, start.tv_sec, start.tv_usec); format_time_s_ms(end_time, end.tv_sec, end.tv_usec); printf("%s...%s = %s (s.ms) -> %s (%d) was initialized\n", start_time, end_time, elapsed_time, this_dev_info->dev_name, this_dev_info->dev_type_n); } // Get the index of the unexplored sub-search space to explore next #if defined(WIN32) || defined(_WIN32) || defined(__WIN32) && !defined(__CYGWIN__) stores_idx = InterlockedAdd(next_str, this_dev_info->block_size) - this_dev_info->block_size; #else stores_idx = __atomic_fetch_add(next_str, this_dev_info->block_size, __ATOMIC_SEQ_CST); #endif if (WORK == CNT) { while (stores_idx < n_ss) { this_dev_info->first_store = stores_idx; if (stores_idx + this_dev_info->block_size >= n_ss) { this_dev_info->block_size = n_ss - stores_idx; } this_dev_info->last_store = stores_idx + this_dev_info->block_size; if (!VERBOSE && !QUIET) { printf("%s (%d) will receive more %d stores\n", this_dev_info->dev_name, this_dev_info->dev_type_n, this_dev_info->block_size); } gettimeofday(&start, NULL); // solve the sub-search spaces on this device for finding all the solutions result = count_sols(this_dev_args, this_dev_info, depth, n_ss, &stats_lock, filtering); gettimeofday(&end, NULL); total_result += result; this_dev_info->sols_found += result; this_dev_info->last_explor_time = (double)(end.tv_sec - start.tv_sec) * 1000.0; this_dev_info->last_explor_time += (double)(end.tv_usec - start.tv_usec) / 1000.0; // update the number of stores this device explored and the number of times it was used this_dev_info->stores_explored += this_dev_info->block_size; this_dev_info->times_used++; // update the last elapsed time this device needs to solve on sub-search space and the cl_find_all_sol elapsed time this_dev_info->last_1ss_solv_time = (float)get_elapsed_ms(start.tv_sec, start.tv_usec, end.tv_sec, end.tv_usec) / (float)this_dev_info->block_size; this_dev_info->ms_solve_time += get_elapsed_ms(start.tv_sec, start.tv_usec, end.tv_sec, end.tv_usec); this_dev_info->avg_1ss_solv_time = (float)this_dev_info->ms_solve_time / (float) this_dev_info->stores_explored; this_dev_info->last_time_prop = (float)((float)get_elapsed_ms(start.tv_sec, start.tv_usec, end.tv_sec, end.tv_usec) / ((double)this_dev_info->last_props * 1.0)); if (VERBOSE) { format_elapsed_time_s_ms(elapsed_time, start.tv_sec, start.tv_usec, end.tv_sec, end.tv_usec); format_time_s_ms(start_time, start.tv_sec, start.tv_usec); format_time_s_ms(end_time, end.tv_sec, end.tv_usec); if (!filtering) { printf("%s...%s = %s (s.ms) -> %s (%d) found %lu solution(s) on %u store(s) (%u...%u)", start_time, end_time, elapsed_time, this_dev_info->dev_name, this_dev_info->dev_type_n, result, this_dev_info->block_size, this_dev_info->first_store, this_dev_info->last_store - 1); if (this_dev_info->n_ss_mult > 1) { printf(" expanded %u times (Max. %u times)", this_dev_info->n_ss_mult, this_dev_info->n_ss_mult_max); } printf(", taking %.03f ms per ss", this_dev_info->last_1ss_solv_time); if (this_dev_info->rank > 0.000) { printf(" with a rank of %.03f\n", this_dev_info->rank); } else { printf("\n"); } } else { printf("%s...%s = %s (s.ms) -> %s (%d) filtered the CSP\n", start_time, end_time, elapsed_time, this_dev_info->dev_name, this_dev_info->dev_type_n); } } if (*next_str < n_ss) { // calculate next amount of stores to send to device set_next_block_size(all_dev_info, thread_d->dev_number, n_ss, next_str); if (this_dev_info->block_size == 0) { break; } // Get the index of the unexplored sub-search space to explore next #if defined(WIN32) || defined(_WIN32) || defined(__WIN32) && !defined(__CYGWIN__) stores_idx = InterlockedAdd(next_str, this_dev_info->block_size) - this_dev_info->block_size; #else stores_idx = __atomic_fetch_add(next_str, this_dev_info->block_size, __ATOMIC_SEQ_CST); #endif if (stores_idx < n_ss && stores_idx + this_dev_info->block_size > n_ss) { this_dev_info->block_size = n_ss - stores_idx; } } else { break; } } } else if (WORK == ONE) { while ((*sol_found) == 0 && (stores_idx < n_ss || (stores_idx == 1 && n_ss == 1))) { this_dev_info->first_store = stores_idx; if (stores_idx + this_dev_info->block_size >= n_ss) { this_dev_info->block_size = n_ss - stores_idx; } this_dev_info->last_store = stores_idx + this_dev_info->block_size; // for elapsed time calculation if (!VERBOSE && !QUIET) { printf("%s (%d) will receive more %d stores\n", this_dev_info->dev_name, this_dev_info->dev_type_n, this_dev_info->block_size); } gettimeofday(&start, NULL); // solve the sub-search spaces on this device for finding one solution result = find_one_sol(this_dev_args, this_dev_info, sol_found, depth, n_ss, &stats_lock, filtering); gettimeofday(&end, NULL); total_result += result; this_dev_info->sols_found += result; this_dev_info->last_explor_time = (float)(end.tv_sec - start.tv_sec) * 1000.0; this_dev_info->last_explor_time += (float)(end.tv_usec - start.tv_usec) / 1000.0; // update the number of stores this device explored and the number of times it was used this_dev_info->stores_explored += this_dev_info->block_size; this_dev_info->times_used++; // update the last elapsed time this device needs to solve on sub-search space and the cl_find_all_sol elapsed time this_dev_info->last_1ss_solv_time = (float)get_elapsed_ms(start.tv_sec, start.tv_usec, end.tv_sec, end.tv_usec) / (float) this_dev_info->block_size; this_dev_info->ms_solve_time += get_elapsed_ms(start.tv_sec, start.tv_usec, end.tv_sec, end.tv_usec); this_dev_info->avg_1ss_solv_time = (float)this_dev_info->ms_solve_time / (float)this_dev_info->stores_explored; this_dev_info->last_time_prop = (float)((float)get_elapsed_ms(start.tv_sec, start.tv_usec, end.tv_sec, end.tv_usec) / ((float)this_dev_info->last_props * 1.0)); if (VERBOSE) { format_elapsed_time_s_ms(elapsed_time, start.tv_sec, start.tv_usec, end.tv_sec, end.tv_usec); format_time_s_ms(start_time, start.tv_sec, start.tv_usec); format_time_s_ms(end_time, end.tv_sec, end.tv_usec); if (!filtering) { printf("%s...%s = %s (s.ms) -> %s (%d) found %lu solution(s) on %u store(s) (%u...%u)", start_time, end_time, elapsed_time, this_dev_info->dev_name, this_dev_info->dev_type_n, result, this_dev_info->block_size, this_dev_info->first_store, this_dev_info->last_store - 1); if (this_dev_info->n_ss_mult > 1) { printf(" expanded %u times (Max. %u times)", this_dev_info->n_ss_mult, this_dev_info->n_ss_mult_max); } printf(", taking %.03f ms per ss", this_dev_info->last_1ss_solv_time); if (this_dev_info->rank > 0.000) { printf(" with a previous rank of %.03f\n", this_dev_info->rank); } else { printf("\n"); } } else { printf("%s...%s = %s (s.ms) -> %s (%d) filtered the CSP\n", start_time, end_time, elapsed_time, this_dev_info->dev_name, this_dev_info->dev_type_n); } } // calculate next amount of stores to send to device if (*next_str < n_ss) { set_next_block_size(all_dev_info, thread_d->dev_number, n_ss, next_str); if (this_dev_info->block_size == 0) { break; } // Get the index of the unexplored sub-search space to explore next #if defined(WIN32) || defined(_WIN32) || defined(__WIN32) && !defined(__CYGWIN__) stores_idx = InterlockedAdd(next_str, this_dev_info->block_size) - this_dev_info->block_size; #else stores_idx = __atomic_fetch_add(next_str, this_dev_info->block_size, __ATOMIC_SEQ_CST); #endif if (stores_idx < n_ss && stores_idx + this_dev_info->block_size > n_ss) { this_dev_info->block_size = n_ss - stores_idx; } } else { break; } } // if WORK == OPT } else { while (stores_idx < n_ss) { this_dev_info->first_store = stores_idx; if (stores_idx + this_dev_info->block_size >= n_ss) { this_dev_info->block_size = n_ss - stores_idx; } this_dev_info->last_store = stores_idx + this_dev_info->block_size; if (!VERBOSE && !QUIET) { printf("%s (%d) will receive more %d stores\n", this_dev_info->dev_name, this_dev_info->dev_type_n, this_dev_info->block_size); } gettimeofday(&start, NULL); // solve the sub-search spaces on this device for finding all the solutions result = find_best_sol(this_dev_args, this_dev_info, val_to_opt, &opt_lock, depth, n_ss, &stats_lock, filtering); gettimeofday(&end, NULL); total_result += result; best_sols_found += (unsigned int)result; this_dev_info->sols_found += result; this_dev_info->last_explor_time = (float)(end.tv_sec - start.tv_sec) * 1000.0; this_dev_info->last_explor_time += (float)(end.tv_usec - start.tv_usec) / 1000.0; // update the number of stores this device explored and the number of times it was used this_dev_info->stores_explored += this_dev_info->block_size; this_dev_info->times_used++; // update the last elapsed time this device needs to solve on sub-search space and the cl_find_all_sol elapsed time this_dev_info->last_1ss_solv_time = (float)get_elapsed_ms(start.tv_sec, start.tv_usec, end.tv_sec, end.tv_usec) / (float) this_dev_info->block_size; this_dev_info->ms_solve_time += get_elapsed_ms(start.tv_sec, start.tv_usec, end.tv_sec, end.tv_usec); this_dev_info->avg_1ss_solv_time = (float)this_dev_info->ms_solve_time / (float)this_dev_info->stores_explored; this_dev_info->last_time_prop = (float)((float)get_elapsed_ms(start.tv_sec, start.tv_usec, end.tv_sec, end.tv_usec) / ((float)this_dev_info->last_props * 1.0)); if (VERBOSE) { format_elapsed_time_s_ms(elapsed_time, start.tv_sec, start.tv_usec, end.tv_sec, end.tv_usec); format_time_s_ms(start_time, start.tv_sec, start.tv_usec); format_time_s_ms(end_time, end.tv_sec, end.tv_usec); if (!filtering) { printf("%s...%s = %s (s.ms) -> %s (%d) found %lu best solution(s) on %u store(s) (%u...%u)", start_time, end_time, elapsed_time, this_dev_info->dev_name, this_dev_info->dev_type_n, result, this_dev_info->block_size, this_dev_info->first_store, this_dev_info->last_store - 1); if (this_dev_info->n_ss_mult > 1) { printf(" expanded %u times (Max. %u times)", this_dev_info->n_ss_mult, this_dev_info->n_ss_mult_max); } printf(", taking %.03f ms per ss", this_dev_info->last_1ss_solv_time); if (this_dev_info->rank > 0.000) { printf(" with a previous rank of %.03f\n", this_dev_info->rank); } else { printf("\n"); } } else { printf("%s...%s = %s (s.ms) -> %s (%d) filtered the CSP\n", start_time, end_time, elapsed_time, this_dev_info->dev_name, this_dev_info->dev_type_n); } } if (*next_str < n_ss) { // calculate next amount of stores to send to device set_next_block_size(all_dev_info, thread_d->dev_number, n_ss, next_str); if (this_dev_info->block_size == 0) { break; } // Get the index of the unexplored sub-search space to explore next #if defined(WIN32) || defined(_WIN32) || defined(__WIN32) && !defined(__CYGWIN__) stores_idx = InterlockedAdd(next_str, this_dev_info->block_size) - this_dev_info->block_size; #else stores_idx = __atomic_fetch_add(next_str, this_dev_info->block_size, __ATOMIC_SEQ_CST); #endif if (stores_idx < n_ss && stores_idx + this_dev_info->block_size > n_ss) { this_dev_info->block_size = n_ss - stores_idx; } } else { break; } } } // for elapsed time calculation if (VERBOSE) { gettimeofday(&start, NULL); } // clear device objects release_device(this_dev_args, this_dev_info, filtering); if (VERBOSE) { gettimeofday(&end, NULL); this_dev_info->ms_finish_time = (unsigned long)end.tv_sec * 1000 + (unsigned long)end.tv_usec / 1000; } if (VERBOSE) { gettimeofday(&end, NULL); format_elapsed_time_s_ms(elapsed_time, start.tv_sec, start.tv_usec, end.tv_sec, end.tv_usec); format_time_s_ms(start_time, start.tv_sec, start.tv_usec); format_time_s_ms(end_time, end.tv_sec, end.tv_usec); printf("%s...%s = %s (s.ms) -> %s (%d) was released\n", start_time, end_time, elapsed_time, this_dev_info->dev_name, this_dev_info->dev_type_n); } if (N_DEVS > 1 && !filtering) { return (void *) (intptr_t) total_result; } else { return (void *) (cl_ulong) total_result; } } /* * Load balancing between all devices * dev_info - all devices information * dev_idx - index of dev_info of the device to calculate the next amount of stores to send to device * n_ss - total number of sub-search spaces created * last_str_explored - last store explored */ void set_next_block_size(device_info* dev_info, unsigned int dev_idx, unsigned int n_ss, unsigned int* last_str_explored) { float avg_sum = 0; float* avg_prop_solv_time = malloc(N_DEVS * sizeof(float)); bool all_devs_used; unsigned int devs_ranked_; unsigned int devs_working_; unsigned int i; unsigned int fastest_dev = 0; unsigned int fastest_dev_prop_solv_time = UINT_MAX; unsigned int last_str_explored_; #if defined(WIN32) || defined(_WIN32) || defined(__WIN32) && !defined(__CYGWIN__) last_str_explored_ = InterlockedAdd(last_str_explored, 0); #else last_str_explored_ = __atomic_fetch_add(last_str_explored, 0, __ATOMIC_SEQ_CST); #endif for (i = 0; i < N_DEVS; i++) { if (dev_info[i].working) { if (avg_prop_solv_time[i] > 0 && avg_prop_solv_time[i] < fastest_dev_prop_solv_time) { fastest_dev_prop_solv_time = (unsigned int)avg_prop_solv_time[i]; fastest_dev = i; } } } // More one device ranked #if defined(WIN32) || defined(_WIN32) || defined(__WIN32) && !defined(__CYGWIN__) devs_ranked_ = InterlockedAdd(&devs_ranked, 0); devs_working_ = InterlockedAdd(&devs_working_, 0); #else devs_ranked_ = __atomic_add_fetch(&devs_ranked, 0, __ATOMIC_SEQ_CST); devs_working_ = __atomic_add_fetch(&devs_working, 0, __ATOMIC_SEQ_CST); #endif dev_info[dev_idx].n_ss_mult = 1; // only one device so take the remaining ss if (devs_working_ == 1) { dev_info[dev_idx].block_size = n_ss - last_str_explored_; } else { // if counting all the solutions if (WORK == CNT) { if (dev_info[dev_idx].props_total == 0) { dev_info[dev_idx].props_total = 1; } if (dev_info[dev_idx].last_1ss_solv_time > dev_info[dev_idx].max_1ss_solv_time) { dev_info[dev_idx].max_1ss_solv_time = dev_info[dev_idx].last_1ss_solv_time; } if (dev_info[dev_idx].last_time_prop > dev_info[dev_idx].max_time_prop) { dev_info[dev_idx].max_time_prop = dev_info[dev_idx].last_time_prop; } // update the average time needed to run one propagator dev_info[dev_idx].avg_time_prop = (float)dev_info[dev_idx].ms_solve_time / (float)dev_info[dev_idx].props_total * 1000; for (i = 0; i < N_DEVS; i++) { avg_prop_solv_time[i] = dev_info[i].avg_time_prop; } // if this device wasn't ranked yet if (!dev_info[dev_idx].ranked && dev_info[dev_idx].times_used == N_FIRST_BLOCKS) { dev_info[dev_idx].ranked = true; // More one device ranked #if defined(WIN32) || defined(_WIN32) || defined(__WIN32) && !defined(__CYGWIN__) devs_ranked_ = InterlockedAdd(&devs_ranked, 1); #else devs_ranked_ = __atomic_add_fetch(&devs_ranked, 1, __ATOMIC_SEQ_CST); #endif } // if the device took more than 1 s to explore the previous block, decrease the size of the next block to half. For the first 3 block only if (dev_info[dev_idx].times_used < N_FIRST_BLOCKS && dev_info[dev_idx].last_explor_time > MS_HALF_FIRST_BLOCKS) { dev_info[dev_idx].block_size /= 2; } // if only one device is already ranked, and this one already explored two blocks, doubles the size of the next block, if that is not be too big if (devs_ranked_ == 1 && dev_info[dev_idx].times_used >= N_FIRST_BLOCKS && dev_info[dev_idx].block_size * 2 < (n_ss - last_str_explored_) * PERCENT_REM_SS_DOUBLE) { dev_info[dev_idx].block_size *= 2; // After this point more than one device are already ranked } else if (devs_ranked_ == N_DEVS) { // update current device rank for (i = 0; i < N_DEVS; i++) { if (dev_info[i].working) { if (avg_prop_solv_time[i] > 0) { avg_sum += 1 / avg_prop_solv_time[i]; } } } if (avg_prop_solv_time[dev_idx] > 0 && avg_sum > 0) { dev_info[dev_idx].rank = 1 / avg_prop_solv_time[dev_idx] / avg_sum; // next block size if (dev_info[dev_idx].type == CL_DEVICE_TYPE_CPU) { dev_info[dev_idx].block_size = (unsigned int)(dev_info[dev_idx].rank * (float)(n_ss - last_str_explored_) * PERCENT_REM_SS_RANK_CPU); } else if (dev_info[dev_idx].type == CL_DEVICE_TYPE_GPU) { dev_info[dev_idx].block_size = (unsigned int)(dev_info[dev_idx].rank * (float)(n_ss - last_str_explored_) * PERCENT_REM_SS_RANK_GPU); // ACC } else { dev_info[dev_idx].block_size = (unsigned int)(dev_info[dev_idx].rank * (float)(n_ss - last_str_explored_) * PERCENT_REM_SS_RANK_ACC); } } // if this device is estimated to take less than 500 ms to solve the remaining ss, takes them all if ((float)(n_ss - last_str_explored_) * dev_info[dev_idx].avg_1ss_solv_time < MS_TAKE_ALL) { dev_info[dev_idx].block_size = n_ss - last_str_explored_; } } // if optimizing try to deliver blocks that take 1s to explore } else if (WORK == OPT) { if (dev_info[dev_idx].last_explor_time < FAST_BLOCKS_MS_OPT) { dev_info[dev_idx].n_fast_blocks++; } else { dev_info[dev_idx].n_fast_blocks = 0; if (dev_info[dev_idx].last_explor_time > FAST_BLOCKS_MS_OPT) { dev_info[dev_idx].block_size /= 2; } } if (dev_info[dev_idx].n_fast_blocks == N_FAST_BLOCKS_OPT) { dev_info[dev_idx].block_size += 1 + (unsigned int)(PERCENT_BLOCKS_ADD * (double)dev_info[dev_idx].block_size); dev_info[dev_idx].n_fast_blocks = 0; } // if finding one solution and all devices have explored at least three blocks each, try to deliver blocks that take 2s to explore } else { all_devs_used = true; for (i = 0; i < N_DEVS; i++) { if (dev_info[i].times_used < N_FIRST_BLOCKS) { all_devs_used = false; break; } } if (dev_info[dev_idx].avg_1ss_solv_time != 0 && all_devs_used == true) { dev_info[dev_idx].block_size = (unsigned int)(FAST_BLOCKS_MS_ONE / dev_info[dev_idx].avg_1ss_solv_time); } if (dev_info[dev_idx].last_explor_time > FAST_BLOCKS_MS_ONE * 2) { dev_info[dev_idx].block_size /= 2; } } } if (dev_info[dev_idx].times_used / TIMES_USED_TRESHOLD > 1.0 && (SS_REM_PERC_TRESHOLD * n_ss) < (n_ss - last_str_explored_)) { dev_info[dev_idx].block_size *= (unsigned int)(dev_info[dev_idx].times_used / TIMES_USED_TRESHOLD); } // if the block size is close to 0, stop it if (dev_info[dev_idx].block_size == 0) { dev_info[dev_idx].n_empty_blocks++; if (dev_info[dev_idx].n_empty_blocks >= N_EMPTY_BLOCKS && dev_idx != fastest_dev && dev_info[dev_idx].type == CL_DEVICE_TYPE_GPU && !all_GPUs) { #if defined(WIN32) || defined(_WIN32) || defined(__WIN32) && !defined(__CYGWIN__) devs_working_ = InterlockedDecrement(&devs_working); #else devs_working_ = __atomic_sub_fetch(&devs_working, 1, __ATOMIC_SEQ_CST); #endif if (devs_working_ > 0) { dev_info[dev_idx].block_size = 0; dev_info[dev_idx].working = false; } else { dev_info[dev_idx].block_size = n_ss - last_str_explored_; } } else { dev_info[dev_idx].block_size = 1; } } else { dev_info[dev_idx].n_empty_blocks = 0; } #if defined(WIN32) || defined(_WIN32) || defined(__WIN32) && !defined(__CYGWIN__) devs_working_ = InterlockedAdd(&devs_working, 0); #else devs_working_ = __atomic_add_fetch(&devs_working, 0, __ATOMIC_SEQ_CST); #endif if (devs_working_ == 1 && dev_info[dev_idx].working) { dev_info[dev_idx].block_size = n_ss - last_str_explored_; } #if SS_MULTIPLIER if (dev_info[dev_idx].block_size > 0) { //GPU if (dev_info[dev_idx].type == CL_DEVICE_TYPE_GPU && dev_info[dev_idx].block_size < SS_GPU / (GPU_DEFAULT_N_WI / (double)dev_info[dev_idx].n_wi_wg * 1.0) / (GPU_DEFAULT_N_WG / (double)dev_info[dev_idx].n_wg)) { dev_info[dev_idx].n_ss_mult = (unsigned int)((SS_GPU / (GPU_DEFAULT_N_WI / (double)dev_info[dev_idx].n_wi_wg) / (GPU_DEFAULT_N_WG / (double)dev_info[dev_idx].n_wg * 1.0)) / dev_info[dev_idx].block_size); // ACC } else if (dev_info[dev_idx].type == CL_DEVICE_TYPE_ACCELERATOR && dev_info[dev_idx].block_size < SS_ACC / (dev_info[dev_idx].compute_units / (double)dev_info[dev_idx].n_wg * 1.0)) { dev_info[dev_idx].n_ss_mult = (unsigned int)(SS_ACC / (dev_info[dev_idx].compute_units / (double)dev_info[dev_idx].n_wg * 1.0) / dev_info[dev_idx].block_size); // CPU } else if (DEVICES_INFO[dev_idx].type == CL_DEVICE_TYPE_CPU && dev_info[dev_idx].block_size < SS_CPU * dev_info[dev_idx].compute_units / (dev_info[dev_idx].compute_units / (double)dev_info[dev_idx].n_wg * 1.0)) { dev_info[dev_idx].n_ss_mult = (unsigned int)((SS_CPU * dev_info[dev_idx].compute_units) / (dev_info[dev_idx].compute_units / (double)dev_info[dev_idx].n_wg * 1.0) / dev_info[dev_idx].block_size); } if (dev_info[dev_idx].n_ss_mult > mult_max) { dev_info[dev_idx].n_ss_mult = mult_max; } else if (dev_info[dev_idx].n_ss_mult == 0) { dev_info[dev_idx].n_ss_mult = 1; } } #endif free(avg_prop_solv_time); } /* * Calculate number of stores to create and fill all stores with disjoint sub-trees * Stores are only filled to depth, because all the remaining domains are equal in all stores * depth - depth of tree needed to expand to fill stores * n_ss - to save the number of stores to create * strs - stores to fill */ void split_ss(unsigned int* depth, unsigned int* n_ss, unsigned int n_vs_to_label) { unsigned int n_strs; // maximum number of ss to create unsigned int vs_to_label_cntr = 0; unsigned int i, j; EXP_VALUES = calloc(N_VS, sizeof(unsigned int)); // if just one search space is to be used if ((N_DEVS == 1 && DEVICES_INFO[0].n_wg * DEVICES_INFO[0].n_wi_wg == 1 && N_SS == 0) || N_SS == 1) { (*depth) = 0; (*n_ss) = 1; } else { // if the user wants to use the default number of sub-search spaces if (N_SS == 0) { // if going to use more than one device if (N_DEVS > 1) { n_strs = 0; // base on the CPU number of cores for (i = 0; i < N_DEVS; i++) { if (DEVICES_INFO[i].type == CL_DEVICE_TYPE_CPU) { n_strs = SS_CPU * DEVICES_INFO[i].compute_units; break; } } // no CPU, get the device with less cores if (n_strs == 0) { for (i = 0; i < N_DEVS; i++) { // GPU if (DEVICES_INFO[0].type == CL_DEVICE_TYPE_GPU) { if (SS_GPU > n_strs) { n_strs = SS_GPU; } // MIC } else if (DEVICES_INFO[0].type == CL_DEVICE_TYPE_ACCELERATOR) { if (SS_ACC > n_strs) { n_strs = SS_ACC; } // CPU } else { n_strs = SS_GPU; } } } // if only one device } else if (DEVICES_INFO[0].type == CL_DEVICE_TYPE_GPU) { if (DEVICES_INFO[0].n_wg == DEVICES_INFO[0].def_n_wg && DEVICES_INFO[0].n_wi_wg == DEVICES_INFO[0].def_n_wi_wg) { n_strs = SS_GPU; } else { n_strs = (unsigned int)((DEVICES_INFO[0].n_wg * DEVICES_INFO[0].n_wi_wg * SS_GPU) / (DEVICES_INFO[0].def_n_wg * DEVICES_INFO[0].def_n_wi_wg)); } } else if (DEVICES_INFO[0].type == CL_DEVICE_TYPE_ACCELERATOR) { if (DEVICES_INFO[0].n_wg == DEVICES_INFO[0].def_n_wg && DEVICES_INFO[0].n_wi_wg == DEVICES_INFO[0].def_n_wi_wg) { n_strs = SS_ACC; } else { n_strs = (unsigned int)((DEVICES_INFO[0].n_wg * DEVICES_INFO[0].n_wi_wg * SS_ACC) / (DEVICES_INFO[0].def_n_wg * DEVICES_INFO[0].def_n_wi_wg)); } } else { if (DEVICES_INFO[0].n_wg == DEVICES_INFO[0].def_n_wg && DEVICES_INFO[0].n_wi_wg == DEVICES_INFO[0].def_n_wi_wg) { n_strs = SS_CPU * DEVICES_INFO[0].compute_units; } else { n_strs = (unsigned int)((DEVICES_INFO[0].n_wg * DEVICES_INFO[0].n_wi_wg * SS_CPU * DEVICES_INFO[0].compute_units) / (DEVICES_INFO[0].def_n_wg * DEVICES_INFO[0].def_n_wi_wg)); } } // cap on 1.000.000 per device if (n_strs > MAX_SS) { n_strs = MAX_SS; } } else { n_strs = N_SS; } // calculate the depth of the tree needed to expand *n_ss = 1; *depth = 0; i = 0; while ((*n_ss) < n_strs && vs_to_label_cntr < n_vs_to_label) { if (VS[i].to_label) { (*n_ss) *= VS[i].n_vals; EXP_VALUES[i] = VS[i].n_vals; // will be fully expanded during sub-search spaces creation, so its already labeled VS[i].to_label = false; VS[i].expanded = true; vs_labeled_at_exp++; vs_to_label_cntr++; } else { EXP_VALUES[i] = 1; } (*depth)++; i++; } // if expanding all the tree nodes to depth generate more than the required number of sub-search spaces if ((*n_ss) != n_strs) { i--; (*n_ss) /= VS[i].n_vals; // reset, because it will not be fully expanded VS[i].to_label = true; VS[i].expanded = false; vs_labeled_at_exp--; vs_to_label_cntr--; for (j = 2; j < VS[i].n_vals; j++) { if ((*n_ss) * j >= n_strs) { (*n_ss) *= j; EXP_VALUES[i] = j; break; } } if (j == VS[i].n_vals) { (*n_ss) *= VS[i].n_vals; EXP_VALUES[i] = VS[i].n_vals; VS[i].to_label = false; VS[i].expanded = true; vs_labeled_at_exp++; vs_to_label_cntr++; } } #if SS_MULTIPLIER unsigned long long mult_max_aux = 1; // get the max multiplier that can be applied to the number of ss inside each device if (*depth != N_VS) { for (i = (*depth); i < N_VS && vs_to_label_cntr < n_vs_to_label && mult_max_aux * (*n_ss) < UINT_MAX; i++) { if (VS[i].n_vals > 1 && VS[i].to_label) { vs_to_label_cntr++; mult_max_aux *= VS[i].n_vals; } } if (mult_max_aux * (*n_ss) > UINT_MAX) { if (i == N_VS) { i--; } mult_max_aux /= VS[i].n_vals; for (j = 2; j < VS[i].n_vals; j++) { if (mult_max_aux * (*n_ss) * j > UINT_MAX) { j--; mult_max_aux *= j; break; } } } mult_max = (unsigned int)mult_max_aux; } if (mult_max == 0) { mult_max = 1; } #else mult_max = 1; #endif } }