/* * devices.c * * Created on: 15/01/2017 * Author: pedro */ #include "devices.h" #include "constraints.h" #include #include /* * calculate the expected speed when comparing the hardware of all the used devices. From 0 to 1, where 1 is the fastest. * The result is saved in "dev_info[i].rel_speed_expect" * dev_info - aray with the information of all the devices that will be used */ void calculate_rel_expect_speed(device_info* dev_info) { if (N_DEVS == 1) { (*dev_info).rel_speed_expect = 1; } else { unsigned int total = 0; unsigned int i; for (i = 0; i < N_DEVS; i++) { if (DEVICES_INFO[i].type == CL_DEVICE_TYPE_GPU) { total += (unsigned int)(dev_info[i].max_freq * dev_info[i].compute_units / GPU_CUTOFF / (GPU_DEFAULT_N_WI / (double)dev_info[i].n_wi_wg * 1.0) / (double)(GPU_DEFAULT_N_WG / dev_info[i].n_wg)); } else if (DEVICES_INFO[i].type == CL_DEVICE_TYPE_ACCELERATOR) { total += (unsigned int)(dev_info[i].max_freq * dev_info[i].compute_units / ACC_CUTOFF / (dev_info[i].compute_units / (double)dev_info[i].n_wg * 1.0)); } else { total += (unsigned int)(dev_info[i].max_freq * dev_info[i].compute_units / (dev_info[i].compute_units / (double)dev_info[i].n_wg * 1.0)); } } for (i = 0; i < N_DEVS; i++) { if (DEVICES_INFO[i].type == CL_DEVICE_TYPE_GPU) { dev_info[i].rel_speed_expect = (float)(dev_info[i].max_freq * dev_info[i].compute_units / GPU_CUTOFF / (GPU_DEFAULT_N_WI / (double)dev_info[i].n_wi_wg * 1.0) / (GPU_DEFAULT_N_WG / (double)dev_info[i].n_wg * 1.0) / (double) total); } else if (DEVICES_INFO[i].type == CL_DEVICE_TYPE_ACCELERATOR) { dev_info[i].rel_speed_expect = (float)(dev_info[i].max_freq * dev_info[i].compute_units / ACC_CUTOFF / (dev_info[i].compute_units / (double)dev_info[i].n_wg * 1.0) / (double) total); } else { dev_info[i].rel_speed_expect = (float)(dev_info[i].max_freq * dev_info[i].compute_units / (dev_info[i].compute_units / (double)dev_info[i].n_wg * 1.0) / (double) total); } } } } /* * Set the size of the device buffers when fully exploring on the device * dev_args - device_args structure of this device * dev_info - device_info structure about this device * */ void set_buffs_size(device_args* dev_args, device_info* dev_info, bool filtering) { #if USE_CONSTANT_MEM cl_ulong constant_mem_used = 0; #endif unsigned int i, n_terms; dev_info->global_mem_used = 0; // 0...cs_vs_idx - each constraint list of constrained variables ids placed per constraint order // cs_vs_idx...cs_vs_idx+vs_cs_idx - each variable list of constraints ids placed per variable order // cs_vs_idx+vs_cs_idx...cs_vs_idx+vs_cs_idx+n_const_cs - each constraint list of constants placed per constraint order dev_args->ints_size = (dev_args->n_vs_cs + dev_args->n_cs_vs + dev_args->n_const_cs) * sizeof(cl_int); #if USE_CONSTANT_MEM dev_args->ints_const = true; if (dev_args->ints_size > dev_info->constant_mem_max_alloc) { dev_args->ints_const = false; } else { constant_mem_used += dev_args->ints_size; } #else dev_args->ints_const = false; #endif dev_info->global_mem_used += dev_args->ints_size; if (DOMAIN_TYPE == BITMAP_) { dev_args->b_ds_size = N_VS * DOMAIN_SIZE; dev_args->cl_vs_size = N_VS * sizeof(cl_var_bitmap); #if USE_CONSTANT_MEM dev_args->b_ds_const = true; if (dev_args->b_ds_size + constant_mem_used > dev_info->constant_mem_max_alloc) { dev_args->b_ds_const = false; } else { constant_mem_used += dev_args->b_ds_size; } #else dev_args->b_ds_const = false; #endif dev_info->global_mem_used += dev_args->b_ds_size; } else if (DOMAIN_TYPE == INTERVAL) { dev_args->cl_vs_size = N_VS * sizeof(cl_var_interval); // size of buffer for cl_var constant data } #if USE_CONSTANT_MEM dev_args->cl_vs_const = true; if (dev_args->cl_vs_size + constant_mem_used > dev_info->constant_mem_max_alloc) { dev_args->cl_vs_const = false; } else { constant_mem_used += dev_args->cl_vs_size; } #else dev_args->cl_vs_const = false; #endif dev_info->global_mem_used += dev_args->cl_vs_size; // size of buffer for cl_constr constant data dev_args->cl_cs_size = N_CS * sizeof(cl_constr); #if USE_CONSTANT_MEM dev_args->cl_cs_const = true; if (dev_args->cl_cs_size + constant_mem_used > dev_info->constant_mem_max_alloc) { dev_args->cl_cs_const = false; } else { constant_mem_used += dev_args->cl_cs_size; } #else dev_args->cl_cs_const = false; #endif dev_info->global_mem_used += dev_args->cl_cs_size; // size of buffer for cl_vs_prop data // if using local memory if (dev_info->use_local_mem) { if (DOMAIN_TYPE == BITMAP_) { dev_args->cl_vs_prop_size = dev_args->wi_local * N_VS * (sizeof(cl_var_p_bitmap) - sizeof(cl_bitmap) + DOMAIN_SIZE); } else if (DOMAIN_TYPE == INTERVAL) { dev_args->cl_vs_prop_size = dev_args->wi_local * N_VS * sizeof(cl_var_p_interval); } dev_args->vs_id_to_prop_size = dev_args->wi_local * (N_VS + 3) * sizeof(cl_ushort); #if RUN_IN_CUDA // due to shared memory alignment in CUDA if (CL_WORD_ == 32) { while ((dev_args->vs_id_to_prop_size * 8) % 32 != 0) { dev_args->vs_id_to_prop_size++; } } else { // 64 while ((dev_args->vs_id_to_prop_size * 8) % 64 != 0) { dev_args->vs_id_to_prop_size++; } } #endif // if using only global memory } else { if (DOMAIN_TYPE == BITMAP_) { dev_args->cl_vs_prop_size = dev_args->wi_total * N_VS * (sizeof(cl_var_p_bitmap) - sizeof(cl_bitmap) + DOMAIN_SIZE); } else if (DOMAIN_TYPE == INTERVAL) { dev_args->cl_vs_prop_size = dev_args->wi_total * N_VS * sizeof(cl_var_p_interval); } dev_info->global_mem_used += dev_args->cl_vs_prop_size; dev_args->vs_id_to_prop_size = dev_args->wi_total * (N_VS + 3) * sizeof(cl_ushort); dev_info->global_mem_used += dev_args->vs_id_to_prop_size; } // buffer for backtracking data // 0...(n_vs_to_label+2+TO_LABEL_THRESHOLD)*N_VS*split_values_ext*wi_total - backtracking history dev_args->backtrack_size = (dev_args->n_vs_to_label + 2 + TO_LABEL_THRESHOLD) * dev_args->split_values_ext * N_VS * dev_args->wi_total * DOMAIN_SIZE; dev_info->global_mem_used += dev_args->backtrack_size; // if all solutions must be found if (WORK == CNT) { // buffer for atomics data (Most devices only have atomics for 32 bits variables) // 0 - first sub-search to explore // 1 - last sub-search to explore // 2 - n_ss // 3 - depth // 4 - WIs still working for work-sharing // 5 - 5+N_VS - n_repeat per variable // 5+N_VS...5+N_VS+N_WG*N_WI_WG - number of solutions found per work-item dev_args->atoms_size = (5 + N_VS + dev_args->wi_total) * sizeof(cl_uint); // if only one solution must be found } else if (WORK == ONE) { // buffer for atomics data (Most devices only have atomics for 32 bits variables) // 0 - first sub-search to explore // 1 - last sub-search to explore // 2 - n_ss // 3 - depth // 4 - WIs still working for work-sharing // 5 - 5+N_VS - n_repeat per variable // 5+N_VS - solution found flag dev_args->atoms_size = (6 + N_VS) * sizeof(cl_uint); // buffer for saving the solution // 0...N_VS - solution domains dev_args->domains_size = N_VS * DOMAIN_SIZE; dev_info->global_mem_used += dev_args->domains_size; // if optimization } else if (WORK == OPT) { // buffer for atomics data (Most devices only have atomics for 32 bits variables) // 0 - first sub-search to explore // 1 - last sub-search to explore // 2 - n_ss // 3 - depth // 4 - WIs still working for work-sharing // 5 - 5+N_VS - n_repeat per variable // 5+N_VS - solution found flag // 6+N_VS - Value to optimize // 7+N_VS - WIs still working for saving the best solution dev_args->atoms_size = (8 + N_VS) * sizeof(cl_uint); // buffer for solutions // 0...N_VS*D_MAX+1 - solution stores because concurrency control dev_args->domains_size = (N_VS * (D_MAX + 1)) * DOMAIN_SIZE; dev_info->global_mem_used += dev_args->domains_size; } else { fprintf(stderr, "\nObjective of exploration not recognized.\n"); exit(-1); } dev_info->global_mem_used += dev_args->atoms_size; if (N_DEVS > 1) { // to count number of propagations done per work-item, for rank calculation dev_args->props_size = dev_args->wi_total * sizeof(cl_ulong); dev_info->global_mem_used += dev_args->props_size; } dev_args->n_shared_stores = 0; #if SHARED_SS > 0 // calculate number of shared stores needed for this device if (dev_info->type == CL_DEVICE_TYPE_GPU) { dev_args->n_shared_stores = dev_info->compute_units * (unsigned int)dev_args->wi_local; } else { dev_args->n_shared_stores = (unsigned int)dev_args->wi_total; } // for work-sharing after the ss in the block have finished dev_args->shared_stores_size = N_VS * dev_args->n_shared_stores * DOMAIN_SIZE; dev_info->global_mem_used += dev_args->shared_stores_size; // flags for signaling the state of each work-sharing store // 0 - next shared SS to be picked // 1 - next shared SS to be filled // 2...number of SS already filled // 3..3+CL_N_SHARED_SS - V_ID that was labeled to generate this SS dev_args->shared_stores_flag_size = (dev_args->n_shared_stores + 3) * sizeof(cl_int); dev_info->global_mem_used += dev_args->shared_stores_flag_size; #endif // 0 - nodes_fail // 1 - nodes_expl // 2 - backtracks // 3 - labels // 4 - pruning // 5 - props_ok // 6 - max_depth // ... repeat per work-item if (PRINT_STATS) { dev_args->stats_size = 7 * dev_args->wi_total * sizeof(cl_ulong); dev_info->global_mem_used += dev_args->stats_size; } if (filtering) { // 0...N_VS - size of domains_mem buffer for the filtering result dev_args->filt_domains_size = N_VS * DOMAIN_SIZE; dev_info->global_mem_used += dev_args->filt_domains_size; if (CS_IGNORE) { // 0...N_CS - size of filt_cs_size buffer for the filtering dev_args->filt_cs_size = N_CS * sizeof(cl_char); dev_info->global_mem_used += dev_args->filt_cs_size; } } // define max number of terms for memory allocation in kernel propagators n_terms = 0; if (USE_CS[LINEAR] || USE_CS[LINEAR_LT] || USE_CS[LINEAR_NE] || USE_CS[LINEAR_VAR] || USE_CS[SUM] || USE_CS[SUM_VAR] || USE_CS[ELEMENT_INT_VAR]) { for (i = 0; i < N_CS; i++) { if (CS[i].kind == LINEAR && CS[i].n_c_consts * 2 > n_terms) { n_terms = (unsigned int)CS[i].n_c_consts * 2; } else if (CS[i].kind == LINEAR_LT && CS[i].n_c_consts * 2 > n_terms) { n_terms = (unsigned int)CS[i].n_c_consts * 2; } else if (CS[i].kind == LINEAR_NE && CS[i].n_c_consts * 2 > n_terms) { n_terms = (unsigned int)CS[i].n_c_consts * 2; } else if (CS[i].kind == LINEAR_VAR && CS[i].n_c_consts * 2 > n_terms) { n_terms = (unsigned int)CS[i].n_c_consts * 2; } else if (CS[i].kind == SUM && CS[i].n_c_vs * 2 > n_terms) { n_terms = (unsigned int)CS[i].n_c_vs * 2; } else if (CS[i].kind == SUM_VAR && CS[i].n_c_vs * 2 > n_terms) { n_terms = (unsigned int)CS[i].n_c_vs * 2; } else if (CS[i].kind == ELEMENT_INT_VAR && (D_MAX + 1) * 2 > n_terms) { n_terms = (D_MAX + 1) * 2; } } } dev_info->n_terms = (int)n_terms; // (dev_args->n_vs_to_label + 2 + TO_LABEL_THRESHOLD) * dev_args->split_values_ext) * 2 - to use in kernel (hist_labeleds_id and hist_labeleds_n_vals) // n_terms * dev_args->wi_total - to use in propagators // D_MAX+1*wi_total - for ss generation dev_args->generic_size = (n_terms + ((dev_args->n_vs_to_label + 2 + TO_LABEL_THRESHOLD) * dev_args->split_values_ext) * 2 + D_MAX + 1) * dev_args->wi_total * sizeof(cl_int); if (CS_IGNORE) { dev_args->cs_ignore_size = N_CS * dev_args->wi_total * sizeof(cl_char); dev_info->global_mem_used += dev_args->cs_ignore_size; } dev_info->global_mem_used += dev_args->generic_size; }