/*
 * devices.c
 *
 *  Created on: 15/01/2017
 *      Author: pedro
 */

#include "devices.h"
#include "constraints.h"

#include <stdio.h>
#include <stdlib.h>

/*
 * calculate the expected speed when comparing the hardware of all the used devices. From 0 to 1, where 1 is the fastest.
 * The result is saved in "dev_info[i].rel_speed_expect"
 * dev_info - aray with the information of all the devices that will be used
 */
void calculate_rel_expect_speed(device_info* dev_info) {
	if (N_DEVS == 1) {
		(*dev_info).rel_speed_expect = 1;
	} else {
		unsigned int total = 0;
		unsigned int i;

		for (i = 0; i < N_DEVS; i++) {
			if (DEVICES_INFO[i].type == CL_DEVICE_TYPE_GPU) {
				total += (unsigned int)(dev_info[i].max_freq * dev_info[i].compute_units / GPU_CUTOFF / (GPU_DEFAULT_N_WI / (double)dev_info[i].n_wi_wg * 1.0) /
						(double)(GPU_DEFAULT_N_WG / dev_info[i].n_wg));
			} else if (DEVICES_INFO[i].type == CL_DEVICE_TYPE_ACCELERATOR) {
				total += (unsigned int)(dev_info[i].max_freq * dev_info[i].compute_units / ACC_CUTOFF / (dev_info[i].compute_units / (double)dev_info[i].n_wg * 1.0));
			} else {
				total += (unsigned int)(dev_info[i].max_freq * dev_info[i].compute_units / (dev_info[i].compute_units / (double)dev_info[i].n_wg * 1.0));
			}
		}

		for (i = 0; i < N_DEVS; i++) {
			if (DEVICES_INFO[i].type == CL_DEVICE_TYPE_GPU) {
				dev_info[i].rel_speed_expect = (float)(dev_info[i].max_freq * dev_info[i].compute_units / GPU_CUTOFF / (GPU_DEFAULT_N_WI / (double)dev_info[i].n_wi_wg * 1.0)
						/ (GPU_DEFAULT_N_WG / (double)dev_info[i].n_wg * 1.0) / (double) total);
			} else if (DEVICES_INFO[i].type == CL_DEVICE_TYPE_ACCELERATOR) {
				dev_info[i].rel_speed_expect = (float)(dev_info[i].max_freq * dev_info[i].compute_units / ACC_CUTOFF / (dev_info[i].compute_units /
						(double)dev_info[i].n_wg * 1.0) / (double) total);
			} else {
				dev_info[i].rel_speed_expect = (float)(dev_info[i].max_freq * dev_info[i].compute_units / (dev_info[i].compute_units /
						(double)dev_info[i].n_wg * 1.0) / (double) total);
			}
		}
	}
}

/*
 * Set the size of the device buffers when fully exploring on the device
 * dev_args - device_args structure of this device
 * dev_info - device_info structure about this device
 *
 */
void set_buffs_size(device_args* dev_args, device_info* dev_info, bool filtering) {

#if USE_CONSTANT_MEM
	cl_ulong constant_mem_used = 0;
#endif

	unsigned int i, n_terms;

	dev_info->global_mem_used = 0;

	// 0...cs_vs_idx	- each constraint list of constrained variables ids placed per constraint order
	// cs_vs_idx...cs_vs_idx+vs_cs_idx	- each variable list of constraints ids placed per variable order
	// cs_vs_idx+vs_cs_idx...cs_vs_idx+vs_cs_idx+n_const_cs	- each constraint list of constants placed per constraint order
	dev_args->ints_size = (dev_args->n_vs_cs + dev_args->n_cs_vs + dev_args->n_const_cs) * sizeof(cl_int);

#if USE_CONSTANT_MEM
	dev_args->ints_const = true;
	if (dev_args->ints_size > dev_info->constant_mem_max_alloc) {
		dev_args->ints_const = false;

	} else {
		constant_mem_used += dev_args->ints_size;
	}
#else
	dev_args->ints_const = false;
#endif
	dev_info->global_mem_used += dev_args->ints_size;

	if (DOMAIN_TYPE == BITMAP_) {
		dev_args->b_ds_size = N_VS * DOMAIN_SIZE;
		dev_args->cl_vs_size = N_VS * sizeof(cl_var_bitmap);

#if USE_CONSTANT_MEM
		dev_args->b_ds_const = true;
		if (dev_args->b_ds_size + constant_mem_used > dev_info->constant_mem_max_alloc) {
			dev_args->b_ds_const = false;

		} else {
			constant_mem_used += dev_args->b_ds_size;
		}
#else
	dev_args->b_ds_const = false;
#endif
		dev_info->global_mem_used += dev_args->b_ds_size;


	} else if (DOMAIN_TYPE == INTERVAL) {
		dev_args->cl_vs_size = N_VS * sizeof(cl_var_interval);	// size of buffer for cl_var constant data
	}

#if USE_CONSTANT_MEM
	dev_args->cl_vs_const = true;
	if (dev_args->cl_vs_size + constant_mem_used  > dev_info->constant_mem_max_alloc) {
		dev_args->cl_vs_const = false;

	} else {
		constant_mem_used += dev_args->cl_vs_size;
	}
#else
	dev_args->cl_vs_const = false;
#endif
	dev_info->global_mem_used += dev_args->cl_vs_size;

	// size of buffer for cl_constr constant data
	dev_args->cl_cs_size = N_CS * sizeof(cl_constr);

#if USE_CONSTANT_MEM
	dev_args->cl_cs_const = true;
	if (dev_args->cl_cs_size + constant_mem_used  > dev_info->constant_mem_max_alloc) {
		dev_args->cl_cs_const = false;

	} else {
		constant_mem_used += dev_args->cl_cs_size;
	}
#else
	dev_args->cl_cs_const = false;
#endif
	dev_info->global_mem_used += dev_args->cl_cs_size;

	// size of buffer for cl_vs_prop data
	// if using local memory
	if (dev_info->use_local_mem) {
		if (DOMAIN_TYPE == BITMAP_) {
			dev_args->cl_vs_prop_size = dev_args->wi_local * N_VS * (sizeof(cl_var_p_bitmap) - sizeof(cl_bitmap) + DOMAIN_SIZE);

		} else if (DOMAIN_TYPE == INTERVAL) {
			dev_args->cl_vs_prop_size = dev_args->wi_local * N_VS * sizeof(cl_var_p_interval);
		}

		dev_args->vs_id_to_prop_size = dev_args->wi_local * (N_VS + 3) * sizeof(cl_ushort);

#if RUN_IN_CUDA
		// due to shared memory alignment in CUDA
		if (CL_WORD_ == 32) {
			while ((dev_args->vs_id_to_prop_size * 8) % 32 != 0) {
				dev_args->vs_id_to_prop_size++;
			}
		} else { // 64
			while ((dev_args->vs_id_to_prop_size * 8) % 64 != 0) {
				dev_args->vs_id_to_prop_size++;
			}
		}
#endif

		// if using only global memory
	} else {
		if (DOMAIN_TYPE == BITMAP_) {
			dev_args->cl_vs_prop_size = dev_args->wi_total * N_VS * (sizeof(cl_var_p_bitmap) - sizeof(cl_bitmap) + DOMAIN_SIZE);

		} else if (DOMAIN_TYPE == INTERVAL) {
			dev_args->cl_vs_prop_size = dev_args->wi_total * N_VS * sizeof(cl_var_p_interval);
		}
		dev_info->global_mem_used += dev_args->cl_vs_prop_size;

		dev_args->vs_id_to_prop_size = dev_args->wi_total * (N_VS + 3) * sizeof(cl_ushort);
		dev_info->global_mem_used += dev_args->vs_id_to_prop_size;
	}

	// buffer for backtracking data
	// 0...(n_vs_to_label+2+TO_LABEL_THRESHOLD)*N_VS*split_values_ext*wi_total - backtracking history
	dev_args->backtrack_size = (dev_args->n_vs_to_label + 2 + TO_LABEL_THRESHOLD) * dev_args->split_values_ext * N_VS * dev_args->wi_total * DOMAIN_SIZE;

	dev_info->global_mem_used += dev_args->backtrack_size;

	// if all solutions must be found
	if (WORK == CNT) {
		// buffer for atomics data (Most devices only have atomics for 32 bits variables)
		// 0 - first sub-search to explore
		// 1 - last sub-search to explore
		// 2 - n_ss
		// 3 - depth
		// 4 - WIs still working for work-sharing
		// 5 - 5+N_VS - n_repeat per variable
		// 5+N_VS...5+N_VS+N_WG*N_WI_WG - number of solutions found per work-item
		dev_args->atoms_size = (5 + N_VS + dev_args->wi_total) * sizeof(cl_uint);

		// if only one solution must be found
	} else if (WORK == ONE) {
		// buffer for atomics data (Most devices only have atomics for 32 bits variables)
		// 0 - first sub-search to explore
		// 1 - last sub-search to explore
		// 2 - n_ss
		// 3 - depth
		// 4 - WIs still working for work-sharing
		// 5 - 5+N_VS - n_repeat per variable
		// 5+N_VS - solution found flag
		dev_args->atoms_size = (6 + N_VS) * sizeof(cl_uint);

		// buffer for saving the solution
		// 0...N_VS - solution domains
		dev_args->domains_size = N_VS * DOMAIN_SIZE;
		dev_info->global_mem_used += dev_args->domains_size;

		// if optimization
	} else if (WORK == OPT) {
		// buffer for atomics data (Most devices only have atomics for 32 bits variables)
		// 0 - first sub-search to explore
		// 1 - last sub-search to explore
		// 2 - n_ss
		// 3 - depth
		// 4 - WIs still working for work-sharing
		// 5 - 5+N_VS - n_repeat per variable
		// 5+N_VS - solution found flag
		// 6+N_VS - Value to optimize
		// 7+N_VS - WIs still working for saving the best solution
		dev_args->atoms_size = (8 + N_VS) * sizeof(cl_uint);

		// buffer for solutions
		// 0...N_VS*D_MAX+1 - solution stores because concurrency control
		dev_args->domains_size = (N_VS * (D_MAX + 1)) * DOMAIN_SIZE;
		dev_info->global_mem_used += dev_args->domains_size;

	} else {
		fprintf(stderr, "\nObjective of exploration not recognized.\n");
		exit(-1);
	}

	dev_info->global_mem_used += dev_args->atoms_size;

	if (N_DEVS > 1) {
		// to count number of propagations done per work-item, for rank calculation
		dev_args->props_size = dev_args->wi_total * sizeof(cl_ulong);
		dev_info->global_mem_used += dev_args->props_size;
	}

	dev_args->n_shared_stores = 0;
#if SHARED_SS > 0

	// calculate number of shared stores needed for this device
	if (dev_info->type == CL_DEVICE_TYPE_GPU) {
		dev_args->n_shared_stores = dev_info->compute_units * (unsigned int)dev_args->wi_local;
	} else {
		dev_args->n_shared_stores = (unsigned int)dev_args->wi_total;
	}

	// for work-sharing after the ss in the block have finished
	dev_args->shared_stores_size = N_VS * dev_args->n_shared_stores * DOMAIN_SIZE;
	dev_info->global_mem_used += dev_args->shared_stores_size;

	// flags for signaling the state of each work-sharing store
	// 0 - next shared SS to be picked
	// 1 - next shared SS to be filled
	// 2...number of SS already filled
	// 3..3+CL_N_SHARED_SS - V_ID that was labeled to generate this SS
	dev_args->shared_stores_flag_size = (dev_args->n_shared_stores + 3) * sizeof(cl_int);
	dev_info->global_mem_used += dev_args->shared_stores_flag_size;
#endif

	// 0 - nodes_fail
	// 1 - nodes_expl
	// 2 - backtracks
	// 3 - labels
	// 4 - pruning
	// 5 - props_ok
	// 6 - max_depth
	// ... repeat per work-item
	if (PRINT_STATS) {
		dev_args->stats_size = 7 * dev_args->wi_total * sizeof(cl_ulong);
		dev_info->global_mem_used += dev_args->stats_size;
	}

	if (filtering) {
		// 0...N_VS - size of domains_mem buffer for the filtering result
		dev_args->filt_domains_size = N_VS * DOMAIN_SIZE;
		dev_info->global_mem_used += dev_args->filt_domains_size;

		if (CS_IGNORE) {
			// 0...N_CS - size of filt_cs_size buffer for the filtering
			dev_args->filt_cs_size = N_CS * sizeof(cl_char);
			dev_info->global_mem_used += dev_args->filt_cs_size;
		}
	}

	// define max number of terms for memory allocation in kernel propagators
	n_terms = 0;
	if (USE_CS[LINEAR] || USE_CS[LINEAR_LT] || USE_CS[LINEAR_NE] || USE_CS[LINEAR_VAR] || USE_CS[SUM] || USE_CS[SUM_VAR] || USE_CS[ELEMENT_INT_VAR]) {

		for (i = 0; i < N_CS; i++) {
			if (CS[i].kind == LINEAR && CS[i].n_c_consts * 2 > n_terms) {
				n_terms = (unsigned int)CS[i].n_c_consts * 2;

			} else if (CS[i].kind == LINEAR_LT && CS[i].n_c_consts * 2 > n_terms) {
					n_terms = (unsigned int)CS[i].n_c_consts * 2;

			} else if (CS[i].kind == LINEAR_NE && CS[i].n_c_consts * 2 > n_terms) {
				n_terms = (unsigned int)CS[i].n_c_consts * 2;

			} else if (CS[i].kind == LINEAR_VAR && CS[i].n_c_consts * 2 > n_terms) {
				n_terms = (unsigned int)CS[i].n_c_consts * 2;

			} else if (CS[i].kind == SUM && CS[i].n_c_vs * 2 > n_terms) {
				n_terms = (unsigned int)CS[i].n_c_vs * 2;

			} else if (CS[i].kind == SUM_VAR && CS[i].n_c_vs * 2 > n_terms) {
				n_terms = (unsigned int)CS[i].n_c_vs * 2;

			} else if (CS[i].kind == ELEMENT_INT_VAR && (D_MAX + 1) * 2 > n_terms) {
				n_terms = (D_MAX + 1) * 2;
			}
		}
	}
	dev_info->n_terms = (int)n_terms;
	// (dev_args->n_vs_to_label + 2 + TO_LABEL_THRESHOLD) * dev_args->split_values_ext) * 2 - to use in kernel (hist_labeleds_id and hist_labeleds_n_vals)
	// n_terms * dev_args->wi_total - to use in propagators
	// D_MAX+1*wi_total - for ss generation
	dev_args->generic_size = (n_terms + ((dev_args->n_vs_to_label + 2 + TO_LABEL_THRESHOLD) * dev_args->split_values_ext) * 2 + D_MAX + 1)
			* dev_args->wi_total * sizeof(cl_int);

	if (CS_IGNORE) {
		dev_args->cs_ignore_size = N_CS * dev_args->wi_total * sizeof(cl_char);
		dev_info->global_mem_used += dev_args->cs_ignore_size;
	}

	dev_info->global_mem_used += dev_args->generic_size;
}