devices.c 12.2 KB
Edit Raw Blame History



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355


/*
 * devices.c
 *
 *  Created on: 15/01/2017
 *      Author: Pedro
 */

#include "devices.h"
#include "constraints.h"

#include <stdio.h>
#include <stdlib.h>

/*
 * calculate the expected speed when comparing the hardware of all the used devices. From 0 to 1, where 1 is the fastest.
 * The result is saved in "dev_info[i].rel_speed_expect"
 * dev_info - aray with the information of all the devices that will be used
 */
void calculate_rel_expect_speed(device_info *dev_info) {
	if (N_DEVS == 1) {
		(*dev_info).rel_speed_expect = 1;
	} else {
		unsigned int total = 0;
		unsigned int i;

		for (i = 0; i < N_DEVS; i++) {
			if (DEVICES_INFO[i].type == CL_DEVICE_TYPE_GPU) {
				total += (unsigned int) (dev_info[i].max_freq * dev_info[i].compute_units / GPU_CUTOFF / (GPU_DEFAULT_N_WI / (double) dev_info[i].n_wi_wg * 1.0)
						/ (double) (GPU_DEFAULT_N_WG / dev_info[i].n_wg));
			} else if (DEVICES_INFO[i].type == CL_DEVICE_TYPE_ACCELERATOR) {
				total += (unsigned int) (dev_info[i].max_freq * dev_info[i].compute_units / ACC_CUTOFF
						/ (dev_info[i].compute_units / (double) dev_info[i].n_wg * 1.0));
			} else {
				total += (unsigned int) (dev_info[i].max_freq * dev_info[i].compute_units / (dev_info[i].compute_units / (double) dev_info[i].n_wg * 1.0));
			}
		}

		for (i = 0; i < N_DEVS; i++) {
			if (DEVICES_INFO[i].type == CL_DEVICE_TYPE_GPU) {
				dev_info[i].rel_speed_expect = (float) (dev_info[i].max_freq * dev_info[i].compute_units / GPU_CUTOFF
						/ (GPU_DEFAULT_N_WI / (double) dev_info[i].n_wi_wg * 1.0) / (GPU_DEFAULT_N_WG / (double) dev_info[i].n_wg * 1.0) / (double) total);
			} else if (DEVICES_INFO[i].type == CL_DEVICE_TYPE_ACCELERATOR) {
				dev_info[i].rel_speed_expect = (float) (dev_info[i].max_freq * dev_info[i].compute_units / ACC_CUTOFF
						/ (dev_info[i].compute_units / (double) dev_info[i].n_wg * 1.0) / (double) total);
			} else {
				dev_info[i].rel_speed_expect = (float) (dev_info[i].max_freq * dev_info[i].compute_units
						/ (dev_info[i].compute_units / (double) dev_info[i].n_wg * 1.0) / (double) total);
			}
		}
	}
}

/*
 * Set the size of the device buffers when fully exploring on the device
 * dev_args - device_args structure of this device
 * dev_info - device_info structure about this device
 * filtering - if being executed in the prefiltering phase
 */
void set_buffs_size(device_args *dev_args, device_info *dev_info, bool filtering) {

#if USE_CONSTANT_MEM
	cl_ulong constant_mem_used = 0;
#endif

	unsigned int i, n_terms;

	dev_info->global_mem_used = 0;

	// 0...cs_vs_idx	- each constraint list of constrained variables ids placed per constraint order
	// cs_vs_idx...cs_vs_idx+vs_cs_idx	- each variable list of constraints ids placed per variable order
	// cs_vs_idx+vs_cs_idx...cs_vs_idx+vs_cs_idx+n_const_cs	- each constraint list of constants placed per constraint order
	dev_args->ints_size = (dev_args->n_vs_cs + dev_args->n_cs_vs + dev_args->n_const_cs) * sizeof(cl_int);

#if USE_CONSTANT_MEM
	dev_args->ints_const = true;
	if (dev_args->ints_size > dev_info->constant_mem_max_alloc) {
		dev_args->ints_const = false;

	} else {
		constant_mem_used += dev_args->ints_size;
	}
#else
	dev_args->ints_const = false;
#endif
	dev_info->global_mem_used += dev_args->ints_size;

	if (DOMAIN_TYPE == BITMAP_) {
		dev_args->b_ds_size = N_VS * DOMAIN_SIZE;
		dev_args->cl_vs_size = N_VS * sizeof(cl_var_bitmap);

#if USE_CONSTANT_MEM
		dev_args->b_ds_const = true;
		if (dev_args->b_ds_size + constant_mem_used > dev_info->constant_mem_max_alloc) {
			dev_args->b_ds_const = false;

		} else {
			constant_mem_used += dev_args->b_ds_size;
		}
#else
		dev_args->b_ds_const = false;
#endif
		dev_info->global_mem_used += dev_args->b_ds_size;

	} else if (DOMAIN_TYPE == INTERVAL) {
		dev_args->cl_vs_size = N_VS * sizeof(cl_var_interval);	// size of buffer for cl_var constant data
	}

#if USE_CONSTANT_MEM
	dev_args->cl_vs_const = true;
	if (dev_args->cl_vs_size + constant_mem_used > dev_info->constant_mem_max_alloc) {
		dev_args->cl_vs_const = false;

	} else {
		constant_mem_used += dev_args->cl_vs_size;
	}
#else
	dev_args->cl_vs_const = false;
#endif
	dev_info->global_mem_used += dev_args->cl_vs_size;

	// size of buffer for cl_constr constant data
	dev_args->cl_cs_size = N_CS * sizeof(cl_constr);

#if USE_CONSTANT_MEM
	dev_args->cl_cs_const = true;
	if (dev_args->cl_cs_size + constant_mem_used > dev_info->constant_mem_max_alloc) {
		dev_args->cl_cs_const = false;

	} else {
		constant_mem_used += dev_args->cl_cs_size;
	}
#else
	dev_args->cl_cs_const = false;
#endif
	dev_info->global_mem_used += dev_args->cl_cs_size;

	// size of buffer for cl_vs_prop data
	// if using local memory
	if (dev_info->use_local_mem) {
		if (DOMAIN_TYPE == BITMAP_) {
			dev_args->cl_vs_prop_size = dev_args->wi_local * N_VS * (sizeof(cl_var_p_bitmap) - sizeof(cl_bitmap) + DOMAIN_SIZE);

		} else if (DOMAIN_TYPE == INTERVAL) {
			dev_args->cl_vs_prop_size = dev_args->wi_local * N_VS * sizeof(cl_var_p_interval);
		}

		dev_args->vs_id_to_prop_size = dev_args->wi_local * (N_VS + 3) * sizeof(cl_ushort);

#if RUN_IN_CUDA
		// due to shared memory alignment in CUDA
		if (CL_WORD_ == 32) {
			while ((dev_args->vs_id_to_prop_size * 8) % 32 != 0) {
				dev_args->vs_id_to_prop_size++;
			}
		} else { // 64
			while ((dev_args->vs_id_to_prop_size * 8) % 64 != 0) {
				dev_args->vs_id_to_prop_size++;
			}
		}
#endif

		// if using only global memory
	} else {
		if (DOMAIN_TYPE == BITMAP_) {
			dev_args->cl_vs_prop_size = dev_args->wi_total * N_VS * (sizeof(cl_var_p_bitmap) - sizeof(cl_bitmap) + DOMAIN_SIZE);

		} else if (DOMAIN_TYPE == INTERVAL) {
			dev_args->cl_vs_prop_size = dev_args->wi_total * N_VS * sizeof(cl_var_p_interval);
		}
		dev_info->global_mem_used += dev_args->cl_vs_prop_size;

		dev_args->vs_id_to_prop_size = dev_args->wi_total * (N_VS + 3) * sizeof(cl_ushort);
		dev_info->global_mem_used += dev_args->vs_id_to_prop_size;
	}

	// buffer for backtracking data
	// 0...(n_vs_to_label+2)*N_VS*split_values_ext*wi_total - backtracking history
	dev_args->backtrack_size = (dev_args->n_vs_to_label + 2) * dev_args->split_values_ext * N_VS * dev_args->wi_total * DOMAIN_SIZE;

	dev_info->global_mem_used += dev_args->backtrack_size;

	// if all solutions must be found
	if (WORK == CNT) {
		// buffer for atomics data (Most devices only have atomics for 32 bits variables)
		// 0 - first sub-search to explore
		// 1 - last sub-search to explore
		// 2 - n_ss
		// 3 - depth
		// 4 - WIs still working for work-sharing
		// 5 - 5+N_VS - n_repeat per variable
		// 5+N_VS...5+N_VS+N_WG*N_WI_WG - number of solutions found per work-item
		dev_args->atoms_size = (5 + N_VS + dev_args->wi_total) * sizeof(cl_uint);

		// if only one solution must be found
	} else if (WORK == ONE) {
		// buffer for atomics data (Most devices only have atomics for 32 bits variables)
		// 0 - first sub-search to explore
		// 1 - last sub-search to explore
		// 2 - n_ss
		// 3 - depth
		// 4 - WIs still working for work-sharing
		// 5 - 5+N_VS - n_repeat per variable
		// 5+N_VS - solution found flag
		dev_args->atoms_size = (6 + N_VS) * sizeof(cl_uint);

		// buffer for saving the solution
		// 0...N_VS - solution domains
		dev_args->domains_size = N_VS * DOMAIN_SIZE;
		dev_info->global_mem_used += dev_args->domains_size;

		// if optimization
	} else if (WORK == OPT) {
		// buffer for atomics data (Most devices only have atomics for 32 bits variables)
		// 0 - first sub-search to explore
		// 1 - last sub-search to explore
		// 2 - n_ss
		// 3 - depth
		// 4 - WIs still working for work-sharing
		// 5 - 5+N_VS - n_repeat per variable
		// 5+N_VS - solution found flag
		// 6+N_VS - Value to optimize
		// 7+N_VS - WIs still working for saving the best solution
		dev_args->atoms_size = (8 + N_VS) * sizeof(cl_uint);

		// buffer for solutions
		// 0...N_VS*D_MAX+1 - solution stores because concurrency control
		dev_args->domains_size = (N_VS * (D_MAX + 1)) * DOMAIN_SIZE;
		dev_info->global_mem_used += dev_args->domains_size;

	} else {
		printf("\nObjective of exploration not recognized.\n");

#if defined(WIN32) || defined(_WIN32) || defined(__WIN32) && !defined(__CYGWIN__)
		printf("\nPress any key to exit\n");
		int a = getchar();
#endif

		exit(0);
	}

	dev_info->global_mem_used += dev_args->atoms_size;

	if (N_DEVS > 1) {
		// to count number of propagations done per work-item, for rank calculation
		dev_args->props_size = dev_args->wi_total * sizeof(cl_ulong);
		dev_info->global_mem_used += dev_args->props_size;
	}

	dev_args->n_shared_stores = 0;
#if SHARED_SS > 0

	// calculate number of shared stores needed for this device
	if (dev_info->type == CL_DEVICE_TYPE_GPU) {
		dev_args->n_shared_stores = dev_info->compute_units * (unsigned int)dev_args->wi_local;
	} else {
		dev_args->n_shared_stores = (unsigned int)dev_args->wi_total;
	}

	// for work-sharing after the ss in the block have finished
	dev_args->shared_stores_size = N_VS * dev_args->n_shared_stores * DOMAIN_SIZE;
	dev_info->global_mem_used += dev_args->shared_stores_size;

	// flags for signaling the state of each work-sharing store
	// 0 - next shared SS to be picked
	// 1 - next shared SS to be filled
	// 2...number of SS already filled
	// 3..3+CL_N_SHARED_SS - V_ID that was labeled to generate this SS
	dev_args->shared_stores_flag_size = (dev_args->n_shared_stores + 3) * sizeof(cl_int);
	dev_info->global_mem_used += dev_args->shared_stores_flag_size;
#endif

	// 0 - nodes_fail
	// 1 - nodes_expl
	// 2 - backtracks
	// 3 - labels
	// 4 - pruning
	// 5 - props_ok
	// 6 - max_depth
	// ... repeat per work-item
	if (PRINT_STATS) {
		dev_args->stats_size = 7 * dev_args->wi_total * sizeof(cl_ulong);
		dev_info->global_mem_used += dev_args->stats_size;
	}

	if (filtering) {
		// 0...N_VS - size of domains_mem buffer for the filtering result
		dev_args->filt_domains_size = N_VS * DOMAIN_SIZE;
		dev_info->global_mem_used += dev_args->filt_domains_size;

		if (CS_IGNORE) {
			// 0...N_CS - size of filt_cs_size buffer for the filtering
			dev_args->filt_cs_size = N_CS * sizeof(cl_char);
			dev_info->global_mem_used += dev_args->filt_cs_size;
		}
	}

	// define max number of terms for memory allocation in kernel propagators
	n_terms = 0;
	if (USE_CS[INT_LIN_EQ] || USE_CS[INT_LIN_LE] || USE_CS[INT_LIN_NE] || USE_CS[INT_LIN_VAR] || USE_CS[SUM] || USE_CS[SUM_VAR] || USE_CS[ARRAY_INT_ELEMENT]
			|| USE_CS[BOOL_LIN_EQ] || USE_CS[BOOL_LIN_LE] || USE_CS[MINUS_EQ]) {

		for (i = 0; i < N_CS; i++) {
			if (CS[i].kind == INT_LIN_EQ && CS[i].n_c_consts * 2 > n_terms) {
				n_terms = (unsigned int) CS[i].n_c_consts * 2;

			} else if (CS[i].kind == INT_LIN_LE && CS[i].n_c_consts * 2 > n_terms) {
				n_terms = (unsigned int) CS[i].n_c_consts * 2;

			} else if (CS[i].kind == INT_LIN_NE && CS[i].n_c_consts * 2 > n_terms) {
				n_terms = (unsigned int) CS[i].n_c_consts * 2;

			} else if (CS[i].kind == INT_LIN_VAR && CS[i].n_c_consts * 2 > n_terms) {
				n_terms = (unsigned int) CS[i].n_c_consts * 2;

			} else if (CS[i].kind == SUM && CS[i].n_c_vs * 2 > n_terms) {
				n_terms = (unsigned int) CS[i].n_c_vs * 2;

			} else if (CS[i].kind == SUM_VAR && CS[i].n_c_vs * 2 > n_terms) {
				n_terms = (unsigned int) CS[i].n_c_vs * 2;

			} else if (CS[i].kind == ARRAY_INT_ELEMENT && (D_MAX + 1) * 2 > n_terms) {
				n_terms = (D_MAX + 1) * 2;

			} else if (CS[i].kind == BOOL_LIN_EQ && CS[i].n_c_consts * 2 > n_terms) {
				n_terms = (unsigned int) CS[i].n_c_consts * 2;

			} else if (CS[i].kind == BOOL_LIN_LE && CS[i].n_c_consts * 2 > n_terms) {
				n_terms = (unsigned int) CS[i].n_c_consts * 2;

			} else if (CS[i].kind == MINUS_EQ && (D_MAX + 1) > n_terms) {
				n_terms = D_MAX + 1;

			}
		}
	}
	dev_info->n_terms = (int) n_terms;
	// (dev_args->n_vs_to_label + 2) * dev_args->split_values_ext) * 2 - to use in kernel (hist_labeleds_id and hist_labeleds_n_vals)
	// n_terms * dev_args->wi_total - to use in propagators
	// D_MAX+1*wi_total - for ss generation
	// ...
	dev_args->generic_size = (n_terms + ((dev_args->n_vs_to_label + 2) * dev_args->split_values_ext) * 2 + D_MAX + 1) * dev_args->wi_total * sizeof(cl_int);

#if FZN_SEQ
	// ...
	// D_MAX+1*wi_total+1... - list of labeling heuristics that may be used
	dev_args->generic_size += ((unsigned int) FZN_SEQ_N_LABELS) * sizeof(cl_int);
#endif

	dev_info->global_mem_used += dev_args->generic_size;

	if (CS_IGNORE) {
		dev_args->cs_ignore_size = N_CS * dev_args->wi_total * sizeof(cl_char);
		dev_info->global_mem_used += dev_args->cs_ignore_size;
	}
}