bucketsort.c 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640
  1. #define BUCKET_WARP_LOG_SIZE 5
  2. #define BUCKET_WARP_N 1
  3. #ifdef BUCKET_WG_SIZE_1
  4. #define BUCKET_THREAD_N BUCKET_WG_SIZE_1
  5. #else
  6. #define BUCKET_THREAD_N (BUCKET_WARP_N << BUCKET_WARP_LOG_SIZE)
  7. #endif
  8. #define BUCKET_BLOCK_MEMORY (DIVISIONS * BUCKET_WARP_N)
  9. #define BUCKET_BAND 128
  10. #define SIZE (1 << 22)
  11. #define DATA_SIZE (1024)
  12. #define MAX_SOURCE_SIZE (0x100000)
  13. #define HISTOGRAM_SIZE (1024 * sizeof(unsigned int))
  14. #include <fcntl.h>
  15. #include <float.h>
  16. #include <stdio.h>
  17. #include <stdlib.h>
  18. #include <string.h>
  19. #include <math.h>
  20. #include <unistd.h>
  21. #include <sys/types.h>
  22. #include <sys/stat.h>
  23. #include <CL/cl.h>
  24. #include "bucketsort.h"
  25. #include <time.h>
  26. ////////////////////////////////////////////////////////////////////////////////
  27. // Forward declarations
  28. ////////////////////////////////////////////////////////////////////////////////
  29. void calcPivotPoints(float *histogram, int histosize, int listsize,
  30. int divisions, float min, float max, float *pivotPoints,
  31. float histo_width);
  32. ////////////////////////////////////////////////////////////////////////////////
  33. // Globals
  34. ////////////////////////////////////////////////////////////////////////////////
  35. const int histosize = 1024;
  36. unsigned int* h_offsets = NULL;
  37. unsigned int* d_offsets = NULL;
  38. cl_mem d_offsets_buff;
  39. int *d_indice = NULL;
  40. cl_mem d_indice_buff;
  41. cl_mem d_input_buff;
  42. cl_mem d_indice_input_buff;
  43. float *pivotPoints = NULL;
  44. float *historesult = NULL;
  45. float *l_pivotpoints = NULL;
  46. cl_mem l_pivotpoints_buff;
  47. unsigned int *d_prefixoffsets = NULL;
  48. unsigned int *d_prefixoffsets_altered = NULL;
  49. cl_mem d_prefixoffsets_buff;
  50. cl_mem d_prefixoffsets_input_buff;
  51. unsigned int *l_offsets = NULL;
  52. cl_mem l_offsets_buff;
  53. unsigned int *d_Result1024;
  54. cl_device_id device_id; // compute device id
  55. cl_context bucketContext; // compute context
  56. cl_context histoContext;
  57. cl_command_queue bucketCommands; // compute command queue
  58. cl_command_queue histoCommands;
  59. cl_program bucketProgram; // compute program
  60. cl_program histoProgram;
  61. cl_kernel bucketcountKernel; // compute kernel
  62. cl_kernel histoKernel;
  63. cl_kernel bucketprefixKernel;
  64. cl_kernel bucketsortKernel;
  65. cl_mem histoInput;
  66. cl_mem histoOutput;
  67. cl_mem bucketOutput;
  68. cl_int err;
  69. cl_uint num_platforms;
  70. cl_event histoEvent;
  71. cl_event bucketCountEvent;
  72. cl_event bucketPrefixEvent;
  73. cl_event bucketSortEvent;
  74. double sum = 0;
  75. ////////////////////////////////////////////////////////////////////////////////
  76. // Initialize the bucketsort algorithm
  77. ////////////////////////////////////////////////////////////////////////////////
  78. void init_bucketsort(int listsize)
  79. {
  80. cl_uint num = 0;
  81. clGetPlatformIDs(0, NULL, &num);
  82. cl_platform_id platformID[num];
  83. clGetPlatformIDs(num, platformID, NULL);
  84. clGetDeviceIDs(platformID[1],CL_DEVICE_TYPE_CPU,0,NULL,&num);
  85. cl_device_id devices[num];
  86. err = clGetDeviceIDs(platformID[1],CL_DEVICE_TYPE_CPU,num,devices,NULL);
  87. // int gpu = 1;
  88. // err = clGetDeviceIDs(NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 2, &device_id, NULL);
  89. if (err != CL_SUCCESS)
  90. {
  91. printf("Error: Failed to create a device group!\n");
  92. exit(1);
  93. }
  94. char name[128];
  95. clGetDeviceInfo(devices[0],CL_DEVICE_NAME,128,name,NULL);
  96. bucketContext = clCreateContext(0, 1, &devices[0], NULL, NULL, &err);
  97. bucketCommands = clCreateCommandQueue(bucketContext, devices[0], CL_QUEUE_PROFILING_ENABLE, &err);
  98. h_offsets = (unsigned int *) malloc(DIVISIONS * sizeof(unsigned int));
  99. for(int i = 0; i < DIVISIONS; i++){
  100. h_offsets[i] = 0;
  101. }
  102. d_offsets_buff = clCreateBuffer(bucketContext,CL_MEM_READ_WRITE, DIVISIONS * sizeof(unsigned int),NULL,NULL);
  103. pivotPoints = (float *)malloc(DIVISIONS * sizeof(float));
  104. d_indice_buff = clCreateBuffer(bucketContext,CL_MEM_READ_WRITE, listsize * sizeof(int),NULL,NULL);
  105. d_indice_input_buff = clCreateBuffer(bucketContext,CL_MEM_READ_WRITE, listsize * sizeof(int),NULL,NULL);
  106. d_indice = (int *)malloc(listsize * sizeof(int));
  107. historesult = (float *)malloc(histosize * sizeof(float));
  108. l_pivotpoints = (float *)malloc(DIVISIONS*sizeof(float));
  109. l_pivotpoints_buff = clCreateBuffer(bucketContext,CL_MEM_READ_WRITE, DIVISIONS * sizeof(float), NULL, NULL);
  110. l_offsets_buff = clCreateBuffer(bucketContext,CL_MEM_READ_WRITE, DIVISIONS * sizeof(unsigned int), NULL, NULL);
  111. int blocks = ((listsize - 1) / (BUCKET_THREAD_N * BUCKET_BAND)) + 1;
  112. d_prefixoffsets_buff = clCreateBuffer(bucketContext,CL_MEM_READ_WRITE, blocks * BUCKET_BLOCK_MEMORY * sizeof(int), NULL, NULL);
  113. d_prefixoffsets = (unsigned int *)malloc(blocks*BUCKET_BLOCK_MEMORY*sizeof(int));
  114. d_prefixoffsets_altered = (unsigned int *)malloc(blocks*BUCKET_BLOCK_MEMORY*sizeof(int));
  115. d_prefixoffsets_input_buff = clCreateBuffer(bucketContext,CL_MEM_READ_WRITE, blocks * BUCKET_BLOCK_MEMORY * sizeof(int), NULL, NULL);
  116. bucketOutput = clCreateBuffer(bucketContext, CL_MEM_READ_WRITE, (listsize + (DIVISIONS*4))*sizeof(float), NULL, NULL);
  117. FILE *fp;
  118. const char fileName[]="./bucketsort_kernels.cl";
  119. size_t source_size;
  120. char *source_str;
  121. fp = fopen(fileName, "r");
  122. if (!fp) {
  123. fprintf(stderr, "Failed to load bucket kernel.\n");
  124. exit(1);
  125. }
  126. source_str = (char *)malloc(MAX_SOURCE_SIZE);
  127. source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
  128. fclose(fp);
  129. bucketProgram = clCreateProgramWithSource(bucketContext, 1, (const char **) &source_str, (const size_t)&source_size, &err);
  130. if (!bucketProgram)
  131. {
  132. printf("Error: Failed to create bucket compute program!\n");
  133. exit(1);
  134. }
  135. err = clBuildProgram(bucketProgram, 0, NULL, NULL, NULL, NULL);
  136. if (err != CL_SUCCESS)
  137. {
  138. size_t len;
  139. char buffer[2048];
  140. printf("Error: Failed to build bucket program executable!\n");
  141. clGetProgramBuildInfo(bucketProgram, devices[0], CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
  142. printf("%s\n", buffer);
  143. exit(1);
  144. }
  145. }
  146. ////////////////////////////////////////////////////////////////////////////////
  147. // Uninitialize the bucketsort algorithm
  148. ////////////////////////////////////////////////////////////////////////////////
  149. void finish_bucketsort()
  150. {
  151. clReleaseMemObject(d_offsets_buff);
  152. clReleaseMemObject(d_indice_buff);
  153. clReleaseMemObject(l_pivotpoints_buff);
  154. clReleaseMemObject(l_offsets_buff);
  155. clReleaseMemObject(d_prefixoffsets_buff);
  156. clReleaseMemObject(d_input_buff);
  157. clReleaseMemObject(d_indice_input_buff);
  158. clReleaseMemObject(bucketOutput);
  159. clReleaseProgram(bucketProgram);
  160. clReleaseKernel(bucketcountKernel);
  161. clReleaseKernel(bucketprefixKernel);
  162. clReleaseKernel(bucketsortKernel);
  163. clReleaseCommandQueue(bucketCommands);
  164. clReleaseContext(bucketContext);
  165. free(pivotPoints);
  166. free(h_offsets);
  167. free(historesult);
  168. }
  169. void histogramInit(int listsize) {
  170. cl_uint num = 0;
  171. clGetPlatformIDs(0, NULL, &num);
  172. cl_platform_id platformID[num];
  173. clGetPlatformIDs(num, platformID, NULL);
  174. clGetDeviceIDs(platformID[1],CL_DEVICE_TYPE_CPU,0,NULL,&num);
  175. num = 2;
  176. char name[128];
  177. clGetPlatformInfo(platformID[1], CL_PLATFORM_PROFILE,128,name,NULL);
  178. cl_device_id devices[num];
  179. err = clGetDeviceIDs(platformID[1],CL_DEVICE_TYPE_CPU,num,devices,NULL);
  180. // int gpu = 1;
  181. // err = clGetDeviceIDs(NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 2, &device_id, NULL);
  182. if (err != CL_SUCCESS)
  183. {
  184. printf("Error: Failed to create a device group!\n");
  185. exit(1);
  186. }
  187. clGetDeviceInfo(devices[0],CL_DEVICE_NAME,128,name,NULL);
  188. printf("%s \n", name);
  189. cl_context_properties contextProperties[] =
  190. {
  191. CL_CONTEXT_PLATFORM,
  192. (cl_context_properties)platformID[num],
  193. 0
  194. };
  195. histoContext = clCreateContext(0, 1, &devices[0], NULL, NULL, &err);
  196. histoCommands = clCreateCommandQueue(histoContext, devices[0], CL_QUEUE_PROFILING_ENABLE, &err);
  197. histoInput = clCreateBuffer(histoContext, CL_MEM_READ_ONLY, listsize*(sizeof(float)), NULL, NULL);
  198. histoOutput = clCreateBuffer(histoContext, CL_MEM_READ_WRITE, 1024 * sizeof(unsigned int), NULL, NULL);
  199. FILE *fp;
  200. const char fileName[]="./histogram1024.cl";
  201. size_t source_size;
  202. char *source_str;
  203. fp = fopen(fileName, "r");
  204. if (!fp) {
  205. fprintf(stderr, "Failed to load kernel.\n");
  206. exit(1);
  207. }
  208. source_str = (char *)malloc(MAX_SOURCE_SIZE);
  209. source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
  210. fclose(fp);
  211. histoProgram = clCreateProgramWithSource(histoContext, 1, (const char **) &source_str, (const size_t)&source_size, &err);
  212. if (!histoProgram)
  213. {
  214. printf("Error: Failed to create compute program!\n");
  215. exit(1);
  216. }
  217. err = clBuildProgram(histoProgram, 0, NULL, NULL, NULL, NULL);
  218. if (err != CL_SUCCESS)
  219. {
  220. size_t len;
  221. char buffer[2048];
  222. printf("Error: Failed to build program executable!\n");
  223. clGetProgramBuildInfo(histoProgram, devices[0], CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
  224. printf("%s\n", buffer);
  225. exit(1);
  226. }
  227. histoKernel = clCreateKernel(histoProgram, "histogram1024Kernel", &err);
  228. if (!histoKernel || err != CL_SUCCESS)
  229. {
  230. printf("Error: Failed to create compute kernel!\n");
  231. exit(1);
  232. }
  233. }
  234. void histogram1024GPU(unsigned int *h_Result, float *d_Data, float minimum, float maximum,int listsize){
  235. err = clEnqueueWriteBuffer(histoCommands, histoInput, CL_TRUE, 0, listsize*sizeof(float), d_Data, 0, NULL, NULL);
  236. if (err != CL_SUCCESS)
  237. {
  238. printf("Error: Failed to write to source array!\n");
  239. exit(1);
  240. }
  241. err = clEnqueueWriteBuffer(histoCommands, histoOutput, CL_TRUE, 0, DIVISIONS*sizeof(unsigned int), h_Result, 0, NULL, NULL);
  242. if (err != CL_SUCCESS)
  243. {
  244. printf("Error: Failed to write to source array!\n");
  245. exit(1);
  246. }
  247. err = 0;
  248. err = clSetKernelArg(histoKernel, 0, sizeof(cl_mem), &histoOutput);
  249. err = clSetKernelArg(histoKernel, 1, sizeof(cl_mem), &histoInput);
  250. err = clSetKernelArg(histoKernel, 2, sizeof(float), &minimum);
  251. err = clSetKernelArg(histoKernel, 3, sizeof(float), &maximum);
  252. err = clSetKernelArg(histoKernel, 4, sizeof(int), &listsize);
  253. if (err != CL_SUCCESS)
  254. {
  255. printf("Error: Failed to set kernel arguments! %d\n", err);
  256. exit(1);
  257. }
  258. size_t global = 6144;
  259. size_t local;
  260. #ifdef HISTO_WG_SIZE_0
  261. local = HISTO_WG_SIZE_0;
  262. #else
  263. local = 96;
  264. #endif
  265. err = clEnqueueNDRangeKernel(histoCommands, histoKernel, 1, NULL, &global, &local, 0, NULL, &histoEvent);
  266. if (err)
  267. {
  268. printf("Error: Failed to execute histogram kernel!\n");
  269. exit(1);
  270. }
  271. clWaitForEvents(1 , &histoEvent);
  272. clFinish(histoCommands);
  273. err = clEnqueueReadBuffer( histoCommands, histoOutput, CL_TRUE, 0, 1024 * sizeof(unsigned int), h_Result, 0, NULL, NULL );
  274. if (err != CL_SUCCESS)
  275. {
  276. printf("Error: Failed to read histo output array! %d\n", err);
  277. exit(1);
  278. }
  279. clFinish(histoCommands);
  280. cl_ulong time_start, time_end;
  281. double total_time;
  282. clGetEventProfilingInfo(histoEvent, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
  283. clGetEventProfilingInfo(histoEvent, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
  284. total_time = time_end - time_start;
  285. sum+= total_time/1000000.0;
  286. printf("Histogram Kernel Time: %0.3f \n", total_time/1000000);
  287. }
  288. void finish_histogram() {
  289. clReleaseProgram(histoProgram);
  290. clReleaseKernel(histoKernel);
  291. clReleaseCommandQueue(histoCommands);
  292. clReleaseContext(histoContext);
  293. clReleaseMemObject((histoInput));
  294. clReleaseMemObject((histoOutput));
  295. }
  296. ////////////////////////////////////////////////////////////////////////////////
  297. // Given the input array of floats and the min and max of the distribution,
  298. // sort the elements into float4 aligned buckets of roughly equal size
  299. ////////////////////////////////////////////////////////////////////////////////
  300. void bucketSort(float *d_input, float *d_output, int listsize,
  301. int *sizes, int *nullElements, float minimum, float maximum,
  302. unsigned int *origOffsets)
  303. {
  304. // ////////////////////////////////////////////////////////////////////////////
  305. // // First pass - Create 1024 bin histogram
  306. // ////////////////////////////////////////////////////////////////////////////
  307. histogramInit(listsize);
  308. histogram1024GPU(h_offsets, d_input, minimum, maximum, listsize);
  309. finish_histogram();
  310. for(int i=0; i<histosize; i++) historesult[i] = (float)h_offsets[i];
  311. // ///////////////////////////////////////////////////////////////////////////
  312. // // Calculate pivot points (CPU algorithm)
  313. // ///////////////////////////////////////////////////////////////////////////
  314. calcPivotPoints(historesult, histosize, listsize, DIVISIONS,
  315. minimum, maximum, pivotPoints,
  316. (maximum - minimum)/(float)histosize);
  317. //
  318. // ///////////////////////////////////////////////////////////////////////////
  319. // // Count the bucket sizes in new divisions
  320. // ///////////////////////////////////////////////////////////////////////////
  321. bucketcountKernel = clCreateKernel(bucketProgram, "bucketcount", &err);
  322. if (!bucketcountKernel || err != CL_SUCCESS)
  323. {
  324. printf("Error: Failed to create bucketsort compute kernel!\n");
  325. exit(1);
  326. }
  327. err = clEnqueueWriteBuffer(bucketCommands, l_pivotpoints_buff, CL_TRUE, 0, DIVISIONS*sizeof(float), pivotPoints, 0, NULL, NULL);
  328. if (err != CL_SUCCESS)
  329. {
  330. printf("Error: Failed to write to l_pivotpoints source array!\n");
  331. exit(1);
  332. }
  333. d_input_buff = clCreateBuffer(bucketContext,CL_MEM_READ_WRITE, (listsize + (DIVISIONS*4))*sizeof(float),NULL,NULL);
  334. err = clEnqueueWriteBuffer(bucketCommands, d_input_buff, CL_TRUE, 0, (listsize + (DIVISIONS*4))*sizeof(float), d_input, 0, NULL, NULL);
  335. if (err != CL_SUCCESS)
  336. {
  337. printf("Error: Failed to write to d_input_buff source array!\n");
  338. exit(1);
  339. }
  340. err = 0;
  341. err = clSetKernelArg(bucketcountKernel, 0, sizeof(cl_mem), &d_input_buff);
  342. err = clSetKernelArg(bucketcountKernel, 1, sizeof(cl_mem), &d_indice_buff);
  343. err = clSetKernelArg(bucketcountKernel, 2, sizeof(cl_mem), &d_prefixoffsets_buff);
  344. err = clSetKernelArg(bucketcountKernel, 3, sizeof(cl_int), &listsize);
  345. err = clSetKernelArg(bucketcountKernel, 4, sizeof(cl_mem), &l_pivotpoints_buff);
  346. if (err != CL_SUCCESS)
  347. {
  348. printf("Error: Failed to set kernel arguments! %d\n", err);
  349. exit(1);
  350. }
  351. int blocks =((listsize -1) / (BUCKET_THREAD_N*BUCKET_BAND)) + 1;
  352. size_t global[] = {blocks*BUCKET_THREAD_N,1,1};
  353. size_t local[] = {BUCKET_THREAD_N,1,1};
  354. err = clEnqueueNDRangeKernel(bucketCommands, bucketcountKernel, 3, NULL, global, local, 0, NULL, &bucketCountEvent);
  355. if (err)
  356. {
  357. printf("Error: Failed to execute bucket count kernel!\n");
  358. exit(1);
  359. }
  360. clWaitForEvents(1 , &bucketCountEvent);
  361. clFinish(bucketCommands);
  362. err = clEnqueueReadBuffer( bucketCommands, d_prefixoffsets_buff, CL_TRUE, 0, blocks * BUCKET_BLOCK_MEMORY * sizeof(unsigned int), d_prefixoffsets, 0, NULL, NULL );
  363. if (err != CL_SUCCESS)
  364. {
  365. printf("Error: Failed to read prefix output array! %d\n", err);
  366. exit(1);
  367. }
  368. err = clEnqueueReadBuffer( bucketCommands, d_indice_buff, CL_TRUE, 0, listsize * sizeof(int), d_indice, 0, NULL, NULL );
  369. if (err != CL_SUCCESS)
  370. {
  371. printf("Error: Failed to read indice output array! %d\n", err);
  372. exit(1);
  373. }
  374. clFinish(bucketCommands);
  375. cl_ulong time_start, time_end;
  376. double total_time;
  377. clGetEventProfilingInfo(bucketCountEvent, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
  378. clGetEventProfilingInfo(bucketCountEvent, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
  379. total_time = time_end - time_start;
  380. sum+= total_time/1000000;
  381. printf("Bucket Count Kernel Time: %0.3f \n", total_time/1000000);
  382. //
  383. // ///////////////////////////////////////////////////////////////////////////
  384. // // Prefix scan offsets and align each division to float4 (required by
  385. // // mergesort)
  386. // ///////////////////////////////////////////////////////////////////////////
  387. #ifdef BUCKET_WG_SIZE_0
  388. size_t localpre[] = {BUCKET_WG_SIZE_0,1,1};
  389. #else
  390. size_t localpre[] = {128,1,1};
  391. #endif
  392. size_t globalpre[] = {(DIVISIONS),1,1};
  393. bucketprefixKernel = clCreateKernel(bucketProgram, "bucketprefixoffset", &err);
  394. if (!bucketprefixKernel || err != CL_SUCCESS)
  395. {
  396. printf("Error: Failed to create bucket prefix compute kernel!\n");
  397. exit(1);
  398. }
  399. err = clEnqueueWriteBuffer(bucketCommands, d_prefixoffsets_buff, CL_TRUE, 0, blocks * BUCKET_BLOCK_MEMORY * sizeof(int), d_prefixoffsets, 0, NULL, NULL);
  400. if (err != CL_SUCCESS)
  401. {
  402. printf("Error: Failed to write to prefix offsets source array!\n");
  403. exit(1);
  404. }
  405. err = 0;
  406. err = clSetKernelArg(bucketprefixKernel, 0, sizeof(cl_mem), &d_prefixoffsets_buff);
  407. err = clSetKernelArg(bucketprefixKernel, 1, sizeof(cl_mem), &d_offsets_buff);
  408. err = clSetKernelArg(bucketprefixKernel, 2, sizeof(cl_int), &blocks);
  409. if (err != CL_SUCCESS)
  410. {
  411. printf("Error: Failed to set kernel arguments! %d\n", err);
  412. exit(1);
  413. }
  414. err = clEnqueueNDRangeKernel(bucketCommands, bucketprefixKernel, 3, NULL, globalpre, localpre, 0, NULL, &bucketPrefixEvent);
  415. if (err)
  416. {
  417. printf("%d Error: Failed to execute bucket prefix kernel!\n", err);
  418. exit(1);
  419. }
  420. clWaitForEvents(1 , &bucketPrefixEvent);
  421. clFinish(bucketCommands);
  422. err = clEnqueueReadBuffer( bucketCommands, d_offsets_buff, CL_TRUE, 0, DIVISIONS * sizeof(unsigned int), h_offsets, 0, NULL, NULL );
  423. if (err != CL_SUCCESS)
  424. {
  425. printf("Error: Failed to read d_offsets output array! %d\n", err);
  426. exit(1);
  427. }
  428. err = clEnqueueReadBuffer( bucketCommands, d_prefixoffsets_buff, CL_TRUE, 0, blocks * BUCKET_BLOCK_MEMORY * sizeof(int), d_prefixoffsets_altered, 0, NULL, NULL );
  429. if (err != CL_SUCCESS)
  430. {
  431. printf("Error: Failed to read d_offsets output array! %d\n", err);
  432. exit(1);
  433. }
  434. clFinish(bucketCommands);
  435. clGetEventProfilingInfo(bucketPrefixEvent, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
  436. clGetEventProfilingInfo(bucketPrefixEvent, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
  437. total_time = time_end - time_start;
  438. sum+= total_time/1000000;
  439. printf("Bucket Prefix Kernel Time: %0.3f \n", total_time/1000000);
  440. // // copy the sizes from device to host
  441. origOffsets[0] = 0;
  442. for(int i=0; i<DIVISIONS; i++){
  443. origOffsets[i+1] = h_offsets[i] + origOffsets[i];
  444. if((h_offsets[i] % 4) != 0){
  445. nullElements[i] = (h_offsets[i] & ~3) + 4 - h_offsets[i];
  446. }
  447. else nullElements[i] = 0;
  448. }
  449. for(int i=0; i<DIVISIONS; i++) sizes[i] = (h_offsets[i] + nullElements[i])/4;
  450. for(int i=0; i<DIVISIONS; i++) {
  451. if((h_offsets[i] % 4) != 0) h_offsets[i] = (h_offsets[i] & ~3) + 4;
  452. }
  453. for(int i=1; i<DIVISIONS; i++) h_offsets[i] = h_offsets[i-1] + h_offsets[i];
  454. for(int i=DIVISIONS - 1; i>0; i--) h_offsets[i] = h_offsets[i-1];
  455. h_offsets[0] = 0;
  456. // ///////////////////////////////////////////////////////////////////////////
  457. // // Finally, sort the lot
  458. // ///////////////////////////////////////////////////////////////////////////
  459. bucketsortKernel = clCreateKernel(bucketProgram, "bucketsort", &err);
  460. if (!bucketsortKernel|| err != CL_SUCCESS)
  461. {
  462. printf("Error: Failed to create bucketsort compute kernel!\n");
  463. exit(1);
  464. }
  465. err = clEnqueueWriteBuffer(bucketCommands, l_offsets_buff, CL_TRUE, 0, DIVISIONS * sizeof(unsigned int), h_offsets, 0, NULL, NULL);
  466. if (err != CL_SUCCESS)
  467. {
  468. printf("Error: Failed to write to l_offsets source array!\n");
  469. exit(1);
  470. }
  471. err = clEnqueueWriteBuffer(bucketCommands, d_input_buff, CL_TRUE, 0, (listsize + (DIVISIONS*4))*sizeof(float), d_input, 0, NULL, NULL);
  472. if (err != CL_SUCCESS)
  473. {
  474. printf("Error: Failed to write to d_input_buff source array!\n");
  475. exit(1);
  476. }
  477. err = clEnqueueWriteBuffer(bucketCommands, d_indice_input_buff, CL_TRUE, 0, listsize*sizeof(int), d_indice, 0, NULL, NULL);
  478. if (err != CL_SUCCESS)
  479. {
  480. printf("Error: Failed to write to d_input_buff source array!\n");
  481. exit(1);
  482. }
  483. err = clEnqueueWriteBuffer(bucketCommands, d_prefixoffsets_input_buff, CL_TRUE, 0, blocks * BUCKET_BLOCK_MEMORY * sizeof(int), d_prefixoffsets_altered, 0, NULL, NULL);
  484. if (err != CL_SUCCESS)
  485. {
  486. printf("Error: Failed to write to prefix offsets source array!\n");
  487. exit(1);
  488. }
  489. err = clEnqueueWriteBuffer(bucketCommands, bucketOutput, CL_TRUE, 0, (listsize + (DIVISIONS*4))*sizeof(float), d_output, 0, NULL, NULL);
  490. if (err != CL_SUCCESS)
  491. {
  492. printf("Error: Failed to write to source array!\n");
  493. exit(1);
  494. }
  495. size_t localfinal[] = {BUCKET_THREAD_N,1,1};
  496. blocks = ((listsize - 1) / (BUCKET_THREAD_N * BUCKET_BAND)) + 1;
  497. size_t globalfinal[] = {blocks*BUCKET_THREAD_N,1,1};
  498. err = 0;
  499. err = clSetKernelArg(bucketsortKernel, 0, sizeof(cl_mem), &d_input_buff);
  500. err = clSetKernelArg(bucketsortKernel, 1, sizeof(cl_mem), &d_indice_input_buff);
  501. err = clSetKernelArg(bucketsortKernel, 2, sizeof(cl_mem), &bucketOutput);
  502. err = clSetKernelArg(bucketsortKernel, 3, sizeof(cl_int), &listsize);
  503. err = clSetKernelArg(bucketsortKernel, 4, sizeof(cl_mem), &d_prefixoffsets_input_buff);
  504. err = clSetKernelArg(bucketsortKernel, 5, sizeof(cl_mem), &l_offsets_buff);
  505. if (err != CL_SUCCESS)
  506. {
  507. printf("Error: Failed to set kernel arguments! %d\n", err);
  508. exit(1);
  509. }
  510. err = clEnqueueNDRangeKernel(bucketCommands, bucketsortKernel, 3, NULL, globalfinal, localfinal, 0, NULL, &bucketSortEvent);
  511. if (err)
  512. {
  513. printf("%d Error: Failed to execute bucketsort kernel!\n", err);
  514. }
  515. err = clEnqueueReadBuffer( bucketCommands, bucketOutput, CL_TRUE, 0, (listsize + (DIVISIONS*4))*sizeof(float), d_output, 0, NULL, NULL );
  516. if (err != CL_SUCCESS)
  517. {
  518. printf("Error: Failed to read d_output array! %d\n", err);
  519. }
  520. clWaitForEvents(1 , &bucketSortEvent);
  521. clFinish(bucketCommands);
  522. clGetEventProfilingInfo(bucketSortEvent, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
  523. clGetEventProfilingInfo(bucketSortEvent, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
  524. total_time = time_end - time_start;
  525. sum+= total_time/1000000;
  526. printf("Bucket Sort Kernel Time: %0.3f \n", total_time/1000000);
  527. }
  528. double getBucketTime() {
  529. return sum;
  530. }
  531. ////////////////////////////////////////////////////////////////////////////////
  532. // Given a histogram of the list, figure out suitable pivotpoints that divide
  533. // the list into approximately listsize/divisions elements each
  534. ////////////////////////////////////////////////////////////////////////////////
  535. void calcPivotPoints(float *histogram, int histosize, int listsize,
  536. int divisions, float min, float max, float *pivotPoints, float histo_width)
  537. {
  538. float elemsPerSlice = listsize/(float)divisions;
  539. float startsAt = min;
  540. float endsAt = min + histo_width;
  541. float we_need = elemsPerSlice;
  542. int p_idx = 0;
  543. for(int i=0; i<histosize; i++)
  544. {
  545. if(i == histosize - 1){
  546. if(!(p_idx < divisions)){
  547. pivotPoints[p_idx++] = startsAt + (we_need/histogram[i]) * histo_width;
  548. }
  549. break;
  550. }
  551. while(histogram[i] > we_need){
  552. if(!(p_idx < divisions)){
  553. printf("i=%d, p_idx = %d, divisions = %d\n", i, p_idx, divisions);
  554. exit(0);
  555. }
  556. pivotPoints[p_idx++] = startsAt + (we_need/histogram[i]) * histo_width;
  557. startsAt += (we_need/histogram[i]) * histo_width;
  558. histogram[i] -= we_need;
  559. we_need = elemsPerSlice;
  560. }
  561. // grab what we can from what remains of it
  562. we_need -= histogram[i];
  563. startsAt = endsAt;
  564. endsAt += histo_width;
  565. }
  566. while(p_idx < divisions){
  567. pivotPoints[p_idx] = pivotPoints[p_idx-1];
  568. p_idx++;
  569. }
  570. }