bucketsort.c 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639
  1. #define BUCKET_WARP_LOG_SIZE 5
  2. #define BUCKET_WARP_N 1
  3. #ifdef BUCKET_WG_SIZE_1
  4. #define BUCKET_THREAD_N BUCKET_WG_SIZE_1
  5. #else
  6. #define BUCKET_THREAD_N (BUCKET_WARP_N << BUCKET_WARP_LOG_SIZE)
  7. #endif
  8. #define BUCKET_BLOCK_MEMORY (DIVISIONS * BUCKET_WARP_N)
  9. #define BUCKET_BAND 128
  10. #define SIZE (1 << 22)
  11. #define DATA_SIZE (1024)
  12. #define MAX_SOURCE_SIZE (0x100000)
  13. #define HISTOGRAM_SIZE (1024 * sizeof(unsigned int))
  14. #include <fcntl.h>
  15. #include <float.h>
  16. #include <stdio.h>
  17. #include <stdlib.h>
  18. #include <string.h>
  19. #include <math.h>
  20. #include <unistd.h>
  21. #include <sys/types.h>
  22. #include <sys/stat.h>
  23. #include <CL/cl.h>
  24. #include "bucketsort.h"
  25. #include <time.h>
  26. ////////////////////////////////////////////////////////////////////////////////
  27. // Forward declarations
  28. ////////////////////////////////////////////////////////////////////////////////
  29. void calcPivotPoints(float *histogram, int histosize, int listsize,
  30. int divisions, float min, float max, float *pivotPoints,
  31. float histo_width);
  32. ////////////////////////////////////////////////////////////////////////////////
  33. // Globals
  34. ////////////////////////////////////////////////////////////////////////////////
  35. const int histosize = 1024;
  36. unsigned int* h_offsets = NULL;
  37. unsigned int* d_offsets = NULL;
  38. cl_mem d_offsets_buff;
  39. int *d_indice = NULL;
  40. cl_mem d_indice_buff;
  41. cl_mem d_input_buff;
  42. cl_mem d_indice_input_buff;
  43. float *pivotPoints = NULL;
  44. float *historesult = NULL;
  45. float *l_pivotpoints = NULL;
  46. cl_mem l_pivotpoints_buff;
  47. unsigned int *d_prefixoffsets = NULL;
  48. unsigned int *d_prefixoffsets_altered = NULL;
  49. cl_mem d_prefixoffsets_buff;
  50. cl_mem d_prefixoffsets_input_buff;
  51. unsigned int *l_offsets = NULL;
  52. cl_mem l_offsets_buff;
  53. unsigned int *d_Result1024;
  54. cl_device_id device_id; // compute device id
  55. cl_context bucketContext; // compute context
  56. cl_context histoContext;
  57. cl_command_queue bucketCommands; // compute command queue
  58. cl_command_queue histoCommands;
  59. cl_program bucketProgram; // compute program
  60. cl_program histoProgram;
  61. cl_kernel bucketcountKernel; // compute kernel
  62. cl_kernel histoKernel;
  63. cl_kernel bucketprefixKernel;
  64. cl_kernel bucketsortKernel;
  65. cl_mem histoInput;
  66. cl_mem histoOutput;
  67. cl_mem bucketOutput;
  68. cl_int err;
  69. cl_uint num_platforms;
  70. cl_event histoEvent;
  71. cl_event bucketCountEvent;
  72. cl_event bucketPrefixEvent;
  73. cl_event bucketSortEvent;
  74. double sum = 0;
  75. ////////////////////////////////////////////////////////////////////////////////
  76. // Initialize the bucketsort algorithm
  77. ////////////////////////////////////////////////////////////////////////////////
  78. void init_bucketsort(int listsize)
  79. {
  80. cl_uint num = 0;
  81. clGetPlatformIDs(0, NULL, &num);
  82. cl_platform_id platformID[num];
  83. clGetPlatformIDs(num, platformID, NULL);
  84. clGetDeviceIDs(platformID[1],CL_DEVICE_TYPE_GPU,0,NULL,&num);
  85. cl_device_id devices[num];
  86. err = clGetDeviceIDs(platformID[1],CL_DEVICE_TYPE_GPU,num,devices,NULL);
  87. // int gpu = 1;
  88. // err = clGetDeviceIDs(NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 2, &device_id, NULL);
  89. if (err != CL_SUCCESS)
  90. {
  91. printf("Error: Failed to create a device group!\n");
  92. exit(1);
  93. }
  94. char name[128];
  95. clGetDeviceInfo(devices[0],CL_DEVICE_NAME,128,name,NULL);
  96. bucketContext = clCreateContext(0, 1, &devices[0], NULL, NULL, &err);
  97. bucketCommands = clCreateCommandQueue(bucketContext, devices[0], CL_QUEUE_PROFILING_ENABLE, &err);
  98. h_offsets = (unsigned int *) malloc(DIVISIONS * sizeof(unsigned int));
  99. for(int i = 0; i < DIVISIONS; i++){
  100. h_offsets[i] = 0;
  101. }
  102. d_offsets_buff = clCreateBuffer(bucketContext,CL_MEM_READ_WRITE, DIVISIONS * sizeof(unsigned int),NULL,NULL);
  103. pivotPoints = (float *)malloc(DIVISIONS * sizeof(float));
  104. d_indice_buff = clCreateBuffer(bucketContext,CL_MEM_READ_WRITE, listsize * sizeof(int),NULL,NULL);
  105. d_indice_input_buff = clCreateBuffer(bucketContext,CL_MEM_READ_WRITE, listsize * sizeof(int),NULL,NULL);
  106. d_indice = (int *)malloc(listsize * sizeof(int));
  107. historesult = (float *)malloc(histosize * sizeof(float));
  108. l_pivotpoints = (float *)malloc(DIVISIONS*sizeof(float));
  109. l_pivotpoints_buff = clCreateBuffer(bucketContext,CL_MEM_READ_WRITE, DIVISIONS * sizeof(float), NULL, NULL);
  110. l_offsets_buff = clCreateBuffer(bucketContext,CL_MEM_READ_WRITE, DIVISIONS * sizeof(unsigned int), NULL, NULL);
  111. int blocks = ((listsize - 1) / (BUCKET_THREAD_N * BUCKET_BAND)) + 1;
  112. d_prefixoffsets_buff = clCreateBuffer(bucketContext,CL_MEM_READ_WRITE, blocks * BUCKET_BLOCK_MEMORY * sizeof(int), NULL, NULL);
  113. d_prefixoffsets = (unsigned int *)malloc(blocks*BUCKET_BLOCK_MEMORY*sizeof(int));
  114. d_prefixoffsets_altered = (unsigned int *)malloc(blocks*BUCKET_BLOCK_MEMORY*sizeof(int));
  115. d_prefixoffsets_input_buff = clCreateBuffer(bucketContext,CL_MEM_READ_WRITE, blocks * BUCKET_BLOCK_MEMORY * sizeof(int), NULL, NULL);
  116. bucketOutput = clCreateBuffer(bucketContext, CL_MEM_READ_WRITE, (listsize + (DIVISIONS*4))*sizeof(float), NULL, NULL);
  117. FILE *fp;
  118. const char fileName[]="./bucketsort_kernels.cl";
  119. size_t source_size;
  120. char *source_str;
  121. fp = fopen(fileName, "r");
  122. if (!fp) {
  123. fprintf(stderr, "Failed to load bucket kernel.\n");
  124. exit(1);
  125. }
  126. source_str = (char *)malloc(MAX_SOURCE_SIZE);
  127. source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
  128. fclose(fp);
  129. bucketProgram = clCreateProgramWithSource(bucketContext, 1, (const char **) &source_str, (const size_t)&source_size, &err);
  130. if (!bucketProgram)
  131. {
  132. printf("Error: Failed to create bucket compute program!\n");
  133. exit(1);
  134. }
  135. err = clBuildProgram(bucketProgram, 0, NULL, NULL, NULL, NULL);
  136. if (err != CL_SUCCESS)
  137. {
  138. size_t len;
  139. char buffer[2048];
  140. printf("Error: Failed to build bucket program executable!\n");
  141. clGetProgramBuildInfo(bucketProgram, devices[0], CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
  142. printf("%s\n", buffer);
  143. exit(1);
  144. }
  145. }
  146. ////////////////////////////////////////////////////////////////////////////////
  147. // Uninitialize the bucketsort algorithm
  148. ////////////////////////////////////////////////////////////////////////////////
  149. void finish_bucketsort()
  150. {
  151. clReleaseMemObject(d_offsets_buff);
  152. clReleaseMemObject(d_indice_buff);
  153. clReleaseMemObject(l_pivotpoints_buff);
  154. clReleaseMemObject(l_offsets_buff);
  155. clReleaseMemObject(d_prefixoffsets_buff);
  156. clReleaseMemObject(d_input_buff);
  157. clReleaseMemObject(d_indice_input_buff);
  158. clReleaseMemObject(bucketOutput);
  159. clReleaseProgram(bucketProgram);
  160. clReleaseKernel(bucketcountKernel);
  161. clReleaseKernel(bucketprefixKernel);
  162. clReleaseKernel(bucketsortKernel);
  163. clReleaseCommandQueue(bucketCommands);
  164. clReleaseContext(bucketContext);
  165. free(pivotPoints);
  166. free(h_offsets);
  167. free(historesult);
  168. }
  169. void histogramInit(int listsize) {
  170. cl_uint num = 0;
  171. clGetPlatformIDs(0, NULL, &num);
  172. cl_platform_id platformID[num];
  173. clGetPlatformIDs(num, platformID, NULL);
  174. clGetDeviceIDs(platformID[1],CL_DEVICE_TYPE_GPU,0,NULL,&num);
  175. char name[128];
  176. clGetPlatformInfo(platformID[1], CL_PLATFORM_PROFILE,128,name,NULL);
  177. cl_device_id devices[num];
  178. err = clGetDeviceIDs(platformID[1],CL_DEVICE_TYPE_GPU,num,devices,NULL);
  179. // int gpu = 1;
  180. // err = clGetDeviceIDs(NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 2, &device_id, NULL);
  181. if (err != CL_SUCCESS)
  182. {
  183. printf("Error: Failed to create a device group!\n");
  184. exit(1);
  185. }
  186. clGetDeviceInfo(devices[0],CL_DEVICE_NAME,128,name,NULL);
  187. printf("%s \n", name);
  188. cl_context_properties contextProperties[] =
  189. {
  190. CL_CONTEXT_PLATFORM,
  191. (cl_context_properties)platformID[num],
  192. 0
  193. };
  194. histoContext = clCreateContext(contextProperties, 1, &devices[0], NULL, NULL, &err);
  195. histoCommands = clCreateCommandQueue(histoContext, devices[0], CL_QUEUE_PROFILING_ENABLE, &err);
  196. histoInput = clCreateBuffer(histoContext, CL_MEM_READ_ONLY, listsize*(sizeof(float)), NULL, NULL);
  197. histoOutput = clCreateBuffer(histoContext, CL_MEM_READ_WRITE, 1024 * sizeof(unsigned int), NULL, NULL);
  198. FILE *fp;
  199. const char fileName[]="./histogram1024.cl";
  200. size_t source_size;
  201. char *source_str;
  202. fp = fopen(fileName, "r");
  203. if (!fp) {
  204. fprintf(stderr, "Failed to load kernel.\n");
  205. exit(1);
  206. }
  207. source_str = (char *)malloc(MAX_SOURCE_SIZE);
  208. source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
  209. fclose(fp);
  210. histoProgram = clCreateProgramWithSource(histoContext, 1, (const char **) &source_str, (const size_t)&source_size, &err);
  211. if (!histoProgram)
  212. {
  213. printf("Error: Failed to create compute program!\n");
  214. exit(1);
  215. }
  216. err = clBuildProgram(histoProgram, 0, NULL, NULL, NULL, NULL);
  217. if (err != CL_SUCCESS)
  218. {
  219. size_t len;
  220. char buffer[2048];
  221. printf("Error: Failed to build program executable!\n");
  222. clGetProgramBuildInfo(histoProgram, devices[0], CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
  223. printf("%s\n", buffer);
  224. exit(1);
  225. }
  226. histoKernel = clCreateKernel(histoProgram, "histogram1024Kernel", &err);
  227. if (!histoKernel || err != CL_SUCCESS)
  228. {
  229. printf("Error: Failed to create compute kernel!\n");
  230. exit(1);
  231. }
  232. }
  233. void histogram1024GPU(unsigned int *h_Result, float *d_Data, float minimum, float maximum,int listsize){
  234. err = clEnqueueWriteBuffer(histoCommands, histoInput, CL_TRUE, 0, listsize*sizeof(float), d_Data, 0, NULL, NULL);
  235. if (err != CL_SUCCESS)
  236. {
  237. printf("Error: Failed to write to source array!\n");
  238. exit(1);
  239. }
  240. err = clEnqueueWriteBuffer(histoCommands, histoOutput, CL_TRUE, 0, DIVISIONS*sizeof(unsigned int), h_Result, 0, NULL, NULL);
  241. if (err != CL_SUCCESS)
  242. {
  243. printf("Error: Failed to write to source array!\n");
  244. exit(1);
  245. }
  246. err = 0;
  247. err = clSetKernelArg(histoKernel, 0, sizeof(cl_mem), &histoOutput);
  248. err = clSetKernelArg(histoKernel, 1, sizeof(cl_mem), &histoInput);
  249. err = clSetKernelArg(histoKernel, 2, sizeof(float), &minimum);
  250. err = clSetKernelArg(histoKernel, 3, sizeof(float), &maximum);
  251. err = clSetKernelArg(histoKernel, 4, sizeof(int), &listsize);
  252. if (err != CL_SUCCESS)
  253. {
  254. printf("Error: Failed to set kernel arguments! %d\n", err);
  255. exit(1);
  256. }
  257. size_t global = 6144;
  258. size_t local;
  259. #ifdef HISTO_WG_SIZE_0
  260. local = HISTO_WG_SIZE_0;
  261. #else
  262. local = 96;
  263. #endif
  264. err = clEnqueueNDRangeKernel(histoCommands, histoKernel, 1, NULL, &global, &local, 0, NULL, &histoEvent);
  265. if (err)
  266. {
  267. printf("Error: Failed to execute histogram kernel!\n");
  268. exit(1);
  269. }
  270. clWaitForEvents(1 , &histoEvent);
  271. clFinish(histoCommands);
  272. err = clEnqueueReadBuffer( histoCommands, histoOutput, CL_TRUE, 0, 1024 * sizeof(unsigned int), h_Result, 0, NULL, NULL );
  273. if (err != CL_SUCCESS)
  274. {
  275. printf("Error: Failed to read histo output array! %d\n", err);
  276. exit(1);
  277. }
  278. clFinish(histoCommands);
  279. cl_ulong time_start, time_end;
  280. double total_time;
  281. clGetEventProfilingInfo(histoEvent, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
  282. clGetEventProfilingInfo(histoEvent, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
  283. total_time = time_end - time_start;
  284. sum+= total_time/1000000.0;
  285. printf("Histogram Kernel Time: %0.3f \n", total_time/1000000);
  286. }
  287. void finish_histogram() {
  288. clReleaseProgram(histoProgram);
  289. clReleaseKernel(histoKernel);
  290. clReleaseCommandQueue(histoCommands);
  291. clReleaseContext(histoContext);
  292. clReleaseMemObject((histoInput));
  293. clReleaseMemObject((histoOutput));
  294. }
  295. ////////////////////////////////////////////////////////////////////////////////
  296. // Given the input array of floats and the min and max of the distribution,
  297. // sort the elements into float4 aligned buckets of roughly equal size
  298. ////////////////////////////////////////////////////////////////////////////////
  299. void bucketSort(float *d_input, float *d_output, int listsize,
  300. int *sizes, int *nullElements, float minimum, float maximum,
  301. unsigned int *origOffsets)
  302. {
  303. // ////////////////////////////////////////////////////////////////////////////
  304. // // First pass - Create 1024 bin histogram
  305. // ////////////////////////////////////////////////////////////////////////////
  306. histogramInit(listsize);
  307. histogram1024GPU(h_offsets, d_input, minimum, maximum, listsize);
  308. finish_histogram();
  309. for(int i=0; i<histosize; i++) historesult[i] = (float)h_offsets[i];
  310. // ///////////////////////////////////////////////////////////////////////////
  311. // // Calculate pivot points (CPU algorithm)
  312. // ///////////////////////////////////////////////////////////////////////////
  313. calcPivotPoints(historesult, histosize, listsize, DIVISIONS,
  314. minimum, maximum, pivotPoints,
  315. (maximum - minimum)/(float)histosize);
  316. //
  317. // ///////////////////////////////////////////////////////////////////////////
  318. // // Count the bucket sizes in new divisions
  319. // ///////////////////////////////////////////////////////////////////////////
  320. bucketcountKernel = clCreateKernel(bucketProgram, "bucketcount", &err);
  321. if (!bucketcountKernel || err != CL_SUCCESS)
  322. {
  323. printf("Error: Failed to create bucketsort compute kernel!\n");
  324. exit(1);
  325. }
  326. err = clEnqueueWriteBuffer(bucketCommands, l_pivotpoints_buff, CL_TRUE, 0, DIVISIONS*sizeof(float), pivotPoints, 0, NULL, NULL);
  327. if (err != CL_SUCCESS)
  328. {
  329. printf("Error: Failed to write to l_pivotpoints source array!\n");
  330. exit(1);
  331. }
  332. d_input_buff = clCreateBuffer(bucketContext,CL_MEM_READ_WRITE, (listsize + (DIVISIONS*4))*sizeof(float),NULL,NULL);
  333. err = clEnqueueWriteBuffer(bucketCommands, d_input_buff, CL_TRUE, 0, (listsize + (DIVISIONS*4))*sizeof(float), d_input, 0, NULL, NULL);
  334. if (err != CL_SUCCESS)
  335. {
  336. printf("Error: Failed to write to d_input_buff source array!\n");
  337. exit(1);
  338. }
  339. err = 0;
  340. err = clSetKernelArg(bucketcountKernel, 0, sizeof(cl_mem), &d_input_buff);
  341. err = clSetKernelArg(bucketcountKernel, 1, sizeof(cl_mem), &d_indice_buff);
  342. err = clSetKernelArg(bucketcountKernel, 2, sizeof(cl_mem), &d_prefixoffsets_buff);
  343. err = clSetKernelArg(bucketcountKernel, 3, sizeof(cl_int), &listsize);
  344. err = clSetKernelArg(bucketcountKernel, 4, sizeof(cl_mem), &l_pivotpoints_buff);
  345. if (err != CL_SUCCESS)
  346. {
  347. printf("Error: Failed to set kernel arguments! %d\n", err);
  348. exit(1);
  349. }
  350. int blocks =((listsize -1) / (BUCKET_THREAD_N*BUCKET_BAND)) + 1;
  351. size_t global[] = {blocks*BUCKET_THREAD_N,1,1};
  352. size_t local[] = {BUCKET_THREAD_N,1,1};
  353. err = clEnqueueNDRangeKernel(bucketCommands, bucketcountKernel, 3, NULL, global, local, 0, NULL, &bucketCountEvent);
  354. if (err)
  355. {
  356. printf("Error: Failed to execute bucket count kernel!\n");
  357. exit(1);
  358. }
  359. clWaitForEvents(1 , &bucketCountEvent);
  360. clFinish(bucketCommands);
  361. err = clEnqueueReadBuffer( bucketCommands, d_prefixoffsets_buff, CL_TRUE, 0, blocks * BUCKET_BLOCK_MEMORY * sizeof(unsigned int), d_prefixoffsets, 0, NULL, NULL );
  362. if (err != CL_SUCCESS)
  363. {
  364. printf("Error: Failed to read prefix output array! %d\n", err);
  365. exit(1);
  366. }
  367. err = clEnqueueReadBuffer( bucketCommands, d_indice_buff, CL_TRUE, 0, listsize * sizeof(int), d_indice, 0, NULL, NULL );
  368. if (err != CL_SUCCESS)
  369. {
  370. printf("Error: Failed to read indice output array! %d\n", err);
  371. exit(1);
  372. }
  373. clFinish(bucketCommands);
  374. cl_ulong time_start, time_end;
  375. double total_time;
  376. clGetEventProfilingInfo(bucketCountEvent, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
  377. clGetEventProfilingInfo(bucketCountEvent, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
  378. total_time = time_end - time_start;
  379. sum+= total_time/1000000;
  380. printf("Bucket Count Kernel Time: %0.3f \n", total_time/1000000);
  381. //
  382. // ///////////////////////////////////////////////////////////////////////////
  383. // // Prefix scan offsets and align each division to float4 (required by
  384. // // mergesort)
  385. // ///////////////////////////////////////////////////////////////////////////
  386. #ifdef BUCKET_WG_SIZE_0
  387. size_t localpre[] = {BUCKET_WG_SIZE_0,1,1};
  388. #else
  389. size_t localpre[] = {128,1,1};
  390. #endif
  391. size_t globalpre[] = {(DIVISIONS),1,1};
  392. bucketprefixKernel = clCreateKernel(bucketProgram, "bucketprefixoffset", &err);
  393. if (!bucketprefixKernel || err != CL_SUCCESS)
  394. {
  395. printf("Error: Failed to create bucket prefix compute kernel!\n");
  396. exit(1);
  397. }
  398. err = clEnqueueWriteBuffer(bucketCommands, d_prefixoffsets_buff, CL_TRUE, 0, blocks * BUCKET_BLOCK_MEMORY * sizeof(int), d_prefixoffsets, 0, NULL, NULL);
  399. if (err != CL_SUCCESS)
  400. {
  401. printf("Error: Failed to write to prefix offsets source array!\n");
  402. exit(1);
  403. }
  404. err = 0;
  405. err = clSetKernelArg(bucketprefixKernel, 0, sizeof(cl_mem), &d_prefixoffsets_buff);
  406. err = clSetKernelArg(bucketprefixKernel, 1, sizeof(cl_mem), &d_offsets_buff);
  407. err = clSetKernelArg(bucketprefixKernel, 2, sizeof(cl_int), &blocks);
  408. if (err != CL_SUCCESS)
  409. {
  410. printf("Error: Failed to set kernel arguments! %d\n", err);
  411. exit(1);
  412. }
  413. err = clEnqueueNDRangeKernel(bucketCommands, bucketprefixKernel, 3, NULL, globalpre, localpre, 0, NULL, &bucketPrefixEvent);
  414. if (err)
  415. {
  416. printf("%d Error: Failed to execute bucket prefix kernel!\n", err);
  417. exit(1);
  418. }
  419. clWaitForEvents(1 , &bucketPrefixEvent);
  420. clFinish(bucketCommands);
  421. err = clEnqueueReadBuffer( bucketCommands, d_offsets_buff, CL_TRUE, 0, DIVISIONS * sizeof(unsigned int), h_offsets, 0, NULL, NULL );
  422. if (err != CL_SUCCESS)
  423. {
  424. printf("Error: Failed to read d_offsets output array! %d\n", err);
  425. exit(1);
  426. }
  427. err = clEnqueueReadBuffer( bucketCommands, d_prefixoffsets_buff, CL_TRUE, 0, blocks * BUCKET_BLOCK_MEMORY * sizeof(int), d_prefixoffsets_altered, 0, NULL, NULL );
  428. if (err != CL_SUCCESS)
  429. {
  430. printf("Error: Failed to read d_offsets output array! %d\n", err);
  431. exit(1);
  432. }
  433. clFinish(bucketCommands);
  434. clGetEventProfilingInfo(bucketPrefixEvent, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
  435. clGetEventProfilingInfo(bucketPrefixEvent, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
  436. total_time = time_end - time_start;
  437. sum+= total_time/1000000;
  438. printf("Bucket Prefix Kernel Time: %0.3f \n", total_time/1000000);
  439. // // copy the sizes from device to host
  440. origOffsets[0] = 0;
  441. for(int i=0; i<DIVISIONS; i++){
  442. origOffsets[i+1] = h_offsets[i] + origOffsets[i];
  443. if((h_offsets[i] % 4) != 0){
  444. nullElements[i] = (h_offsets[i] & ~3) + 4 - h_offsets[i];
  445. }
  446. else nullElements[i] = 0;
  447. }
  448. for(int i=0; i<DIVISIONS; i++) sizes[i] = (h_offsets[i] + nullElements[i])/4;
  449. for(int i=0; i<DIVISIONS; i++) {
  450. if((h_offsets[i] % 4) != 0) h_offsets[i] = (h_offsets[i] & ~3) + 4;
  451. }
  452. for(int i=1; i<DIVISIONS; i++) h_offsets[i] = h_offsets[i-1] + h_offsets[i];
  453. for(int i=DIVISIONS - 1; i>0; i--) h_offsets[i] = h_offsets[i-1];
  454. h_offsets[0] = 0;
  455. // ///////////////////////////////////////////////////////////////////////////
  456. // // Finally, sort the lot
  457. // ///////////////////////////////////////////////////////////////////////////
  458. bucketsortKernel = clCreateKernel(bucketProgram, "bucketsort", &err);
  459. if (!bucketsortKernel|| err != CL_SUCCESS)
  460. {
  461. printf("Error: Failed to create bucketsort compute kernel!\n");
  462. exit(1);
  463. }
  464. err = clEnqueueWriteBuffer(bucketCommands, l_offsets_buff, CL_TRUE, 0, DIVISIONS * sizeof(unsigned int), h_offsets, 0, NULL, NULL);
  465. if (err != CL_SUCCESS)
  466. {
  467. printf("Error: Failed to write to l_offsets source array!\n");
  468. exit(1);
  469. }
  470. err = clEnqueueWriteBuffer(bucketCommands, d_input_buff, CL_TRUE, 0, (listsize + (DIVISIONS*4))*sizeof(float), d_input, 0, NULL, NULL);
  471. if (err != CL_SUCCESS)
  472. {
  473. printf("Error: Failed to write to d_input_buff source array!\n");
  474. exit(1);
  475. }
  476. err = clEnqueueWriteBuffer(bucketCommands, d_indice_input_buff, CL_TRUE, 0, listsize*sizeof(int), d_indice, 0, NULL, NULL);
  477. if (err != CL_SUCCESS)
  478. {
  479. printf("Error: Failed to write to d_input_buff source array!\n");
  480. exit(1);
  481. }
  482. err = clEnqueueWriteBuffer(bucketCommands, d_prefixoffsets_input_buff, CL_TRUE, 0, blocks * BUCKET_BLOCK_MEMORY * sizeof(int), d_prefixoffsets_altered, 0, NULL, NULL);
  483. if (err != CL_SUCCESS)
  484. {
  485. printf("Error: Failed to write to prefix offsets source array!\n");
  486. exit(1);
  487. }
  488. err = clEnqueueWriteBuffer(bucketCommands, bucketOutput, CL_TRUE, 0, (listsize + (DIVISIONS*4))*sizeof(float), d_output, 0, NULL, NULL);
  489. if (err != CL_SUCCESS)
  490. {
  491. printf("Error: Failed to write to source array!\n");
  492. exit(1);
  493. }
  494. size_t localfinal[] = {BUCKET_THREAD_N,1,1};
  495. blocks = ((listsize - 1) / (BUCKET_THREAD_N * BUCKET_BAND)) + 1;
  496. size_t globalfinal[] = {blocks*BUCKET_THREAD_N,1,1};
  497. err = 0;
  498. err = clSetKernelArg(bucketsortKernel, 0, sizeof(cl_mem), &d_input_buff);
  499. err = clSetKernelArg(bucketsortKernel, 1, sizeof(cl_mem), &d_indice_input_buff);
  500. err = clSetKernelArg(bucketsortKernel, 2, sizeof(cl_mem), &bucketOutput);
  501. err = clSetKernelArg(bucketsortKernel, 3, sizeof(cl_int), &listsize);
  502. err = clSetKernelArg(bucketsortKernel, 4, sizeof(cl_mem), &d_prefixoffsets_input_buff);
  503. err = clSetKernelArg(bucketsortKernel, 5, sizeof(cl_mem), &l_offsets_buff);
  504. if (err != CL_SUCCESS)
  505. {
  506. printf("Error: Failed to set kernel arguments! %d\n", err);
  507. exit(1);
  508. }
  509. err = clEnqueueNDRangeKernel(bucketCommands, bucketsortKernel, 3, NULL, globalfinal, localfinal, 0, NULL, &bucketSortEvent);
  510. if (err)
  511. {
  512. printf("%d Error: Failed to execute bucketsort kernel!\n", err);
  513. }
  514. err = clEnqueueReadBuffer( bucketCommands, bucketOutput, CL_TRUE, 0, (listsize + (DIVISIONS*4))*sizeof(float), d_output, 0, NULL, NULL );
  515. if (err != CL_SUCCESS)
  516. {
  517. printf("Error: Failed to read d_output array! %d\n", err);
  518. }
  519. clWaitForEvents(1 , &bucketSortEvent);
  520. clFinish(bucketCommands);
  521. clGetEventProfilingInfo(bucketSortEvent, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
  522. clGetEventProfilingInfo(bucketSortEvent, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
  523. total_time = time_end - time_start;
  524. sum+= total_time/1000000;
  525. printf("Bucket Sort Kernel Time: %0.3f \n", total_time/1000000);
  526. }
  527. double getBucketTime() {
  528. return sum;
  529. }
  530. ////////////////////////////////////////////////////////////////////////////////
  531. // Given a histogram of the list, figure out suitable pivotpoints that divide
  532. // the list into approximately listsize/divisions elements each
  533. ////////////////////////////////////////////////////////////////////////////////
  534. void calcPivotPoints(float *histogram, int histosize, int listsize,
  535. int divisions, float min, float max, float *pivotPoints, float histo_width)
  536. {
  537. float elemsPerSlice = listsize/(float)divisions;
  538. float startsAt = min;
  539. float endsAt = min + histo_width;
  540. float we_need = elemsPerSlice;
  541. int p_idx = 0;
  542. for(int i=0; i<histosize; i++)
  543. {
  544. if(i == histosize - 1){
  545. if(!(p_idx < divisions)){
  546. pivotPoints[p_idx++] = startsAt + (we_need/histogram[i]) * histo_width;
  547. }
  548. break;
  549. }
  550. while(histogram[i] > we_need){
  551. if(!(p_idx < divisions)){
  552. printf("i=%d, p_idx = %d, divisions = %d\n", i, p_idx, divisions);
  553. exit(0);
  554. }
  555. pivotPoints[p_idx++] = startsAt + (we_need/histogram[i]) * histo_width;
  556. startsAt += (we_need/histogram[i]) * histo_width;
  557. histogram[i] -= we_need;
  558. we_need = elemsPerSlice;
  559. }
  560. // grab what we can from what remains of it
  561. we_need -= histogram[i];
  562. startsAt = endsAt;
  563. endsAt += histo_width;
  564. }
  565. while(p_idx < divisions){
  566. pivotPoints[p_idx] = pivotPoints[p_idx-1];
  567. p_idx++;
  568. }
  569. }