bucketsort_kernels.cl 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105
  1. #define DIVISIONS (1 << 10)
  2. #define LOG_DIVISIONS (10)
  3. #define BUCKET_WARP_LOG_SIZE (5)
  4. #define BUCKET_WARP_N (1)
  5. #ifdef BUCKET_WG_SIZE_1
  6. #define BUCKET_THREAD_N BUCKET_WG_SIZE_1
  7. #else
  8. #define BUCKET_THREAD_N (BUCKET_WARP_N << BUCKET_WARP_LOG_SIZE)
  9. #endif
  10. #define BUCKET_BLOCK_MEMORY (DIVISIONS * BUCKET_WARP_N)
  11. #define BUCKET_BAND (128)
  12. int addOffset(volatile __local uint *s_offset, uint data, uint threadTag){
  13. uint count;
  14. do{
  15. count = s_offset[data] & 0x07FFFFFFU;
  16. count = threadTag | (count + 1);
  17. s_offset[data] = count;
  18. }while(s_offset[data] != count);
  19. return (count & 0x07FFFFFFU) - 1;
  20. }
  21. __kernel void
  22. bucketcount( global float *input, global int *indice, global uint *d_prefixoffsets, const int size, global float *l_pivotpoints)
  23. {
  24. volatile __local uint s_offset[BUCKET_BLOCK_MEMORY];
  25. const uint threadTag = get_local_id(0) << (32 - BUCKET_WARP_LOG_SIZE);
  26. const int warpBase = (get_local_id(0) >> BUCKET_WARP_LOG_SIZE) * DIVISIONS;
  27. const int numThreads = get_global_size(0);
  28. for (int i = get_local_id(0); i < BUCKET_BLOCK_MEMORY; i += get_local_size(0))
  29. s_offset[i] = 0;
  30. barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
  31. for (int tid = get_global_id(0); tid < size; tid += numThreads) {
  32. float elem = input[tid];
  33. int idx = DIVISIONS/2 - 1;
  34. int jump = DIVISIONS/4;
  35. float piv = l_pivotpoints[idx]; //s_pivotpoints[idx];
  36. while(jump >= 1){
  37. idx = (elem < piv) ? (idx - jump) : (idx + jump);
  38. piv = l_pivotpoints[idx]; //s_pivotpoints[idx];
  39. jump /= 2;
  40. }
  41. idx = (elem < piv) ? idx : (idx + 1);
  42. indice[tid] = (addOffset(s_offset + warpBase, idx, threadTag) << LOG_DIVISIONS) + idx; //atomicInc(&offsets[idx], size + 1);
  43. }
  44. barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
  45. int prefixBase = get_group_id(0) * BUCKET_BLOCK_MEMORY;
  46. for (int i = get_local_id(0); i < BUCKET_BLOCK_MEMORY; i += get_local_size(0))
  47. d_prefixoffsets[prefixBase + i] = s_offset[i] & 0x07FFFFFFU;
  48. }
  49. __kernel void bucketprefixoffset(global uint *d_prefixoffsets, global uint *d_offsets, const int blocks) {
  50. int tid = get_global_id(0);
  51. int size = blocks * BUCKET_BLOCK_MEMORY;
  52. int sum = 0;
  53. for (int i = tid; i < size; i += DIVISIONS) {
  54. int x = d_prefixoffsets[i];
  55. d_prefixoffsets[i] = sum;
  56. sum += x;
  57. }
  58. d_offsets[tid] = sum;
  59. }
  60. __kernel void
  61. bucketsort(global float *input, global int *indice, __global float *output, const int size, global uint *d_prefixoffsets,
  62. global uint *l_offsets)
  63. {
  64. volatile __local unsigned int s_offset[BUCKET_BLOCK_MEMORY];
  65. int prefixBase = get_group_id(0) * BUCKET_BLOCK_MEMORY;
  66. const int warpBase = (get_local_id(0) >> BUCKET_WARP_LOG_SIZE) * DIVISIONS;
  67. const int numThreads = get_global_size(0);
  68. for (int i = get_local_id(0); i < BUCKET_BLOCK_MEMORY; i += get_local_size(0)){
  69. s_offset[i] = l_offsets[i & (DIVISIONS - 1)] + d_prefixoffsets[prefixBase + i];
  70. }
  71. barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
  72. for (int tid = get_global_id(0); tid < size; tid += numThreads) {
  73. float elem = input[tid];
  74. int id = indice[tid];
  75. output[s_offset[warpBase + (id & (DIVISIONS - 1))] + (id >> LOG_DIVISIONS)] = elem;
  76. int test = s_offset[warpBase + (id & (DIVISIONS - 1))] + (id >> LOG_DIVISIONS);
  77. // if(test == 2) {
  78. // printf("EDLLAWD %f", elem);
  79. // }
  80. }
  81. }