mergesort.cl 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135
  1. #define DIVISIONS (1024)
  2. float4 sortElem(float4 r) {
  3. float4 nr;
  4. nr.x = (r.x > r.y) ? r.y : r.x;
  5. nr.y = (r.y > r.x) ? r.y : r.x;
  6. nr.z = (r.z > r.w) ? r.w : r.z;
  7. nr.w = (r.w > r.z) ? r.w : r.z;
  8. r.x = (nr.x > nr.z) ? nr.z : nr.x;
  9. r.y = (nr.y > nr.w) ? nr.w : nr.y;
  10. r.z = (nr.z > nr.x) ? nr.z : nr.x;
  11. r.w = (nr.w > nr.y) ? nr.w : nr.y;
  12. nr.x = r.x;
  13. nr.y = (r.y > r.z) ? r.z : r.y;
  14. nr.z = (r.z > r.y) ? r.z : r.y;
  15. nr.w = r.w;
  16. return nr;
  17. }
  18. float4 getLowest(float4 a, float4 b)
  19. {
  20. a.x = (a.x < b.w) ? a.x : b.w;
  21. a.y = (a.y < b.z) ? a.y : b.z;
  22. a.z = (a.z < b.y) ? a.z : b.y;
  23. a.w = (a.w < b.x) ? a.w : b.x;
  24. return a;
  25. }
  26. float4 getHighest(float4 a, float4 b)
  27. {
  28. b.x = (a.w >= b.x) ? a.w : b.x;
  29. b.y = (a.z >= b.y) ? a.z : b.y;
  30. b.z = (a.y >= b.z) ? a.y : b.z;
  31. b.w = (a.x >= b.w) ? a.x : b.w;
  32. return b;
  33. }
  34. __kernel void mergeSortFirst(__global float4 *input,__global float4 *result, const int listsize){
  35. int bx = get_group_id(0);
  36. if(bx*get_local_size(0) + get_local_id(0) < listsize/4){
  37. float4 r = input[bx*get_local_size(0)+ get_local_id(0)];
  38. result[bx * get_local_size(0) + get_local_id(0)] = sortElem(r);
  39. }
  40. }
  41. __kernel void
  42. mergeSortPass(__global float4 *input, __global float4 *result,const int nrElems,int threadsPerDiv, __global int *constStartAddr)
  43. {
  44. int gid = get_global_id(0);
  45. // The division to work on
  46. int division = gid / threadsPerDiv;
  47. if(division >= DIVISIONS) return;
  48. // The block within the division
  49. int int_gid = gid - division * threadsPerDiv;
  50. int Astart = constStartAddr[division] + int_gid * nrElems;
  51. int Bstart = Astart + nrElems/2;
  52. global float4 *resStart;
  53. resStart= &(result[Astart]);
  54. if(Astart >= constStartAddr[division + 1])
  55. return;
  56. if(Bstart >= constStartAddr[division + 1]){
  57. for(int i=0; i<(constStartAddr[division + 1] - Astart); i++)
  58. {
  59. resStart[i] = input[Astart + i];
  60. }
  61. return;
  62. }
  63. int aidx = 0;
  64. int bidx = 0;
  65. int outidx = 0;
  66. float4 a, b;
  67. a = input[Astart + aidx];
  68. b = input[Bstart + bidx];
  69. while(true)//aidx < nrElems/2)// || (bidx < nrElems/2 && (Bstart + bidx < constEndAddr[division])))
  70. {
  71. /**
  72. * For some reason, it's faster to do the texture fetches here than
  73. * after the merge
  74. */
  75. float4 nextA = input[Astart + aidx + 1];
  76. float4 nextB = input[Bstart + bidx + 1];
  77. float4 na = getLowest(a,b);
  78. float4 nb = getHighest(a,b);
  79. a = sortElem(na);
  80. b = sortElem(nb);
  81. // Now, a contains the lowest four elements, sorted
  82. resStart[outidx++] = a;
  83. bool elemsLeftInA;
  84. bool elemsLeftInB;
  85. elemsLeftInA = (aidx + 1 < nrElems/2); // Astart + aidx + 1 is allways less than division border
  86. elemsLeftInB = (bidx + 1 < nrElems/2) && (Bstart + bidx + 1 < constStartAddr[division + 1]);
  87. if(elemsLeftInA){
  88. if(elemsLeftInB){
  89. if(nextA.x < nextB.x) { aidx += 1; a = nextA; }
  90. else { bidx += 1; a = nextB; }
  91. }
  92. else {
  93. aidx += 1; a = nextA;
  94. }
  95. }
  96. else {
  97. if(elemsLeftInB){
  98. bidx += 1; a = nextB;
  99. }
  100. else {
  101. break;
  102. }
  103. }
  104. }
  105. resStart[outidx++] = b;
  106. }
  107. __kernel void
  108. mergepack(__global float *orig, __global float *result, __constant int *constStartAddr, __constant int *nullElems, __constant int *finalStartAddr)
  109. {
  110. int idx = get_global_id(0);
  111. int division = get_group_id(1);
  112. if((finalStartAddr[division] + idx) >= finalStartAddr[division + 1]) return;
  113. result[finalStartAddr[division] + idx] = orig[constStartAddr[division]*4 + nullElems[division] + idx];
  114. }