123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135 |
- #define DIVISIONS (1024)
- float4 sortElem(float4 r) {
- float4 nr;
-
- nr.x = (r.x > r.y) ? r.y : r.x;
- nr.y = (r.y > r.x) ? r.y : r.x;
- nr.z = (r.z > r.w) ? r.w : r.z;
- nr.w = (r.w > r.z) ? r.w : r.z;
-
- r.x = (nr.x > nr.z) ? nr.z : nr.x;
- r.y = (nr.y > nr.w) ? nr.w : nr.y;
- r.z = (nr.z > nr.x) ? nr.z : nr.x;
- r.w = (nr.w > nr.y) ? nr.w : nr.y;
-
- nr.x = r.x;
- nr.y = (r.y > r.z) ? r.z : r.y;
- nr.z = (r.z > r.y) ? r.z : r.y;
- nr.w = r.w;
- return nr;
- }
- float4 getLowest(float4 a, float4 b)
- {
- a.x = (a.x < b.w) ? a.x : b.w;
- a.y = (a.y < b.z) ? a.y : b.z;
- a.z = (a.z < b.y) ? a.z : b.y;
- a.w = (a.w < b.x) ? a.w : b.x;
- return a;
- }
- float4 getHighest(float4 a, float4 b)
- {
- b.x = (a.w >= b.x) ? a.w : b.x;
- b.y = (a.z >= b.y) ? a.z : b.y;
- b.z = (a.y >= b.z) ? a.y : b.z;
- b.w = (a.x >= b.w) ? a.x : b.w;
- return b;
- }
- __kernel void mergeSortFirst(__global float4 *input,__global float4 *result, const int listsize){
- int bx = get_group_id(0);
-
- if(bx*get_local_size(0) + get_local_id(0) < listsize/4){
- float4 r = input[bx*get_local_size(0)+ get_local_id(0)];
- result[bx * get_local_size(0) + get_local_id(0)] = sortElem(r);
- }
- }
- __kernel void
- mergeSortPass(__global float4 *input, __global float4 *result,const int nrElems,int threadsPerDiv, __global int *constStartAddr)
- {
- int gid = get_global_id(0);
- // The division to work on
- int division = gid / threadsPerDiv;
- if(division >= DIVISIONS) return;
- // The block within the division
- int int_gid = gid - division * threadsPerDiv;
- int Astart = constStartAddr[division] + int_gid * nrElems;
-
- int Bstart = Astart + nrElems/2;
- global float4 *resStart;
- resStart= &(result[Astart]);
-
- if(Astart >= constStartAddr[division + 1])
- return;
- if(Bstart >= constStartAddr[division + 1]){
- for(int i=0; i<(constStartAddr[division + 1] - Astart); i++)
- {
- resStart[i] = input[Astart + i];
- }
- return;
- }
-
- int aidx = 0;
- int bidx = 0;
- int outidx = 0;
- float4 a, b;
- a = input[Astart + aidx];
- b = input[Bstart + bidx];
-
- while(true)//aidx < nrElems/2)// || (bidx < nrElems/2 && (Bstart + bidx < constEndAddr[division])))
- {
- /**
- * For some reason, it's faster to do the texture fetches here than
- * after the merge
- */
- float4 nextA = input[Astart + aidx + 1];
- float4 nextB = input[Bstart + bidx + 1];
-
- float4 na = getLowest(a,b);
- float4 nb = getHighest(a,b);
- a = sortElem(na);
- b = sortElem(nb);
- // Now, a contains the lowest four elements, sorted
- resStart[outidx++] = a;
-
- bool elemsLeftInA;
- bool elemsLeftInB;
-
- elemsLeftInA = (aidx + 1 < nrElems/2); // Astart + aidx + 1 is allways less than division border
- elemsLeftInB = (bidx + 1 < nrElems/2) && (Bstart + bidx + 1 < constStartAddr[division + 1]);
-
- if(elemsLeftInA){
- if(elemsLeftInB){
- if(nextA.x < nextB.x) { aidx += 1; a = nextA; }
- else { bidx += 1; a = nextB; }
- }
- else {
- aidx += 1; a = nextA;
- }
- }
- else {
- if(elemsLeftInB){
- bidx += 1; a = nextB;
- }
- else {
- break;
- }
- }
-
- }
- resStart[outidx++] = b;
- }
- __kernel void
- mergepack(__global float *orig, __global float *result, __constant int *constStartAddr, __constant int *nullElems, __constant int *finalStartAddr)
- {
- int idx = get_global_id(0);
- int division = get_group_id(1);
-
- if((finalStartAddr[division] + idx) >= finalStartAddr[division + 1]) return;
- result[finalStartAddr[division] + idx] = orig[constStartAddr[division]*4 + nullElems[division] + idx];
- }
|