#define DIVISIONS               (1 << 10)
#define LOG_DIVISIONS	(10)
#define BUCKET_WARP_LOG_SIZE	(5)
#define BUCKET_WARP_N			(1)
#ifdef BUCKET_WG_SIZE_1
#define BUCKET_THREAD_N BUCKET_WG_SIZE_1
#else
#define BUCKET_THREAD_N			(BUCKET_WARP_N << BUCKET_WARP_LOG_SIZE)
#endif
#define BUCKET_BLOCK_MEMORY		(DIVISIONS * BUCKET_WARP_N)
#define BUCKET_BAND				(128)


int addOffset(volatile __local uint *s_offset, uint data, uint threadTag){
    uint count;

    do{
        count = s_offset[data] & 0x07FFFFFFU;
        count = threadTag | (count + 1);
        s_offset[data] = count;
    }while(s_offset[data] != count);

    return (count & 0x07FFFFFFU) - 1;
}

__kernel void
bucketcount( global float *input, global int *indice, global uint *d_prefixoffsets, const int size, global float *l_pivotpoints)
{
    
	volatile __local uint s_offset[BUCKET_BLOCK_MEMORY];
    
    const uint threadTag = get_local_id(0) << (32 - BUCKET_WARP_LOG_SIZE);
    const int warpBase = (get_local_id(0) >> BUCKET_WARP_LOG_SIZE) * DIVISIONS;
    const int numThreads = get_global_size(0);
	for (int i = get_local_id(0); i < BUCKET_BLOCK_MEMORY; i += get_local_size(0))
		s_offset[i] = 0;
    
    barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
    
	for (int tid = get_global_id(0); tid < size; tid += numThreads) {
		float elem = input[tid];
        
		int idx  = DIVISIONS/2 - 1;
		int jump = DIVISIONS/4;
		float piv = l_pivotpoints[idx]; //s_pivotpoints[idx];
        
		while(jump >= 1){
			idx = (elem < piv) ? (idx - jump) : (idx + jump);
			piv = l_pivotpoints[idx]; //s_pivotpoints[idx];
			jump /= 2;
		}
		idx = (elem < piv) ? idx : (idx + 1);
        
		indice[tid] = (addOffset(s_offset + warpBase, idx, threadTag) << LOG_DIVISIONS) + idx;  //atomicInc(&offsets[idx], size + 1);
	}
    
    barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
    
	int prefixBase = get_group_id(0) * BUCKET_BLOCK_MEMORY;
    
	for (int i = get_local_id(0); i < BUCKET_BLOCK_MEMORY; i += get_local_size(0))
		d_prefixoffsets[prefixBase + i] = s_offset[i] & 0x07FFFFFFU;
}

__kernel void bucketprefixoffset(global uint *d_prefixoffsets, global uint *d_offsets, const int blocks) {
	int tid = get_global_id(0);
	int size = blocks * BUCKET_BLOCK_MEMORY;
	int sum = 0;
    
	for (int i = tid; i < size; i += DIVISIONS) {
		int x = d_prefixoffsets[i];
		d_prefixoffsets[i] = sum;
		sum += x;
	}
    
	d_offsets[tid] = sum;
}

__kernel void
bucketsort(global float *input, global int *indice, __global float *output, const int size, global uint *d_prefixoffsets,
		   global uint *l_offsets)
{
	volatile __local unsigned int s_offset[BUCKET_BLOCK_MEMORY];
    
	int prefixBase = get_group_id(0) * BUCKET_BLOCK_MEMORY;
    const int warpBase = (get_local_id(0) >> BUCKET_WARP_LOG_SIZE) * DIVISIONS;
    const int numThreads = get_global_size(0);
    
	for (int i = get_local_id(0); i < BUCKET_BLOCK_MEMORY; i += get_local_size(0)){
		s_offset[i] = l_offsets[i & (DIVISIONS - 1)] + d_prefixoffsets[prefixBase + i];
    }
    
    barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);

	for (int tid = get_global_id(0); tid < size; tid += numThreads) {
       
		float elem = input[tid];
		int id = indice[tid];
		output[s_offset[warpBase + (id & (DIVISIONS - 1))] + (id >> LOG_DIVISIONS)] = elem;
        int test = s_offset[warpBase + (id & (DIVISIONS - 1))] + (id >> LOG_DIVISIONS);
//        if(test == 2) {
//            printf("EDLLAWD %f", elem);
//        }
	}
}