/*
 * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
 *
 * NVIDIA Corporation and its licensors retain all intellectual property and
 * proprietary rights in and to this software and related documentation.
 * Any use, reproduction, disclosure, or distribution of this software
 * and related documentation without an express license agreement from
 * NVIDIA Corporation is strictly prohibited.
 *
 * Please refer to the applicable NVIDIA end user license agreement (EULA)
 * associated with this source code for terms and conditions that govern
 * your use of this NVIDIA software.
 *
 */


////////////////////////////////////////////////////////////////////////////////
// Common definition
////////////////////////////////////////////////////////////////////////////////
//Total number of possible data values
#define BIN_COUNT (1024) // Changed from 256
#define HISTOGRAM_SIZE (BIN_COUNT * sizeof(unsigned int))
//Machine warp size
#ifndef __DEVICE_EMULATION__
//G80's warp size is 32 threads
#define WARP_LOG_SIZE 5
#else
//Emulation currently doesn't execute threads in coherent groups of 32 threads,
//which effectively means warp size of 1 thread for emulation modes
#define WARP_LOG_SIZE 0
#endif
//Warps in thread block
#define  WARP_N 3
//Per-block number of elements in histograms
#define BLOCK_MEMORY (WARP_N * BIN_COUNT)
#define IMUL(a, b) mul24(a, b)

////////////////////////////////////////////////////////////////////////////////
// Main computation pass: compute per-workgroup partial histograms
////////////////////////////////////////////////////////////////////////////////

inline void addData1024(volatile __local uint *s_WarpHist, uint data, uint tag){
    uint count;
    do{
        count = s_WarpHist[data]  & 0x07FFFFFFU;
        count = tag | (count + 1);
        s_WarpHist[data] = count;
    }while(s_WarpHist[data] != count);
}

 __kernel void histogram1024Kernel(
                  __global uint *d_Result,
                  __global float *d_Data,
                     float minimum,
                     float maximum,
                    uint dataCount
                  ){
    const int gid = get_global_id(0);
    const int gsize = get_global_size(0);
    //Per-warp substorage storage
     int mulBase = (get_local_id(0) >> WARP_LOG_SIZE);
     const int warpBase = IMUL(mulBase, BIN_COUNT);
    __local unsigned int s_Hist[BLOCK_MEMORY];
     int test = 0;
     
//     if(get_global_id(0) == 0) {
//     for(int i = 0; i < 1024; i++) {
//             d_Result[i] = 0;
//         }
//     }
     const uint tag =  get_local_id(0) << (32 - WARP_LOG_SIZE);
    //Clear shared memory storage for current threadblock before processing
     for(uint i = get_local_id(0); i < BLOCK_MEMORY; i+=get_local_size(0)){
        s_Hist[i] = 0;
 }

    
    //Read through the entire input buffer, build per-warp histograms
     barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
    for(int pos = get_global_id(0); pos < dataCount; pos += get_global_size(0)){
        uint data4 = ((d_Data[pos] - minimum)/(maximum - minimum)) * BIN_COUNT;
        addData1024(s_Hist + warpBase, data4 & 0x3FFU, tag);
    }
    
    //Per-block histogram reduction
     // Sum is adding to index 0, pls fix
     barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
    for(int pos = get_local_id(0); pos < BIN_COUNT; pos += get_local_size(0)){
        uint sum = 0;
        for(int i = 0; i < BLOCK_MEMORY; i+= BIN_COUNT){
            sum += s_Hist[pos + i] & 0x07FFFFFFU;
        }
        atomic_add(d_Result+pos,sum);
    }
     
     
}