histogram1024.cl 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101
  1. /*
  2. * Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
  3. *
  4. * NVIDIA Corporation and its licensors retain all intellectual property and
  5. * proprietary rights in and to this software and related documentation.
  6. * Any use, reproduction, disclosure, or distribution of this software
  7. * and related documentation without an express license agreement from
  8. * NVIDIA Corporation is strictly prohibited.
  9. *
  10. * Please refer to the applicable NVIDIA end user license agreement (EULA)
  11. * associated with this source code for terms and conditions that govern
  12. * your use of this NVIDIA software.
  13. *
  14. */
  15. ////////////////////////////////////////////////////////////////////////////////
  16. // Common definition
  17. ////////////////////////////////////////////////////////////////////////////////
  18. //Total number of possible data values
  19. #define BIN_COUNT (1024) // Changed from 256
  20. #define HISTOGRAM_SIZE (BIN_COUNT * sizeof(unsigned int))
  21. //Machine warp size
  22. #ifndef __DEVICE_EMULATION__
  23. //G80's warp size is 32 threads
  24. #define WARP_LOG_SIZE 5
  25. #else
  26. //Emulation currently doesn't execute threads in coherent groups of 32 threads,
  27. //which effectively means warp size of 1 thread for emulation modes
  28. #define WARP_LOG_SIZE 0
  29. #endif
  30. //Warps in thread block
  31. #define WARP_N 3
  32. //Per-block number of elements in histograms
  33. #define BLOCK_MEMORY (WARP_N * BIN_COUNT)
  34. #define IMUL(a, b) mul24(a, b)
  35. ////////////////////////////////////////////////////////////////////////////////
  36. // Main computation pass: compute per-workgroup partial histograms
  37. ////////////////////////////////////////////////////////////////////////////////
  38. inline void addData1024(volatile __local uint *s_WarpHist, uint data, uint tag){
  39. uint count;
  40. do{
  41. count = s_WarpHist[data] & 0x07FFFFFFU;
  42. count = tag | (count + 1);
  43. s_WarpHist[data] = count;
  44. }while(s_WarpHist[data] != count);
  45. }
  46. __kernel void histogram1024Kernel(
  47. __global uint *d_Result,
  48. __global float *d_Data,
  49. float minimum,
  50. float maximum,
  51. uint dataCount
  52. ){
  53. const int gid = get_global_id(0);
  54. const int gsize = get_global_size(0);
  55. //Per-warp substorage storage
  56. int mulBase = (get_local_id(0) >> WARP_LOG_SIZE);
  57. const int warpBase = IMUL(mulBase, BIN_COUNT);
  58. __local unsigned int s_Hist[BLOCK_MEMORY];
  59. int test = 0;
  60. // if(get_global_id(0) == 0) {
  61. // for(int i = 0; i < 1024; i++) {
  62. // d_Result[i] = 0;
  63. // }
  64. // }
  65. const uint tag = get_local_id(0) << (32 - WARP_LOG_SIZE);
  66. //Clear shared memory storage for current threadblock before processing
  67. for(uint i = get_local_id(0); i < BLOCK_MEMORY; i+=get_local_size(0)){
  68. s_Hist[i] = 0;
  69. }
  70. //Read through the entire input buffer, build per-warp histograms
  71. barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
  72. for(int pos = get_global_id(0); pos < dataCount; pos += get_global_size(0)){
  73. uint data4 = ((d_Data[pos] - minimum)/(maximum - minimum)) * BIN_COUNT;
  74. addData1024(s_Hist + warpBase, data4 & 0x3FFU, tag);
  75. }
  76. //Per-block histogram reduction
  77. // Sum is adding to index 0, pls fix
  78. barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
  79. for(int pos = get_local_id(0); pos < BIN_COUNT; pos += get_local_size(0)){
  80. uint sum = 0;
  81. for(int i = 0; i < BLOCK_MEMORY; i+= BIN_COUNT){
  82. sum += s_Hist[pos + i] & 0x07FFFFFFU;
  83. }
  84. atomic_add(d_Result+pos,sum);
  85. }
  86. }