reduce_kernel.c 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100
  1. // statistical kernel
  2. __global__ void reduce( long d_Ne, // number of elements in array
  3. int d_no, // number of sums to reduce
  4. int d_mul, // increment
  5. fp *d_sums, // pointer to partial sums variable (DEVICE GLOBAL MEMORY)
  6. fp *d_sums2){
  7. // indexes
  8. int bx = blockIdx.x; // get current horizontal block index
  9. int tx = threadIdx.x; // get current horizontal thread index
  10. int ei = (bx*NUMBER_THREADS)+tx; // unique thread id, more threads than actual elements !!!
  11. int nf = NUMBER_THREADS-(gridDim.x*NUMBER_THREADS-d_no); // number of elements assigned to last block
  12. int df = 0; // divisibility factor for the last block
  13. // statistical
  14. __shared__ fp d_psum[NUMBER_THREADS]; // data for block calculations allocated by every block in its shared memory
  15. __shared__ fp d_psum2[NUMBER_THREADS];
  16. // counters
  17. int i;
  18. // copy data to shared memory
  19. if(ei<d_no){ // do only for the number of elements, omit extra threads
  20. d_psum[tx] = d_sums[ei*d_mul];
  21. d_psum2[tx] = d_sums2[ei*d_mul];
  22. }
  23. // reduction of sums if all blocks are full (rare case)
  24. if(nf == NUMBER_THREADS){
  25. // sum of every 2, 4, ..., NUMBER_THREADS elements
  26. for(i=2; i<=NUMBER_THREADS; i=2*i){
  27. // sum of elements
  28. if((tx+1) % i == 0){ // every ith
  29. d_psum[tx] = d_psum[tx] + d_psum[tx-i/2];
  30. d_psum2[tx] = d_psum2[tx] + d_psum2[tx-i/2];
  31. }
  32. // synchronization
  33. __syncthreads();
  34. }
  35. // final sumation by last thread in every block
  36. if(tx==(NUMBER_THREADS-1)){ // block result stored in global memory
  37. d_sums[bx*d_mul*NUMBER_THREADS] = d_psum[tx];
  38. d_sums2[bx*d_mul*NUMBER_THREADS] = d_psum2[tx];
  39. }
  40. }
  41. // reduction of sums if last block is not full (common case)
  42. else{
  43. // for full blocks (all except for last block)
  44. if(bx != (gridDim.x - 1)){ //
  45. // sum of every 2, 4, ..., NUMBER_THREADS elements
  46. for(i=2; i<=NUMBER_THREADS; i=2*i){ //
  47. // sum of elements
  48. if((tx+1) % i == 0){ // every ith
  49. d_psum[tx] = d_psum[tx] + d_psum[tx-i/2];
  50. d_psum2[tx] = d_psum2[tx] + d_psum2[tx-i/2];
  51. }
  52. // synchronization
  53. __syncthreads(); //
  54. }
  55. // final sumation by last thread in every block
  56. if(tx==(NUMBER_THREADS-1)){ // block result stored in global memory
  57. d_sums[bx*d_mul*NUMBER_THREADS] = d_psum[tx];
  58. d_sums2[bx*d_mul*NUMBER_THREADS] = d_psum2[tx];
  59. }
  60. }
  61. // for not full block (last block)
  62. else{ //
  63. // figure out divisibility
  64. for(i=2; i<=NUMBER_THREADS; i=2*i){ //
  65. if(nf >= i){
  66. df = i;
  67. }
  68. }
  69. // sum of every 2, 4, ..., NUMBER_THREADS elements
  70. for(i=2; i<=df; i=2*i){ //
  71. // sum of elements (only busy threads)
  72. if((tx+1) % i == 0 && tx<df){ // every ith
  73. d_psum[tx] = d_psum[tx] + d_psum[tx-i/2];
  74. d_psum2[tx] = d_psum2[tx] + d_psum2[tx-i/2];
  75. }
  76. // synchronization (all threads)
  77. __syncthreads(); //
  78. }
  79. // remainder / final summation by last thread
  80. if(tx==(df-1)){ //
  81. // compute the remainder and final summation by last busy thread
  82. for(i=(bx*NUMBER_THREADS)+df; i<(bx*NUMBER_THREADS)+nf; i++){ //
  83. d_psum[tx] = d_psum[tx] + d_sums[i];
  84. d_psum2[tx] = d_psum2[tx] + d_sums2[i];
  85. }
  86. // final sumation by last thread in every block
  87. d_sums[bx*d_mul*NUMBER_THREADS] = d_psum[tx];
  88. d_sums2[bx*d_mul*NUMBER_THREADS] = d_psum2[tx];
  89. }
  90. }
  91. }
  92. }