hotspot_kernel.cl 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115
  1. #define IN_RANGE(x, min, max) ((x)>=(min) && (x)<=(max))
  2. __kernel void hotspot( int iteration, //number of iteration
  3. global float *power, //power input
  4. global float *temp_src, //temperature input/output
  5. global float *temp_dst, //temperature input/output
  6. int grid_cols, //Col of grid
  7. int grid_rows, //Row of grid
  8. int border_cols, // border offset
  9. int border_rows, // border offset
  10. float Cap, //Capacitance
  11. float Rx,
  12. float Ry,
  13. float Rz,
  14. float step) {
  15. local float temp_on_cuda[BLOCK_SIZE][BLOCK_SIZE];
  16. local float power_on_cuda[BLOCK_SIZE][BLOCK_SIZE];
  17. local float temp_t[BLOCK_SIZE][BLOCK_SIZE]; // saving temporary temperature result
  18. float amb_temp = 80.0f;
  19. float step_div_Cap;
  20. float Rx_1,Ry_1,Rz_1;
  21. int bx = get_group_id(0);
  22. int by = get_group_id(1);
  23. int tx = get_local_id(0);
  24. int ty = get_local_id(1);
  25. step_div_Cap=step/Cap;
  26. Rx_1=1/Rx;
  27. Ry_1=1/Ry;
  28. Rz_1=1/Rz;
  29. // each block finally computes result for a small block
  30. // after N iterations.
  31. // it is the non-overlapping small blocks that cover
  32. // all the input data
  33. // calculate the small block size
  34. int small_block_rows = BLOCK_SIZE-iteration*2;//EXPAND_RATE
  35. int small_block_cols = BLOCK_SIZE-iteration*2;//EXPAND_RATE
  36. // calculate the boundary for the block according to
  37. // the boundary of its small block
  38. int blkY = small_block_rows*by-border_rows;
  39. int blkX = small_block_cols*bx-border_cols;
  40. int blkYmax = blkY+BLOCK_SIZE-1;
  41. int blkXmax = blkX+BLOCK_SIZE-1;
  42. // calculate the global thread coordination
  43. int yidx = blkY+ty;
  44. int xidx = blkX+tx;
  45. // load data if it is within the valid input range
  46. int loadYidx=yidx, loadXidx=xidx;
  47. int index = grid_cols*loadYidx+loadXidx;
  48. if(IN_RANGE(loadYidx, 0, grid_rows-1) && IN_RANGE(loadXidx, 0, grid_cols-1)){
  49. temp_on_cuda[ty][tx] = temp_src[index]; // Load the temperature data from global memory to shared memory
  50. power_on_cuda[ty][tx] = power[index];// Load the power data from global memory to shared memory
  51. }
  52. barrier(CLK_LOCAL_MEM_FENCE);
  53. // effective range within this block that falls within
  54. // the valid range of the input data
  55. // used to rule out computation outside the boundary.
  56. int validYmin = (blkY < 0) ? -blkY : 0;
  57. int validYmax = (blkYmax > grid_rows-1) ? BLOCK_SIZE-1-(blkYmax-grid_rows+1) : BLOCK_SIZE-1;
  58. int validXmin = (blkX < 0) ? -blkX : 0;
  59. int validXmax = (blkXmax > grid_cols-1) ? BLOCK_SIZE-1-(blkXmax-grid_cols+1) : BLOCK_SIZE-1;
  60. int N = ty-1;
  61. int S = ty+1;
  62. int W = tx-1;
  63. int E = tx+1;
  64. N = (N < validYmin) ? validYmin : N;
  65. S = (S > validYmax) ? validYmax : S;
  66. W = (W < validXmin) ? validXmin : W;
  67. E = (E > validXmax) ? validXmax : E;
  68. bool computed;
  69. for (int i=0; i<iteration ; i++){
  70. computed = false;
  71. if( IN_RANGE(tx, i+1, BLOCK_SIZE-i-2) && \
  72. IN_RANGE(ty, i+1, BLOCK_SIZE-i-2) && \
  73. IN_RANGE(tx, validXmin, validXmax) && \
  74. IN_RANGE(ty, validYmin, validYmax) ) {
  75. computed = true;
  76. temp_t[ty][tx] = temp_on_cuda[ty][tx] + step_div_Cap * (power_on_cuda[ty][tx] +
  77. (temp_on_cuda[S][tx] + temp_on_cuda[N][tx] - 2.0f * temp_on_cuda[ty][tx]) * Ry_1 +
  78. (temp_on_cuda[ty][E] + temp_on_cuda[ty][W] - 2.0f * temp_on_cuda[ty][tx]) * Rx_1 +
  79. (amb_temp - temp_on_cuda[ty][tx]) * Rz_1);
  80. }
  81. barrier(CLK_LOCAL_MEM_FENCE);
  82. if(i==iteration-1)
  83. break;
  84. if(computed) //Assign the computation range
  85. temp_on_cuda[ty][tx]= temp_t[ty][tx];
  86. barrier(CLK_LOCAL_MEM_FENCE);
  87. }
  88. // update the global memory
  89. // after the last iteration, only threads coordinated within the
  90. // small block perform the calculation and switch on ``computed''
  91. if (computed){
  92. temp_dst[index]= temp_t[ty][tx];
  93. }
  94. }