kernels.cl 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116
  1. #define IN_RANGE(x, min, max) ((x)>=(min) && (x)<=(max))
  2. #define CLAMP_RANGE(x, min, max) x = (x<(min)) ? min : ((x>(max)) ? max : x )
  3. #define MIN(a, b) ((a)<=(b) ? (a) : (b))
  4. __kernel void dynproc_kernel (int iteration,
  5. __global int* gpuWall,
  6. __global int* gpuSrc,
  7. __global int* gpuResults,
  8. int cols,
  9. int rows,
  10. int startStep,
  11. int border,
  12. int HALO,
  13. __local int* prev,
  14. __local int* result,
  15. __global int* outputBuffer)
  16. {
  17. int BLOCK_SIZE = get_local_size(0);
  18. int bx = get_group_id(0);
  19. int tx = get_local_id(0);
  20. // Each block finally computes result for a small block
  21. // after N iterations.
  22. // it is the non-overlapping small blocks that cover
  23. // all the input data
  24. // calculate the small block size.
  25. int small_block_cols = BLOCK_SIZE - (iteration*HALO*2);
  26. // calculate the boundary for the block according to
  27. // the boundary of its small block
  28. int blkX = (small_block_cols*bx) - border;
  29. int blkXmax = blkX+BLOCK_SIZE-1;
  30. // calculate the global thread coordination
  31. int xidx = blkX+tx;
  32. // effective range within this block that falls within
  33. // the valid range of the input data
  34. // used to rule out computation outside the boundary.
  35. int validXmin = (blkX < 0) ? -blkX : 0;
  36. int validXmax = (blkXmax > cols-1) ? BLOCK_SIZE-1-(blkXmax-cols+1) : BLOCK_SIZE-1;
  37. int W = tx-1;
  38. int E = tx+1;
  39. W = (W < validXmin) ? validXmin : W;
  40. E = (E > validXmax) ? validXmax : E;
  41. bool isValid = IN_RANGE(tx, validXmin, validXmax);
  42. if(IN_RANGE(xidx, 0, cols-1))
  43. {
  44. prev[tx] = gpuSrc[xidx];
  45. }
  46. barrier(CLK_LOCAL_MEM_FENCE);
  47. bool computed;
  48. for (int i = 0; i < iteration; i++)
  49. {
  50. computed = false;
  51. if( IN_RANGE(tx, i+1, BLOCK_SIZE-i-2) && isValid )
  52. {
  53. computed = true;
  54. int left = prev[W];
  55. int up = prev[tx];
  56. int right = prev[E];
  57. int shortest = MIN(left, up);
  58. shortest = MIN(shortest, right);
  59. int index = cols*(startStep+i)+xidx;
  60. result[tx] = shortest + gpuWall[index];
  61. // ===================================================================
  62. // add debugging info to the debug output buffer...
  63. if (tx==11 && i==0)
  64. {
  65. // set bufIndex to what value/range of values you want to know.
  66. int bufIndex = gpuSrc[xidx];
  67. // dont touch the line below.
  68. outputBuffer[bufIndex] = 1;
  69. }
  70. // ===================================================================
  71. }
  72. barrier(CLK_LOCAL_MEM_FENCE);
  73. if(i==iteration-1)
  74. {
  75. // we are on the last iteration, and thus don't need to
  76. // compute for the next step.
  77. break;
  78. }
  79. if(computed)
  80. {
  81. //Assign the computation range
  82. prev[tx] = result[tx];
  83. }
  84. barrier(CLK_LOCAL_MEM_FENCE);
  85. }
  86. // update the global memory
  87. // after the last iteration, only threads coordinated within the
  88. // small block perform the calculation and switch on "computed"
  89. if (computed)
  90. {
  91. gpuResults[xidx] = result[tx];
  92. }
  93. }