123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116 |
- #define IN_RANGE(x, min, max) ((x)>=(min) && (x)<=(max))
- #define CLAMP_RANGE(x, min, max) x = (x<(min)) ? min : ((x>(max)) ? max : x )
- #define MIN(a, b) ((a)<=(b) ? (a) : (b))
- __kernel void dynproc_kernel (int iteration,
- __global int* gpuWall,
- __global int* gpuSrc,
- __global int* gpuResults,
- int cols,
- int rows,
- int startStep,
- int border,
- int HALO,
- __local int* prev,
- __local int* result,
- __global int* outputBuffer)
- {
- int BLOCK_SIZE = get_local_size(0);
- int bx = get_group_id(0);
- int tx = get_local_id(0);
- // Each block finally computes result for a small block
- // after N iterations.
- // it is the non-overlapping small blocks that cover
- // all the input data
- // calculate the small block size.
- int small_block_cols = BLOCK_SIZE - (iteration*HALO*2);
- // calculate the boundary for the block according to
- // the boundary of its small block
- int blkX = (small_block_cols*bx) - border;
- int blkXmax = blkX+BLOCK_SIZE-1;
- // calculate the global thread coordination
- int xidx = blkX+tx;
- // effective range within this block that falls within
- // the valid range of the input data
- // used to rule out computation outside the boundary.
- int validXmin = (blkX < 0) ? -blkX : 0;
- int validXmax = (blkXmax > cols-1) ? BLOCK_SIZE-1-(blkXmax-cols+1) : BLOCK_SIZE-1;
-
- int W = tx-1;
- int E = tx+1;
- W = (W < validXmin) ? validXmin : W;
- E = (E > validXmax) ? validXmax : E;
- bool isValid = IN_RANGE(tx, validXmin, validXmax);
- if(IN_RANGE(xidx, 0, cols-1))
- {
- prev[tx] = gpuSrc[xidx];
- }
-
- barrier(CLK_LOCAL_MEM_FENCE);
- bool computed;
- for (int i = 0; i < iteration; i++)
- {
- computed = false;
-
- if( IN_RANGE(tx, i+1, BLOCK_SIZE-i-2) && isValid )
- {
- computed = true;
- int left = prev[W];
- int up = prev[tx];
- int right = prev[E];
- int shortest = MIN(left, up);
- shortest = MIN(shortest, right);
-
- int index = cols*(startStep+i)+xidx;
- result[tx] = shortest + gpuWall[index];
-
- // ===================================================================
- // add debugging info to the debug output buffer...
- if (tx==11 && i==0)
- {
- // set bufIndex to what value/range of values you want to know.
- int bufIndex = gpuSrc[xidx];
- // dont touch the line below.
- outputBuffer[bufIndex] = 1;
- }
- // ===================================================================
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- if(i==iteration-1)
- {
- // we are on the last iteration, and thus don't need to
- // compute for the next step.
- break;
- }
- if(computed)
- {
- //Assign the computation range
- prev[tx] = result[tx];
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- }
- // update the global memory
- // after the last iteration, only threads coordinated within the
- // small block perform the calculation and switch on "computed"
- if (computed)
- {
- gpuResults[xidx] = result[tx];
- }
- }
|