123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346 |
- //========================================================================================================================================================================================================200
- // DEFINE / INCLUDE
- //========================================================================================================================================================================================================200
- //======================================================================================================================================================150
- // MAIN FUNCTION HEADER
- //======================================================================================================================================================150
- #include "./main.h"
- //======================================================================================================================================================150
- // End
- //======================================================================================================================================================150
- //========================================================================================================================================================================================================200
- // Extract KERNEL
- //========================================================================================================================================================================================================200
- __kernel void
- extract_kernel(long d_Ne,
- __global fp* d_I){ // pointer to input image (DEVICE GLOBAL MEMORY)
- // indexes
- int bx = get_group_id(0); // get current horizontal block index
- int tx = get_local_id(0); // get current horizontal thread index
- int ei = (bx*NUMBER_THREADS)+tx; // unique thread id, more threads than actual elements !!!
- // copy input to output & log uncompress
- if(ei<d_Ne){ // do only for the number of elements, omit extra threads
- d_I[ei] = exp(d_I[ei]/255); // exponentiate input IMAGE and copy to output image
- }
- }
- //========================================================================================================================================================================================================200
- // Prepare KERNEL
- //========================================================================================================================================================================================================200
- __kernel void
- prepare_kernel( long d_Ne,
- __global fp* d_I, // pointer to output image (DEVICE GLOBAL MEMORY)
- __global fp* d_sums, // pointer to input image (DEVICE GLOBAL MEMORY)
- __global fp* d_sums2){
- // indexes
- int bx = get_group_id(0); // get current horizontal block index
- int tx = get_local_id(0); // get current horizontal thread index
- int ei = (bx*NUMBER_THREADS)+tx; // unique thread id, more threads than actual elements !!!
- // copy input to output & log uncompress
- if(ei<d_Ne){ // do only for the number of elements, omit extra threads
- d_sums[ei] = d_I[ei];
- d_sums2[ei] = d_I[ei]*d_I[ei];
- }
- }
- //========================================================================================================================================================================================================200
- // Reduce KERNEL
- //========================================================================================================================================================================================================200
- __kernel void
- reduce_kernel( long d_Ne, // number of elements in array
- long d_no, // number of sums to reduce
- int d_mul, // increment
- __global fp* d_sums, // pointer to partial sums variable (DEVICE GLOBAL MEMORY)
- __global fp* d_sums2,
- int gridDim){
- // indexes
- int bx = get_group_id(0); // get current horizontal block index
- int tx = get_local_id(0); // get current horizontal thread index
- int ei = (bx*NUMBER_THREADS)+tx; // unique thread id, more threads than actual elements !!!
- // int gridDim = (int)get_group_size(0)/(int)get_local_size(0); // number of workgroups
- int nf = NUMBER_THREADS-(gridDim*NUMBER_THREADS-d_no); // number of elements assigned to last block
- int df = 0; // divisibility factor for the last block
- // statistical
- __local fp d_psum[NUMBER_THREADS]; // data for block calculations allocated by every block in its shared memory
- __local fp d_psum2[NUMBER_THREADS];
- // counters
- int i;
- // copy data to shared memory
- if(ei<d_no){ // do only for the number of elements, omit extra threads
- d_psum[tx] = d_sums[ei*d_mul];
- d_psum2[tx] = d_sums2[ei*d_mul];
- }
- // Lingjie Zhang modificated at Nov 1, 2015
- // barrier(CLK_LOCAL_MEM_FENCE);
- barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE); // Lukasz proposed, Ke modified 2015/12/12 22:31:00
- // end Lingjie Zhang modification
- // reduction of sums if all blocks are full (rare case)
- if(nf == NUMBER_THREADS){
- // sum of every 2, 4, ..., NUMBER_THREADS elements
- for(i=2; i<=NUMBER_THREADS; i=2*i){
- // sum of elements
- if((tx+1) % i == 0){ // every ith
- d_psum[tx] = d_psum[tx] + d_psum[tx-i/2];
- d_psum2[tx] = d_psum2[tx] + d_psum2[tx-i/2];
- }
- // synchronization
- barrier(CLK_LOCAL_MEM_FENCE);
- }
- // final sumation by last thread in every block
- if(tx==(NUMBER_THREADS-1)){ // block result stored in global memory
- d_sums[bx*d_mul*NUMBER_THREADS] = d_psum[tx];
- d_sums2[bx*d_mul*NUMBER_THREADS] = d_psum2[tx];
- }
- }
- // reduction of sums if last block is not full (common case)
- else{
- // for full blocks (all except for last block)
- if(bx != (gridDim - 1)){ //
- // sum of every 2, 4, ..., NUMBER_THREADS elements
- for(i=2; i<=NUMBER_THREADS; i=2*i){ //
- // sum of elements
- if((tx+1) % i == 0){ // every ith
- d_psum[tx] = d_psum[tx] + d_psum[tx-i/2];
- d_psum2[tx] = d_psum2[tx] + d_psum2[tx-i/2];
- }
- // synchronization
- barrier(CLK_LOCAL_MEM_FENCE);
- }
- // final sumation by last thread in every block
- if(tx==(NUMBER_THREADS-1)){ // block result stored in global memory
- d_sums[bx*d_mul*NUMBER_THREADS] = d_psum[tx];
- d_sums2[bx*d_mul*NUMBER_THREADS] = d_psum2[tx];
- }
- }
- // for not full block (last block)
- else{ //
- // figure out divisibility
- for(i=2; i<=NUMBER_THREADS; i=2*i){ //
- if(nf >= i){
- df = i;
- }
- }
- // sum of every 2, 4, ..., NUMBER_THREADS elements
- for(i=2; i<=df; i=2*i){ //
- // sum of elements (only busy threads)
- if((tx+1) % i == 0 && tx<df){ // every ith
- d_psum[tx] = d_psum[tx] + d_psum[tx-i/2];
- d_psum2[tx] = d_psum2[tx] + d_psum2[tx-i/2];
- }
- // synchronization (all threads)
- barrier(CLK_LOCAL_MEM_FENCE);
- }
- // remainder / final summation by last thread
- if(tx==(df-1)){ //
- // compute the remainder and final summation by last busy thread
- for(i=(bx*NUMBER_THREADS)+df; i<(bx*NUMBER_THREADS)+nf; i++){ //
- d_psum[tx] = d_psum[tx] + d_sums[i];
- d_psum2[tx] = d_psum2[tx] + d_sums2[i];
- }
- // final sumation by last thread in every block
- d_sums[bx*d_mul*NUMBER_THREADS] = d_psum[tx];
- d_sums2[bx*d_mul*NUMBER_THREADS] = d_psum2[tx];
- }
- }
- }
- }
- //========================================================================================================================================================================================================200
- // SRAD KERNEL
- //========================================================================================================================================================================================================200
- // BUG, IF STILL PRESENT, COULD BE SOMEWHERE IN THIS CODE, MEMORY ACCESS OUT OF BOUNDS
- __kernel void
- srad_kernel(fp d_lambda,
- int d_Nr,
- int d_Nc,
- long d_Ne,
- __global int* d_iN,
- __global int* d_iS,
- __global int* d_jE,
- __global int* d_jW,
- __global fp* d_dN,
- __global fp* d_dS,
- __global fp* d_dE,
- __global fp* d_dW,
- fp d_q0sqr,
- __global fp* d_c,
- __global fp* d_I){
- // indexes
- int bx = get_group_id(0); // get current horizontal block index
- int tx = get_local_id(0); // get current horizontal thread index
- int ei = bx*NUMBER_THREADS+tx; // more threads than actual elements !!!
- int row; // column, x position
- int col; // row, y position
- // variables
- fp d_Jc;
- fp d_dN_loc, d_dS_loc, d_dW_loc, d_dE_loc;
- fp d_c_loc;
- fp d_G2,d_L,d_num,d_den,d_qsqr;
-
- // figure out row/col location in new matrix
- row = (ei+1) % d_Nr - 1; // (0-n) row
- col = (ei+1) / d_Nr + 1 - 1; // (0-n) column
- if((ei+1) % d_Nr == 0){
- row = d_Nr - 1;
- col = col - 1;
- }
-
- if(ei<d_Ne){ // make sure that only threads matching jobs run
-
- // directional derivatives, ICOV, diffusion coefficent
- d_Jc = d_I[ei]; // get value of the current element
-
- // directional derivates (every element of IMAGE)(try to copy to shared memory or temp files)
- d_dN_loc = d_I[d_iN[row] + d_Nr*col] - d_Jc; // north direction derivative
- d_dS_loc = d_I[d_iS[row] + d_Nr*col] - d_Jc; // south direction derivative
- d_dW_loc = d_I[row + d_Nr*d_jW[col]] - d_Jc; // west direction derivative
- d_dE_loc = d_I[row + d_Nr*d_jE[col]] - d_Jc; // east direction derivative
-
- // normalized discrete gradient mag squared (equ 52,53)
- d_G2 = (d_dN_loc*d_dN_loc + d_dS_loc*d_dS_loc + d_dW_loc*d_dW_loc + d_dE_loc*d_dE_loc) / (d_Jc*d_Jc); // gradient (based on derivatives)
-
- // normalized discrete laplacian (equ 54)
- d_L = (d_dN_loc + d_dS_loc + d_dW_loc + d_dE_loc) / d_Jc; // laplacian (based on derivatives)
- // ICOV (equ 31/35)
- d_num = (0.5*d_G2) - ((1.0/16.0)*(d_L*d_L)) ; // num (based on gradient and laplacian)
- d_den = 1 + (0.25*d_L); // den (based on laplacian)
- d_qsqr = d_num/(d_den*d_den); // qsqr (based on num and den)
-
- // diffusion coefficent (equ 33) (every element of IMAGE)
- d_den = (d_qsqr-d_q0sqr) / (d_q0sqr * (1+d_q0sqr)) ; // den (based on qsqr and q0sqr)
- d_c_loc = 1.0 / (1.0+d_den) ; // diffusion coefficient (based on den)
-
- // saturate diffusion coefficent to 0-1 range
- if (d_c_loc < 0){ // if diffusion coefficient < 0
- d_c_loc = 0; // ... set to 0
- }
- else if (d_c_loc > 1){ // if diffusion coefficient > 1
- d_c_loc = 1; // ... set to 1
- }
- // save data to global memory
- d_dN[ei] = d_dN_loc;
- d_dS[ei] = d_dS_loc;
- d_dW[ei] = d_dW_loc;
- d_dE[ei] = d_dE_loc;
- d_c[ei] = d_c_loc;
-
- }
-
- }
- //========================================================================================================================================================================================================200
- // SRAD2 KERNEL
- //========================================================================================================================================================================================================200
- // BUG, IF STILL PRESENT, COULD BE SOMEWHERE IN THIS CODE, MEMORY ACCESS OUT OF BOUNDS
- __kernel void
- srad2_kernel( fp d_lambda,
- int d_Nr,
- int d_Nc,
- long d_Ne,
- __global int* d_iN,
- __global int* d_iS,
- __global int* d_jE,
- __global int* d_jW,
- __global fp* d_dN,
- __global fp* d_dS,
- __global fp* d_dE,
- __global fp* d_dW,
- __global fp* d_c,
- __global fp* d_I){
- // indexes
- int bx = get_group_id(0); // get current horizontal block index
- int tx = get_local_id(0); // get current horizontal thread index
- int ei = bx*NUMBER_THREADS+tx; // more threads than actual elements !!!
- int row; // column, x position
- int col; // row, y position
- // variables
- fp d_cN,d_cS,d_cW,d_cE;
- fp d_D;
- // figure out row/col location in new matrix
- row = (ei+1) % d_Nr - 1; // (0-n) row
- col = (ei+1) / d_Nr + 1 - 1; // (0-n) column
- if((ei+1) % d_Nr == 0){
- row = d_Nr - 1;
- col = col - 1;
- }
- if(ei<d_Ne){ // make sure that only threads matching jobs run
- // diffusion coefficent
- d_cN = d_c[ei]; // north diffusion coefficient
- d_cS = d_c[d_iS[row] + d_Nr*col]; // south diffusion coefficient
- d_cW = d_c[ei]; // west diffusion coefficient
- d_cE = d_c[row + d_Nr * d_jE[col]]; // east diffusion coefficient
- // divergence (equ 58)
- d_D = d_cN*d_dN[ei] + d_cS*d_dS[ei] + d_cW*d_dW[ei] + d_cE*d_dE[ei];// divergence
- // image update (equ 61) (every element of IMAGE)
- d_I[ei] = d_I[ei] + 0.25*d_lambda*d_D; // updates image (based on input time step and divergence)
- }
- }
- //========================================================================================================================================================================================================200
- // Compress KERNEL
- //========================================================================================================================================================================================================200
- __kernel void
- compress_kernel(long d_Ne,
- __global fp* d_I){ // pointer to output image (DEVICE GLOBAL MEMORY)
- // indexes
- int bx = get_group_id(0); // get current horizontal block index
- int tx = get_local_id(0); // get current horizontal thread index
- int ei = (bx*NUMBER_THREADS)+tx; // unique thread id, more threads than actual elements !!!
- // copy input to output & log uncompress
- if(ei<d_Ne){ // do only for the number of elements, omit extra threads
- d_I[ei] = log(d_I[ei])*255; // exponentiate input IMAGE and copy to output image
- }
- }
- //========================================================================================================================================================================================================200
- // End
- //========================================================================================================================================================================================================200
|