//========================================================================================================================================================================================================200 // DEFINE / INCLUDE //========================================================================================================================================================================================================200 //======================================================================================================================================================150 // MAIN FUNCTION HEADER //======================================================================================================================================================150 #include "./main.h" //======================================================================================================================================================150 // End //======================================================================================================================================================150 //========================================================================================================================================================================================================200 // Extract KERNEL //========================================================================================================================================================================================================200 __kernel void extract_kernel(long d_Ne, __global fp* d_I){ // pointer to input image (DEVICE GLOBAL MEMORY) // indexes int bx = get_group_id(0); // get current horizontal block index int tx = get_local_id(0); // get current horizontal thread index int ei = (bx*NUMBER_THREADS)+tx; // unique thread id, more threads than actual elements !!! // copy input to output & log uncompress if(ei= i){ df = i; } } // sum of every 2, 4, ..., NUMBER_THREADS elements for(i=2; i<=df; i=2*i){ // // sum of elements (only busy threads) if((tx+1) % i == 0 && tx 1){ // if diffusion coefficient > 1 d_c_loc = 1; // ... set to 1 } // save data to global memory d_dN[ei] = d_dN_loc; d_dS[ei] = d_dS_loc; d_dW[ei] = d_dW_loc; d_dE[ei] = d_dE_loc; d_c[ei] = d_c_loc; } } //========================================================================================================================================================================================================200 // SRAD2 KERNEL //========================================================================================================================================================================================================200 // BUG, IF STILL PRESENT, COULD BE SOMEWHERE IN THIS CODE, MEMORY ACCESS OUT OF BOUNDS __kernel void srad2_kernel( fp d_lambda, int d_Nr, int d_Nc, long d_Ne, __global int* d_iN, __global int* d_iS, __global int* d_jE, __global int* d_jW, __global fp* d_dN, __global fp* d_dS, __global fp* d_dE, __global fp* d_dW, __global fp* d_c, __global fp* d_I){ // indexes int bx = get_group_id(0); // get current horizontal block index int tx = get_local_id(0); // get current horizontal thread index int ei = bx*NUMBER_THREADS+tx; // more threads than actual elements !!! int row; // column, x position int col; // row, y position // variables fp d_cN,d_cS,d_cW,d_cE; fp d_D; // figure out row/col location in new matrix row = (ei+1) % d_Nr - 1; // (0-n) row col = (ei+1) / d_Nr + 1 - 1; // (0-n) column if((ei+1) % d_Nr == 0){ row = d_Nr - 1; col = col - 1; } if(ei