kernel_gpu_opencl.cl 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346
  1. //========================================================================================================================================================================================================200
  2. // DEFINE / INCLUDE
  3. //========================================================================================================================================================================================================200
  4. //======================================================================================================================================================150
  5. // MAIN FUNCTION HEADER
  6. //======================================================================================================================================================150
  7. #include "./main.h"
  8. //======================================================================================================================================================150
  9. // End
  10. //======================================================================================================================================================150
  11. //========================================================================================================================================================================================================200
  12. // Extract KERNEL
  13. //========================================================================================================================================================================================================200
  14. __kernel void
  15. extract_kernel(long d_Ne,
  16. __global fp* d_I){ // pointer to input image (DEVICE GLOBAL MEMORY)
  17. // indexes
  18. int bx = get_group_id(0); // get current horizontal block index
  19. int tx = get_local_id(0); // get current horizontal thread index
  20. int ei = (bx*NUMBER_THREADS)+tx; // unique thread id, more threads than actual elements !!!
  21. // copy input to output & log uncompress
  22. if(ei<d_Ne){ // do only for the number of elements, omit extra threads
  23. d_I[ei] = exp(d_I[ei]/255); // exponentiate input IMAGE and copy to output image
  24. }
  25. }
  26. //========================================================================================================================================================================================================200
  27. // Prepare KERNEL
  28. //========================================================================================================================================================================================================200
  29. __kernel void
  30. prepare_kernel( long d_Ne,
  31. __global fp* d_I, // pointer to output image (DEVICE GLOBAL MEMORY)
  32. __global fp* d_sums, // pointer to input image (DEVICE GLOBAL MEMORY)
  33. __global fp* d_sums2){
  34. // indexes
  35. int bx = get_group_id(0); // get current horizontal block index
  36. int tx = get_local_id(0); // get current horizontal thread index
  37. int ei = (bx*NUMBER_THREADS)+tx; // unique thread id, more threads than actual elements !!!
  38. // copy input to output & log uncompress
  39. if(ei<d_Ne){ // do only for the number of elements, omit extra threads
  40. d_sums[ei] = d_I[ei];
  41. d_sums2[ei] = d_I[ei]*d_I[ei];
  42. }
  43. }
  44. //========================================================================================================================================================================================================200
  45. // Reduce KERNEL
  46. //========================================================================================================================================================================================================200
  47. __kernel void
  48. reduce_kernel( long d_Ne, // number of elements in array
  49. long d_no, // number of sums to reduce
  50. int d_mul, // increment
  51. __global fp* d_sums, // pointer to partial sums variable (DEVICE GLOBAL MEMORY)
  52. __global fp* d_sums2,
  53. int gridDim){
  54. // indexes
  55. int bx = get_group_id(0); // get current horizontal block index
  56. int tx = get_local_id(0); // get current horizontal thread index
  57. int ei = (bx*NUMBER_THREADS)+tx; // unique thread id, more threads than actual elements !!!
  58. // int gridDim = (int)get_group_size(0)/(int)get_local_size(0); // number of workgroups
  59. int nf = NUMBER_THREADS-(gridDim*NUMBER_THREADS-d_no); // number of elements assigned to last block
  60. int df = 0; // divisibility factor for the last block
  61. // statistical
  62. __local fp d_psum[NUMBER_THREADS]; // data for block calculations allocated by every block in its shared memory
  63. __local fp d_psum2[NUMBER_THREADS];
  64. // counters
  65. int i;
  66. // copy data to shared memory
  67. if(ei<d_no){ // do only for the number of elements, omit extra threads
  68. d_psum[tx] = d_sums[ei*d_mul];
  69. d_psum2[tx] = d_sums2[ei*d_mul];
  70. }
  71. // Lingjie Zhang modificated at Nov 1, 2015
  72. // barrier(CLK_LOCAL_MEM_FENCE);
  73. barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE); // Lukasz proposed, Ke modified 2015/12/12 22:31:00
  74. // end Lingjie Zhang modification
  75. // reduction of sums if all blocks are full (rare case)
  76. if(nf == NUMBER_THREADS){
  77. // sum of every 2, 4, ..., NUMBER_THREADS elements
  78. for(i=2; i<=NUMBER_THREADS; i=2*i){
  79. // sum of elements
  80. if((tx+1) % i == 0){ // every ith
  81. d_psum[tx] = d_psum[tx] + d_psum[tx-i/2];
  82. d_psum2[tx] = d_psum2[tx] + d_psum2[tx-i/2];
  83. }
  84. // synchronization
  85. barrier(CLK_LOCAL_MEM_FENCE);
  86. }
  87. // final sumation by last thread in every block
  88. if(tx==(NUMBER_THREADS-1)){ // block result stored in global memory
  89. d_sums[bx*d_mul*NUMBER_THREADS] = d_psum[tx];
  90. d_sums2[bx*d_mul*NUMBER_THREADS] = d_psum2[tx];
  91. }
  92. }
  93. // reduction of sums if last block is not full (common case)
  94. else{
  95. // for full blocks (all except for last block)
  96. if(bx != (gridDim - 1)){ //
  97. // sum of every 2, 4, ..., NUMBER_THREADS elements
  98. for(i=2; i<=NUMBER_THREADS; i=2*i){ //
  99. // sum of elements
  100. if((tx+1) % i == 0){ // every ith
  101. d_psum[tx] = d_psum[tx] + d_psum[tx-i/2];
  102. d_psum2[tx] = d_psum2[tx] + d_psum2[tx-i/2];
  103. }
  104. // synchronization
  105. barrier(CLK_LOCAL_MEM_FENCE);
  106. }
  107. // final sumation by last thread in every block
  108. if(tx==(NUMBER_THREADS-1)){ // block result stored in global memory
  109. d_sums[bx*d_mul*NUMBER_THREADS] = d_psum[tx];
  110. d_sums2[bx*d_mul*NUMBER_THREADS] = d_psum2[tx];
  111. }
  112. }
  113. // for not full block (last block)
  114. else{ //
  115. // figure out divisibility
  116. for(i=2; i<=NUMBER_THREADS; i=2*i){ //
  117. if(nf >= i){
  118. df = i;
  119. }
  120. }
  121. // sum of every 2, 4, ..., NUMBER_THREADS elements
  122. for(i=2; i<=df; i=2*i){ //
  123. // sum of elements (only busy threads)
  124. if((tx+1) % i == 0 && tx<df){ // every ith
  125. d_psum[tx] = d_psum[tx] + d_psum[tx-i/2];
  126. d_psum2[tx] = d_psum2[tx] + d_psum2[tx-i/2];
  127. }
  128. // synchronization (all threads)
  129. barrier(CLK_LOCAL_MEM_FENCE);
  130. }
  131. // remainder / final summation by last thread
  132. if(tx==(df-1)){ //
  133. // compute the remainder and final summation by last busy thread
  134. for(i=(bx*NUMBER_THREADS)+df; i<(bx*NUMBER_THREADS)+nf; i++){ //
  135. d_psum[tx] = d_psum[tx] + d_sums[i];
  136. d_psum2[tx] = d_psum2[tx] + d_sums2[i];
  137. }
  138. // final sumation by last thread in every block
  139. d_sums[bx*d_mul*NUMBER_THREADS] = d_psum[tx];
  140. d_sums2[bx*d_mul*NUMBER_THREADS] = d_psum2[tx];
  141. }
  142. }
  143. }
  144. }
  145. //========================================================================================================================================================================================================200
  146. // SRAD KERNEL
  147. //========================================================================================================================================================================================================200
  148. // BUG, IF STILL PRESENT, COULD BE SOMEWHERE IN THIS CODE, MEMORY ACCESS OUT OF BOUNDS
  149. __kernel void
  150. srad_kernel(fp d_lambda,
  151. int d_Nr,
  152. int d_Nc,
  153. long d_Ne,
  154. __global int* d_iN,
  155. __global int* d_iS,
  156. __global int* d_jE,
  157. __global int* d_jW,
  158. __global fp* d_dN,
  159. __global fp* d_dS,
  160. __global fp* d_dE,
  161. __global fp* d_dW,
  162. fp d_q0sqr,
  163. __global fp* d_c,
  164. __global fp* d_I){
  165. // indexes
  166. int bx = get_group_id(0); // get current horizontal block index
  167. int tx = get_local_id(0); // get current horizontal thread index
  168. int ei = bx*NUMBER_THREADS+tx; // more threads than actual elements !!!
  169. int row; // column, x position
  170. int col; // row, y position
  171. // variables
  172. fp d_Jc;
  173. fp d_dN_loc, d_dS_loc, d_dW_loc, d_dE_loc;
  174. fp d_c_loc;
  175. fp d_G2,d_L,d_num,d_den,d_qsqr;
  176. // figure out row/col location in new matrix
  177. row = (ei+1) % d_Nr - 1; // (0-n) row
  178. col = (ei+1) / d_Nr + 1 - 1; // (0-n) column
  179. if((ei+1) % d_Nr == 0){
  180. row = d_Nr - 1;
  181. col = col - 1;
  182. }
  183. if(ei<d_Ne){ // make sure that only threads matching jobs run
  184. // directional derivatives, ICOV, diffusion coefficent
  185. d_Jc = d_I[ei]; // get value of the current element
  186. // directional derivates (every element of IMAGE)(try to copy to shared memory or temp files)
  187. d_dN_loc = d_I[d_iN[row] + d_Nr*col] - d_Jc; // north direction derivative
  188. d_dS_loc = d_I[d_iS[row] + d_Nr*col] - d_Jc; // south direction derivative
  189. d_dW_loc = d_I[row + d_Nr*d_jW[col]] - d_Jc; // west direction derivative
  190. d_dE_loc = d_I[row + d_Nr*d_jE[col]] - d_Jc; // east direction derivative
  191. // normalized discrete gradient mag squared (equ 52,53)
  192. d_G2 = (d_dN_loc*d_dN_loc + d_dS_loc*d_dS_loc + d_dW_loc*d_dW_loc + d_dE_loc*d_dE_loc) / (d_Jc*d_Jc); // gradient (based on derivatives)
  193. // normalized discrete laplacian (equ 54)
  194. d_L = (d_dN_loc + d_dS_loc + d_dW_loc + d_dE_loc) / d_Jc; // laplacian (based on derivatives)
  195. // ICOV (equ 31/35)
  196. d_num = (0.5*d_G2) - ((1.0/16.0)*(d_L*d_L)) ; // num (based on gradient and laplacian)
  197. d_den = 1 + (0.25*d_L); // den (based on laplacian)
  198. d_qsqr = d_num/(d_den*d_den); // qsqr (based on num and den)
  199. // diffusion coefficent (equ 33) (every element of IMAGE)
  200. d_den = (d_qsqr-d_q0sqr) / (d_q0sqr * (1+d_q0sqr)) ; // den (based on qsqr and q0sqr)
  201. d_c_loc = 1.0 / (1.0+d_den) ; // diffusion coefficient (based on den)
  202. // saturate diffusion coefficent to 0-1 range
  203. if (d_c_loc < 0){ // if diffusion coefficient < 0
  204. d_c_loc = 0; // ... set to 0
  205. }
  206. else if (d_c_loc > 1){ // if diffusion coefficient > 1
  207. d_c_loc = 1; // ... set to 1
  208. }
  209. // save data to global memory
  210. d_dN[ei] = d_dN_loc;
  211. d_dS[ei] = d_dS_loc;
  212. d_dW[ei] = d_dW_loc;
  213. d_dE[ei] = d_dE_loc;
  214. d_c[ei] = d_c_loc;
  215. }
  216. }
  217. //========================================================================================================================================================================================================200
  218. // SRAD2 KERNEL
  219. //========================================================================================================================================================================================================200
  220. // BUG, IF STILL PRESENT, COULD BE SOMEWHERE IN THIS CODE, MEMORY ACCESS OUT OF BOUNDS
  221. __kernel void
  222. srad2_kernel( fp d_lambda,
  223. int d_Nr,
  224. int d_Nc,
  225. long d_Ne,
  226. __global int* d_iN,
  227. __global int* d_iS,
  228. __global int* d_jE,
  229. __global int* d_jW,
  230. __global fp* d_dN,
  231. __global fp* d_dS,
  232. __global fp* d_dE,
  233. __global fp* d_dW,
  234. __global fp* d_c,
  235. __global fp* d_I){
  236. // indexes
  237. int bx = get_group_id(0); // get current horizontal block index
  238. int tx = get_local_id(0); // get current horizontal thread index
  239. int ei = bx*NUMBER_THREADS+tx; // more threads than actual elements !!!
  240. int row; // column, x position
  241. int col; // row, y position
  242. // variables
  243. fp d_cN,d_cS,d_cW,d_cE;
  244. fp d_D;
  245. // figure out row/col location in new matrix
  246. row = (ei+1) % d_Nr - 1; // (0-n) row
  247. col = (ei+1) / d_Nr + 1 - 1; // (0-n) column
  248. if((ei+1) % d_Nr == 0){
  249. row = d_Nr - 1;
  250. col = col - 1;
  251. }
  252. if(ei<d_Ne){ // make sure that only threads matching jobs run
  253. // diffusion coefficent
  254. d_cN = d_c[ei]; // north diffusion coefficient
  255. d_cS = d_c[d_iS[row] + d_Nr*col]; // south diffusion coefficient
  256. d_cW = d_c[ei]; // west diffusion coefficient
  257. d_cE = d_c[row + d_Nr * d_jE[col]]; // east diffusion coefficient
  258. // divergence (equ 58)
  259. d_D = d_cN*d_dN[ei] + d_cS*d_dS[ei] + d_cW*d_dW[ei] + d_cE*d_dE[ei];// divergence
  260. // image update (equ 61) (every element of IMAGE)
  261. d_I[ei] = d_I[ei] + 0.25*d_lambda*d_D; // updates image (based on input time step and divergence)
  262. }
  263. }
  264. //========================================================================================================================================================================================================200
  265. // Compress KERNEL
  266. //========================================================================================================================================================================================================200
  267. __kernel void
  268. compress_kernel(long d_Ne,
  269. __global fp* d_I){ // pointer to output image (DEVICE GLOBAL MEMORY)
  270. // indexes
  271. int bx = get_group_id(0); // get current horizontal block index
  272. int tx = get_local_id(0); // get current horizontal thread index
  273. int ei = (bx*NUMBER_THREADS)+tx; // unique thread id, more threads than actual elements !!!
  274. // copy input to output & log uncompress
  275. if(ei<d_Ne){ // do only for the number of elements, omit extra threads
  276. d_I[ei] = log(d_I[ei])*255; // exponentiate input IMAGE and copy to output image
  277. }
  278. }
  279. //========================================================================================================================================================================================================200
  280. // End
  281. //========================================================================================================================================================================================================200