track_ellipse_kernel_opt.cl 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218
  1. #ifdef DOUBLE_PRECISION
  2. #pragma OPENCL EXTENSION cl_khr_fp64: enable
  3. #define FP_TYPE double
  4. #define FP_CONST(num) num
  5. #else
  6. #define FP_TYPE float
  7. #define FP_CONST(num) num##f
  8. #endif
  9. // Constants used in the MGVF computation
  10. #define PI FP_CONST(3.14159)
  11. #define ONE_OVER_PI (FP_CONST(1.0) / PI)
  12. #define MU FP_CONST(0.5)
  13. #define LAMBDA (FP_CONST(8.0) * MU + FP_CONST(1.0))
  14. // The number of threads per thread block
  15. #define LOCAL_WORK_SIZE 256
  16. #define NEXT_LOWEST_POWER_OF_TWO 256
  17. // Regularized version of the Heaviside step function:
  18. // He(x) = (atan(x) / pi) + 0.5
  19. FP_TYPE heaviside(FP_TYPE x) {
  20. return (atan(x) * ONE_OVER_PI) + FP_CONST(0.5);
  21. // A simpler, faster approximation of the Heaviside function
  22. /* float out = 0.0;
  23. if (x > -0.0001) out = 0.5;
  24. if (x > 0.0001) out = 1.0;
  25. return out; */
  26. }
  27. // Kernel to compute the Motion Gradient Vector Field (MGVF) matrix for multiple cells
  28. __kernel void IMGVF_kernel(__global FP_TYPE *IMGVF_array, __global FP_TYPE *I_array, __constant int *I_offsets, int __constant *m_array,
  29. __constant int *n_array, FP_TYPE vx, FP_TYPE vy, FP_TYPE e, int max_iterations, FP_TYPE cutoff) {
  30. // Shared copy of the matrix being computed
  31. __local FP_TYPE IMGVF[41 * 81];
  32. // Shared buffer used for two purposes:
  33. // 1) To temporarily store newly computed matrix values so that only
  34. // values from the previous iteration are used in the computation.
  35. // 2) To store partial sums during the tree reduction which is performed
  36. // at the end of each iteration to determine if the computation has converged.
  37. __local FP_TYPE buffer[LOCAL_WORK_SIZE];
  38. // Figure out which cell this thread block is working on
  39. int cell_num = get_group_id(0);
  40. // Get pointers to current cell's input image and inital matrix
  41. int I_offset = I_offsets[cell_num];
  42. __global FP_TYPE *IMGVF_global = &(IMGVF_array[I_offset]);
  43. __global FP_TYPE *I = &(I_array[I_offset]);
  44. // Get current cell's matrix dimensions
  45. int m = m_array[cell_num];
  46. int n = n_array[cell_num];
  47. // Compute the number of virtual thread blocks
  48. int IMGVF_Size = m * n;
  49. int tb_count = (IMGVF_Size + LOCAL_WORK_SIZE - 1) / LOCAL_WORK_SIZE;
  50. // Load the initial IMGVF matrix into shared memory
  51. int thread_id = get_local_id(0);
  52. int thread_block, i, j;
  53. for (thread_block = 0; thread_block < tb_count; thread_block++) {
  54. int offset = thread_block * LOCAL_WORK_SIZE + thread_id;
  55. if (offset < IMGVF_Size)
  56. IMGVF[offset] = IMGVF_global[offset];
  57. }
  58. barrier(CLK_LOCAL_MEM_FENCE);
  59. // Set the converged flag to false
  60. __local int cell_converged;
  61. if (thread_id == 0) cell_converged = 0;
  62. barrier(CLK_LOCAL_MEM_FENCE);
  63. // Constants used to iterate through virtual thread blocks
  64. const float one_nth = 1.0f / (float) n;
  65. const int tid_mod = thread_id % n;
  66. const int tbsize_mod = LOCAL_WORK_SIZE % n;
  67. // Constant used in the computation of Heaviside values
  68. FP_TYPE one_over_e = FP_CONST(1.0) / e;
  69. // Iteratively compute the IMGVF matrix until the computation has
  70. // converged or we have reached the maximum number of iterations
  71. int iterations = 0;
  72. while ((! cell_converged) && (iterations < max_iterations)) {
  73. // The total change to this thread's matrix elements in the current iteration
  74. FP_TYPE total_diff = FP_CONST(0.0);
  75. int old_i = 0, old_j = 0;
  76. j = tid_mod - tbsize_mod;
  77. // Iterate over virtual thread blocks
  78. for (thread_block = 0; thread_block < tb_count; thread_block++) {
  79. // Store the index of this thread's previous matrix element
  80. // (used in the buffering scheme below)
  81. old_i = i;
  82. old_j = j;
  83. // Determine the index of this thread's current matrix element
  84. int offset = thread_block * LOCAL_WORK_SIZE;
  85. i = (thread_id + offset) * one_nth;
  86. j += tbsize_mod;
  87. if (j >= n) j -= n;
  88. FP_TYPE new_val, old_val;
  89. // Make sure the thread has not gone off the end of the matrix
  90. if (i < m) {
  91. // Compute neighboring matrix element indices
  92. int rowU = (i == 0) ? 0 : i - 1;
  93. int rowD = (i == m - 1) ? m - 1 : i + 1;
  94. int colL = (j == 0) ? 0 : j - 1;
  95. int colR = (j == n - 1) ? n - 1 : j + 1;
  96. // Compute the difference between the matrix element and its eight neighbors
  97. old_val = IMGVF[(i * n) + j];
  98. FP_TYPE U = IMGVF[(rowU * n) + j ] - old_val;
  99. FP_TYPE D = IMGVF[(rowD * n) + j ] - old_val;
  100. FP_TYPE L = IMGVF[(i * n) + colL] - old_val;
  101. FP_TYPE R = IMGVF[(i * n) + colR] - old_val;
  102. FP_TYPE UR = IMGVF[(rowU * n) + colR] - old_val;
  103. FP_TYPE DR = IMGVF[(rowD * n) + colR] - old_val;
  104. FP_TYPE UL = IMGVF[(rowU * n) + colL] - old_val;
  105. FP_TYPE DL = IMGVF[(rowD * n) + colL] - old_val;
  106. // Compute the regularized heaviside value for these differences
  107. FP_TYPE UHe = heaviside((U * -vy) * one_over_e);
  108. FP_TYPE DHe = heaviside((D * vy) * one_over_e);
  109. FP_TYPE LHe = heaviside((L * -vx ) * one_over_e);
  110. FP_TYPE RHe = heaviside((R * vx ) * one_over_e);
  111. FP_TYPE URHe = heaviside((UR * ( vx - vy)) * one_over_e);
  112. FP_TYPE DRHe = heaviside((DR * ( vx + vy)) * one_over_e);
  113. FP_TYPE ULHe = heaviside((UL * (-vx - vy)) * one_over_e);
  114. FP_TYPE DLHe = heaviside((DL * (-vx + vy)) * one_over_e);
  115. // Update the IMGVF value in two steps:
  116. // 1) Compute IMGVF += (mu / lambda)(UHe .*U + DHe .*D + LHe .*L + RHe .*R +
  117. // URHe.*UR + DRHe.*DR + ULHe.*UL + DLHe.*DL);
  118. new_val = old_val + (MU / LAMBDA) * (UHe * U + DHe * D + LHe * L + RHe * R +
  119. URHe * UR + DRHe * DR + ULHe * UL + DLHe * DL);
  120. // 2) Compute IMGVF -= (1 / lambda)(I .* (IMGVF - I))
  121. FP_TYPE vI = I[(i * n) + j];
  122. new_val -= ((1.0 / LAMBDA) * vI * (new_val - vI));
  123. // Keep track of the total change of this thread's matrix elements
  124. total_diff += fabs(new_val - old_val);
  125. }
  126. barrier(CLK_LOCAL_MEM_FENCE);
  127. // Save the previous virtual thread block's value (if it exists)
  128. if (thread_block > 0) {
  129. offset = (thread_block - 1) * LOCAL_WORK_SIZE;
  130. if (old_i < m) IMGVF[(old_i * n) + old_j] = buffer[thread_id];
  131. }
  132. if (thread_block < tb_count - 1) {
  133. // Write the new value to the buffer
  134. buffer[thread_id] = new_val;
  135. } else {
  136. // We've reached the final virtual thread block,
  137. // so write directly to the matrix
  138. if (i < m) IMGVF[(i * n) + j] = new_val;
  139. }
  140. // We need to synchronize between virtual thread blocks to prevent
  141. // threads from writing the values from the buffer to the actual
  142. // IMGVF matrix too early
  143. barrier(CLK_LOCAL_MEM_FENCE);
  144. }
  145. // We need to compute the overall sum of the change at each matrix element
  146. // by performing a tree reduction across the whole threadblock
  147. buffer[thread_id] = total_diff;
  148. barrier(CLK_LOCAL_MEM_FENCE);
  149. // Account for thread block sizes that are not a power of 2
  150. if (thread_id >= NEXT_LOWEST_POWER_OF_TWO) {
  151. buffer[thread_id - NEXT_LOWEST_POWER_OF_TWO] += buffer[thread_id];
  152. }
  153. barrier(CLK_LOCAL_MEM_FENCE);
  154. // Perform the tree reduction
  155. int th;
  156. for (th = NEXT_LOWEST_POWER_OF_TWO / 2; th > 0; th /= 2) {
  157. if (thread_id < th) {
  158. buffer[thread_id] += buffer[thread_id + th];
  159. }
  160. barrier(CLK_LOCAL_MEM_FENCE);
  161. }
  162. // Figure out if we have converged
  163. if(thread_id == 0) {
  164. FP_TYPE mean = buffer[thread_id] / (FP_TYPE) (m * n);
  165. if (mean < cutoff) {
  166. // We have converged, so set the appropriate flag
  167. cell_converged = 1;
  168. }
  169. }
  170. // We need to synchronize to ensure that all threads
  171. // read the correct value of the convergence flag
  172. barrier(CLK_LOCAL_MEM_FENCE);
  173. // Keep track of the number of iterations we have performed
  174. iterations++;
  175. }
  176. // Save the final IMGVF matrix to global memory
  177. for (thread_block = 0; thread_block < tb_count; thread_block++) {
  178. int offset = thread_block * LOCAL_WORK_SIZE + thread_id;
  179. if (offset < IMGVF_Size)
  180. IMGVF_global[offset] = IMGVF[offset];
  181. }
  182. }