track_ellipse_kernel.cl 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212
  1. // Constants used in the MGVF computation
  2. #define PI 3.14159f
  3. #define ONE_OVER_PI (1.0f / PI)
  4. #define MU 0.5f
  5. #define LAMBDA (8.0f * MU + 1.0f)
  6. // The number of threads per thread block
  7. #define LOCAL_WORK_SIZE 256
  8. #define NEXT_LOWEST_POWER_OF_TWO 256
  9. // Regularized version of the Heaviside step function:
  10. // He(x) = (atan(x) / pi) + 0.5
  11. float heaviside(float x) {
  12. return (atan(x) * ONE_OVER_PI) + 0.5f;
  13. // A simpler, faster approximation of the Heaviside function
  14. /* float out = 0.0;
  15. if (x > -0.0001) out = 0.5;
  16. if (x > 0.0001) out = 1.0;
  17. return out; */
  18. }
  19. // Kernel to compute the Motion Gradient Vector Field (MGVF) matrix for multiple cells
  20. __kernel void IMGVF_kernel(__global float *IMGVF_array, __global float *I_array, __constant int *I_offsets, int __constant *m_array,
  21. __constant int *n_array, float vx, float vy, float e, int max_iterations, float cutoff) {
  22. // Shared copy of the matrix being computed
  23. __local float IMGVF[41 * 81];
  24. // Shared buffer used for two purposes:
  25. // 1) To temporarily store newly computed matrix values so that only
  26. // values from the previous iteration are used in the computation.
  27. // 2) To store partial sums during the tree reduction which is performed
  28. // at the end of each iteration to determine if the computation has converged.
  29. __local float buffer[LOCAL_WORK_SIZE];
  30. // Figure out which cell this thread block is working on
  31. int cell_num = get_group_id(0);
  32. // Get pointers to current cell's input image and inital matrix
  33. int I_offset = I_offsets[cell_num];
  34. __global float *IMGVF_global = &(IMGVF_array[I_offset]);
  35. __global float *I = &(I_array[I_offset]);
  36. // Get current cell's matrix dimensions
  37. int m = m_array[cell_num];
  38. int n = n_array[cell_num];
  39. // Compute the number of virtual thread blocks
  40. int max = (m * n + LOCAL_WORK_SIZE - 1) / LOCAL_WORK_SIZE;
  41. // Load the initial IMGVF matrix into shared memory
  42. int thread_id = get_local_id(0);
  43. int thread_block, i, j;
  44. for (thread_block = 0; thread_block < max; thread_block++) {
  45. int offset = thread_block * LOCAL_WORK_SIZE;
  46. i = (thread_id + offset) / n;
  47. j = (thread_id + offset) % n;
  48. if (i < m) IMGVF[(i * n) + j] = IMGVF_global[(i * n) + j];
  49. }
  50. barrier(CLK_LOCAL_MEM_FENCE);
  51. // Set the converged flag to false
  52. __local int cell_converged;
  53. if (thread_id == 0) cell_converged = 0;
  54. barrier(CLK_LOCAL_MEM_FENCE);
  55. // Constants used to iterate through virtual thread blocks
  56. const float one_nth = 1.0f / (float) n;
  57. const int tid_mod = thread_id % n;
  58. const int tbsize_mod = LOCAL_WORK_SIZE % n;
  59. // Constant used in the computation of Heaviside values
  60. float one_over_e = 1.0f / e;
  61. // Iteratively compute the IMGVF matrix until the computation has
  62. // converged or we have reached the maximum number of iterations
  63. int iterations = 0;
  64. while ((! cell_converged) && (iterations < max_iterations)) {
  65. // The total change to this thread's matrix elements in the current iteration
  66. float total_diff = 0.0f;
  67. int old_i = 0, old_j = 0;
  68. j = tid_mod - tbsize_mod;
  69. // Iterate over virtual thread blocks
  70. for (thread_block = 0; thread_block < max; thread_block++) {
  71. // Store the index of this thread's previous matrix element
  72. // (used in the buffering scheme below)
  73. old_i = i;
  74. old_j = j;
  75. // Determine the index of this thread's current matrix element
  76. int offset = thread_block * LOCAL_WORK_SIZE;
  77. i = (thread_id + offset) * one_nth;
  78. j += tbsize_mod;
  79. if (j >= n) j -= n;
  80. float new_val = 0.0f, old_val = 0.0f;
  81. // Make sure the thread has not gone off the end of the matrix
  82. if (i < m) {
  83. // Compute neighboring matrix element indices
  84. int rowU = (i == 0) ? 0 : i - 1;
  85. int rowD = (i == m - 1) ? m - 1 : i + 1;
  86. int colL = (j == 0) ? 0 : j - 1;
  87. int colR = (j == n - 1) ? n - 1 : j + 1;
  88. // Compute the difference between the matrix element and its eight neighbors
  89. old_val = IMGVF[(i * n) + j];
  90. float U = IMGVF[(rowU * n) + j ] - old_val;
  91. float D = IMGVF[(rowD * n) + j ] - old_val;
  92. float L = IMGVF[(i * n) + colL] - old_val;
  93. float R = IMGVF[(i * n) + colR] - old_val;
  94. float UR = IMGVF[(rowU * n) + colR] - old_val;
  95. float DR = IMGVF[(rowD * n) + colR] - old_val;
  96. float UL = IMGVF[(rowU * n) + colL] - old_val;
  97. float DL = IMGVF[(rowD * n) + colL] - old_val;
  98. // Compute the regularized heaviside value for these differences
  99. float UHe = heaviside((U * -vy) * one_over_e);
  100. float DHe = heaviside((D * vy) * one_over_e);
  101. float LHe = heaviside((L * -vx ) * one_over_e);
  102. float RHe = heaviside((R * vx ) * one_over_e);
  103. float URHe = heaviside((UR * ( vx - vy)) * one_over_e);
  104. float DRHe = heaviside((DR * ( vx + vy)) * one_over_e);
  105. float ULHe = heaviside((UL * (-vx - vy)) * one_over_e);
  106. float DLHe = heaviside((DL * (-vx + vy)) * one_over_e);
  107. // Update the IMGVF value in two steps:
  108. // 1) Compute IMGVF += (mu / lambda)(UHe .*U + DHe .*D + LHe .*L + RHe .*R +
  109. // URHe.*UR + DRHe.*DR + ULHe.*UL + DLHe.*DL);
  110. new_val = old_val + (MU / LAMBDA) * (UHe * U + DHe * D + LHe * L + RHe * R +
  111. URHe * UR + DRHe * DR + ULHe * UL + DLHe * DL);
  112. // 2) Compute IMGVF -= (1 / lambda)(I .* (IMGVF - I))
  113. float vI = I[(i * n) + j];
  114. new_val -= ((1.0f / LAMBDA) * vI * (new_val - vI));
  115. }
  116. // Save the previous virtual thread block's value (if it exists)
  117. if (thread_block > 0) {
  118. offset = (thread_block - 1) * LOCAL_WORK_SIZE;
  119. if (old_i < m) IMGVF[(old_i * n) + old_j] = buffer[thread_id];
  120. }
  121. if (thread_block < max - 1) {
  122. // Write the new value to the buffer
  123. buffer[thread_id] = new_val;
  124. } else {
  125. // We've reached the final virtual thread block,
  126. // so write directly to the matrix
  127. if (i < m) IMGVF[(i * n) + j] = new_val;
  128. }
  129. // Keep track of the total change of this thread's matrix elements
  130. total_diff += fabs(new_val - old_val);
  131. // We need to synchronize between virtual thread blocks to prevent
  132. // threads from writing the values from the buffer to the actual
  133. // IMGVF matrix too early
  134. barrier(CLK_LOCAL_MEM_FENCE);
  135. }
  136. // We need to compute the overall sum of the change at each matrix element
  137. // by performing a tree reduction across the whole threadblock
  138. buffer[thread_id] = total_diff;
  139. barrier(CLK_LOCAL_MEM_FENCE);
  140. // Account for thread block sizes that are not a power of 2
  141. if (thread_id >= NEXT_LOWEST_POWER_OF_TWO) {
  142. buffer[thread_id - NEXT_LOWEST_POWER_OF_TWO] += buffer[thread_id];
  143. }
  144. barrier(CLK_LOCAL_MEM_FENCE);
  145. // Perform the tree reduction
  146. int th;
  147. for (th = NEXT_LOWEST_POWER_OF_TWO / 2; th > 0; th /= 2) {
  148. if (thread_id < th) {
  149. buffer[thread_id] += buffer[thread_id + th];
  150. }
  151. barrier(CLK_LOCAL_MEM_FENCE);
  152. }
  153. // Figure out if we have converged
  154. if(thread_id == 0) {
  155. float mean = buffer[thread_id] / (float) (m * n);
  156. if (mean < cutoff) {
  157. // We have converged, so set the appropriate flag
  158. cell_converged = 1;
  159. }
  160. }
  161. // We need to synchronize to ensure that all threads
  162. // read the correct value of the convergence flag
  163. barrier(CLK_LOCAL_MEM_FENCE);
  164. // Keep track of the number of iterations we have performed
  165. iterations++;
  166. }
  167. // Save the final IMGVF matrix to global memory
  168. for (thread_block = 0; thread_block < max; thread_block++) {
  169. int offset = thread_block * LOCAL_WORK_SIZE;
  170. i = (thread_id + offset) / n;
  171. j = (thread_id + offset) % n;
  172. if (i < m) IMGVF_global[(i * n) + j] = IMGVF[(i * n) + j];
  173. }
  174. // if (thread_id == 0) IMGVF_global[0] = (float) iterations;
  175. }