123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212 |
- // Constants used in the MGVF computation
- #define PI 3.14159f
- #define ONE_OVER_PI (1.0f / PI)
- #define MU 0.5f
- #define LAMBDA (8.0f * MU + 1.0f)
- // The number of threads per thread block
- #define LOCAL_WORK_SIZE 256
- #define NEXT_LOWEST_POWER_OF_TWO 256
- // Regularized version of the Heaviside step function:
- // He(x) = (atan(x) / pi) + 0.5
- float heaviside(float x) {
- return (atan(x) * ONE_OVER_PI) + 0.5f;
- // A simpler, faster approximation of the Heaviside function
- /* float out = 0.0;
- if (x > -0.0001) out = 0.5;
- if (x > 0.0001) out = 1.0;
- return out; */
- }
- // Kernel to compute the Motion Gradient Vector Field (MGVF) matrix for multiple cells
- __kernel void IMGVF_kernel(__global float *IMGVF_array, __global float *I_array, __constant int *I_offsets, int __constant *m_array,
- __constant int *n_array, float vx, float vy, float e, int max_iterations, float cutoff) {
-
- // Shared copy of the matrix being computed
- __local float IMGVF[41 * 81];
-
- // Shared buffer used for two purposes:
- // 1) To temporarily store newly computed matrix values so that only
- // values from the previous iteration are used in the computation.
- // 2) To store partial sums during the tree reduction which is performed
- // at the end of each iteration to determine if the computation has converged.
- __local float buffer[LOCAL_WORK_SIZE];
-
- // Figure out which cell this thread block is working on
- int cell_num = get_group_id(0);
-
- // Get pointers to current cell's input image and inital matrix
- int I_offset = I_offsets[cell_num];
- __global float *IMGVF_global = &(IMGVF_array[I_offset]);
- __global float *I = &(I_array[I_offset]);
-
- // Get current cell's matrix dimensions
- int m = m_array[cell_num];
- int n = n_array[cell_num];
-
- // Compute the number of virtual thread blocks
- int max = (m * n + LOCAL_WORK_SIZE - 1) / LOCAL_WORK_SIZE;
-
- // Load the initial IMGVF matrix into shared memory
- int thread_id = get_local_id(0);
- int thread_block, i, j;
- for (thread_block = 0; thread_block < max; thread_block++) {
- int offset = thread_block * LOCAL_WORK_SIZE;
- i = (thread_id + offset) / n;
- j = (thread_id + offset) % n;
- if (i < m) IMGVF[(i * n) + j] = IMGVF_global[(i * n) + j];
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- // Set the converged flag to false
- __local int cell_converged;
- if (thread_id == 0) cell_converged = 0;
- barrier(CLK_LOCAL_MEM_FENCE);
-
- // Constants used to iterate through virtual thread blocks
- const float one_nth = 1.0f / (float) n;
- const int tid_mod = thread_id % n;
- const int tbsize_mod = LOCAL_WORK_SIZE % n;
-
- // Constant used in the computation of Heaviside values
- float one_over_e = 1.0f / e;
-
- // Iteratively compute the IMGVF matrix until the computation has
- // converged or we have reached the maximum number of iterations
- int iterations = 0;
- while ((! cell_converged) && (iterations < max_iterations)) {
-
- // The total change to this thread's matrix elements in the current iteration
- float total_diff = 0.0f;
-
- int old_i = 0, old_j = 0;
- j = tid_mod - tbsize_mod;
-
- // Iterate over virtual thread blocks
- for (thread_block = 0; thread_block < max; thread_block++) {
- // Store the index of this thread's previous matrix element
- // (used in the buffering scheme below)
- old_i = i;
- old_j = j;
-
- // Determine the index of this thread's current matrix element
- int offset = thread_block * LOCAL_WORK_SIZE;
- i = (thread_id + offset) * one_nth;
- j += tbsize_mod;
- if (j >= n) j -= n;
-
- float new_val = 0.0f, old_val = 0.0f;
-
- // Make sure the thread has not gone off the end of the matrix
- if (i < m) {
- // Compute neighboring matrix element indices
- int rowU = (i == 0) ? 0 : i - 1;
- int rowD = (i == m - 1) ? m - 1 : i + 1;
- int colL = (j == 0) ? 0 : j - 1;
- int colR = (j == n - 1) ? n - 1 : j + 1;
-
- // Compute the difference between the matrix element and its eight neighbors
- old_val = IMGVF[(i * n) + j];
- float U = IMGVF[(rowU * n) + j ] - old_val;
- float D = IMGVF[(rowD * n) + j ] - old_val;
- float L = IMGVF[(i * n) + colL] - old_val;
- float R = IMGVF[(i * n) + colR] - old_val;
- float UR = IMGVF[(rowU * n) + colR] - old_val;
- float DR = IMGVF[(rowD * n) + colR] - old_val;
- float UL = IMGVF[(rowU * n) + colL] - old_val;
- float DL = IMGVF[(rowD * n) + colL] - old_val;
-
- // Compute the regularized heaviside value for these differences
- float UHe = heaviside((U * -vy) * one_over_e);
- float DHe = heaviside((D * vy) * one_over_e);
- float LHe = heaviside((L * -vx ) * one_over_e);
- float RHe = heaviside((R * vx ) * one_over_e);
- float URHe = heaviside((UR * ( vx - vy)) * one_over_e);
- float DRHe = heaviside((DR * ( vx + vy)) * one_over_e);
- float ULHe = heaviside((UL * (-vx - vy)) * one_over_e);
- float DLHe = heaviside((DL * (-vx + vy)) * one_over_e);
-
- // Update the IMGVF value in two steps:
- // 1) Compute IMGVF += (mu / lambda)(UHe .*U + DHe .*D + LHe .*L + RHe .*R +
- // URHe.*UR + DRHe.*DR + ULHe.*UL + DLHe.*DL);
- new_val = old_val + (MU / LAMBDA) * (UHe * U + DHe * D + LHe * L + RHe * R +
- URHe * UR + DRHe * DR + ULHe * UL + DLHe * DL);
- // 2) Compute IMGVF -= (1 / lambda)(I .* (IMGVF - I))
- float vI = I[(i * n) + j];
- new_val -= ((1.0f / LAMBDA) * vI * (new_val - vI));
- }
-
- // Save the previous virtual thread block's value (if it exists)
- if (thread_block > 0) {
- offset = (thread_block - 1) * LOCAL_WORK_SIZE;
- if (old_i < m) IMGVF[(old_i * n) + old_j] = buffer[thread_id];
- }
- if (thread_block < max - 1) {
- // Write the new value to the buffer
- buffer[thread_id] = new_val;
- } else {
- // We've reached the final virtual thread block,
- // so write directly to the matrix
- if (i < m) IMGVF[(i * n) + j] = new_val;
- }
-
- // Keep track of the total change of this thread's matrix elements
- total_diff += fabs(new_val - old_val);
-
- // We need to synchronize between virtual thread blocks to prevent
- // threads from writing the values from the buffer to the actual
- // IMGVF matrix too early
- barrier(CLK_LOCAL_MEM_FENCE);
- }
-
- // We need to compute the overall sum of the change at each matrix element
- // by performing a tree reduction across the whole threadblock
- buffer[thread_id] = total_diff;
- barrier(CLK_LOCAL_MEM_FENCE);
-
- // Account for thread block sizes that are not a power of 2
- if (thread_id >= NEXT_LOWEST_POWER_OF_TWO) {
- buffer[thread_id - NEXT_LOWEST_POWER_OF_TWO] += buffer[thread_id];
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- // Perform the tree reduction
- int th;
- for (th = NEXT_LOWEST_POWER_OF_TWO / 2; th > 0; th /= 2) {
- if (thread_id < th) {
- buffer[thread_id] += buffer[thread_id + th];
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- }
-
- // Figure out if we have converged
- if(thread_id == 0) {
- float mean = buffer[thread_id] / (float) (m * n);
- if (mean < cutoff) {
- // We have converged, so set the appropriate flag
- cell_converged = 1;
- }
- }
-
- // We need to synchronize to ensure that all threads
- // read the correct value of the convergence flag
- barrier(CLK_LOCAL_MEM_FENCE);
-
- // Keep track of the number of iterations we have performed
- iterations++;
- }
-
- // Save the final IMGVF matrix to global memory
- for (thread_block = 0; thread_block < max; thread_block++) {
- int offset = thread_block * LOCAL_WORK_SIZE;
- i = (thread_id + offset) / n;
- j = (thread_id + offset) % n;
- if (i < m) IMGVF_global[(i * n) + j] = IMGVF[(i * n) + j];
- }
-
- // if (thread_id == 0) IMGVF_global[0] = (float) iterations;
- }
|