123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218 |
- #ifdef DOUBLE_PRECISION
- #pragma OPENCL EXTENSION cl_khr_fp64: enable
- #define FP_TYPE double
- #define FP_CONST(num) num
- #else
- #define FP_TYPE float
- #define FP_CONST(num) num##f
- #endif
- // Constants used in the MGVF computation
- #define PI FP_CONST(3.14159)
- #define ONE_OVER_PI (FP_CONST(1.0) / PI)
- #define MU FP_CONST(0.5)
- #define LAMBDA (FP_CONST(8.0) * MU + FP_CONST(1.0))
- // The number of threads per thread block
- #define LOCAL_WORK_SIZE 256
- #define NEXT_LOWEST_POWER_OF_TWO 256
- // Regularized version of the Heaviside step function:
- // He(x) = (atan(x) / pi) + 0.5
- FP_TYPE heaviside(FP_TYPE x) {
- return (atan(x) * ONE_OVER_PI) + FP_CONST(0.5);
- // A simpler, faster approximation of the Heaviside function
- /* float out = 0.0;
- if (x > -0.0001) out = 0.5;
- if (x > 0.0001) out = 1.0;
- return out; */
- }
- // Kernel to compute the Motion Gradient Vector Field (MGVF) matrix for multiple cells
- __kernel void IMGVF_kernel(__global FP_TYPE *IMGVF_array, __global FP_TYPE *I_array, __constant int *I_offsets, int __constant *m_array,
- __constant int *n_array, FP_TYPE vx, FP_TYPE vy, FP_TYPE e, int max_iterations, FP_TYPE cutoff) {
-
- // Shared copy of the matrix being computed
- __local FP_TYPE IMGVF[41 * 81];
-
- // Shared buffer used for two purposes:
- // 1) To temporarily store newly computed matrix values so that only
- // values from the previous iteration are used in the computation.
- // 2) To store partial sums during the tree reduction which is performed
- // at the end of each iteration to determine if the computation has converged.
- __local FP_TYPE buffer[LOCAL_WORK_SIZE];
-
- // Figure out which cell this thread block is working on
- int cell_num = get_group_id(0);
-
- // Get pointers to current cell's input image and inital matrix
- int I_offset = I_offsets[cell_num];
- __global FP_TYPE *IMGVF_global = &(IMGVF_array[I_offset]);
- __global FP_TYPE *I = &(I_array[I_offset]);
-
- // Get current cell's matrix dimensions
- int m = m_array[cell_num];
- int n = n_array[cell_num];
-
- // Compute the number of virtual thread blocks
- int IMGVF_Size = m * n;
- int tb_count = (IMGVF_Size + LOCAL_WORK_SIZE - 1) / LOCAL_WORK_SIZE;
-
- // Load the initial IMGVF matrix into shared memory
- int thread_id = get_local_id(0);
- int thread_block, i, j;
- for (thread_block = 0; thread_block < tb_count; thread_block++) {
- int offset = thread_block * LOCAL_WORK_SIZE + thread_id;
- if (offset < IMGVF_Size)
- IMGVF[offset] = IMGVF_global[offset];
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- // Set the converged flag to false
- __local int cell_converged;
- if (thread_id == 0) cell_converged = 0;
- barrier(CLK_LOCAL_MEM_FENCE);
-
- // Constants used to iterate through virtual thread blocks
- const float one_nth = 1.0f / (float) n;
- const int tid_mod = thread_id % n;
- const int tbsize_mod = LOCAL_WORK_SIZE % n;
-
- // Constant used in the computation of Heaviside values
- FP_TYPE one_over_e = FP_CONST(1.0) / e;
-
- // Iteratively compute the IMGVF matrix until the computation has
- // converged or we have reached the maximum number of iterations
- int iterations = 0;
- while ((! cell_converged) && (iterations < max_iterations)) {
-
- // The total change to this thread's matrix elements in the current iteration
- FP_TYPE total_diff = FP_CONST(0.0);
-
- int old_i = 0, old_j = 0;
- j = tid_mod - tbsize_mod;
-
- // Iterate over virtual thread blocks
- for (thread_block = 0; thread_block < tb_count; thread_block++) {
- // Store the index of this thread's previous matrix element
- // (used in the buffering scheme below)
- old_i = i;
- old_j = j;
-
- // Determine the index of this thread's current matrix element
- int offset = thread_block * LOCAL_WORK_SIZE;
- i = (thread_id + offset) * one_nth;
- j += tbsize_mod;
- if (j >= n) j -= n;
-
- FP_TYPE new_val, old_val;
-
- // Make sure the thread has not gone off the end of the matrix
- if (i < m) {
- // Compute neighboring matrix element indices
- int rowU = (i == 0) ? 0 : i - 1;
- int rowD = (i == m - 1) ? m - 1 : i + 1;
- int colL = (j == 0) ? 0 : j - 1;
- int colR = (j == n - 1) ? n - 1 : j + 1;
-
- // Compute the difference between the matrix element and its eight neighbors
- old_val = IMGVF[(i * n) + j];
- FP_TYPE U = IMGVF[(rowU * n) + j ] - old_val;
- FP_TYPE D = IMGVF[(rowD * n) + j ] - old_val;
- FP_TYPE L = IMGVF[(i * n) + colL] - old_val;
- FP_TYPE R = IMGVF[(i * n) + colR] - old_val;
- FP_TYPE UR = IMGVF[(rowU * n) + colR] - old_val;
- FP_TYPE DR = IMGVF[(rowD * n) + colR] - old_val;
- FP_TYPE UL = IMGVF[(rowU * n) + colL] - old_val;
- FP_TYPE DL = IMGVF[(rowD * n) + colL] - old_val;
-
- // Compute the regularized heaviside value for these differences
- FP_TYPE UHe = heaviside((U * -vy) * one_over_e);
- FP_TYPE DHe = heaviside((D * vy) * one_over_e);
- FP_TYPE LHe = heaviside((L * -vx ) * one_over_e);
- FP_TYPE RHe = heaviside((R * vx ) * one_over_e);
- FP_TYPE URHe = heaviside((UR * ( vx - vy)) * one_over_e);
- FP_TYPE DRHe = heaviside((DR * ( vx + vy)) * one_over_e);
- FP_TYPE ULHe = heaviside((UL * (-vx - vy)) * one_over_e);
- FP_TYPE DLHe = heaviside((DL * (-vx + vy)) * one_over_e);
-
- // Update the IMGVF value in two steps:
- // 1) Compute IMGVF += (mu / lambda)(UHe .*U + DHe .*D + LHe .*L + RHe .*R +
- // URHe.*UR + DRHe.*DR + ULHe.*UL + DLHe.*DL);
- new_val = old_val + (MU / LAMBDA) * (UHe * U + DHe * D + LHe * L + RHe * R +
- URHe * UR + DRHe * DR + ULHe * UL + DLHe * DL);
- // 2) Compute IMGVF -= (1 / lambda)(I .* (IMGVF - I))
- FP_TYPE vI = I[(i * n) + j];
- new_val -= ((1.0 / LAMBDA) * vI * (new_val - vI));
- // Keep track of the total change of this thread's matrix elements
- total_diff += fabs(new_val - old_val);
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- // Save the previous virtual thread block's value (if it exists)
- if (thread_block > 0) {
- offset = (thread_block - 1) * LOCAL_WORK_SIZE;
- if (old_i < m) IMGVF[(old_i * n) + old_j] = buffer[thread_id];
- }
- if (thread_block < tb_count - 1) {
- // Write the new value to the buffer
- buffer[thread_id] = new_val;
- } else {
- // We've reached the final virtual thread block,
- // so write directly to the matrix
- if (i < m) IMGVF[(i * n) + j] = new_val;
- }
-
- // We need to synchronize between virtual thread blocks to prevent
- // threads from writing the values from the buffer to the actual
- // IMGVF matrix too early
- barrier(CLK_LOCAL_MEM_FENCE);
- }
-
- // We need to compute the overall sum of the change at each matrix element
- // by performing a tree reduction across the whole threadblock
- buffer[thread_id] = total_diff;
- barrier(CLK_LOCAL_MEM_FENCE);
-
- // Account for thread block sizes that are not a power of 2
- if (thread_id >= NEXT_LOWEST_POWER_OF_TWO) {
- buffer[thread_id - NEXT_LOWEST_POWER_OF_TWO] += buffer[thread_id];
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- // Perform the tree reduction
- int th;
- for (th = NEXT_LOWEST_POWER_OF_TWO / 2; th > 0; th /= 2) {
- if (thread_id < th) {
- buffer[thread_id] += buffer[thread_id + th];
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- }
-
- // Figure out if we have converged
- if(thread_id == 0) {
- FP_TYPE mean = buffer[thread_id] / (FP_TYPE) (m * n);
- if (mean < cutoff) {
- // We have converged, so set the appropriate flag
- cell_converged = 1;
- }
- }
-
- // We need to synchronize to ensure that all threads
- // read the correct value of the convergence flag
- barrier(CLK_LOCAL_MEM_FENCE);
-
- // Keep track of the number of iterations we have performed
- iterations++;
- }
-
- // Save the final IMGVF matrix to global memory
- for (thread_block = 0; thread_block < tb_count; thread_block++) {
- int offset = thread_block * LOCAL_WORK_SIZE + thread_id;
- if (offset < IMGVF_Size)
- IMGVF_global[offset] = IMGVF[offset];
- }
- }
|