// Constants used in the MGVF computation
#define PI 3.14159f
#define ONE_OVER_PI (1.0f / PI)
#define MU 0.5f
#define LAMBDA (8.0f * MU + 1.0f)

// The number of threads per thread block
#define LOCAL_WORK_SIZE 256
#define NEXT_LOWEST_POWER_OF_TWO 256


// Regularized version of the Heaviside step function:
// He(x) = (atan(x) / pi) + 0.5
float heaviside(float x) {
	return (atan(x) * ONE_OVER_PI) + 0.5f;

	// A simpler, faster approximation of the Heaviside function
	/* float out = 0.0;
	if (x > -0.0001) out = 0.5;
	if (x >  0.0001) out = 1.0;
	return out; */
}


// Kernel to compute the Motion Gradient Vector Field (MGVF) matrix for multiple cells
__kernel void IMGVF_kernel(__global float *IMGVF_array, __global float *I_array, __constant int *I_offsets, int __constant *m_array,
						   __constant int *n_array, float vx, float vy, float e, int max_iterations, float cutoff) {
	
	// Shared copy of the matrix being computed
	__local float IMGVF[41 * 81];
	
	// Shared buffer used for two purposes:
	// 1) To temporarily store newly computed matrix values so that only
	//    values from the previous iteration are used in the computation.
	// 2) To store partial sums during the tree reduction which is performed
	//    at the end of each iteration to determine if the computation has converged.
	__local float buffer[LOCAL_WORK_SIZE];
	
	// Figure out which cell this thread block is working on
	int cell_num = get_group_id(0);
	
	// Get pointers to current cell's input image and inital matrix
	int I_offset = I_offsets[cell_num];
	__global float *IMGVF_global = &(IMGVF_array[I_offset]);
	__global float *I = &(I_array[I_offset]);
	
	// Get current cell's matrix dimensions
	int m = m_array[cell_num];
	int n = n_array[cell_num];
	
	// Compute the number of virtual thread blocks
	int max = (m * n + LOCAL_WORK_SIZE - 1) / LOCAL_WORK_SIZE;
	
	// Load the initial IMGVF matrix into shared memory
	int thread_id = get_local_id(0);
	int thread_block, i, j;
	for (thread_block = 0; thread_block < max; thread_block++) {
		int offset = thread_block * LOCAL_WORK_SIZE;
		i = (thread_id + offset) / n;
		j = (thread_id + offset) % n;
		if (i < m) IMGVF[(i * n) + j] = IMGVF_global[(i * n) + j];
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	
	// Set the converged flag to false
	__local int cell_converged;
	if (thread_id == 0) cell_converged = 0;
	barrier(CLK_LOCAL_MEM_FENCE);
	
	// Constants used to iterate through virtual thread blocks
	const float one_nth = 1.0f / (float) n;
	const int tid_mod = thread_id % n;
	const int tbsize_mod = LOCAL_WORK_SIZE % n;
	
	// Constant used in the computation of Heaviside values
	float one_over_e = 1.0f / e;
	
	// Iteratively compute the IMGVF matrix until the computation has
	//  converged or we have reached the maximum number of iterations
	int iterations = 0;
	while ((! cell_converged) && (iterations < max_iterations)) {
	
		// The total change to this thread's matrix elements in the current iteration
		float total_diff = 0.0f;
		
		int old_i = 0, old_j = 0;
		j = tid_mod - tbsize_mod;
		
		// Iterate over virtual thread blocks
		for (thread_block = 0; thread_block < max; thread_block++) {
			// Store the index of this thread's previous matrix element
			//  (used in the buffering scheme below)
			old_i = i;
			old_j = j;
			
			// Determine the index of this thread's current matrix element 
			int offset = thread_block * LOCAL_WORK_SIZE;
			i = (thread_id + offset) * one_nth;
			j += tbsize_mod;
			if (j >= n) j -= n;
			
			float new_val = 0.0f, old_val = 0.0f;
			
			// Make sure the thread has not gone off the end of the matrix
			if (i < m) {
				// Compute neighboring matrix element indices
				int rowU = (i == 0) ? 0 : i - 1;
				int rowD = (i == m - 1) ? m - 1 : i + 1;
				int colL = (j == 0) ? 0 : j - 1;
				int colR = (j == n - 1) ? n - 1 : j + 1;
				
				// Compute the difference between the matrix element and its eight neighbors
				old_val = IMGVF[(i * n) + j];
				float U  = IMGVF[(rowU * n) + j   ] - old_val;
				float D  = IMGVF[(rowD * n) + j   ] - old_val;
				float L  = IMGVF[(i    * n) + colL] - old_val;
				float R  = IMGVF[(i    * n) + colR] - old_val;
				float UR = IMGVF[(rowU * n) + colR] - old_val;
				float DR = IMGVF[(rowD * n) + colR] - old_val;
				float UL = IMGVF[(rowU * n) + colL] - old_val;
				float DL = IMGVF[(rowD * n) + colL] - old_val;
				
				// Compute the regularized heaviside value for these differences
				float UHe  = heaviside((U  *       -vy)  * one_over_e);
				float DHe  = heaviside((D  *        vy)  * one_over_e);
				float LHe  = heaviside((L  *  -vx     )  * one_over_e);
				float RHe  = heaviside((R  *   vx     )  * one_over_e);
				float URHe = heaviside((UR * ( vx - vy)) * one_over_e);
				float DRHe = heaviside((DR * ( vx + vy)) * one_over_e);
				float ULHe = heaviside((UL * (-vx - vy)) * one_over_e);
				float DLHe = heaviside((DL * (-vx + vy)) * one_over_e);
				
				// Update the IMGVF value in two steps:
				// 1) Compute IMGVF += (mu / lambda)(UHe .*U  + DHe .*D  + LHe .*L  + RHe .*R +
				//                                   URHe.*UR + DRHe.*DR + ULHe.*UL + DLHe.*DL);
				new_val = old_val + (MU / LAMBDA) * (UHe  * U  + DHe  * D  + LHe  * L  + RHe  * R +
													 URHe * UR + DRHe * DR + ULHe * UL + DLHe * DL);
				// 2) Compute IMGVF -= (1 / lambda)(I .* (IMGVF - I))
				float vI = I[(i * n) + j];
				new_val -= ((1.0f / LAMBDA) * vI * (new_val - vI));
			}
			
			// Save the previous virtual thread block's value (if it exists)
			if (thread_block > 0) {
				offset = (thread_block - 1) * LOCAL_WORK_SIZE;
				if (old_i < m) IMGVF[(old_i * n) + old_j] = buffer[thread_id];
			}
			if (thread_block < max - 1) {
				// Write the new value to the buffer
				buffer[thread_id] = new_val;
			} else {
				// We've reached the final virtual thread block,
				//  so write directly to the matrix
				if (i < m) IMGVF[(i * n) + j] = new_val;
			}
			
			// Keep track of the total change of this thread's matrix elements
			total_diff += fabs(new_val - old_val);
			
			// We need to synchronize between virtual thread blocks to prevent
			//  threads from writing the values from the buffer to the actual
			//  IMGVF matrix too early
			barrier(CLK_LOCAL_MEM_FENCE);
		}
		
		// We need to compute the overall sum of the change at each matrix element
		//  by performing a tree reduction across the whole threadblock
		buffer[thread_id] = total_diff;
		barrier(CLK_LOCAL_MEM_FENCE);
		
		// Account for thread block sizes that are not a power of 2
		if (thread_id >= NEXT_LOWEST_POWER_OF_TWO) {
			buffer[thread_id - NEXT_LOWEST_POWER_OF_TWO] += buffer[thread_id];
		}
		barrier(CLK_LOCAL_MEM_FENCE);
		
		// Perform the tree reduction
		int th;
		for (th = NEXT_LOWEST_POWER_OF_TWO / 2; th > 0; th /= 2) {
			if (thread_id < th) {
				buffer[thread_id] += buffer[thread_id + th];
			}
			barrier(CLK_LOCAL_MEM_FENCE);
		}
		
		// Figure out if we have converged
		if(thread_id == 0) {
			float mean = buffer[thread_id] / (float) (m * n);
			if (mean < cutoff) {
				// We have converged, so set the appropriate flag
				cell_converged = 1;
			}
		}
		
		// We need to synchronize to ensure that all threads
		//  read the correct value of the convergence flag
		barrier(CLK_LOCAL_MEM_FENCE);
		
		// Keep track of the number of iterations we have performed
		iterations++;
	}
	
	// Save the final IMGVF matrix to global memory
	for (thread_block = 0; thread_block < max; thread_block++) {
		int offset = thread_block * LOCAL_WORK_SIZE;
		i = (thread_id + offset) / n;
		j = (thread_id + offset) % n;
		if (i < m) IMGVF_global[(i * n) + j] = IMGVF[(i * n) + j];
	}
	
	// if (thread_id == 0) IMGVF_global[0] = (float) iterations;
}