andreagus
/
rodinia-benchmark


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285
							#ifdef __cplusplus
extern "C" {
#endif

//========================================================================================================================================================================================================200
//	INCLUDE/DEFINE (had to bring from ./../main.h here because feature of including headers in clBuildProgram does not work for some reason)
//========================================================================================================================================================================================================200

// #include <main.h>						// (in the directory SOMEHOW known to the OpenCL compiler function)

#define fp float

#define NUMBER_PAR_PER_BOX 100							// keep this low to allow more blocks that share shared memory to run concurrently, code does not work for larger than 110, more speedup can be achieved with larger number and no shared memory used

#define NUMBER_THREADS 128								// this should be roughly equal to NUMBER_PAR_PER_BOX for best performance

#define DOT(A,B) ((A.x)*(B.x)+(A.y)*(B.y)+(A.z)*(B.z))	// STABLE

//===============================================================================================================================================================================================================200
//	STRUCTURES (had to bring from ./../main.h here because feature of including headers in clBuildProgram does not work for some reason)
//===============================================================================================================================================================================================================200

typedef struct
{
	fp x, y, z;

} THREE_VECTOR;

typedef struct
{
	fp v, x, y, z;

} FOUR_VECTOR;

typedef struct nei_str
{

	// neighbor box
	int x, y, z;
	int number;
	long offset;

} nei_str;

typedef struct box_str
{

	// home box
	int x, y, z;
	int number;
	long offset;

	// neighbor boxes
	int nn;
	nei_str nei[26];

} box_str;

typedef struct par_str
{

	fp alpha;

} par_str;

typedef struct dim_str
{

	// input arguments
	int cur_arg;
	int arch_arg;
	int cores_arg;
	int boxes1d_arg;

	// system memory
	long number_boxes;
	long box_mem;
	long space_elem;
	long space_mem;
	long space_mem2;

} dim_str;

//========================================================================================================================================================================================================200
//	kernel_gpu_opencl KERNEL
//========================================================================================================================================================================================================200

__kernel void kernel_gpu_opencl(	par_str d_par_gpu,
					dim_str d_dim_gpu,
					__global box_str *d_box_gpu,
					__global FOUR_VECTOR *d_rv_gpu,
					__global fp *d_qv_gpu,
					__global FOUR_VECTOR *d_fv_gpu)
{

	//--------------------------------------------------------------------------------------------------------------------------------------------------------------------------180
	//	THREAD PARAMETERS
	//--------------------------------------------------------------------------------------------------------------------------------------------------------------------------180

	int bx = get_group_id(0);															// get current horizontal block index (0-n)
	int tx = get_local_id(0);															// get current horizontal thread index (0-n)
	int wtx = tx;

	//--------------------------------------------------------------------------------------------------------------------------------------------------------------------------180
	//	DO FOR THE NUMBER OF BOXES
	//--------------------------------------------------------------------------------------------------------------------------------------------------------------------------180

	if(bx<d_dim_gpu.number_boxes){

		//------------------------------------------------------------------------------------------------------------------------------------------------------160
		//	Extract input parameters
		//------------------------------------------------------------------------------------------------------------------------------------------------------160

		// parameters
		fp a2 = 2*d_par_gpu.alpha*d_par_gpu.alpha;

		// home box
		int first_i;
		// (enable the line below only if wanting to use shared memory)
		__local FOUR_VECTOR rA_shared[100];

		// nei box
		int pointer;
		int k = 0;
		int first_j;
		int j = 0;
		// (enable the two lines below only if wanting to use shared memory)
		__local FOUR_VECTOR rB_shared[100];
		__local fp qB_shared[100];

		// common
		fp r2;
		fp u2;
		fp vij;
		fp fs;
		fp fxij;
		fp fyij;
		fp fzij;
		THREE_VECTOR d;

		//------------------------------------------------------------------------------------------------------------------------------------------------------160
		//	Home box
		//------------------------------------------------------------------------------------------------------------------------------------------------------160

		//----------------------------------------------------------------------------------------------------------------------------------140
		//	Setup parameters
		//----------------------------------------------------------------------------------------------------------------------------------140

		// home box - box parameters
		first_i = d_box_gpu[bx].offset;

		//----------------------------------------------------------------------------------------------------------------------------------140
		//	Copy to shared memory
		//----------------------------------------------------------------------------------------------------------------------------------140

		// (enable the section below only if wanting to use shared memory)
		// home box - shared memory
		while(wtx<NUMBER_PAR_PER_BOX){
			rA_shared[wtx] = d_rv_gpu[first_i+wtx];
			wtx = wtx + NUMBER_THREADS;
		}
		wtx = tx;

		// (enable the section below only if wanting to use shared memory)
		// synchronize threads  - not needed, but just to be safe for now
		barrier(CLK_LOCAL_MEM_FENCE);

		//------------------------------------------------------------------------------------------------------------------------------------------------------160
		//	nei box loop
		//------------------------------------------------------------------------------------------------------------------------------------------------------160

		// loop over nei boxes of home box
		for (k=0; k<(1+d_box_gpu[bx].nn); k++){

			//----------------------------------------50
			//	nei box - get pointer to the right box
			//----------------------------------------50

			if(k==0){
				pointer = bx;													// set first box to be processed to home box
			}
			else{
				pointer = d_box_gpu[bx].nei[k-1].number;							// remaining boxes are nei boxes
			}

			//----------------------------------------------------------------------------------------------------------------------------------140
			//	Setup parameters
			//----------------------------------------------------------------------------------------------------------------------------------140

			// nei box - box parameters
			first_j = d_box_gpu[pointer].offset;

			//----------------------------------------------------------------------------------------------------------------------------------140
			//	Setup parameters
			//----------------------------------------------------------------------------------------------------------------------------------140

			// (enable the section below only if wanting to use shared memory)
			// nei box - shared memory
			while(wtx<NUMBER_PAR_PER_BOX){
				rB_shared[wtx] = d_rv_gpu[first_j+wtx];
				qB_shared[wtx] = d_qv_gpu[first_j+wtx];
				wtx = wtx + NUMBER_THREADS;
			}
			wtx = tx;

			// (enable the section below only if wanting to use shared memory)
			// synchronize threads because in next section each thread accesses data brought in by different threads here
			barrier(CLK_LOCAL_MEM_FENCE);

			//----------------------------------------------------------------------------------------------------------------------------------140
			//	Calculation
			//----------------------------------------------------------------------------------------------------------------------------------140

			// loop for the number of particles in the home box
			while(wtx<NUMBER_PAR_PER_BOX){

				// loop for the number of particles in the current nei box
				for (j=0; j<NUMBER_PAR_PER_BOX; j++){

					// (disable the section below only if wanting to use shared memory)
					// r2 = d_rv_gpu[first_i+wtx].v + d_rv_gpu[first_j+j].v - DOT(d_rv_gpu[first_i+wtx],d_rv_gpu[first_j+j]); 
					// u2 = a2*r2;
					// vij= exp(-u2);
					// fs = 2*vij;
					// d.x = d_rv_gpu[first_i+wtx].x  - d_rv_gpu[first_j+j].x;
					// fxij=fs*d.x;
					// d.y = d_rv_gpu[first_i+wtx].y  - d_rv_gpu[first_j+j].y;
					// fyij=fs*d.y;
					// d.z = d_rv_gpu[first_i+wtx].z  - d_rv_gpu[first_j+j].z;
					// fzij=fs*d.z;
					// d_fv_gpu[first_i+wtx].v +=  d_qv_gpu[first_j+j]*vij;
					// d_fv_gpu[first_i+wtx].x +=  d_qv_gpu[first_j+j]*fxij;
					// d_fv_gpu[first_i+wtx].y +=  d_qv_gpu[first_j+j]*fyij;
					// d_fv_gpu[first_i+wtx].z +=  d_qv_gpu[first_j+j]*fzij;

					// (enable the section below only if wanting to use shared memory)
					r2 = rA_shared[wtx].v + rB_shared[j].v - DOT(rA_shared[wtx],rB_shared[j]); 
					u2 = a2*r2;
					vij= exp(-u2);
					fs = 2*vij;
					d.x = rA_shared[wtx].x  - rB_shared[j].x;
					fxij=fs*d.x;
					d.y = rA_shared[wtx].y  - rB_shared[j].y;
					fyij=fs*d.y;
					d.z = rA_shared[wtx].z  - rB_shared[j].z;
					fzij=fs*d.z;
					d_fv_gpu[first_i+wtx].v +=  qB_shared[j]*vij;
					d_fv_gpu[first_i+wtx].x +=  qB_shared[j]*fxij;
					d_fv_gpu[first_i+wtx].y +=  qB_shared[j]*fyij;
					d_fv_gpu[first_i+wtx].z +=  qB_shared[j]*fzij;

				}

				// increment work thread index
				wtx = wtx + NUMBER_THREADS;

			}

			// reset work index
			wtx = tx;

			// synchronize after finishing force contributions from current nei box not to cause conflicts when starting next box
			barrier(CLK_LOCAL_MEM_FENCE);

			//----------------------------------------------------------------------------------------------------------------------------------140
			//	Calculation END
			//----------------------------------------------------------------------------------------------------------------------------------140

		}

		//------------------------------------------------------------------------------------------------------------------------------------------------------160
		//	nei box loop END
		//------------------------------------------------------------------------------------------------------------------------------------------------------160

	}

}

//========================================================================================================================================================================================================200
//	END kernel_gpu_opencl KERNEL
//========================================================================================================================================================================================================200

#ifdef __cplusplus
}
#endif