#ifdef __cplusplus extern "C" { #endif //========================================================================================================================================================================================================200 // INCLUDE/DEFINE (had to bring from ./../main.h here because feature of including headers in clBuildProgram does not work for some reason) //========================================================================================================================================================================================================200 // #include // (in the directory SOMEHOW known to the OpenCL compiler function) #define fp float #define NUMBER_PAR_PER_BOX 100 // keep this low to allow more blocks that share shared memory to run concurrently, code does not work for larger than 110, more speedup can be achieved with larger number and no shared memory used #define NUMBER_THREADS 128 // this should be roughly equal to NUMBER_PAR_PER_BOX for best performance #define DOT(A,B) ((A.x)*(B.x)+(A.y)*(B.y)+(A.z)*(B.z)) // STABLE //===============================================================================================================================================================================================================200 // STRUCTURES (had to bring from ./../main.h here because feature of including headers in clBuildProgram does not work for some reason) //===============================================================================================================================================================================================================200 typedef struct { fp x, y, z; } THREE_VECTOR; typedef struct { fp v, x, y, z; } FOUR_VECTOR; typedef struct nei_str { // neighbor box int x, y, z; int number; long offset; } nei_str; typedef struct box_str { // home box int x, y, z; int number; long offset; // neighbor boxes int nn; nei_str nei[26]; } box_str; typedef struct par_str { fp alpha; } par_str; typedef struct dim_str { // input arguments int cur_arg; int arch_arg; int cores_arg; int boxes1d_arg; // system memory long number_boxes; long box_mem; long space_elem; long space_mem; long space_mem2; } dim_str; //========================================================================================================================================================================================================200 // kernel_gpu_opencl KERNEL //========================================================================================================================================================================================================200 __kernel void kernel_gpu_opencl( par_str d_par_gpu, dim_str d_dim_gpu, __global box_str *d_box_gpu, __global FOUR_VECTOR *d_rv_gpu, __global fp *d_qv_gpu, __global FOUR_VECTOR *d_fv_gpu) { //--------------------------------------------------------------------------------------------------------------------------------------------------------------------------180 // THREAD PARAMETERS //--------------------------------------------------------------------------------------------------------------------------------------------------------------------------180 int bx = get_group_id(0); // get current horizontal block index (0-n) int tx = get_local_id(0); // get current horizontal thread index (0-n) int wtx = tx; //--------------------------------------------------------------------------------------------------------------------------------------------------------------------------180 // DO FOR THE NUMBER OF BOXES //--------------------------------------------------------------------------------------------------------------------------------------------------------------------------180 if(bx