kernel_gpu_opencl.cl 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285
  1. #ifdef __cplusplus
  2. extern "C" {
  3. #endif
  4. //========================================================================================================================================================================================================200
  5. // INCLUDE/DEFINE (had to bring from ./../main.h here because feature of including headers in clBuildProgram does not work for some reason)
  6. //========================================================================================================================================================================================================200
  7. // #include <main.h> // (in the directory SOMEHOW known to the OpenCL compiler function)
  8. #define fp float
  9. #define NUMBER_PAR_PER_BOX 100 // keep this low to allow more blocks that share shared memory to run concurrently, code does not work for larger than 110, more speedup can be achieved with larger number and no shared memory used
  10. #define NUMBER_THREADS 128 // this should be roughly equal to NUMBER_PAR_PER_BOX for best performance
  11. #define DOT(A,B) ((A.x)*(B.x)+(A.y)*(B.y)+(A.z)*(B.z)) // STABLE
  12. //===============================================================================================================================================================================================================200
  13. // STRUCTURES (had to bring from ./../main.h here because feature of including headers in clBuildProgram does not work for some reason)
  14. //===============================================================================================================================================================================================================200
  15. typedef struct
  16. {
  17. fp x, y, z;
  18. } THREE_VECTOR;
  19. typedef struct
  20. {
  21. fp v, x, y, z;
  22. } FOUR_VECTOR;
  23. typedef struct nei_str
  24. {
  25. // neighbor box
  26. int x, y, z;
  27. int number;
  28. long offset;
  29. } nei_str;
  30. typedef struct box_str
  31. {
  32. // home box
  33. int x, y, z;
  34. int number;
  35. long offset;
  36. // neighbor boxes
  37. int nn;
  38. nei_str nei[26];
  39. } box_str;
  40. typedef struct par_str
  41. {
  42. fp alpha;
  43. } par_str;
  44. typedef struct dim_str
  45. {
  46. // input arguments
  47. int cur_arg;
  48. int arch_arg;
  49. int cores_arg;
  50. int boxes1d_arg;
  51. // system memory
  52. long number_boxes;
  53. long box_mem;
  54. long space_elem;
  55. long space_mem;
  56. long space_mem2;
  57. } dim_str;
  58. //========================================================================================================================================================================================================200
  59. // kernel_gpu_opencl KERNEL
  60. //========================================================================================================================================================================================================200
  61. __kernel void kernel_gpu_opencl( par_str d_par_gpu,
  62. dim_str d_dim_gpu,
  63. __global box_str *d_box_gpu,
  64. __global FOUR_VECTOR *d_rv_gpu,
  65. __global fp *d_qv_gpu,
  66. __global FOUR_VECTOR *d_fv_gpu)
  67. {
  68. //--------------------------------------------------------------------------------------------------------------------------------------------------------------------------180
  69. // THREAD PARAMETERS
  70. //--------------------------------------------------------------------------------------------------------------------------------------------------------------------------180
  71. int bx = get_group_id(0); // get current horizontal block index (0-n)
  72. int tx = get_local_id(0); // get current horizontal thread index (0-n)
  73. int wtx = tx;
  74. //--------------------------------------------------------------------------------------------------------------------------------------------------------------------------180
  75. // DO FOR THE NUMBER OF BOXES
  76. //--------------------------------------------------------------------------------------------------------------------------------------------------------------------------180
  77. if(bx<d_dim_gpu.number_boxes){
  78. //------------------------------------------------------------------------------------------------------------------------------------------------------160
  79. // Extract input parameters
  80. //------------------------------------------------------------------------------------------------------------------------------------------------------160
  81. // parameters
  82. fp a2 = 2*d_par_gpu.alpha*d_par_gpu.alpha;
  83. // home box
  84. int first_i;
  85. // (enable the line below only if wanting to use shared memory)
  86. __local FOUR_VECTOR rA_shared[100];
  87. // nei box
  88. int pointer;
  89. int k = 0;
  90. int first_j;
  91. int j = 0;
  92. // (enable the two lines below only if wanting to use shared memory)
  93. __local FOUR_VECTOR rB_shared[100];
  94. __local fp qB_shared[100];
  95. // common
  96. fp r2;
  97. fp u2;
  98. fp vij;
  99. fp fs;
  100. fp fxij;
  101. fp fyij;
  102. fp fzij;
  103. THREE_VECTOR d;
  104. //------------------------------------------------------------------------------------------------------------------------------------------------------160
  105. // Home box
  106. //------------------------------------------------------------------------------------------------------------------------------------------------------160
  107. //----------------------------------------------------------------------------------------------------------------------------------140
  108. // Setup parameters
  109. //----------------------------------------------------------------------------------------------------------------------------------140
  110. // home box - box parameters
  111. first_i = d_box_gpu[bx].offset;
  112. //----------------------------------------------------------------------------------------------------------------------------------140
  113. // Copy to shared memory
  114. //----------------------------------------------------------------------------------------------------------------------------------140
  115. // (enable the section below only if wanting to use shared memory)
  116. // home box - shared memory
  117. while(wtx<NUMBER_PAR_PER_BOX){
  118. rA_shared[wtx] = d_rv_gpu[first_i+wtx];
  119. wtx = wtx + NUMBER_THREADS;
  120. }
  121. wtx = tx;
  122. // (enable the section below only if wanting to use shared memory)
  123. // synchronize threads - not needed, but just to be safe for now
  124. barrier(CLK_LOCAL_MEM_FENCE);
  125. //------------------------------------------------------------------------------------------------------------------------------------------------------160
  126. // nei box loop
  127. //------------------------------------------------------------------------------------------------------------------------------------------------------160
  128. // loop over nei boxes of home box
  129. for (k=0; k<(1+d_box_gpu[bx].nn); k++){
  130. //----------------------------------------50
  131. // nei box - get pointer to the right box
  132. //----------------------------------------50
  133. if(k==0){
  134. pointer = bx; // set first box to be processed to home box
  135. }
  136. else{
  137. pointer = d_box_gpu[bx].nei[k-1].number; // remaining boxes are nei boxes
  138. }
  139. //----------------------------------------------------------------------------------------------------------------------------------140
  140. // Setup parameters
  141. //----------------------------------------------------------------------------------------------------------------------------------140
  142. // nei box - box parameters
  143. first_j = d_box_gpu[pointer].offset;
  144. //----------------------------------------------------------------------------------------------------------------------------------140
  145. // Setup parameters
  146. //----------------------------------------------------------------------------------------------------------------------------------140
  147. // (enable the section below only if wanting to use shared memory)
  148. // nei box - shared memory
  149. while(wtx<NUMBER_PAR_PER_BOX){
  150. rB_shared[wtx] = d_rv_gpu[first_j+wtx];
  151. qB_shared[wtx] = d_qv_gpu[first_j+wtx];
  152. wtx = wtx + NUMBER_THREADS;
  153. }
  154. wtx = tx;
  155. // (enable the section below only if wanting to use shared memory)
  156. // synchronize threads because in next section each thread accesses data brought in by different threads here
  157. barrier(CLK_LOCAL_MEM_FENCE);
  158. //----------------------------------------------------------------------------------------------------------------------------------140
  159. // Calculation
  160. //----------------------------------------------------------------------------------------------------------------------------------140
  161. // loop for the number of particles in the home box
  162. while(wtx<NUMBER_PAR_PER_BOX){
  163. // loop for the number of particles in the current nei box
  164. for (j=0; j<NUMBER_PAR_PER_BOX; j++){
  165. // (disable the section below only if wanting to use shared memory)
  166. // r2 = d_rv_gpu[first_i+wtx].v + d_rv_gpu[first_j+j].v - DOT(d_rv_gpu[first_i+wtx],d_rv_gpu[first_j+j]);
  167. // u2 = a2*r2;
  168. // vij= exp(-u2);
  169. // fs = 2*vij;
  170. // d.x = d_rv_gpu[first_i+wtx].x - d_rv_gpu[first_j+j].x;
  171. // fxij=fs*d.x;
  172. // d.y = d_rv_gpu[first_i+wtx].y - d_rv_gpu[first_j+j].y;
  173. // fyij=fs*d.y;
  174. // d.z = d_rv_gpu[first_i+wtx].z - d_rv_gpu[first_j+j].z;
  175. // fzij=fs*d.z;
  176. // d_fv_gpu[first_i+wtx].v += d_qv_gpu[first_j+j]*vij;
  177. // d_fv_gpu[first_i+wtx].x += d_qv_gpu[first_j+j]*fxij;
  178. // d_fv_gpu[first_i+wtx].y += d_qv_gpu[first_j+j]*fyij;
  179. // d_fv_gpu[first_i+wtx].z += d_qv_gpu[first_j+j]*fzij;
  180. // (enable the section below only if wanting to use shared memory)
  181. r2 = rA_shared[wtx].v + rB_shared[j].v - DOT(rA_shared[wtx],rB_shared[j]);
  182. u2 = a2*r2;
  183. vij= exp(-u2);
  184. fs = 2*vij;
  185. d.x = rA_shared[wtx].x - rB_shared[j].x;
  186. fxij=fs*d.x;
  187. d.y = rA_shared[wtx].y - rB_shared[j].y;
  188. fyij=fs*d.y;
  189. d.z = rA_shared[wtx].z - rB_shared[j].z;
  190. fzij=fs*d.z;
  191. d_fv_gpu[first_i+wtx].v += qB_shared[j]*vij;
  192. d_fv_gpu[first_i+wtx].x += qB_shared[j]*fxij;
  193. d_fv_gpu[first_i+wtx].y += qB_shared[j]*fyij;
  194. d_fv_gpu[first_i+wtx].z += qB_shared[j]*fzij;
  195. }
  196. // increment work thread index
  197. wtx = wtx + NUMBER_THREADS;
  198. }
  199. // reset work index
  200. wtx = tx;
  201. // synchronize after finishing force contributions from current nei box not to cause conflicts when starting next box
  202. barrier(CLK_LOCAL_MEM_FENCE);
  203. //----------------------------------------------------------------------------------------------------------------------------------140
  204. // Calculation END
  205. //----------------------------------------------------------------------------------------------------------------------------------140
  206. }
  207. //------------------------------------------------------------------------------------------------------------------------------------------------------160
  208. // nei box loop END
  209. //------------------------------------------------------------------------------------------------------------------------------------------------------160
  210. }
  211. }
  212. //========================================================================================================================================================================================================200
  213. // END kernel_gpu_opencl KERNEL
  214. //========================================================================================================================================================================================================200
  215. #ifdef __cplusplus
  216. }
  217. #endif