kernel_gpu_opencl_wrapper.c 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540
  1. #ifdef __cplusplus
  2. extern "C" {
  3. #endif
  4. //========================================================================================================================================================================================================200
  5. // DEFINE/INCLUDE
  6. //========================================================================================================================================================================================================200
  7. //======================================================================================================================================================150
  8. // LIBRARIES
  9. //======================================================================================================================================================150
  10. #include <string.h>
  11. #include <CL/cl.h> // (in library path provided to compiler) needed by OpenCL types and functions
  12. //======================================================================================================================================================150
  13. // MAIN FUNCTION HEADER
  14. //======================================================================================================================================================150
  15. #include "./../main.h" // (in the main program folder) needed to recognized input parameters
  16. //======================================================================================================================================================150
  17. // UTILITIES
  18. //======================================================================================================================================================150
  19. #include "./../util/opencl/opencl.h" // (in library path specified to compiler) needed by for device functions
  20. #include "./../util/timer/timer.h" // (in library path specified to compiler) needed by timer
  21. //======================================================================================================================================================150
  22. // KERNEL_GPU_OPENCL_WRAPPER FUNCTION HEADER
  23. //======================================================================================================================================================150
  24. #include "./kernel_gpu_opencl_wrapper.h" // (in the current directory)
  25. //========================================================================================================================================================================================================200
  26. // KERNEL_GPU_OPENCL_WRAPPER FUNCTION
  27. //========================================================================================================================================================================================================200
  28. void
  29. kernel_gpu_opencl_wrapper( par_str par_cpu,
  30. dim_str dim_cpu,
  31. box_str* box_cpu,
  32. FOUR_VECTOR* rv_cpu,
  33. fp* qv_cpu,
  34. FOUR_VECTOR* fv_cpu)
  35. {
  36. //======================================================================================================================================================150
  37. // CPU VARIABLES
  38. //======================================================================================================================================================150
  39. // timer
  40. long long time0;
  41. long long time1;
  42. long long time2;
  43. long long time3;
  44. long long time4;
  45. long long time5;
  46. long long time6;
  47. time0 = get_time();
  48. //======================================================================================================================================================150
  49. // GPU SETUP
  50. //======================================================================================================================================================150
  51. //====================================================================================================100
  52. // COMMON VARIABLES
  53. //====================================================================================================100
  54. // common variables
  55. cl_int error;
  56. //====================================================================================================100
  57. // GET PLATFORMS (Intel, AMD, NVIDIA, based on provided library), SELECT ONE
  58. //====================================================================================================100
  59. // Get the number of available platforms
  60. cl_uint num_platforms;
  61. error = clGetPlatformIDs( 0,
  62. NULL,
  63. &num_platforms);
  64. if (error != CL_SUCCESS)
  65. fatal_CL(error, __LINE__);
  66. // Get the list of available platforms
  67. cl_platform_id *platforms = (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms);
  68. error = clGetPlatformIDs( num_platforms,
  69. platforms,
  70. NULL);
  71. if (error != CL_SUCCESS)
  72. fatal_CL(error, __LINE__);
  73. // Select the 1st platform
  74. cl_platform_id platform = platforms[1];
  75. // Get the name of the selected platform and print it (if there are multiple platforms, choose the first one)
  76. char pbuf[100];
  77. error = clGetPlatformInfo( platform,
  78. CL_PLATFORM_VENDOR,
  79. sizeof(pbuf),
  80. pbuf,
  81. NULL);
  82. if (error != CL_SUCCESS)
  83. fatal_CL(error, __LINE__);
  84. printf("Platform: %s\n", pbuf);
  85. //====================================================================================================100
  86. // CREATE CONTEXT FOR THE PLATFORM
  87. //====================================================================================================100
  88. // Create context properties for selected platform
  89. cl_context_properties context_properties[3] = { CL_CONTEXT_PLATFORM,
  90. (cl_context_properties) platform,
  91. 0};
  92. // Create context for selected platform being GPU
  93. cl_context context;
  94. context = clCreateContextFromType( context_properties,
  95. CL_DEVICE_TYPE_CPU,
  96. NULL,
  97. NULL,
  98. &error);
  99. if (error != CL_SUCCESS)
  100. fatal_CL(error, __LINE__);
  101. //====================================================================================================100
  102. // GET DEVICES AVAILABLE FOR THE CONTEXT, SELECT ONE
  103. //====================================================================================================100
  104. // Get the number of devices (previousely selected for the context)
  105. size_t devices_size;
  106. error = clGetContextInfo( context,
  107. CL_CONTEXT_DEVICES,
  108. 0,
  109. NULL,
  110. &devices_size);
  111. if (error != CL_SUCCESS)
  112. fatal_CL(error, __LINE__);
  113. // Get the list of devices (previousely selected for the context)
  114. cl_device_id *devices = (cl_device_id *) malloc(devices_size);
  115. error = clGetContextInfo( context,
  116. CL_CONTEXT_DEVICES,
  117. devices_size,
  118. devices,
  119. NULL);
  120. if (error != CL_SUCCESS)
  121. fatal_CL(error, __LINE__);
  122. // Select the first device (previousely selected for the context) (if there are multiple devices, choose the first one)
  123. cl_device_id device;
  124. device = devices[0];
  125. // Get the name of the selected device (previousely selected for the context) and print it
  126. error = clGetDeviceInfo(device,
  127. CL_DEVICE_NAME,
  128. sizeof(pbuf),
  129. pbuf,
  130. NULL);
  131. if (error != CL_SUCCESS)
  132. fatal_CL(error, __LINE__);
  133. printf("Device: %s\n", pbuf);
  134. //====================================================================================================100
  135. // CREATE COMMAND QUEUE FOR THE DEVICE
  136. //====================================================================================================100
  137. // Create a command queue
  138. cl_command_queue command_queue;
  139. command_queue = clCreateCommandQueue( context,
  140. device,
  141. 0,
  142. &error);
  143. if (error != CL_SUCCESS)
  144. fatal_CL(error, __LINE__);
  145. //====================================================================================================100
  146. // CRATE PROGRAM, COMPILE IT
  147. //====================================================================================================100
  148. // Load kernel source code from file
  149. const char *source = load_kernel_source("./kernel/kernel_gpu_opencl.cl");
  150. size_t sourceSize = strlen(source);
  151. // Create the program
  152. cl_program program = clCreateProgramWithSource( context,
  153. 1,
  154. &source,
  155. &sourceSize,
  156. &error);
  157. if (error != CL_SUCCESS)
  158. fatal_CL(error, __LINE__);
  159. // parameterized kernel dimension
  160. char clOptions[110];
  161. // sprintf(clOptions,"-I../../src");
  162. sprintf(clOptions,"-I.");
  163. #ifdef RD_WG_SIZE
  164. sprintf(clOptions + strlen(clOptions), " -DRD_WG_SIZE=%d", RD_WG_SIZE);
  165. #endif
  166. #ifdef RD_WG_SIZE_0
  167. sprintf(clOptions + strlen(clOptions), " -DRD_WG_SIZE_0=%d", RD_WG_SIZE_0);
  168. #endif
  169. #ifdef RD_WG_SIZE_0_0
  170. sprintf(clOptions + strlen(clOptions), " -DRD_WG_SIZE_0_0=%d", RD_WG_SIZE_0_0);
  171. #endif
  172. // Compile the program
  173. error = clBuildProgram( program,
  174. 1,
  175. &device,
  176. clOptions,
  177. NULL,
  178. NULL);
  179. // Print warnings and errors from compilation
  180. static char log[65536];
  181. memset(log, 0, sizeof(log));
  182. clGetProgramBuildInfo( program,
  183. device,
  184. CL_PROGRAM_BUILD_LOG,
  185. sizeof(log)-1,
  186. log,
  187. NULL);
  188. if (strstr(log,"warning:") || strstr(log, "error:"))
  189. printf("<<<<\n%s\n>>>>\n", log);
  190. if (error != CL_SUCCESS)
  191. fatal_CL(error, __LINE__);
  192. // Create kernel
  193. cl_kernel kernel;
  194. kernel = clCreateKernel(program,
  195. "kernel_gpu_opencl",
  196. &error);
  197. if (error != CL_SUCCESS)
  198. fatal_CL(error, __LINE__);
  199. //====================================================================================================100
  200. // INITIAL DRIVER OVERHEAD
  201. //====================================================================================================100
  202. // cudaThreadSynchronize();
  203. //====================================================================================================100
  204. // EXECUTION PARAMETERS
  205. //====================================================================================================100
  206. size_t local_work_size[1];
  207. local_work_size[0] = NUMBER_THREADS;
  208. size_t global_work_size[1];
  209. global_work_size[0] = dim_cpu.number_boxes * local_work_size[0];
  210. printf("# of blocks = %d, # of threads/block = %d (ensure that device can handle)\n", global_work_size[0]/local_work_size[0], local_work_size[0]);
  211. time1 = get_time();
  212. //======================================================================================================================================================150
  213. // GPU MEMORY (MALLOC)
  214. //======================================================================================================================================================150
  215. //====================================================================================================100
  216. // GPU MEMORY COPY IN
  217. //====================================================================================================100
  218. //==================================================50
  219. // boxes
  220. //==================================================50
  221. cl_mem d_box_gpu;
  222. d_box_gpu = clCreateBuffer( context,
  223. CL_MEM_READ_WRITE,
  224. dim_cpu.box_mem,
  225. NULL,
  226. &error );
  227. if (error != CL_SUCCESS)
  228. fatal_CL(error, __LINE__);
  229. //==================================================50
  230. // rv
  231. //==================================================50
  232. cl_mem d_rv_gpu;
  233. d_rv_gpu = clCreateBuffer( context,
  234. CL_MEM_READ_WRITE,
  235. dim_cpu.space_mem,
  236. NULL,
  237. &error );
  238. if (error != CL_SUCCESS)
  239. fatal_CL(error, __LINE__);
  240. //==================================================50
  241. // qv
  242. //==================================================50
  243. cl_mem d_qv_gpu;
  244. d_qv_gpu = clCreateBuffer( context,
  245. CL_MEM_READ_WRITE,
  246. dim_cpu.space_mem2,
  247. NULL,
  248. &error );
  249. if (error != CL_SUCCESS)
  250. fatal_CL(error, __LINE__);
  251. //====================================================================================================100
  252. // GPU MEMORY COPY (IN & OUT)
  253. //====================================================================================================100
  254. //==================================================50
  255. // fv
  256. //==================================================50
  257. cl_mem d_fv_gpu;
  258. d_fv_gpu = clCreateBuffer( context,
  259. CL_MEM_READ_WRITE,
  260. dim_cpu.space_mem,
  261. NULL,
  262. &error );
  263. if (error != CL_SUCCESS)
  264. fatal_CL(error, __LINE__);
  265. time2 = get_time();
  266. //======================================================================================================================================================150
  267. // GPU MEMORY COPY IN
  268. //======================================================================================================================================================150
  269. //====================================================================================================100
  270. // GPU MEMORY COPY IN
  271. //====================================================================================================100
  272. //==================================================50
  273. // boxes
  274. //==================================================50
  275. error = clEnqueueWriteBuffer( command_queue, // command queue
  276. d_box_gpu, // destination
  277. 1, // block the source from access until this copy operation complates (1=yes, 0=no)
  278. 0, // offset in destination to write to
  279. dim_cpu.box_mem, // size to be copied
  280. box_cpu, // source
  281. 0, // # of events in the list of events to wait for
  282. NULL, // list of events to wait for
  283. NULL); // ID of this operation to be used by waiting operations
  284. if (error != CL_SUCCESS)
  285. fatal_CL(error, __LINE__);
  286. //==================================================50
  287. // rv
  288. //==================================================50
  289. error = clEnqueueWriteBuffer( command_queue,
  290. d_rv_gpu,
  291. 1,
  292. 0,
  293. dim_cpu.space_mem,
  294. rv_cpu,
  295. 0,
  296. 0,
  297. 0);
  298. if (error != CL_SUCCESS)
  299. fatal_CL(error, __LINE__);
  300. //==================================================50
  301. // qv
  302. //==================================================50
  303. error = clEnqueueWriteBuffer( command_queue,
  304. d_qv_gpu,
  305. 1,
  306. 0,
  307. dim_cpu.space_mem2,
  308. qv_cpu,
  309. 0,
  310. 0,
  311. 0);
  312. if (error != CL_SUCCESS)
  313. fatal_CL(error, __LINE__);
  314. //====================================================================================================100
  315. // GPU MEMORY COPY (IN & OUT)
  316. //====================================================================================================100
  317. //==================================================50
  318. // fv
  319. //==================================================50
  320. error = clEnqueueWriteBuffer( command_queue,
  321. d_fv_gpu,
  322. 1,
  323. 0,
  324. dim_cpu.space_mem,
  325. fv_cpu,
  326. 0,
  327. 0,
  328. 0);
  329. if (error != CL_SUCCESS)
  330. fatal_CL(error, __LINE__);
  331. time3 = get_time();
  332. //======================================================================================================================================================150
  333. // KERNEL
  334. //======================================================================================================================================================150
  335. // ???
  336. clSetKernelArg( kernel,
  337. 0,
  338. sizeof(par_str),
  339. (void *) &par_cpu);
  340. clSetKernelArg( kernel,
  341. 1,
  342. sizeof(dim_str),
  343. (void *) &dim_cpu);
  344. clSetKernelArg( kernel,
  345. 2,
  346. sizeof(cl_mem),
  347. (void *) &d_box_gpu);
  348. clSetKernelArg( kernel,
  349. 3,
  350. sizeof(cl_mem),
  351. (void *) &d_rv_gpu);
  352. clSetKernelArg( kernel,
  353. 4,
  354. sizeof(cl_mem),
  355. (void *) &d_qv_gpu);
  356. clSetKernelArg( kernel,
  357. 5,
  358. sizeof(cl_mem),
  359. (void *) &d_fv_gpu);
  360. // launch kernel - all boxes
  361. error = clEnqueueNDRangeKernel( command_queue,
  362. kernel,
  363. 1,
  364. NULL,
  365. global_work_size,
  366. local_work_size,
  367. 0,
  368. NULL,
  369. NULL);
  370. if (error != CL_SUCCESS)
  371. fatal_CL(error, __LINE__);
  372. // Wait for all operations to finish NOT SURE WHERE THIS SHOULD GO
  373. error = clFinish(command_queue);
  374. if (error != CL_SUCCESS)
  375. fatal_CL(error, __LINE__);
  376. time4 = get_time();
  377. //======================================================================================================================================================150
  378. // GPU MEMORY COPY OUT
  379. //======================================================================================================================================================150
  380. //====================================================================================================100
  381. // GPU MEMORY COPY (IN & OUT)
  382. //====================================================================================================100
  383. //==================================================50
  384. // fv
  385. //==================================================50
  386. error = clEnqueueReadBuffer(command_queue, // The command queue.
  387. d_fv_gpu, // The image on the device.
  388. CL_TRUE, // Blocking? (ie. Wait at this line until read has finished?)
  389. 0, // Offset. None in this case.
  390. dim_cpu.space_mem, // Size to copy.
  391. fv_cpu, // The pointer to the image on the host.
  392. 0, // Number of events in wait list. Not used.
  393. NULL, // Event wait list. Not used.
  394. NULL); // Event object for determining status. Not used.
  395. if (error != CL_SUCCESS)
  396. fatal_CL(error, __LINE__);
  397. // (enable for testing purposes only - prints some range of output, make sure not to initialize input in main.c with random numbers for comparison across runs)
  398. // int g;
  399. // int offset = 395;
  400. // for(g=0; g<10; g++){
  401. // printf("%f, %f, %f, %f\n", fv_cpu[offset+g].v, fv_cpu[offset+g].x, fv_cpu[offset+g].y, fv_cpu[offset+g].z);
  402. // }
  403. time5 = get_time();
  404. //======================================================================================================================================================150
  405. // GPU MEMORY DEALLOCATION
  406. //======================================================================================================================================================150
  407. // Release kernels...
  408. clReleaseKernel(kernel);
  409. // Now the program...
  410. clReleaseProgram(program);
  411. // Clean up the device memory...
  412. clReleaseMemObject(d_rv_gpu);
  413. clReleaseMemObject(d_qv_gpu);
  414. clReleaseMemObject(d_fv_gpu);
  415. clReleaseMemObject(d_box_gpu);
  416. // Flush the queue
  417. error = clFlush(command_queue);
  418. if (error != CL_SUCCESS)
  419. fatal_CL(error, __LINE__);
  420. // ...and finally, the queue and context.
  421. clReleaseCommandQueue(command_queue);
  422. // ???
  423. clReleaseContext(context);
  424. time6 = get_time();
  425. //======================================================================================================================================================150
  426. // DISPLAY TIMING
  427. //======================================================================================================================================================150
  428. printf("Time spent in different stages of GPU_CUDA KERNEL:\n");
  429. printf("%15.12f s, %15.12f % : GPU: SET DEVICE / DRIVER INIT\n", (float) (time1-time0) / 1000000, (float) (time1-time0) / (float) (time6-time0) * 100);
  430. printf("%15.12f s, %15.12f % : GPU MEM: ALO\n", (float) (time2-time1) / 1000000, (float) (time2-time1) / (float) (time6-time0) * 100);
  431. printf("%15.12f s, %15.12f % : GPU MEM: COPY IN\n", (float) (time3-time2) / 1000000, (float) (time3-time2) / (float) (time6-time0) * 100);
  432. printf("%15.12f s, %15.12f % : GPU: KERNEL\n", (float) (time4-time3) / 1000000, (float) (time4-time3) / (float) (time6-time0) * 100);
  433. printf("%15.12f s, %15.12f % : GPU MEM: COPY OUT\n", (float) (time5-time4) / 1000000, (float) (time5-time4) / (float) (time6-time0) * 100);
  434. printf("%15.12f s, %15.12f % : GPU MEM: FRE\n", (float) (time6-time5) / 1000000, (float) (time6-time5) / (float) (time6-time0) * 100);
  435. printf("Total time:\n");
  436. printf("%.12f s\n", (float) (time6-time0) / 1000000);
  437. }
  438. //========================================================================================================================================================================================================200
  439. // END KERNEL_GPU_OPENCL_WRAPPER FUNCTION
  440. //========================================================================================================================================================================================================200
  441. #ifdef __cplusplus
  442. }
  443. #endif