kernel_gpu_opencl_wrapper.c 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546
  1. #ifdef __cplusplus
  2. extern "C" {
  3. #endif
  4. //========================================================================================================================================================================================================200
  5. // DEFINE/INCLUDE
  6. //========================================================================================================================================================================================================200
  7. //======================================================================================================================================================150
  8. // LIBRARIES
  9. //======================================================================================================================================================150
  10. #include <string.h>
  11. #include <CL/cl.h> // (in library path provided to compiler) needed by OpenCL types and functions
  12. //======================================================================================================================================================150
  13. // MAIN FUNCTION HEADER
  14. //======================================================================================================================================================150
  15. #include "./../main.h" // (in the main program folder) needed to recognized input parameters
  16. //======================================================================================================================================================150
  17. // UTILITIES
  18. //======================================================================================================================================================150
  19. #include "./../util/opencl/opencl.h" // (in library path specified to compiler) needed by for device functions
  20. #include "./../util/timer/timer.h" // (in library path specified to compiler) needed by timer
  21. //======================================================================================================================================================150
  22. // KERNEL_GPU_OPENCL_WRAPPER FUNCTION HEADER
  23. //======================================================================================================================================================150
  24. #include "./kernel_gpu_opencl_wrapper.h" // (in the current directory)
  25. //========================================================================================================================================================================================================200
  26. // KERNEL_GPU_OPENCL_WRAPPER FUNCTION
  27. //========================================================================================================================================================================================================200
  28. void
  29. kernel_gpu_opencl_wrapper(par_str par_cpu,
  30. dim_str dim_cpu,
  31. box_str* box_cpu,
  32. FOUR_VECTOR* rv_cpu,
  33. fp* qv_cpu,
  34. FOUR_VECTOR* fv_cpu,
  35. int platform_id,
  36. int device_id,
  37. int use_gpu)
  38. {
  39. //======================================================================================================================================================150
  40. // CPU VARIABLES
  41. //======================================================================================================================================================150
  42. // timer
  43. long long time0;
  44. long long time1;
  45. long long time2;
  46. long long time3;
  47. long long time4;
  48. long long time5;
  49. long long time6;
  50. time0 = get_time();
  51. //======================================================================================================================================================150
  52. // GPU SETUP
  53. //======================================================================================================================================================150
  54. //====================================================================================================100
  55. // COMMON VARIABLES
  56. //====================================================================================================100
  57. // common variables
  58. cl_int error;
  59. //====================================================================================================100
  60. // GET PLATFORMS (Intel, AMD, NVIDIA, based on provided library), SELECT ONE
  61. //====================================================================================================100
  62. // Get the number of available platforms
  63. cl_uint num_platforms;
  64. error = clGetPlatformIDs( 0,
  65. NULL,
  66. &num_platforms);
  67. if (error != CL_SUCCESS)
  68. fatal_CL(error, __LINE__);
  69. // Get the list of available platforms
  70. cl_platform_id *platforms = (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms);
  71. error = clGetPlatformIDs( num_platforms,
  72. platforms,
  73. NULL);
  74. if (error != CL_SUCCESS)
  75. fatal_CL(error, __LINE__);
  76. // Select the platform in accordance to platform_id passed as parameter
  77. cl_platform_id platform = platforms[platform_id];
  78. // Get the name of the selected platform and print it (if there are multiple platforms, choose the first one)
  79. char pbuf[100];
  80. error = clGetPlatformInfo( platform,
  81. CL_PLATFORM_VENDOR,
  82. sizeof(pbuf),
  83. pbuf,
  84. NULL);
  85. if (error != CL_SUCCESS)
  86. fatal_CL(error, __LINE__);
  87. printf("Platform: %s\n", pbuf);
  88. //====================================================================================================100
  89. // CREATE CONTEXT FOR THE PLATFORM
  90. //====================================================================================================100
  91. // Selector for the device type in accordance to what passed as parameter
  92. cl_device_type device_type = use_gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU;
  93. // Create context properties for selected platform
  94. cl_context_properties context_properties[3] = { CL_CONTEXT_PLATFORM,
  95. (cl_context_properties) platform,
  96. 0};
  97. // Create context for selected platform being CPU/GPU
  98. cl_context context;
  99. context = clCreateContextFromType(context_properties,
  100. device_type,
  101. NULL,
  102. NULL,
  103. &error);
  104. if (error != CL_SUCCESS)
  105. fatal_CL(error, __LINE__);
  106. //====================================================================================================100
  107. // GET DEVICES AVAILABLE FOR THE CONTEXT, SELECT ONE
  108. //====================================================================================================100
  109. // Get the number of devices (previousely selected for the context)
  110. size_t devices_size;
  111. error = clGetContextInfo( context,
  112. CL_CONTEXT_DEVICES,
  113. 0,
  114. NULL,
  115. &devices_size);
  116. if (error != CL_SUCCESS)
  117. fatal_CL(error, __LINE__);
  118. // Get the list of devices (previousely selected for the context)
  119. cl_device_id *devices = (cl_device_id *) malloc(devices_size);
  120. error = clGetContextInfo( context,
  121. CL_CONTEXT_DEVICES,
  122. devices_size,
  123. devices,
  124. NULL);
  125. if (error != CL_SUCCESS)
  126. fatal_CL(error, __LINE__);
  127. // Select the device passed as parameter
  128. cl_device_id device;
  129. device = devices[device_id];
  130. // Get the name of the selected device (previousely selected for the context) and print it
  131. error = clGetDeviceInfo(device,
  132. CL_DEVICE_NAME,
  133. sizeof(pbuf),
  134. pbuf,
  135. NULL);
  136. if (error != CL_SUCCESS)
  137. fatal_CL(error, __LINE__);
  138. printf("Device: %s\n", pbuf);
  139. //====================================================================================================100
  140. // CREATE COMMAND QUEUE FOR THE DEVICE
  141. //====================================================================================================100
  142. // Create a command queue
  143. cl_command_queue command_queue;
  144. command_queue = clCreateCommandQueue( context,
  145. device,
  146. 0,
  147. &error);
  148. if (error != CL_SUCCESS)
  149. fatal_CL(error, __LINE__);
  150. //====================================================================================================100
  151. // CRATE PROGRAM, COMPILE IT
  152. //====================================================================================================100
  153. // Load kernel source code from file
  154. const char *source = load_kernel_source("./kernel/kernel_gpu_opencl.cl");
  155. size_t sourceSize = strlen(source);
  156. // Create the program
  157. cl_program program = clCreateProgramWithSource( context,
  158. 1,
  159. &source,
  160. &sourceSize,
  161. &error);
  162. if (error != CL_SUCCESS)
  163. fatal_CL(error, __LINE__);
  164. // parameterized kernel dimension
  165. char clOptions[110];
  166. // sprintf(clOptions,"-I../../src");
  167. sprintf(clOptions,"-I.");
  168. #ifdef RD_WG_SIZE
  169. sprintf(clOptions + strlen(clOptions), " -DRD_WG_SIZE=%d", RD_WG_SIZE);
  170. #endif
  171. #ifdef RD_WG_SIZE_0
  172. sprintf(clOptions + strlen(clOptions), " -DRD_WG_SIZE_0=%d", RD_WG_SIZE_0);
  173. #endif
  174. #ifdef RD_WG_SIZE_0_0
  175. sprintf(clOptions + strlen(clOptions), " -DRD_WG_SIZE_0_0=%d", RD_WG_SIZE_0_0);
  176. #endif
  177. // Compile the program
  178. error = clBuildProgram( program,
  179. 1,
  180. &device,
  181. clOptions,
  182. NULL,
  183. NULL);
  184. // Print warnings and errors from compilation
  185. static char log[65536];
  186. memset(log, 0, sizeof(log));
  187. clGetProgramBuildInfo( program,
  188. device,
  189. CL_PROGRAM_BUILD_LOG,
  190. sizeof(log)-1,
  191. log,
  192. NULL);
  193. if (strstr(log,"warning:") || strstr(log, "error:"))
  194. printf("<<<<\n%s\n>>>>\n", log);
  195. if (error != CL_SUCCESS)
  196. fatal_CL(error, __LINE__);
  197. // Create kernel
  198. cl_kernel kernel;
  199. kernel = clCreateKernel(program,
  200. "kernel_gpu_opencl",
  201. &error);
  202. if (error != CL_SUCCESS)
  203. fatal_CL(error, __LINE__);
  204. //====================================================================================================100
  205. // INITIAL DRIVER OVERHEAD
  206. //====================================================================================================100
  207. // cudaThreadSynchronize();
  208. //====================================================================================================100
  209. // EXECUTION PARAMETERS
  210. //====================================================================================================100
  211. size_t local_work_size[1];
  212. local_work_size[0] = NUMBER_THREADS;
  213. size_t global_work_size[1];
  214. global_work_size[0] = dim_cpu.number_boxes * local_work_size[0];
  215. printf("# of blocks = %d, # of threads/block = %d (ensure that device can handle)\n", global_work_size[0]/local_work_size[0], local_work_size[0]);
  216. time1 = get_time();
  217. //======================================================================================================================================================150
  218. // GPU MEMORY (MALLOC)
  219. //======================================================================================================================================================150
  220. //====================================================================================================100
  221. // GPU MEMORY COPY IN
  222. //====================================================================================================100
  223. //==================================================50
  224. // boxes
  225. //==================================================50
  226. cl_mem d_box_gpu;
  227. d_box_gpu = clCreateBuffer( context,
  228. CL_MEM_READ_WRITE,
  229. dim_cpu.box_mem,
  230. NULL,
  231. &error );
  232. if (error != CL_SUCCESS)
  233. fatal_CL(error, __LINE__);
  234. //==================================================50
  235. // rv
  236. //==================================================50
  237. cl_mem d_rv_gpu;
  238. d_rv_gpu = clCreateBuffer( context,
  239. CL_MEM_READ_WRITE,
  240. dim_cpu.space_mem,
  241. NULL,
  242. &error );
  243. if (error != CL_SUCCESS)
  244. fatal_CL(error, __LINE__);
  245. //==================================================50
  246. // qv
  247. //==================================================50
  248. cl_mem d_qv_gpu;
  249. d_qv_gpu = clCreateBuffer( context,
  250. CL_MEM_READ_WRITE,
  251. dim_cpu.space_mem2,
  252. NULL,
  253. &error );
  254. if (error != CL_SUCCESS)
  255. fatal_CL(error, __LINE__);
  256. //====================================================================================================100
  257. // GPU MEMORY COPY (IN & OUT)
  258. //====================================================================================================100
  259. //==================================================50
  260. // fv
  261. //==================================================50
  262. cl_mem d_fv_gpu;
  263. d_fv_gpu = clCreateBuffer( context,
  264. CL_MEM_READ_WRITE,
  265. dim_cpu.space_mem,
  266. NULL,
  267. &error );
  268. if (error != CL_SUCCESS)
  269. fatal_CL(error, __LINE__);
  270. time2 = get_time();
  271. //======================================================================================================================================================150
  272. // GPU MEMORY COPY IN
  273. //======================================================================================================================================================150
  274. //====================================================================================================100
  275. // GPU MEMORY COPY IN
  276. //====================================================================================================100
  277. //==================================================50
  278. // boxes
  279. //==================================================50
  280. error = clEnqueueWriteBuffer( command_queue, // command queue
  281. d_box_gpu, // destination
  282. 1, // block the source from access until this copy operation complates (1=yes, 0=no)
  283. 0, // offset in destination to write to
  284. dim_cpu.box_mem, // size to be copied
  285. box_cpu, // source
  286. 0, // # of events in the list of events to wait for
  287. NULL, // list of events to wait for
  288. NULL); // ID of this operation to be used by waiting operations
  289. if (error != CL_SUCCESS)
  290. fatal_CL(error, __LINE__);
  291. //==================================================50
  292. // rv
  293. //==================================================50
  294. error = clEnqueueWriteBuffer( command_queue,
  295. d_rv_gpu,
  296. 1,
  297. 0,
  298. dim_cpu.space_mem,
  299. rv_cpu,
  300. 0,
  301. 0,
  302. 0);
  303. if (error != CL_SUCCESS)
  304. fatal_CL(error, __LINE__);
  305. //==================================================50
  306. // qv
  307. //==================================================50
  308. error = clEnqueueWriteBuffer( command_queue,
  309. d_qv_gpu,
  310. 1,
  311. 0,
  312. dim_cpu.space_mem2,
  313. qv_cpu,
  314. 0,
  315. 0,
  316. 0);
  317. if (error != CL_SUCCESS)
  318. fatal_CL(error, __LINE__);
  319. //====================================================================================================100
  320. // GPU MEMORY COPY (IN & OUT)
  321. //====================================================================================================100
  322. //==================================================50
  323. // fv
  324. //==================================================50
  325. error = clEnqueueWriteBuffer( command_queue,
  326. d_fv_gpu,
  327. 1,
  328. 0,
  329. dim_cpu.space_mem,
  330. fv_cpu,
  331. 0,
  332. 0,
  333. 0);
  334. if (error != CL_SUCCESS)
  335. fatal_CL(error, __LINE__);
  336. time3 = get_time();
  337. //======================================================================================================================================================150
  338. // KERNEL
  339. //======================================================================================================================================================150
  340. // ???
  341. clSetKernelArg( kernel,
  342. 0,
  343. sizeof(par_str),
  344. (void *) &par_cpu);
  345. clSetKernelArg( kernel,
  346. 1,
  347. sizeof(dim_str),
  348. (void *) &dim_cpu);
  349. clSetKernelArg( kernel,
  350. 2,
  351. sizeof(cl_mem),
  352. (void *) &d_box_gpu);
  353. clSetKernelArg( kernel,
  354. 3,
  355. sizeof(cl_mem),
  356. (void *) &d_rv_gpu);
  357. clSetKernelArg( kernel,
  358. 4,
  359. sizeof(cl_mem),
  360. (void *) &d_qv_gpu);
  361. clSetKernelArg( kernel,
  362. 5,
  363. sizeof(cl_mem),
  364. (void *) &d_fv_gpu);
  365. // launch kernel - all boxes
  366. error = clEnqueueNDRangeKernel( command_queue,
  367. kernel,
  368. 1,
  369. NULL,
  370. global_work_size,
  371. local_work_size,
  372. 0,
  373. NULL,
  374. NULL);
  375. if (error != CL_SUCCESS)
  376. fatal_CL(error, __LINE__);
  377. // Wait for all operations to finish NOT SURE WHERE THIS SHOULD GO
  378. error = clFinish(command_queue);
  379. if (error != CL_SUCCESS)
  380. fatal_CL(error, __LINE__);
  381. time4 = get_time();
  382. //======================================================================================================================================================150
  383. // GPU MEMORY COPY OUT
  384. //======================================================================================================================================================150
  385. //====================================================================================================100
  386. // GPU MEMORY COPY (IN & OUT)
  387. //====================================================================================================100
  388. //==================================================50
  389. // fv
  390. //==================================================50
  391. error = clEnqueueReadBuffer(command_queue, // The command queue.
  392. d_fv_gpu, // The image on the device.
  393. CL_TRUE, // Blocking? (ie. Wait at this line until read has finished?)
  394. 0, // Offset. None in this case.
  395. dim_cpu.space_mem, // Size to copy.
  396. fv_cpu, // The pointer to the image on the host.
  397. 0, // Number of events in wait list. Not used.
  398. NULL, // Event wait list. Not used.
  399. NULL); // Event object for determining status. Not used.
  400. if (error != CL_SUCCESS)
  401. fatal_CL(error, __LINE__);
  402. // (enable for testing purposes only - prints some range of output, make sure not to initialize input in main.c with random numbers for comparison across runs)
  403. // int g;
  404. // int offset = 395;
  405. // for(g=0; g<10; g++){
  406. // printf("%f, %f, %f, %f\n", fv_cpu[offset+g].v, fv_cpu[offset+g].x, fv_cpu[offset+g].y, fv_cpu[offset+g].z);
  407. // }
  408. time5 = get_time();
  409. //======================================================================================================================================================150
  410. // GPU MEMORY DEALLOCATION
  411. //======================================================================================================================================================150
  412. // Release kernels...
  413. clReleaseKernel(kernel);
  414. // Now the program...
  415. clReleaseProgram(program);
  416. // Clean up the device memory...
  417. clReleaseMemObject(d_rv_gpu);
  418. clReleaseMemObject(d_qv_gpu);
  419. clReleaseMemObject(d_fv_gpu);
  420. clReleaseMemObject(d_box_gpu);
  421. // Flush the queue
  422. error = clFlush(command_queue);
  423. if (error != CL_SUCCESS)
  424. fatal_CL(error, __LINE__);
  425. // ...and finally, the queue and context.
  426. clReleaseCommandQueue(command_queue);
  427. // ???
  428. clReleaseContext(context);
  429. time6 = get_time();
  430. //======================================================================================================================================================150
  431. // DISPLAY TIMING
  432. //======================================================================================================================================================150
  433. printf("Time spent in different stages of GPU_CUDA KERNEL:\n");
  434. printf("%15.12f s, %15.12f % : GPU: SET DEVICE / DRIVER INIT\n", (float) (time1-time0) / 1000000, (float) (time1-time0) / (float) (time6-time0) * 100);
  435. printf("%15.12f s, %15.12f % : GPU MEM: ALO\n", (float) (time2-time1) / 1000000, (float) (time2-time1) / (float) (time6-time0) * 100);
  436. printf("%15.12f s, %15.12f % : GPU MEM: COPY IN\n", (float) (time3-time2) / 1000000, (float) (time3-time2) / (float) (time6-time0) * 100);
  437. printf("%15.12f s, %15.12f % : GPU: KERNEL\n", (float) (time4-time3) / 1000000, (float) (time4-time3) / (float) (time6-time0) * 100);
  438. printf("%15.12f s, %15.12f % : GPU MEM: COPY OUT\n", (float) (time5-time4) / 1000000, (float) (time5-time4) / (float) (time6-time0) * 100);
  439. printf("%15.12f s, %15.12f % : GPU MEM: FRE\n", (float) (time6-time5) / 1000000, (float) (time6-time5) / (float) (time6-time0) * 100);
  440. printf("Total time:\n");
  441. printf("%.12f s\n", (float) (time6-time0) / 1000000);
  442. }
  443. //========================================================================================================================================================================================================200
  444. // END KERNEL_GPU_OPENCL_WRAPPER FUNCTION
  445. //========================================================================================================================================================================================================200
  446. #ifdef __cplusplus
  447. }
  448. #endif