kernel_gpu_opencl_wrapper.c 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657
  1. // #ifdef __cplusplus
  2. // extern "C" {
  3. // #endif
  4. //========================================================================================================================================================================================================200
  5. // DEFINE/INCLUDE
  6. //========================================================================================================================================================================================================200
  7. //======================================================================================================================================================150
  8. // LIBRARIES
  9. //======================================================================================================================================================150
  10. #include <CL/cl.h> // (in directory provided to compiler) needed by OpenCL types and functions
  11. #include <string.h> // (in directory known to compiler) needed by memset
  12. //======================================================================================================================================================150
  13. // COMMON
  14. //======================================================================================================================================================150
  15. #include "../common.h" // (in directory provided here)
  16. //======================================================================================================================================================150
  17. // UTILITIES
  18. //======================================================================================================================================================150
  19. #include "../util/opencl/opencl.h" // (in directory provided here)
  20. #include "../util/timer/timer.h" // (in directory provided here)
  21. //======================================================================================================================================================150
  22. // HEADER
  23. //======================================================================================================================================================150
  24. #include "./kernel_gpu_opencl_wrapper.h" // (in directory provided here)
  25. //========================================================================================================================================================================================================200
  26. // KERNEL_GPU_CUDA_WRAPPER FUNCTION
  27. //========================================================================================================================================================================================================200
  28. void
  29. kernel_gpu_opencl_wrapper( record *records,
  30. long records_mem,
  31. knode *knodes,
  32. long knodes_elem,
  33. long knodes_mem,
  34. int order,
  35. long maxheight,
  36. int count,
  37. long *currKnode,
  38. long *offset,
  39. int *keys,
  40. record *ans)
  41. {
  42. //======================================================================================================================================================150
  43. // CPU VARIABLES
  44. //======================================================================================================================================================150
  45. // timer
  46. long long time0;
  47. long long time1;
  48. long long time2;
  49. long long time3;
  50. long long time4;
  51. long long time5;
  52. long long time6;
  53. time0 = get_time();
  54. //======================================================================================================================================================150
  55. // GPU SETUP
  56. //======================================================================================================================================================150
  57. //====================================================================================================100
  58. // INITIAL DRIVER OVERHEAD
  59. //====================================================================================================100
  60. // cudaThreadSynchronize();
  61. //====================================================================================================100
  62. // COMMON VARIABLES
  63. //====================================================================================================100
  64. // common variables
  65. cl_int error;
  66. //====================================================================================================100
  67. // GET PLATFORMS (Intel, AMD, NVIDIA, based on provided library), SELECT ONE
  68. //====================================================================================================100
  69. // Get the number of available platforms
  70. cl_uint num_platforms;
  71. error = clGetPlatformIDs( 0,
  72. NULL,
  73. &num_platforms);
  74. if (error != CL_SUCCESS)
  75. fatal_CL(error, __LINE__);
  76. // Get the list of available platforms
  77. cl_platform_id *platforms = (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms);
  78. error = clGetPlatformIDs( num_platforms,
  79. platforms,
  80. NULL);
  81. if (error != CL_SUCCESS)
  82. fatal_CL(error, __LINE__);
  83. // Select the 1st platform
  84. cl_platform_id platform = platforms[0];
  85. // Get the name of the selected platform and print it (if there are multiple platforms, choose the first one)
  86. char pbuf[100];
  87. error = clGetPlatformInfo( platform,
  88. CL_PLATFORM_VENDOR,
  89. sizeof(pbuf),
  90. pbuf,
  91. NULL);
  92. if (error != CL_SUCCESS)
  93. fatal_CL(error, __LINE__);
  94. printf("Platform: %s\n", pbuf);
  95. //====================================================================================================100
  96. // CREATE CONTEXT FOR THE PLATFORM
  97. //====================================================================================================100
  98. // Create context properties for selected platform
  99. cl_context_properties context_properties[3] = { CL_CONTEXT_PLATFORM,
  100. (cl_context_properties) platform,
  101. 0};
  102. // Create context for selected platform being GPU
  103. cl_context context;
  104. context = clCreateContextFromType( context_properties,
  105. CL_DEVICE_TYPE_GPU,
  106. NULL,
  107. NULL,
  108. &error);
  109. if (error != CL_SUCCESS)
  110. fatal_CL(error, __LINE__);
  111. //====================================================================================================100
  112. // GET DEVICES AVAILABLE FOR THE CONTEXT, SELECT ONE
  113. //====================================================================================================100
  114. // Get the number of devices (previousely selected for the context)
  115. size_t devices_size;
  116. error = clGetContextInfo( context,
  117. CL_CONTEXT_DEVICES,
  118. 0,
  119. NULL,
  120. &devices_size);
  121. if (error != CL_SUCCESS)
  122. fatal_CL(error, __LINE__);
  123. // Get the list of devices (previousely selected for the context)
  124. cl_device_id *devices = (cl_device_id *) malloc(devices_size);
  125. error = clGetContextInfo( context,
  126. CL_CONTEXT_DEVICES,
  127. devices_size,
  128. devices,
  129. NULL);
  130. if (error != CL_SUCCESS)
  131. fatal_CL(error, __LINE__);
  132. // Select the first device (previousely selected for the context) (if there are multiple devices, choose the first one)
  133. cl_device_id device;
  134. device = devices[0];
  135. // Get the name of the selected device (previousely selected for the context) and print it
  136. error = clGetDeviceInfo(device,
  137. CL_DEVICE_NAME,
  138. sizeof(pbuf),
  139. pbuf,
  140. NULL);
  141. if (error != CL_SUCCESS)
  142. fatal_CL(error, __LINE__);
  143. printf("Device: %s\n", pbuf);
  144. //====================================================================================================100
  145. // CREATE COMMAND QUEUE FOR THE DEVICE
  146. //====================================================================================================100
  147. // Create a command queue
  148. cl_command_queue command_queue;
  149. command_queue = clCreateCommandQueue( context,
  150. device,
  151. 0,
  152. &error);
  153. if (error != CL_SUCCESS)
  154. fatal_CL(error, __LINE__);
  155. //====================================================================================================100
  156. // CREATE PROGRAM, COMPILE IT
  157. //====================================================================================================100
  158. // Load kernel source code from file
  159. const char *source = load_kernel_source("./kernel/kernel_gpu_opencl.cl");
  160. size_t sourceSize = strlen(source);
  161. // Create the program
  162. cl_program program = clCreateProgramWithSource( context,
  163. 1,
  164. &source,
  165. &sourceSize,
  166. &error);
  167. if (error != CL_SUCCESS)
  168. fatal_CL(error, __LINE__);
  169. char clOptions[110];
  170. // sprintf(clOptions,"-I../../src");
  171. sprintf(clOptions,"-I./../");
  172. #ifdef DEFAULT_ORDER
  173. sprintf(clOptions + strlen(clOptions), " -DDEFAULT_ORDER=%d", DEFAULT_ORDER);
  174. #endif
  175. // Compile the program
  176. error = clBuildProgram( program,
  177. 1,
  178. &device,
  179. clOptions,
  180. NULL,
  181. NULL);
  182. // Print warnings and errors from compilation
  183. static char log[65536];
  184. memset(log, 0, sizeof(log));
  185. clGetProgramBuildInfo( program,
  186. device,
  187. CL_PROGRAM_BUILD_LOG,
  188. sizeof(log)-1,
  189. log,
  190. NULL);
  191. printf("-----OpenCL Compiler Output-----\n");
  192. if (strstr(log,"warning:") || strstr(log, "error:"))
  193. printf("<<<<\n%s\n>>>>\n", log);
  194. printf("--------------------------------\n");
  195. if (error != CL_SUCCESS)
  196. fatal_CL(error, __LINE__);
  197. // Create kernel
  198. cl_kernel kernel;
  199. kernel = clCreateKernel(program,
  200. "findK",
  201. &error);
  202. if (error != CL_SUCCESS)
  203. fatal_CL(error, __LINE__);
  204. time1 = get_time();
  205. //====================================================================================================100
  206. // END
  207. //====================================================================================================100
  208. //======================================================================================================================================================150
  209. // GPU MEMORY (MALLOC)
  210. //======================================================================================================================================================150
  211. //====================================================================================================100
  212. // DEVICE IN
  213. //====================================================================================================100
  214. //==================================================50
  215. // recordsD
  216. //==================================================50
  217. cl_mem recordsD;
  218. recordsD = clCreateBuffer( context,
  219. CL_MEM_READ_WRITE,
  220. records_mem,
  221. NULL,
  222. &error );
  223. if (error != CL_SUCCESS)
  224. fatal_CL(error, __LINE__);
  225. //==================================================50
  226. // knodesD
  227. //==================================================50
  228. cl_mem knodesD;
  229. knodesD = clCreateBuffer( context,
  230. CL_MEM_READ_WRITE,
  231. knodes_mem,
  232. NULL,
  233. &error );
  234. if (error != CL_SUCCESS)
  235. fatal_CL(error, __LINE__);
  236. //==================================================50
  237. // currKnodeD
  238. //==================================================50
  239. cl_mem currKnodeD;
  240. currKnodeD = clCreateBuffer( context,
  241. CL_MEM_READ_WRITE,
  242. count*sizeof(long),
  243. NULL,
  244. &error );
  245. if (error != CL_SUCCESS)
  246. fatal_CL(error, __LINE__);
  247. //==================================================50
  248. // offsetD
  249. //==================================================50
  250. cl_mem offsetD;
  251. offsetD = clCreateBuffer( context,
  252. CL_MEM_READ_WRITE,
  253. count*sizeof(long),
  254. NULL,
  255. &error );
  256. if (error != CL_SUCCESS)
  257. fatal_CL(error, __LINE__);
  258. //==================================================50
  259. // keysD
  260. //==================================================50
  261. cl_mem keysD;
  262. keysD = clCreateBuffer( context,
  263. CL_MEM_READ_WRITE,
  264. count*sizeof(long),
  265. NULL,
  266. &error );
  267. if (error != CL_SUCCESS)
  268. fatal_CL(error, __LINE__);
  269. //==================================================50
  270. // END
  271. //==================================================50
  272. //====================================================================================================100
  273. // DEVICE IN/OUT
  274. //====================================================================================================100
  275. //==================================================50
  276. // ansD
  277. //==================================================50
  278. cl_mem ansD;
  279. ansD = clCreateBuffer( context,
  280. CL_MEM_READ_WRITE,
  281. count*sizeof(record),
  282. NULL,
  283. &error );
  284. if (error != CL_SUCCESS)
  285. fatal_CL(error, __LINE__);
  286. time2 = get_time();
  287. //==================================================50
  288. // END
  289. //==================================================50
  290. //====================================================================================================100
  291. // END
  292. //====================================================================================================100
  293. //======================================================================================================================================================150
  294. // GPU MEMORY COPY
  295. //======================================================================================================================================================150
  296. //====================================================================================================100
  297. // GPU MEMORY (MALLOC) COPY IN
  298. //====================================================================================================100
  299. //==================================================50
  300. // recordsD
  301. //==================================================50
  302. error = clEnqueueWriteBuffer( command_queue, // command queue
  303. recordsD, // destination
  304. 1, // block the source from access until this copy operation complates (1=yes, 0=no)
  305. 0, // offset in destination to write to
  306. records_mem, // size to be copied
  307. records, // source
  308. 0, // # of events in the list of events to wait for
  309. NULL, // list of events to wait for
  310. NULL); // ID of this operation to be used by waiting operations
  311. if (error != CL_SUCCESS)
  312. fatal_CL(error, __LINE__);
  313. //==================================================50
  314. // knodesD
  315. //==================================================50
  316. error = clEnqueueWriteBuffer( command_queue, // command queue
  317. knodesD, // destination
  318. 1, // block the source from access until this copy operation complates (1=yes, 0=no)
  319. 0, // offset in destination to write to
  320. knodes_mem, // size to be copied
  321. knodes, // source
  322. 0, // # of events in the list of events to wait for
  323. NULL, // list of events to wait for
  324. NULL); // ID of this operation to be used by waiting operations
  325. if (error != CL_SUCCESS)
  326. fatal_CL(error, __LINE__);
  327. //==================================================50
  328. // currKnodeD
  329. //==================================================50
  330. error = clEnqueueWriteBuffer( command_queue, // command queue
  331. currKnodeD, // destination
  332. 1, // block the source from access until this copy operation complates (1=yes, 0=no)
  333. 0, // offset in destination to write to
  334. count*sizeof(long), // size to be copied
  335. currKnode, // source
  336. 0, // # of events in the list of events to wait for
  337. NULL, // list of events to wait for
  338. NULL); // ID of this operation to be used by waiting operations
  339. if (error != CL_SUCCESS)
  340. fatal_CL(error, __LINE__);
  341. //==================================================50
  342. // offsetD
  343. //==================================================50
  344. error = clEnqueueWriteBuffer( command_queue, // command queue
  345. offsetD, // destination
  346. 1, // block the source from access until this copy operation complates (1=yes, 0=no)
  347. 0, // offset in destination to write to
  348. count*sizeof(long), // size to be copied
  349. offset, // source
  350. 0, // # of events in the list of events to wait for
  351. NULL, // list of events to wait for
  352. NULL); // ID of this operation to be used by waiting operations
  353. if (error != CL_SUCCESS)
  354. fatal_CL(error, __LINE__);
  355. //==================================================50
  356. // keysD
  357. //==================================================50
  358. error = clEnqueueWriteBuffer( command_queue, // command queue
  359. keysD, // destination
  360. 1, // block the source from access until this copy operation complates (1=yes, 0=no)
  361. 0, // offset in destination to write to
  362. count*sizeof(int), // size to be copied
  363. keys, // source
  364. 0, // # of events in the list of events to wait for
  365. NULL, // list of events to wait for
  366. NULL); // ID of this operation to be used by waiting operations
  367. if (error != CL_SUCCESS)
  368. fatal_CL(error, __LINE__);
  369. //==================================================50
  370. // END
  371. //==================================================50
  372. //====================================================================================================100
  373. // DEVICE IN/OUT
  374. //====================================================================================================100
  375. //==================================================50
  376. // ansD
  377. //==================================================50
  378. error = clEnqueueWriteBuffer( command_queue, // command queue
  379. ansD, // destination
  380. 1, // block the source from access until this copy operation complates (1=yes, 0=no)
  381. 0, // offset in destination to write to
  382. count*sizeof(record), // size to be copied
  383. ans, // source
  384. 0, // # of events in the list of events to wait for
  385. NULL, // list of events to wait for
  386. NULL); // ID of this operation to be used by waiting operations
  387. if (error != CL_SUCCESS)
  388. fatal_CL(error, __LINE__);
  389. time3 = get_time();
  390. //==================================================50
  391. // END
  392. //==================================================50
  393. //====================================================================================================100
  394. // END
  395. //====================================================================================================100
  396. //======================================================================================================================================================150
  397. // findK kernel
  398. //======================================================================================================================================================150
  399. //====================================================================================================100
  400. // Execution Parameters
  401. //====================================================================================================100
  402. size_t local_work_size[1];
  403. local_work_size[0] = order < 1024 ? order : 1024;
  404. size_t global_work_size[1];
  405. global_work_size[0] = count * local_work_size[0];
  406. printf("# of blocks = %d, # of threads/block = %d (ensure that device can handle)\n", (int)(global_work_size[0]/local_work_size[0]), (int)local_work_size[0]);
  407. //====================================================================================================100
  408. // Kernel Arguments
  409. //====================================================================================================100
  410. clSetKernelArg( kernel,
  411. 0,
  412. sizeof(long),
  413. (void *) &maxheight);
  414. clSetKernelArg( kernel,
  415. 1,
  416. sizeof(cl_mem),
  417. (void *) &knodesD);
  418. clSetKernelArg( kernel,
  419. 2,
  420. sizeof(long),
  421. (void *) &knodes_elem);
  422. clSetKernelArg( kernel,
  423. 3,
  424. sizeof(cl_mem),
  425. (void *) &recordsD);
  426. clSetKernelArg( kernel,
  427. 4,
  428. sizeof(cl_mem),
  429. (void *) &currKnodeD);
  430. clSetKernelArg( kernel,
  431. 5,
  432. sizeof(cl_mem),
  433. (void *) &offsetD);
  434. clSetKernelArg( kernel,
  435. 6,
  436. sizeof(cl_mem),
  437. (void *) &keysD);
  438. clSetKernelArg( kernel,
  439. 7,
  440. sizeof(cl_mem),
  441. (void *) &ansD);
  442. //====================================================================================================100
  443. // Kernel
  444. //====================================================================================================100
  445. error = clEnqueueNDRangeKernel( command_queue,
  446. kernel,
  447. 1,
  448. NULL,
  449. global_work_size,
  450. local_work_size,
  451. 0,
  452. NULL,
  453. NULL);
  454. if (error != CL_SUCCESS)
  455. fatal_CL(error, __LINE__);
  456. // Wait for all operations to finish NOT SURE WHERE THIS SHOULD GO
  457. error = clFinish(command_queue);
  458. if (error != CL_SUCCESS)
  459. fatal_CL(error, __LINE__);
  460. time4 = get_time();
  461. //====================================================================================================100
  462. // END
  463. //====================================================================================================100
  464. //======================================================================================================================================================150
  465. // GPU MEMORY COPY (CONTD.)
  466. //======================================================================================================================================================150
  467. //====================================================================================================100
  468. // DEVICE IN/OUT
  469. //====================================================================================================100
  470. //==================================================50
  471. // ansD
  472. //==================================================50
  473. error = clEnqueueReadBuffer(command_queue, // The command queue.
  474. ansD, // The image on the device.
  475. CL_TRUE, // Blocking? (ie. Wait at this line until read has finished?)
  476. 0, // Offset. None in this case.
  477. count*sizeof(record), // Size to copy.
  478. ans, // The pointer to the image on the host.
  479. 0, // Number of events in wait list. Not used.
  480. NULL, // Event wait list. Not used.
  481. NULL); // Event object for determining status. Not used.
  482. if (error != CL_SUCCESS)
  483. fatal_CL(error, __LINE__);
  484. time5 = get_time();
  485. //==================================================50
  486. // END
  487. //==================================================50
  488. //====================================================================================================100
  489. // END
  490. //====================================================================================================100
  491. //======================================================================================================================================================150
  492. // GPU MEMORY DEALLOCATION
  493. //======================================================================================================================================================150
  494. // Release kernels...
  495. clReleaseKernel(kernel);
  496. // Now the program...
  497. clReleaseProgram(program);
  498. // Clean up the device memory...
  499. clReleaseMemObject(recordsD);
  500. clReleaseMemObject(knodesD);
  501. clReleaseMemObject(currKnodeD);
  502. clReleaseMemObject(offsetD);
  503. clReleaseMemObject(keysD);
  504. clReleaseMemObject(ansD);
  505. // Flush the queue
  506. error = clFlush(command_queue);
  507. if (error != CL_SUCCESS)
  508. fatal_CL(error, __LINE__);
  509. // ...and finally, the queue and context.
  510. clReleaseCommandQueue(command_queue);
  511. // ???
  512. clReleaseContext(context);
  513. time6 = get_time();
  514. //======================================================================================================================================================150
  515. // DISPLAY TIMING
  516. //======================================================================================================================================================150
  517. printf("Time spent in different stages of GPU_CUDA KERNEL:\n");
  518. printf("%15.12f s, %15.12f % : GPU: SET DEVICE / DRIVER INIT\n", (float) (time1-time0) / 1000000, (float) (time1-time0) / (float) (time6-time0) * 100);
  519. printf("%15.12f s, %15.12f % : GPU MEM: ALO\n", (float) (time2-time1) / 1000000, (float) (time2-time1) / (float) (time6-time0) * 100);
  520. printf("%15.12f s, %15.12f % : GPU MEM: COPY IN\n", (float) (time3-time2) / 1000000, (float) (time3-time2) / (float) (time6-time0) * 100);
  521. printf("%15.12f s, %15.12f % : GPU: KERNEL\n", (float) (time4-time3) / 1000000, (float) (time4-time3) / (float) (time6-time0) * 100);
  522. printf("%15.12f s, %15.12f % : GPU MEM: COPY OUT\n", (float) (time5-time4) / 1000000, (float) (time5-time4) / (float) (time6-time0) * 100);
  523. printf("%15.12f s, %15.12f % : GPU MEM: FRE\n", (float) (time6-time5) / 1000000, (float) (time6-time5) / (float) (time6-time0) * 100);
  524. printf("Total time:\n");
  525. printf("%.12f s\n", (float) (time6-time0) / 1000000);
  526. //======================================================================================================================================================150
  527. // END
  528. //======================================================================================================================================================150
  529. }
  530. //========================================================================================================================================================================================================200
  531. // END
  532. //========================================================================================================================================================================================================200
  533. // #ifdef __cplusplus
  534. // }
  535. // #endif