kernel_gpu_opencl_wrapper_2.c 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772
  1. // #ifdef __cplusplus
  2. // extern "C" {
  3. // #endif
  4. //========================================================================================================================================================================================================200
  5. // INCLUDE
  6. //========================================================================================================================================================================================================200
  7. //======================================================================================================================================================150
  8. // LIBRARIES
  9. //======================================================================================================================================================150
  10. #include <CL/cl.h> // (in directory provided to compiler) needed by OpenCL types and functions
  11. #include <string.h> // (in directory known to compiler) needed by memset
  12. #include <stdio.h> // (in directory known to compiler) needed by printf, stderr
  13. //======================================================================================================================================================150
  14. // COMMON
  15. //======================================================================================================================================================150
  16. #include "../common.h" // (in directory provided here)
  17. //======================================================================================================================================================150
  18. // UTILITIES
  19. //======================================================================================================================================================150
  20. #include "../util/timer/timer.h" // (in directory provided here)
  21. //======================================================================================================================================================150
  22. // HEADER
  23. //======================================================================================================================================================150
  24. #include "./kernel_gpu_opencl_wrapper_2.h" // (in directory provided here)
  25. //========================================================================================================================================================================================================200
  26. // FUNCTION
  27. //========================================================================================================================================================================================================200
  28. void
  29. kernel_gpu_opencl_wrapper_2(knode *knodes,
  30. long knodes_elem,
  31. long knodes_mem,
  32. int order,
  33. long maxheight,
  34. int count,
  35. long *currKnode,
  36. long *offset,
  37. long *lastKnode,
  38. long *offset_2,
  39. int *start,
  40. int *end,
  41. int *recstart,
  42. int *reclength)
  43. {
  44. //======================================================================================================================================================150
  45. // CPU VARIABLES
  46. //======================================================================================================================================================150
  47. // timer
  48. long long time0;
  49. long long time1;
  50. long long time2;
  51. long long time3;
  52. long long time4;
  53. long long time5;
  54. long long time6;
  55. time0 = get_time();
  56. //======================================================================================================================================================150
  57. // GPU SETUP
  58. //======================================================================================================================================================150
  59. //====================================================================================================100
  60. // INITIAL DRIVER OVERHEAD
  61. //====================================================================================================100
  62. // cudaThreadSynchronize();
  63. //====================================================================================================100
  64. // COMMON VARIABLES
  65. //====================================================================================================100
  66. // common variables
  67. cl_int error;
  68. //====================================================================================================100
  69. // GET PLATFORMS (Intel, AMD, NVIDIA, based on provided library), SELECT ONE
  70. //====================================================================================================100
  71. // Get the number of available platforms
  72. cl_uint num_platforms;
  73. error = clGetPlatformIDs( 0,
  74. NULL,
  75. &num_platforms);
  76. if (error != CL_SUCCESS)
  77. fatal_CL(error, __LINE__);
  78. // Get the list of available platforms
  79. cl_platform_id *platforms = (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms);
  80. error = clGetPlatformIDs( num_platforms,
  81. platforms,
  82. NULL);
  83. if (error != CL_SUCCESS)
  84. fatal_CL(error, __LINE__);
  85. // Select the 1st platform
  86. cl_platform_id platform = platforms[0];
  87. // Get the name of the selected platform and print it (if there are multiple platforms, choose the first one)
  88. char pbuf[100];
  89. error = clGetPlatformInfo( platform,
  90. CL_PLATFORM_VENDOR,
  91. sizeof(pbuf),
  92. pbuf,
  93. NULL);
  94. if (error != CL_SUCCESS)
  95. fatal_CL(error, __LINE__);
  96. printf("Platform: %s\n", pbuf);
  97. //====================================================================================================100
  98. // CREATE CONTEXT FOR THE PLATFORM
  99. //====================================================================================================100
  100. // Create context properties for selected platform
  101. cl_context_properties context_properties[3] = { CL_CONTEXT_PLATFORM,
  102. (cl_context_properties) platform,
  103. 0};
  104. // Create context for selected platform being GPU
  105. cl_context context;
  106. context = clCreateContextFromType( context_properties,
  107. CL_DEVICE_TYPE_GPU,
  108. NULL,
  109. NULL,
  110. &error);
  111. if (error != CL_SUCCESS)
  112. fatal_CL(error, __LINE__);
  113. //====================================================================================================100
  114. // GET DEVICES AVAILABLE FOR THE CONTEXT, SELECT ONE
  115. //====================================================================================================100
  116. // Get the number of devices (previousely selected for the context)
  117. size_t devices_size;
  118. error = clGetContextInfo( context,
  119. CL_CONTEXT_DEVICES,
  120. 0,
  121. NULL,
  122. &devices_size);
  123. if (error != CL_SUCCESS)
  124. fatal_CL(error, __LINE__);
  125. // Get the list of devices (previousely selected for the context)
  126. cl_device_id *devices = (cl_device_id *) malloc(devices_size);
  127. error = clGetContextInfo( context,
  128. CL_CONTEXT_DEVICES,
  129. devices_size,
  130. devices,
  131. NULL);
  132. if (error != CL_SUCCESS)
  133. fatal_CL(error, __LINE__);
  134. // Select the first device (previousely selected for the context) (if there are multiple devices, choose the first one)
  135. cl_device_id device;
  136. device = devices[0];
  137. // Get the name of the selected device (previousely selected for the context) and print it
  138. error = clGetDeviceInfo(device,
  139. CL_DEVICE_NAME,
  140. sizeof(pbuf),
  141. pbuf,
  142. NULL);
  143. if (error != CL_SUCCESS)
  144. fatal_CL(error, __LINE__);
  145. printf("Device: %s\n", pbuf);
  146. //====================================================================================================100
  147. // CREATE COMMAND QUEUE FOR THE DEVICE
  148. //====================================================================================================100
  149. // Create a command queue
  150. cl_command_queue command_queue;
  151. command_queue = clCreateCommandQueue( context,
  152. device,
  153. 0,
  154. &error);
  155. if (error != CL_SUCCESS)
  156. fatal_CL(error, __LINE__);
  157. //====================================================================================================100
  158. // CREATE PROGRAM, COMPILE IT
  159. //====================================================================================================100
  160. // Load kernel source code from file
  161. const char *source = load_kernel_source("./kernel/kernel_gpu_opencl_2.cl");
  162. size_t sourceSize = strlen(source);
  163. // Create the program
  164. cl_program program = clCreateProgramWithSource( context,
  165. 1,
  166. &source,
  167. &sourceSize,
  168. &error);
  169. if (error != CL_SUCCESS)
  170. fatal_CL(error, __LINE__);
  171. char clOptions[110];
  172. // sprintf(clOptions,"-I../../src");
  173. sprintf(clOptions,"-I./../");
  174. #ifdef DEFAULT_ORDER_2
  175. sprintf(clOptions + strlen(clOptions), " -DDEFAULT_ORDER_2=%d", DEFAULT_ORDER_2);
  176. #endif
  177. // Compile the program
  178. error = clBuildProgram( program,
  179. 1,
  180. &device,
  181. clOptions,
  182. NULL,
  183. NULL);
  184. // Print warnings and errors from compilation
  185. static char log[65536];
  186. memset(log, 0, sizeof(log));
  187. clGetProgramBuildInfo( program,
  188. device,
  189. CL_PROGRAM_BUILD_LOG,
  190. sizeof(log)-1,
  191. log,
  192. NULL);
  193. printf("-----OpenCL Compiler Output-----\n");
  194. if (strstr(log,"warning:") || strstr(log, "error:"))
  195. printf("<<<<\n%s\n>>>>\n", log);
  196. printf("--------------------------------\n");
  197. if (error != CL_SUCCESS)
  198. fatal_CL(error, __LINE__);
  199. // Create kernel
  200. cl_kernel kernel;
  201. kernel = clCreateKernel(program,
  202. "findRangeK",
  203. &error);
  204. if (error != CL_SUCCESS)
  205. fatal_CL(error, __LINE__);
  206. time1 = get_time();
  207. //====================================================================================================100
  208. // END
  209. //====================================================================================================100
  210. //======================================================================================================================================================150
  211. // GPU MEMORY MALLOC
  212. //======================================================================================================================================================150
  213. //====================================================================================================100
  214. // DEVICE IN
  215. //====================================================================================================100
  216. //==================================================50
  217. // knodesD
  218. //==================================================50
  219. cl_mem knodesD;
  220. knodesD = clCreateBuffer( context,
  221. CL_MEM_READ_WRITE,
  222. knodes_mem,
  223. NULL,
  224. &error );
  225. if (error != CL_SUCCESS)
  226. fatal_CL(error, __LINE__);
  227. //==================================================50
  228. // currKnodeD
  229. //==================================================50
  230. cl_mem currKnodeD;
  231. currKnodeD = clCreateBuffer(context,
  232. CL_MEM_READ_WRITE,
  233. count*sizeof(long),
  234. NULL,
  235. &error );
  236. if (error != CL_SUCCESS)
  237. fatal_CL(error, __LINE__);
  238. //==================================================50
  239. // offsetD
  240. //==================================================50
  241. cl_mem offsetD;
  242. offsetD = clCreateBuffer( context,
  243. CL_MEM_READ_WRITE,
  244. count*sizeof(long),
  245. NULL,
  246. &error );
  247. if (error != CL_SUCCESS)
  248. fatal_CL(error, __LINE__);
  249. //==================================================50
  250. // lastKnodeD
  251. //==================================================50
  252. cl_mem lastKnodeD;
  253. lastKnodeD = clCreateBuffer(context,
  254. CL_MEM_READ_WRITE,
  255. count*sizeof(long),
  256. NULL,
  257. &error );
  258. if (error != CL_SUCCESS)
  259. fatal_CL(error, __LINE__);
  260. //==================================================50
  261. // offset_2D
  262. //==================================================50
  263. cl_mem offset_2D;
  264. offset_2D = clCreateBuffer(context,
  265. CL_MEM_READ_WRITE,
  266. count*sizeof(long),
  267. NULL,
  268. &error );
  269. if (error != CL_SUCCESS)
  270. fatal_CL(error, __LINE__);
  271. //==================================================50
  272. // startD
  273. //==================================================50
  274. cl_mem startD;
  275. startD = clCreateBuffer(context,
  276. CL_MEM_READ_WRITE,
  277. count*sizeof(int),
  278. NULL,
  279. &error );
  280. if (error != CL_SUCCESS)
  281. fatal_CL(error, __LINE__);
  282. //==================================================50
  283. // endD
  284. //==================================================50
  285. cl_mem endD;
  286. endD = clCreateBuffer( context,
  287. CL_MEM_READ_WRITE,
  288. count*sizeof(int),
  289. NULL,
  290. &error );
  291. if (error != CL_SUCCESS)
  292. fatal_CL(error, __LINE__);
  293. //==================================================50
  294. // END
  295. //==================================================50
  296. //====================================================================================================100
  297. // DEVICE IN/OUT
  298. //====================================================================================================100
  299. //==================================================50
  300. // ansDStart
  301. //==================================================50
  302. cl_mem ansDStart;
  303. ansDStart = clCreateBuffer( context,
  304. CL_MEM_READ_WRITE,
  305. count*sizeof(int),
  306. NULL,
  307. &error );
  308. if (error != CL_SUCCESS)
  309. fatal_CL(error, __LINE__);
  310. //==================================================50
  311. // ansDLength
  312. //==================================================50
  313. cl_mem ansDLength;
  314. ansDLength = clCreateBuffer( context,
  315. CL_MEM_READ_WRITE,
  316. count*sizeof(int),
  317. NULL,
  318. &error );
  319. if (error != CL_SUCCESS)
  320. fatal_CL(error, __LINE__);
  321. time2 = get_time();
  322. //==================================================50
  323. // END
  324. //==================================================50
  325. //====================================================================================================100
  326. // END
  327. //====================================================================================================100
  328. //======================================================================================================================================================150
  329. // GPU MEMORY COPY
  330. //======================================================================================================================================================150
  331. //====================================================================================================100
  332. // DEVICE IN
  333. //====================================================================================================100
  334. //==================================================50
  335. // knodesD
  336. //==================================================50
  337. error = clEnqueueWriteBuffer( command_queue, // command queue
  338. knodesD, // destination
  339. 1, // block the source from access until this copy operation complates (1=yes, 0=no)
  340. 0, // offset in destination to write to
  341. knodes_mem, // size to be copied
  342. knodes, // source
  343. 0, // # of events in the list of events to wait for
  344. NULL, // list of events to wait for
  345. NULL); // ID of this operation to be used by waiting operations
  346. if (error != CL_SUCCESS)
  347. fatal_CL(error, __LINE__);
  348. //==================================================50
  349. // currKnodeD
  350. //==================================================50
  351. error = clEnqueueWriteBuffer( command_queue, // command queue
  352. currKnodeD, // destination
  353. 1, // block the source from access until this copy operation complates (1=yes, 0=no)
  354. 0, // offset in destination to write to
  355. count*sizeof(long), // size to be copied
  356. currKnode, // source
  357. 0, // # of events in the list of events to wait for
  358. NULL, // list of events to wait for
  359. NULL); // ID of this operation to be used by waiting operations
  360. if (error != CL_SUCCESS)
  361. fatal_CL(error, __LINE__);
  362. //==================================================50
  363. // offsetD
  364. //==================================================50
  365. error = clEnqueueWriteBuffer( command_queue, // command queue
  366. offsetD, // destination
  367. 1, // block the source from access until this copy operation complates (1=yes, 0=no)
  368. 0, // offset in destination to write to
  369. count*sizeof(long), // size to be copied
  370. offset, // source
  371. 0, // # of events in the list of events to wait for
  372. NULL, // list of events to wait for
  373. NULL); // ID of this operation to be used by waiting operations
  374. if (error != CL_SUCCESS)
  375. fatal_CL(error, __LINE__);
  376. //==================================================50
  377. // lastKnodeD
  378. //==================================================50
  379. error = clEnqueueWriteBuffer( command_queue, // command queue
  380. lastKnodeD, // destination
  381. 1, // block the source from access until this copy operation complates (1=yes, 0=no)
  382. 0, // offset in destination to write to
  383. count*sizeof(long), // size to be copied
  384. lastKnode, // source
  385. 0, // # of events in the list of events to wait for
  386. NULL, // list of events to wait for
  387. NULL); // ID of this operation to be used by waiting operations
  388. if (error != CL_SUCCESS)
  389. fatal_CL(error, __LINE__);
  390. //==================================================50
  391. // offset_2D
  392. //==================================================50
  393. error = clEnqueueWriteBuffer( command_queue, // command queue
  394. offset_2D, // destination
  395. 1, // block the source from access until this copy operation complates (1=yes, 0=no)
  396. 0, // offset in destination to write to
  397. count*sizeof(long), // size to be copied
  398. offset_2, // source
  399. 0, // # of events in the list of events to wait for
  400. NULL, // list of events to wait for
  401. NULL); // ID of this operation to be used by waiting operations
  402. if (error != CL_SUCCESS)
  403. fatal_CL(error, __LINE__);
  404. //==================================================50
  405. // startD
  406. //==================================================50
  407. error = clEnqueueWriteBuffer( command_queue, // command queue
  408. startD, // destination
  409. 1, // block the source from access until this copy operation complates (1=yes, 0=no)
  410. 0, // offset in destination to write to
  411. count*sizeof(int), // size to be copied
  412. start, // source
  413. 0, // # of events in the list of events to wait for
  414. NULL, // list of events to wait for
  415. NULL); // ID of this operation to be used by waiting operations
  416. if (error != CL_SUCCESS)
  417. fatal_CL(error, __LINE__);
  418. //==================================================50
  419. // endD
  420. //==================================================50
  421. error = clEnqueueWriteBuffer( command_queue, // command queue
  422. endD, // destination
  423. 1, // block the source from access until this copy operation complates (1=yes, 0=no)
  424. 0, // offset in destination to write to
  425. count*sizeof(int), // size to be copied
  426. end, // source
  427. 0, // # of events in the list of events to wait for
  428. NULL, // list of events to wait for
  429. NULL); // ID of this operation to be used by waiting operations
  430. if (error != CL_SUCCESS)
  431. fatal_CL(error, __LINE__);
  432. //==================================================50
  433. // END
  434. //==================================================50
  435. //====================================================================================================100
  436. // DEVICE IN/OUT
  437. //====================================================================================================100
  438. //==================================================50
  439. // ansDStart
  440. //==================================================50
  441. error = clEnqueueWriteBuffer( command_queue, // command queue
  442. endD, // destination
  443. 1, // block the source from access until this copy operation complates (1=yes, 0=no)
  444. 0, // offset in destination to write to
  445. count*sizeof(int), // size to be copied
  446. end, // source
  447. 0, // # of events in the list of events to wait for
  448. NULL, // list of events to wait for
  449. NULL); // ID of this operation to be used by waiting operations
  450. if (error != CL_SUCCESS)
  451. fatal_CL(error, __LINE__);
  452. //==================================================50
  453. // ansDLength
  454. //==================================================50
  455. error = clEnqueueWriteBuffer( command_queue, // command queue
  456. ansDLength, // destination
  457. 1, // block the source from access until this copy operation complates (1=yes, 0=no)
  458. 0, // offset in destination to write to
  459. count*sizeof(int), // size to be copied
  460. reclength, // source
  461. 0, // # of events in the list of events to wait for
  462. NULL, // list of events to wait for
  463. NULL); // ID of this operation to be used by waiting operations
  464. if (error != CL_SUCCESS)
  465. fatal_CL(error, __LINE__);
  466. time3 = get_time();
  467. //==================================================50
  468. // END
  469. //==================================================50
  470. //======================================================================================================================================================150
  471. // KERNEL
  472. //======================================================================================================================================================150
  473. //====================================================================================================100
  474. // Execution Parameters
  475. //====================================================================================================100
  476. size_t local_work_size[1];
  477. local_work_size[0] = order < 1024 ? order : 1024;
  478. size_t global_work_size[1];
  479. global_work_size[0] = count * local_work_size[0];
  480. printf("# of blocks = %d, # of threads/block = %d (ensure that device can handle)\n", (int)(global_work_size[0]/local_work_size[0]), (int)local_work_size[0]);
  481. //====================================================================================================100
  482. // Kernel Arguments
  483. //====================================================================================================100
  484. clSetKernelArg( kernel,
  485. 0,
  486. sizeof(long),
  487. (void *) &maxheight);
  488. clSetKernelArg( kernel,
  489. 1,
  490. sizeof(cl_mem),
  491. (void *) &knodesD);
  492. clSetKernelArg( kernel,
  493. 2,
  494. sizeof(long),
  495. (void *) &knodes_elem);
  496. clSetKernelArg( kernel,
  497. 3,
  498. sizeof(cl_mem),
  499. (void *) &currKnodeD);
  500. clSetKernelArg( kernel,
  501. 4,
  502. sizeof(cl_mem),
  503. (void *) &offsetD);
  504. clSetKernelArg( kernel,
  505. 5,
  506. sizeof(cl_mem),
  507. (void *) &lastKnodeD);
  508. clSetKernelArg( kernel,
  509. 6,
  510. sizeof(cl_mem),
  511. (void *) &offset_2D);
  512. clSetKernelArg( kernel,
  513. 7,
  514. sizeof(cl_mem),
  515. (void *) &startD);
  516. clSetKernelArg( kernel,
  517. 8,
  518. sizeof(cl_mem),
  519. (void *) &endD);
  520. clSetKernelArg( kernel,
  521. 9,
  522. sizeof(cl_mem),
  523. (void *) &ansDStart);
  524. clSetKernelArg( kernel,
  525. 10,
  526. sizeof(cl_mem),
  527. (void *) &ansDLength);
  528. //====================================================================================================100
  529. // Kernel
  530. //====================================================================================================100
  531. error = clEnqueueNDRangeKernel( command_queue,
  532. kernel,
  533. 1,
  534. NULL,
  535. global_work_size,
  536. local_work_size,
  537. 0,
  538. NULL,
  539. NULL);
  540. if (error != CL_SUCCESS)
  541. fatal_CL(error, __LINE__);
  542. // Wait for all operations to finish NOT SURE WHERE THIS SHOULD GO
  543. error = clFinish(command_queue);
  544. if (error != CL_SUCCESS)
  545. fatal_CL(error, __LINE__);
  546. time4 = get_time();
  547. //====================================================================================================100
  548. // END
  549. //====================================================================================================100
  550. //======================================================================================================================================================150
  551. // GPU MEMORY COPY (CONTD.)
  552. //======================================================================================================================================================150
  553. //====================================================================================================100
  554. // DEVICE IN/OUT
  555. //====================================================================================================100
  556. //==================================================50
  557. // ansDStart
  558. //==================================================50
  559. error = clEnqueueReadBuffer(command_queue, // The command queue.
  560. ansDStart, // The image on the device.
  561. CL_TRUE, // Blocking? (ie. Wait at this line until read has finished?)
  562. 0, // Offset. None in this case.
  563. count*sizeof(int), // Size to copy.
  564. recstart, // The pointer to the image on the host.
  565. 0, // Number of events in wait list. Not used.
  566. NULL, // Event wait list. Not used.
  567. NULL); // Event object for determining status. Not used.
  568. if (error != CL_SUCCESS)
  569. fatal_CL(error, __LINE__);
  570. //==================================================50
  571. // ansDLength
  572. //==================================================50
  573. error = clEnqueueReadBuffer(command_queue, // The command queue.
  574. ansDLength, // The image on the device.
  575. CL_TRUE, // Blocking? (ie. Wait at this line until read has finished?)
  576. 0, // Offset. None in this case.
  577. count*sizeof(int), // Size to copy.
  578. reclength, // The pointer to the image on the host.
  579. 0, // Number of events in wait list. Not used.
  580. NULL, // Event wait list. Not used.
  581. NULL); // Event object for determining status. Not used.
  582. if (error != CL_SUCCESS)
  583. fatal_CL(error, __LINE__);
  584. time5 = get_time();
  585. //==================================================50
  586. // END
  587. //==================================================50
  588. //====================================================================================================100
  589. // END
  590. //====================================================================================================100
  591. //======================================================================================================================================================150
  592. // GPU MEMORY DEALLOCATION
  593. //======================================================================================================================================================150
  594. // Release kernels...
  595. clReleaseKernel(kernel);
  596. // Now the program...
  597. clReleaseProgram(program);
  598. // Clean up the device memory...
  599. clReleaseMemObject(knodesD);
  600. clReleaseMemObject(currKnodeD);
  601. clReleaseMemObject(offsetD);
  602. clReleaseMemObject(lastKnodeD);
  603. clReleaseMemObject(offset_2D);
  604. clReleaseMemObject(startD);
  605. clReleaseMemObject(endD);
  606. clReleaseMemObject(ansDStart);
  607. clReleaseMemObject(ansDLength);
  608. // Flush the queue
  609. error = clFlush(command_queue);
  610. if (error != CL_SUCCESS)
  611. fatal_CL(error, __LINE__);
  612. // ...and finally, the queue and context.
  613. clReleaseCommandQueue(command_queue);
  614. // ???
  615. clReleaseContext(context);
  616. time6 = get_time();
  617. //======================================================================================================================================================150
  618. // DISPLAY TIMING
  619. //======================================================================================================================================================150
  620. printf("Time spent in different stages of GPU_CUDA KERNEL:\n");
  621. printf("%15.12f s, %15.12f % : GPU: SET DEVICE / DRIVER INIT\n", (float) (time1-time0) / 1000000, (float) (time1-time0) / (float) (time6-time0) * 100);
  622. printf("%15.12f s, %15.12f % : GPU MEM: ALO\n", (float) (time2-time1) / 1000000, (float) (time2-time1) / (float) (time6-time0) * 100);
  623. printf("%15.12f s, %15.12f % : GPU MEM: COPY IN\n", (float) (time3-time2) / 1000000, (float) (time3-time2) / (float) (time6-time0) * 100);
  624. printf("%15.12f s, %15.12f % : GPU: KERNEL\n", (float) (time4-time3) / 1000000, (float) (time4-time3) / (float) (time6-time0) * 100);
  625. printf("%15.12f s, %15.12f % : GPU MEM: COPY OUT\n", (float) (time5-time4) / 1000000, (float) (time5-time4) / (float) (time6-time0) * 100);
  626. printf("%15.12f s, %15.12f % : GPU MEM: FRE\n", (float) (time6-time5) / 1000000, (float) (time6-time5) / (float) (time6-time0) * 100);
  627. printf("Total time:\n");
  628. printf("%.12f s\n", (float) (time6-time0) / 1000000);
  629. //======================================================================================================================================================150
  630. // END
  631. //======================================================================================================================================================150
  632. }
  633. //========================================================================================================================================================================================================200
  634. // END
  635. //========================================================================================================================================================================================================200
  636. // #ifdef __cplusplus
  637. // }
  638. // #endif