kernel_gpu_opencl_wrapper.c 40 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187
  1. //========================================================================================================================================================================================================200
  2. // DEFINE/INCLUDE
  3. //========================================================================================================================================================================================================200
  4. //======================================================================================================================================================150
  5. // MAIN FUNCTION HEADER
  6. //======================================================================================================================================================150
  7. #include "./../main.h" // (in the main program folder)
  8. //======================================================================================================================================================150
  9. // DEFINE
  10. //======================================================================================================================================================150
  11. //======================================================================================================================================================150
  12. // LIBRARIES
  13. //======================================================================================================================================================150
  14. #include <stdio.h> // (in path known to compiler) needed by printf
  15. #include <string.h> // (in path known to compiler) needed by strlen
  16. #include <CL/cl.h> // (in path specified to compiler) needed by OpenCL types and functions
  17. //======================================================================================================================================================150
  18. // UTILITIES
  19. //======================================================================================================================================================150
  20. #include "./../util/opencl/opencl.h" // (in directory) needed by device functions
  21. //======================================================================================================================================================150
  22. // KERNEL_GPU_CUDA_WRAPPER FUNCTION HEADER
  23. //======================================================================================================================================================150
  24. #include "./kernel_gpu_opencl_wrapper.h" // (in directory)
  25. //======================================================================================================================================================150
  26. // END
  27. //======================================================================================================================================================150
  28. //========================================================================================================================================================================================================200
  29. // KERNEL_GPU_CUDA_WRAPPER FUNCTION
  30. //========================================================================================================================================================================================================200
  31. void
  32. kernel_gpu_opencl_wrapper( fp* image, // input image
  33. int Nr, // IMAGE nbr of rows
  34. int Nc, // IMAGE nbr of cols
  35. long Ne, // IMAGE nbr of elem
  36. int niter, // nbr of iterations
  37. fp lambda, // update step size
  38. long NeROI, // ROI nbr of elements
  39. int* iN,
  40. int* iS,
  41. int* jE,
  42. int* jW,
  43. int iter, // primary loop
  44. int mem_size_i,
  45. int mem_size_j)
  46. {
  47. //======================================================================================================================================================150
  48. // GPU SETUP
  49. //======================================================================================================================================================150
  50. //====================================================================================================100
  51. // COMMON VARIABLES
  52. //====================================================================================================100
  53. // common variables
  54. cl_int error;
  55. //====================================================================================================100
  56. // GET PLATFORMS (Intel, AMD, NVIDIA, based on provided library), SELECT ONE
  57. //====================================================================================================100
  58. // Get the number of available platforms
  59. cl_uint num_platforms;
  60. error = clGetPlatformIDs( 0,
  61. NULL,
  62. &num_platforms);
  63. if (error != CL_SUCCESS)
  64. fatal_CL(error, __LINE__);
  65. // Get the list of available platforms
  66. cl_platform_id *platforms = (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms);
  67. error = clGetPlatformIDs( num_platforms,
  68. platforms,
  69. NULL);
  70. if (error != CL_SUCCESS)
  71. fatal_CL(error, __LINE__);
  72. // Select the 1st platform
  73. cl_platform_id platform = platforms[1];
  74. // Get the name of the selected platform and print it (if there are multiple platforms, choose the first one)
  75. char pbuf[100];
  76. error = clGetPlatformInfo( platform,
  77. CL_PLATFORM_VENDOR,
  78. sizeof(pbuf),
  79. pbuf,
  80. NULL);
  81. if (error != CL_SUCCESS)
  82. fatal_CL(error, __LINE__);
  83. printf("Platform: %s\n", pbuf);
  84. //====================================================================================================100
  85. // CREATE CONTEXT FOR THE PLATFORM
  86. //====================================================================================================100
  87. // Create context properties for selected platform
  88. cl_context_properties context_properties[3] = { CL_CONTEXT_PLATFORM,
  89. (cl_context_properties) platform,
  90. 0};
  91. // Create context for selected platform being GPU
  92. cl_context context;
  93. context = clCreateContextFromType( context_properties,
  94. CL_DEVICE_TYPE_ALL,
  95. NULL,
  96. NULL,
  97. &error);
  98. if (error != CL_SUCCESS)
  99. fatal_CL(error, __LINE__);
  100. //====================================================================================================100
  101. // GET DEVICES AVAILABLE FOR THE CONTEXT, SELECT ONE
  102. //====================================================================================================100
  103. // Get the number of devices (previousely selected for the context)
  104. size_t devices_size;
  105. error = clGetContextInfo( context,
  106. CL_CONTEXT_DEVICES,
  107. 0,
  108. NULL,
  109. &devices_size);
  110. if (error != CL_SUCCESS)
  111. fatal_CL(error, __LINE__);
  112. // Get the list of devices (previousely selected for the context)
  113. cl_device_id *devices = (cl_device_id *) malloc(devices_size);
  114. error = clGetContextInfo( context,
  115. CL_CONTEXT_DEVICES,
  116. devices_size,
  117. devices,
  118. NULL);
  119. if (error != CL_SUCCESS)
  120. fatal_CL(error, __LINE__);
  121. // Select the first device (previousely selected for the context) (if there are multiple devices, choose the first one)
  122. cl_device_id device;
  123. device = devices[0];
  124. // Get the name of the selected device (previousely selected for the context) and print it
  125. error = clGetDeviceInfo(device,
  126. CL_DEVICE_NAME,
  127. sizeof(pbuf),
  128. pbuf,
  129. NULL);
  130. if (error != CL_SUCCESS)
  131. fatal_CL(error, __LINE__);
  132. printf("Device: %s\n", pbuf);
  133. //====================================================================================================100
  134. // CREATE COMMAND QUEUE FOR THE DEVICE
  135. //====================================================================================================100
  136. // Create a command queue
  137. cl_command_queue command_queue;
  138. command_queue = clCreateCommandQueue( context,
  139. device,
  140. 0,
  141. &error);
  142. if (error != CL_SUCCESS)
  143. fatal_CL(error, __LINE__);
  144. //====================================================================================================100
  145. // CREATE PROGRAM, COMPILE IT
  146. //====================================================================================================100
  147. // Load kernel source code from file
  148. const char *source = load_kernel_source("./kernel/kernel_gpu_opencl.cl");
  149. size_t sourceSize = strlen(source);
  150. // Create the program
  151. cl_program program = clCreateProgramWithSource( context,
  152. 1,
  153. &source,
  154. &sourceSize,
  155. &error);
  156. if (error != CL_SUCCESS)
  157. fatal_CL(error, __LINE__);
  158. char clOptions[150];
  159. // sprintf(clOptions,"-I../../src");
  160. sprintf(clOptions,"-I.");
  161. #ifdef RD_WG_SIZE
  162. sprintf(clOptions + strlen(clOptions), " -DRD_WG_SIZE=%d", RD_WG_SIZE);
  163. #endif
  164. #ifdef RD_WG_SIZE_0
  165. sprintf(clOptions + strlen(clOptions), " -DRD_WG_SIZE_0=%d", RD_WG_SIZE_0);
  166. #endif
  167. #ifdef RD_WG_SIZE_0_0
  168. sprintf(clOptions + strlen(clOptions), " -DRD_WG_SIZE_0_0=%d", RD_WG_SIZE_0_0);
  169. #endif
  170. // Compile the program
  171. error = clBuildProgram( program,
  172. 1,
  173. &device,
  174. clOptions,
  175. NULL,
  176. NULL);
  177. // Print warnings and errors from compilation
  178. static char log[65536];
  179. memset(log, 0, sizeof(log));
  180. clGetProgramBuildInfo( program,
  181. device,
  182. CL_PROGRAM_BUILD_LOG,
  183. sizeof(log)-1,
  184. log,
  185. NULL);
  186. printf("-----OpenCL Compiler Output-----\n");
  187. if (strstr(log,"warning:") || strstr(log, "error:"))
  188. printf("<<<<\n%s\n>>>>\n", log);
  189. printf("--------------------------------\n");
  190. if (error != CL_SUCCESS)
  191. fatal_CL(error, __LINE__);
  192. //====================================================================================================100
  193. // CREATE Kernels
  194. //====================================================================================================100
  195. // Extract kernel
  196. cl_kernel extract_kernel;
  197. extract_kernel = clCreateKernel(program,
  198. "extract_kernel",
  199. &error);
  200. if (error != CL_SUCCESS)
  201. fatal_CL(error, __LINE__);
  202. // Prepare kernel
  203. cl_kernel prepare_kernel;
  204. prepare_kernel = clCreateKernel(program,
  205. "prepare_kernel",
  206. &error);
  207. if (error != CL_SUCCESS)
  208. fatal_CL(error, __LINE__);
  209. // Reduce kernel
  210. cl_kernel reduce_kernel;
  211. reduce_kernel = clCreateKernel( program,
  212. "reduce_kernel",
  213. &error);
  214. if (error != CL_SUCCESS)
  215. fatal_CL(error, __LINE__);
  216. // SRAD kernel
  217. cl_kernel srad_kernel;
  218. srad_kernel = clCreateKernel( program,
  219. "srad_kernel",
  220. &error);
  221. if (error != CL_SUCCESS)
  222. fatal_CL(error, __LINE__);
  223. // SRAD2 kernel
  224. cl_kernel srad2_kernel;
  225. srad2_kernel = clCreateKernel( program,
  226. "srad2_kernel",
  227. &error);
  228. if (error != CL_SUCCESS)
  229. fatal_CL(error, __LINE__);
  230. // Compress kernel
  231. cl_kernel compress_kernel;
  232. compress_kernel = clCreateKernel( program,
  233. "compress_kernel",
  234. &error);
  235. if (error != CL_SUCCESS)
  236. fatal_CL(error, __LINE__);
  237. //====================================================================================================100
  238. // TRIGGERING INITIAL DRIVER OVERHEAD
  239. //====================================================================================================100
  240. // cudaThreadSynchronize(); // the above does it
  241. //======================================================================================================================================================150
  242. // GPU VARIABLES
  243. //======================================================================================================================================================150
  244. // CUDA kernel execution parameters
  245. int blocks_x;
  246. //======================================================================================================================================================150
  247. // ALLOCATE MEMORY IN GPU
  248. //======================================================================================================================================================150
  249. //====================================================================================================100
  250. // common memory size
  251. //====================================================================================================100
  252. int mem_size; // matrix memory size
  253. mem_size = sizeof(fp) * Ne; // get the size of float representation of input IMAGE
  254. //====================================================================================================100
  255. // allocate memory for entire IMAGE on DEVICE
  256. //====================================================================================================100
  257. cl_mem d_I;
  258. d_I = clCreateBuffer( context,
  259. CL_MEM_READ_WRITE,
  260. mem_size,
  261. NULL,
  262. &error );
  263. if (error != CL_SUCCESS)
  264. fatal_CL(error, __LINE__);
  265. //====================================================================================================100
  266. // allocate memory for coordinates on DEVICE
  267. //====================================================================================================100
  268. cl_mem d_iN;
  269. d_iN = clCreateBuffer( context,
  270. CL_MEM_READ_WRITE,
  271. mem_size_i,
  272. NULL,
  273. &error );
  274. if (error != CL_SUCCESS)
  275. fatal_CL(error, __LINE__);
  276. cl_mem d_iS;
  277. d_iS = clCreateBuffer( context,
  278. CL_MEM_READ_WRITE,
  279. mem_size_i,
  280. NULL,
  281. &error );
  282. if (error != CL_SUCCESS)
  283. fatal_CL(error, __LINE__);
  284. cl_mem d_jE;
  285. d_jE = clCreateBuffer( context,
  286. CL_MEM_READ_WRITE,
  287. mem_size_j,
  288. NULL,
  289. &error );
  290. if (error != CL_SUCCESS)
  291. fatal_CL(error, __LINE__);
  292. cl_mem d_jW;
  293. d_jW = clCreateBuffer( context,
  294. CL_MEM_READ_WRITE,
  295. mem_size_j,
  296. NULL,
  297. &error );
  298. if (error != CL_SUCCESS)
  299. fatal_CL(error, __LINE__);
  300. //====================================================================================================100
  301. // allocate memory for derivatives
  302. //====================================================================================================100
  303. cl_mem d_dN;
  304. d_dN = clCreateBuffer( context,
  305. CL_MEM_READ_WRITE,
  306. mem_size,
  307. NULL,
  308. &error );
  309. if (error != CL_SUCCESS)
  310. fatal_CL(error, __LINE__);
  311. cl_mem d_dS;
  312. d_dS = clCreateBuffer( context,
  313. CL_MEM_READ_WRITE,
  314. mem_size,
  315. NULL,
  316. &error );
  317. if (error != CL_SUCCESS)
  318. fatal_CL(error, __LINE__);
  319. cl_mem d_dW;
  320. d_dW = clCreateBuffer( context,
  321. CL_MEM_READ_WRITE,
  322. mem_size,
  323. NULL,
  324. &error );
  325. if (error != CL_SUCCESS)
  326. fatal_CL(error, __LINE__);
  327. cl_mem d_dE;
  328. d_dE = clCreateBuffer( context,
  329. CL_MEM_READ_WRITE,
  330. mem_size,
  331. NULL,
  332. &error );
  333. if (error != CL_SUCCESS)
  334. fatal_CL(error, __LINE__);
  335. //====================================================================================================100
  336. // allocate memory for coefficient on DEVICE
  337. //====================================================================================================100
  338. cl_mem d_c;
  339. d_c = clCreateBuffer( context,
  340. CL_MEM_READ_WRITE,
  341. mem_size,
  342. NULL,
  343. &error );
  344. if (error != CL_SUCCESS)
  345. fatal_CL(error, __LINE__);
  346. //====================================================================================================100
  347. // allocate memory for partial sums on DEVICE
  348. //====================================================================================================100
  349. cl_mem d_sums;
  350. d_sums = clCreateBuffer( context,
  351. CL_MEM_READ_WRITE,
  352. mem_size,
  353. NULL,
  354. &error );
  355. if (error != CL_SUCCESS)
  356. fatal_CL(error, __LINE__);
  357. cl_mem d_sums2;
  358. d_sums2 = clCreateBuffer( context,
  359. CL_MEM_READ_WRITE,
  360. mem_size,
  361. NULL,
  362. &error );
  363. if (error != CL_SUCCESS)
  364. fatal_CL(error, __LINE__);
  365. //====================================================================================================100
  366. // End
  367. //====================================================================================================100
  368. //======================================================================================================================================================150
  369. // COPY INPUT TO CPU
  370. //======================================================================================================================================================150
  371. //====================================================================================================100
  372. // Image
  373. //====================================================================================================100
  374. error = clEnqueueWriteBuffer( command_queue,
  375. d_I,
  376. 1,
  377. 0,
  378. mem_size,
  379. image,
  380. 0,
  381. 0,
  382. 0);
  383. if (error != CL_SUCCESS)
  384. fatal_CL(error, __LINE__);
  385. //====================================================================================================100
  386. // coordinates
  387. //====================================================================================================100
  388. error = clEnqueueWriteBuffer( command_queue,
  389. d_iN,
  390. 1,
  391. 0,
  392. mem_size_i,
  393. iN,
  394. 0,
  395. 0,
  396. 0);
  397. if (error != CL_SUCCESS)
  398. fatal_CL(error, __LINE__);
  399. error = clEnqueueWriteBuffer( command_queue,
  400. d_iS,
  401. 1,
  402. 0,
  403. mem_size_i,
  404. iS,
  405. 0,
  406. 0,
  407. 0);
  408. if (error != CL_SUCCESS)
  409. fatal_CL(error, __LINE__);
  410. error = clEnqueueWriteBuffer( command_queue,
  411. d_jE,
  412. 1,
  413. 0,
  414. mem_size_j,
  415. jE,
  416. 0,
  417. 0,
  418. 0);
  419. if (error != CL_SUCCESS)
  420. fatal_CL(error, __LINE__);
  421. error = clEnqueueWriteBuffer( command_queue,
  422. d_jW,
  423. 1,
  424. 0,
  425. mem_size_j,
  426. jW,
  427. 0,
  428. 0,
  429. 0);
  430. if (error != CL_SUCCESS)
  431. fatal_CL(error, __LINE__);
  432. //====================================================================================================100
  433. // End
  434. //====================================================================================================100
  435. //======================================================================================================================================================150
  436. // KERNEL EXECUTION PARAMETERS
  437. //======================================================================================================================================================150
  438. // threads
  439. size_t local_work_size[1];
  440. local_work_size[0] = NUMBER_THREADS;
  441. // workgroups
  442. int blocks_work_size;
  443. size_t global_work_size[1];
  444. blocks_x = Ne/(int)local_work_size[0];
  445. if (Ne % (int)local_work_size[0] != 0){ // compensate for division remainder above by adding one grid
  446. blocks_x = blocks_x + 1;
  447. }
  448. blocks_work_size = blocks_x;
  449. global_work_size[0] = blocks_work_size * local_work_size[0]; // define the number of blocks in the grid
  450. printf("max # of workgroups = %d, # of threads/workgroup = %d (ensure that device can handle)\n", (int)(global_work_size[0]/local_work_size[0]), (int)local_work_size[0]);
  451. //======================================================================================================================================================150
  452. // Extract Kernel - SCALE IMAGE DOWN FROM 0-255 TO 0-1 AND EXTRACT
  453. //======================================================================================================================================================150
  454. //====================================================================================================100
  455. // set arguments
  456. //====================================================================================================100
  457. error = clSetKernelArg( extract_kernel,
  458. 0,
  459. sizeof(long)*2,
  460. (void *) &Ne);
  461. if (error != CL_SUCCESS)
  462. fatal_CL(error, __LINE__);
  463. error = clSetKernelArg( extract_kernel,
  464. 1,
  465. sizeof(cl_mem),
  466. (void *) &d_I);
  467. if (error != CL_SUCCESS)
  468. fatal_CL(error, __LINE__);
  469. //====================================================================================================100
  470. // launch kernel
  471. //====================================================================================================100
  472. error = clEnqueueNDRangeKernel( command_queue,
  473. extract_kernel,
  474. 1,
  475. NULL,
  476. global_work_size,
  477. local_work_size,
  478. 0,
  479. NULL,
  480. NULL);
  481. if (error != CL_SUCCESS)
  482. fatal_CL(error, __LINE__);
  483. //====================================================================================================100
  484. // Synchronization - wait for all operations in the command queue so far to finish
  485. //====================================================================================================100
  486. // error = clFinish(command_queue);
  487. // if (error != CL_SUCCESS)
  488. // fatal_CL(error, __LINE__);
  489. //====================================================================================================100
  490. // End
  491. //====================================================================================================100
  492. //======================================================================================================================================================150
  493. // WHAT IS CONSTANT IN COMPUTATION LOOP
  494. //======================================================================================================================================================150
  495. //====================================================================================================100
  496. // Prepare Kernel
  497. //====================================================================================================100
  498. error = clSetKernelArg( prepare_kernel,
  499. 0,
  500. sizeof(long)*2,
  501. (void *) &Ne);
  502. if (error != CL_SUCCESS)
  503. fatal_CL(error, __LINE__);
  504. error = clSetKernelArg( prepare_kernel,
  505. 1,
  506. sizeof(cl_mem),
  507. (void *) &d_I);
  508. if (error != CL_SUCCESS)
  509. fatal_CL(error, __LINE__);
  510. error = clSetKernelArg( prepare_kernel,
  511. 2,
  512. sizeof(cl_mem),
  513. (void *) &d_sums);
  514. if (error != CL_SUCCESS)
  515. fatal_CL(error, __LINE__);
  516. error = clSetKernelArg( prepare_kernel,
  517. 3,
  518. sizeof(cl_mem),
  519. (void *) &d_sums2);
  520. if (error != CL_SUCCESS)
  521. fatal_CL(error, __LINE__);
  522. //====================================================================================================100
  523. // Reduce Kernel
  524. //====================================================================================================100
  525. int blocks2_x;
  526. int blocks2_work_size;
  527. size_t global_work_size2[1];
  528. long no;
  529. int mul;
  530. int mem_size_single = sizeof(fp) * 1;
  531. fp total;
  532. fp total2;
  533. fp meanROI;
  534. fp meanROI2;
  535. fp varROI;
  536. fp q0sqr;
  537. error = clSetKernelArg( reduce_kernel,
  538. 0,
  539. sizeof(long)*2,
  540. (void *) &Ne);
  541. if (error != CL_SUCCESS)
  542. fatal_CL(error, __LINE__);
  543. error = clSetKernelArg( reduce_kernel,
  544. 3,
  545. sizeof(cl_mem),
  546. (void *) &d_sums);
  547. if (error != CL_SUCCESS)
  548. fatal_CL(error, __LINE__);
  549. error = clSetKernelArg( reduce_kernel,
  550. 4,
  551. sizeof(cl_mem),
  552. (void *) &d_sums2);
  553. if (error != CL_SUCCESS)
  554. fatal_CL(error, __LINE__);
  555. //====================================================================================================100
  556. // SRAD Kernel
  557. //====================================================================================================100
  558. error = clSetKernelArg( srad_kernel,
  559. 0,
  560. sizeof(fp),
  561. (void *) &lambda);
  562. if (error != CL_SUCCESS)
  563. fatal_CL(error, __LINE__);
  564. error = clSetKernelArg( srad_kernel,
  565. 1,
  566. sizeof(int),
  567. (void *) &Nr);
  568. if (error != CL_SUCCESS)
  569. fatal_CL(error, __LINE__);
  570. error = clSetKernelArg( srad_kernel,
  571. 2,
  572. sizeof(int),
  573. (void *) &Nc);
  574. if (error != CL_SUCCESS)
  575. fatal_CL(error, __LINE__);
  576. error = clSetKernelArg( srad_kernel,
  577. 3,
  578. sizeof(long)*2,
  579. (void *) &Ne);
  580. if (error != CL_SUCCESS)
  581. fatal_CL(error, __LINE__);
  582. error = clSetKernelArg( srad_kernel,
  583. 4,
  584. sizeof(cl_mem),
  585. (void *) &d_iN);
  586. if (error != CL_SUCCESS)
  587. fatal_CL(error, __LINE__);
  588. error = clSetKernelArg( srad_kernel,
  589. 5,
  590. sizeof(cl_mem),
  591. (void *) &d_iS);
  592. if (error != CL_SUCCESS)
  593. fatal_CL(error, __LINE__);
  594. error = clSetKernelArg( srad_kernel,
  595. 6,
  596. sizeof(cl_mem),
  597. (void *) &d_jE);
  598. if (error != CL_SUCCESS)
  599. fatal_CL(error, __LINE__);
  600. error = clSetKernelArg( srad_kernel,
  601. 7,
  602. sizeof(cl_mem),
  603. (void *) &d_jW);
  604. if (error != CL_SUCCESS)
  605. fatal_CL(error, __LINE__);
  606. error = clSetKernelArg( srad_kernel,
  607. 8,
  608. sizeof(cl_mem),
  609. (void *) &d_dN);
  610. if (error != CL_SUCCESS)
  611. fatal_CL(error, __LINE__);
  612. error = clSetKernelArg( srad_kernel,
  613. 9,
  614. sizeof(cl_mem),
  615. (void *) &d_dS);
  616. if (error != CL_SUCCESS)
  617. fatal_CL(error, __LINE__);
  618. error = clSetKernelArg( srad_kernel,
  619. 10,
  620. sizeof(cl_mem),
  621. (void *) &d_dW);
  622. if (error != CL_SUCCESS)
  623. fatal_CL(error, __LINE__);
  624. error = clSetKernelArg( srad_kernel,
  625. 11,
  626. sizeof(cl_mem),
  627. (void *) &d_dE);
  628. if (error != CL_SUCCESS)
  629. fatal_CL(error, __LINE__);
  630. error = clSetKernelArg( srad_kernel,
  631. 13,
  632. sizeof(cl_mem),
  633. (void *) &d_c);
  634. if (error != CL_SUCCESS)
  635. fatal_CL(error, __LINE__);
  636. error = clSetKernelArg( srad_kernel,
  637. 14,
  638. sizeof(cl_mem),
  639. (void *) &d_I);
  640. if (error != CL_SUCCESS)
  641. fatal_CL(error, __LINE__);
  642. //====================================================================================================100
  643. // SRAD2 Kernel
  644. //====================================================================================================100
  645. error = clSetKernelArg( srad2_kernel,
  646. 0,
  647. sizeof(fp),
  648. (void *) &lambda);
  649. if (error != CL_SUCCESS)
  650. fatal_CL(error, __LINE__);
  651. error = clSetKernelArg( srad2_kernel,
  652. 1,
  653. sizeof(int),
  654. (void *) &Nr);
  655. if (error != CL_SUCCESS)
  656. fatal_CL(error, __LINE__);
  657. error = clSetKernelArg( srad2_kernel,
  658. 2,
  659. sizeof(int),
  660. (void *) &Nc);
  661. if (error != CL_SUCCESS)
  662. fatal_CL(error, __LINE__);
  663. error = clSetKernelArg( srad2_kernel,
  664. 3,
  665. sizeof(long)*2,
  666. (void *) &Ne);
  667. if (error != CL_SUCCESS)
  668. fatal_CL(error, __LINE__);
  669. error = clSetKernelArg( srad2_kernel,
  670. 4,
  671. sizeof(cl_mem),
  672. (void *) &d_iN);
  673. if (error != CL_SUCCESS)
  674. fatal_CL(error, __LINE__);
  675. error = clSetKernelArg( srad2_kernel,
  676. 5,
  677. sizeof(cl_mem),
  678. (void *) &d_iS);
  679. if (error != CL_SUCCESS)
  680. fatal_CL(error, __LINE__);
  681. error = clSetKernelArg( srad2_kernel,
  682. 6,
  683. sizeof(cl_mem),
  684. (void *) &d_jE);
  685. if (error != CL_SUCCESS)
  686. fatal_CL(error, __LINE__);
  687. error = clSetKernelArg( srad2_kernel,
  688. 7,
  689. sizeof(cl_mem),
  690. (void *) &d_jW);
  691. if (error != CL_SUCCESS)
  692. fatal_CL(error, __LINE__);
  693. error = clSetKernelArg( srad2_kernel,
  694. 8,
  695. sizeof(cl_mem),
  696. (void *) &d_dN);
  697. if (error != CL_SUCCESS)
  698. fatal_CL(error, __LINE__);
  699. error = clSetKernelArg( srad2_kernel,
  700. 9,
  701. sizeof(cl_mem),
  702. (void *) &d_dS);
  703. if (error != CL_SUCCESS)
  704. fatal_CL(error, __LINE__);
  705. error = clSetKernelArg( srad2_kernel,
  706. 10,
  707. sizeof(cl_mem),
  708. (void *) &d_dW);
  709. if (error != CL_SUCCESS)
  710. fatal_CL(error, __LINE__);
  711. error = clSetKernelArg( srad2_kernel,
  712. 11,
  713. sizeof(cl_mem),
  714. (void *) &d_dE);
  715. if (error != CL_SUCCESS)
  716. fatal_CL(error, __LINE__);
  717. error = clSetKernelArg( srad2_kernel,
  718. 12,
  719. sizeof(cl_mem),
  720. (void *) &d_c);
  721. if (error != CL_SUCCESS)
  722. fatal_CL(error, __LINE__);
  723. error = clSetKernelArg( srad2_kernel,
  724. 13,
  725. sizeof(cl_mem),
  726. (void *) &d_I);
  727. if (error != CL_SUCCESS)
  728. fatal_CL(error, __LINE__);
  729. //====================================================================================================100
  730. // End
  731. //====================================================================================================100
  732. //======================================================================================================================================================150
  733. // COMPUTATION
  734. //======================================================================================================================================================150
  735. printf("Iterations Progress: ");
  736. // execute main loop
  737. for (iter=0; iter<niter; iter++){ // do for the number of iterations input parameter
  738. printf("%d ", iter);
  739. fflush(NULL);
  740. //====================================================================================================100
  741. // Prepare kernel
  742. //====================================================================================================100
  743. // launch kernel
  744. error = clEnqueueNDRangeKernel( command_queue,
  745. prepare_kernel,
  746. 1,
  747. NULL,
  748. global_work_size,
  749. local_work_size,
  750. 0,
  751. NULL,
  752. NULL);
  753. if (error != CL_SUCCESS)
  754. fatal_CL(error, __LINE__);
  755. // synchronize
  756. // error = clFinish(command_queue);
  757. // if (error != CL_SUCCESS)
  758. // fatal_CL(error, __LINE__);
  759. //====================================================================================================100
  760. // Reduce Kernel - performs subsequent reductions of sums
  761. //====================================================================================================100
  762. // initial values
  763. blocks2_work_size = blocks_work_size; // original number of blocks
  764. global_work_size2[0] = global_work_size[0];
  765. no = Ne; // original number of sum elements
  766. mul = 1; // original multiplier
  767. // loop
  768. while(blocks2_work_size != 0){
  769. // set arguments that were uptaded in this loop
  770. error = clSetKernelArg( reduce_kernel,
  771. 1,
  772. sizeof(long)*2,
  773. (void *) &no);
  774. if (error != CL_SUCCESS)
  775. fatal_CL(error, __LINE__);
  776. error = clSetKernelArg( reduce_kernel,
  777. 2,
  778. sizeof(int),
  779. (void *) &mul);
  780. if (error != CL_SUCCESS)
  781. fatal_CL(error, __LINE__);
  782. error = clSetKernelArg( reduce_kernel,
  783. 5,
  784. sizeof(int),
  785. (void *) &blocks2_work_size);
  786. if (error != CL_SUCCESS)
  787. fatal_CL(error, __LINE__);
  788. // launch kernel
  789. error = clEnqueueNDRangeKernel( command_queue,
  790. reduce_kernel,
  791. 1,
  792. NULL,
  793. global_work_size2,
  794. local_work_size,
  795. 0,
  796. NULL,
  797. NULL);
  798. if (error != CL_SUCCESS)
  799. fatal_CL(error, __LINE__);
  800. // synchronize
  801. // error = clFinish(command_queue);
  802. // if (error != CL_SUCCESS)
  803. // fatal_CL(error, __LINE__);
  804. // update execution parameters
  805. no = blocks2_work_size; // get current number of elements
  806. if(blocks2_work_size == 1){
  807. blocks2_work_size = 0;
  808. }
  809. else{
  810. mul = mul * NUMBER_THREADS; // update the increment
  811. blocks_x = blocks2_work_size/(int)local_work_size[0]; // number of blocks
  812. if (blocks2_work_size % (int)local_work_size[0] != 0){ // compensate for division remainder above by adding one grid
  813. blocks_x = blocks_x + 1;
  814. }
  815. blocks2_work_size = blocks_x;
  816. global_work_size2[0] = blocks2_work_size * (int)local_work_size[0];
  817. }
  818. }
  819. // copy total sums to device
  820. error = clEnqueueReadBuffer(command_queue,
  821. d_sums,
  822. CL_TRUE,
  823. 0,
  824. mem_size_single,
  825. &total,
  826. 0,
  827. NULL,
  828. NULL);
  829. if (error != CL_SUCCESS)
  830. fatal_CL(error, __LINE__);
  831. error = clEnqueueReadBuffer(command_queue,
  832. d_sums2,
  833. CL_TRUE,
  834. 0,
  835. mem_size_single,
  836. &total2,
  837. 0,
  838. NULL,
  839. NULL);
  840. if (error != CL_SUCCESS)
  841. fatal_CL(error, __LINE__);
  842. //====================================================================================================100
  843. // calculate statistics
  844. //====================================================================================================100
  845. meanROI = total / (fp)(NeROI); // gets mean (average) value of element in ROI
  846. meanROI2 = meanROI * meanROI; //
  847. varROI = (total2 / (fp)(NeROI)) - meanROI2; // gets variance of ROI
  848. q0sqr = varROI / meanROI2; // gets standard deviation of ROI
  849. //====================================================================================================100
  850. // execute srad kernel
  851. //====================================================================================================100
  852. // set arguments that were uptaded in this loop
  853. error = clSetKernelArg( srad_kernel,
  854. 12,
  855. sizeof(fp),
  856. (void *) &q0sqr);
  857. if (error != CL_SUCCESS)
  858. fatal_CL(error, __LINE__);
  859. // launch kernel
  860. error = clEnqueueNDRangeKernel( command_queue,
  861. srad_kernel,
  862. 1,
  863. NULL,
  864. global_work_size,
  865. local_work_size,
  866. 0,
  867. NULL,
  868. NULL);
  869. if (error != CL_SUCCESS)
  870. fatal_CL(error, __LINE__);
  871. // synchronize
  872. // error = clFinish(command_queue);
  873. // if (error != CL_SUCCESS)
  874. // fatal_CL(error, __LINE__);
  875. //====================================================================================================100
  876. // execute srad2 kernel
  877. //====================================================================================================100
  878. // launch kernel
  879. error = clEnqueueNDRangeKernel( command_queue,
  880. srad2_kernel,
  881. 1,
  882. NULL,
  883. global_work_size,
  884. local_work_size,
  885. 0,
  886. NULL,
  887. NULL);
  888. if (error != CL_SUCCESS)
  889. fatal_CL(error, __LINE__);
  890. // synchronize
  891. // error = clFinish(command_queue);
  892. // if (error != CL_SUCCESS)
  893. // fatal_CL(error, __LINE__);
  894. //====================================================================================================100
  895. // End
  896. //====================================================================================================100
  897. }
  898. printf("\n");
  899. //======================================================================================================================================================150
  900. // Compress Kernel - SCALE IMAGE UP FROM 0-1 TO 0-255 AND COMPRESS
  901. //======================================================================================================================================================150
  902. //====================================================================================================100
  903. // set parameters
  904. //====================================================================================================100
  905. error = clSetKernelArg( compress_kernel,
  906. 0,
  907. sizeof(long)*2,
  908. (void *) &Ne);
  909. if (error != CL_SUCCESS)
  910. fatal_CL(error, __LINE__);
  911. error = clSetKernelArg( compress_kernel,
  912. 1,
  913. sizeof(cl_mem),
  914. (void *) &d_I);
  915. if (error != CL_SUCCESS)
  916. fatal_CL(error, __LINE__);
  917. //====================================================================================================100
  918. // launch kernel
  919. //====================================================================================================100
  920. error = clEnqueueNDRangeKernel( command_queue,
  921. compress_kernel,
  922. 1,
  923. NULL,
  924. global_work_size,
  925. local_work_size,
  926. 0,
  927. NULL,
  928. NULL);
  929. if (error != CL_SUCCESS)
  930. fatal_CL(error, __LINE__);
  931. //====================================================================================================100
  932. // synchronize
  933. //====================================================================================================100
  934. error = clFinish(command_queue);
  935. if (error != CL_SUCCESS)
  936. fatal_CL(error, __LINE__);
  937. //====================================================================================================100
  938. // End
  939. //====================================================================================================100
  940. //======================================================================================================================================================150
  941. // COPY RESULTS BACK TO CPU
  942. //======================================================================================================================================================150
  943. error = clEnqueueReadBuffer(command_queue,
  944. d_I,
  945. CL_TRUE,
  946. 0,
  947. mem_size,
  948. image,
  949. 0,
  950. NULL,
  951. NULL);
  952. if (error != CL_SUCCESS)
  953. fatal_CL(error, __LINE__);
  954. // int i;
  955. // for(i=0; i<100; i++){
  956. // printf("%f ", image[i]);
  957. // }
  958. //======================================================================================================================================================150
  959. // FREE MEMORY
  960. //======================================================================================================================================================150
  961. // OpenCL structures
  962. error = clReleaseKernel(extract_kernel);
  963. if (error != CL_SUCCESS)
  964. fatal_CL(error, __LINE__);
  965. error = clReleaseKernel(prepare_kernel);
  966. if (error != CL_SUCCESS)
  967. fatal_CL(error, __LINE__);
  968. error = clReleaseKernel(reduce_kernel);
  969. if (error != CL_SUCCESS)
  970. fatal_CL(error, __LINE__);
  971. error = clReleaseKernel(srad_kernel);
  972. if (error != CL_SUCCESS)
  973. fatal_CL(error, __LINE__);
  974. error = clReleaseKernel(srad2_kernel);
  975. if (error != CL_SUCCESS)
  976. fatal_CL(error, __LINE__);
  977. error = clReleaseKernel(compress_kernel);
  978. if (error != CL_SUCCESS)
  979. fatal_CL(error, __LINE__);
  980. error = clReleaseProgram(program);
  981. if (error != CL_SUCCESS)
  982. fatal_CL(error, __LINE__);
  983. // common_change
  984. error = clReleaseMemObject(d_I);
  985. if (error != CL_SUCCESS)
  986. fatal_CL(error, __LINE__);
  987. error = clReleaseMemObject(d_c);
  988. if (error != CL_SUCCESS)
  989. fatal_CL(error, __LINE__);
  990. error = clReleaseMemObject(d_iN);
  991. if (error != CL_SUCCESS)
  992. fatal_CL(error, __LINE__);
  993. error = clReleaseMemObject(d_iS);
  994. if (error != CL_SUCCESS)
  995. fatal_CL(error, __LINE__);
  996. error = clReleaseMemObject(d_jE);
  997. if (error != CL_SUCCESS)
  998. fatal_CL(error, __LINE__);
  999. error = clReleaseMemObject(d_jW);
  1000. if (error != CL_SUCCESS)
  1001. fatal_CL(error, __LINE__);
  1002. error = clReleaseMemObject(d_dN);
  1003. if (error != CL_SUCCESS)
  1004. fatal_CL(error, __LINE__);
  1005. error = clReleaseMemObject(d_dS);
  1006. if (error != CL_SUCCESS)
  1007. fatal_CL(error, __LINE__);
  1008. error = clReleaseMemObject(d_dE);
  1009. if (error != CL_SUCCESS)
  1010. fatal_CL(error, __LINE__);
  1011. error = clReleaseMemObject(d_dW);
  1012. if (error != CL_SUCCESS)
  1013. fatal_CL(error, __LINE__);
  1014. error = clReleaseMemObject(d_sums);
  1015. if (error != CL_SUCCESS)
  1016. fatal_CL(error, __LINE__);
  1017. error = clReleaseMemObject(d_sums2);
  1018. if (error != CL_SUCCESS)
  1019. fatal_CL(error, __LINE__);
  1020. // OpenCL structures
  1021. error = clFlush(command_queue);
  1022. if (error != CL_SUCCESS)
  1023. fatal_CL(error, __LINE__);
  1024. error = clReleaseCommandQueue(command_queue);
  1025. if (error != CL_SUCCESS)
  1026. fatal_CL(error, __LINE__);
  1027. error = clReleaseContext(context);
  1028. if (error != CL_SUCCESS)
  1029. fatal_CL(error, __LINE__);
  1030. //======================================================================================================================================================150
  1031. // End
  1032. //======================================================================================================================================================150
  1033. }
  1034. //========================================================================================================================================================================================================200
  1035. // End
  1036. //========================================================================================================================================================================================================200