kernel_gpu_opencl_wrapper.c 40 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193
  1. //========================================================================================================================================================================================================200
  2. // DEFINE/INCLUDE
  3. //========================================================================================================================================================================================================200
  4. //======================================================================================================================================================150
  5. // MAIN FUNCTION HEADER
  6. //======================================================================================================================================================150
  7. #include "./../main.h" // (in the main program folder)
  8. //======================================================================================================================================================150
  9. // DEFINE
  10. //======================================================================================================================================================150
  11. //======================================================================================================================================================150
  12. // LIBRARIES
  13. //======================================================================================================================================================150
  14. #include <stdio.h> // (in path known to compiler) needed by printf
  15. #include <string.h> // (in path known to compiler) needed by strlen
  16. #include <CL/cl.h> // (in path specified to compiler) needed by OpenCL types and functions
  17. //======================================================================================================================================================150
  18. // UTILITIES
  19. //======================================================================================================================================================150
  20. #include "./../util/opencl/opencl.h" // (in directory) needed by device functions
  21. //======================================================================================================================================================150
  22. // KERNEL_GPU_CUDA_WRAPPER FUNCTION HEADER
  23. //======================================================================================================================================================150
  24. #include "./kernel_gpu_opencl_wrapper.h" // (in directory)
  25. //======================================================================================================================================================150
  26. // END
  27. //======================================================================================================================================================150
  28. //========================================================================================================================================================================================================200
  29. // KERNEL_GPU_CUDA_WRAPPER FUNCTION
  30. //========================================================================================================================================================================================================200
  31. void
  32. kernel_gpu_opencl_wrapper(fp* image, // input image
  33. int Nr, // IMAGE nbr of rows
  34. int Nc, // IMAGE nbr of cols
  35. long Ne, // IMAGE nbr of elem
  36. int niter, // nbr of iterations
  37. fp lambda, // update step size
  38. long NeROI, // ROI nbr of elements
  39. int* iN,
  40. int* iS,
  41. int* jE,
  42. int* jW,
  43. int iter, // primary loop
  44. int mem_size_i,
  45. int mem_size_j,
  46. int platform_idx,
  47. int device_idx,
  48. int use_gpu)
  49. {
  50. //======================================================================================================================================================150
  51. // GPU SETUP
  52. //======================================================================================================================================================150
  53. //====================================================================================================100
  54. // COMMON VARIABLES
  55. //====================================================================================================100
  56. // common variables
  57. cl_int error;
  58. //====================================================================================================100
  59. // GET PLATFORMS (Intel, AMD, NVIDIA, based on provided library), SELECT ONE
  60. //====================================================================================================100
  61. // Get the number of available platforms
  62. cl_uint num_platforms;
  63. error = clGetPlatformIDs( 0,
  64. NULL,
  65. &num_platforms);
  66. if (error != CL_SUCCESS)
  67. fatal_CL(error, __LINE__);
  68. // Get the list of available platforms
  69. cl_platform_id *platforms = (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms);
  70. error = clGetPlatformIDs( num_platforms,
  71. platforms,
  72. NULL);
  73. if (error != CL_SUCCESS)
  74. fatal_CL(error, __LINE__);
  75. // Select the platform using the parameter
  76. cl_platform_id platform = platforms[platform_idx];
  77. // Get the name of the selected platform and print it (if there are multiple platforms, choose the first one)
  78. char pbuf[100];
  79. error = clGetPlatformInfo( platform,
  80. CL_PLATFORM_VENDOR,
  81. sizeof(pbuf),
  82. pbuf,
  83. NULL);
  84. if (error != CL_SUCCESS)
  85. fatal_CL(error, __LINE__);
  86. printf("Platform: %s\n", pbuf);
  87. //====================================================================================================100
  88. // CREATE CONTEXT FOR THE PLATFORM
  89. //====================================================================================================100
  90. // Create context properties for selected platform
  91. cl_context_properties context_properties[3] = { CL_CONTEXT_PLATFORM,
  92. (cl_context_properties) platform,
  93. 0};
  94. // Selector for the device type in accordance to what passed as parameter
  95. cl_device_type device_type = use_gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU;
  96. // Create context for selected platform being GPU/CPU
  97. cl_context context;
  98. context = clCreateContextFromType( context_properties,
  99. device_type,
  100. NULL,
  101. NULL,
  102. &error);
  103. if (error != CL_SUCCESS)
  104. fatal_CL(error, __LINE__);
  105. //====================================================================================================100
  106. // GET DEVICES AVAILABLE FOR THE CONTEXT, SELECT ONE
  107. //====================================================================================================100
  108. // Get the number of devices (previousely selected for the context)
  109. size_t devices_size;
  110. error = clGetContextInfo( context,
  111. CL_CONTEXT_DEVICES,
  112. 0,
  113. NULL,
  114. &devices_size);
  115. if (error != CL_SUCCESS)
  116. fatal_CL(error, __LINE__);
  117. // Get the list of devices (previousely selected for the context)
  118. cl_device_id *devices = (cl_device_id *) malloc(devices_size);
  119. error = clGetContextInfo( context,
  120. CL_CONTEXT_DEVICES,
  121. devices_size,
  122. devices,
  123. NULL);
  124. if (error != CL_SUCCESS)
  125. fatal_CL(error, __LINE__);
  126. // Select the device in accordance to the paramter
  127. cl_device_id device;
  128. device = devices[device_idx];
  129. // Get the name of the selected device (previousely selected for the context) and print it
  130. error = clGetDeviceInfo(device,
  131. CL_DEVICE_NAME,
  132. sizeof(pbuf),
  133. pbuf,
  134. NULL);
  135. if (error != CL_SUCCESS)
  136. fatal_CL(error, __LINE__);
  137. printf("Device: %s\n", pbuf);
  138. //====================================================================================================100
  139. // CREATE COMMAND QUEUE FOR THE DEVICE
  140. //====================================================================================================100
  141. // Create a command queue
  142. cl_command_queue command_queue;
  143. command_queue = clCreateCommandQueue( context,
  144. device,
  145. 0,
  146. &error);
  147. if (error != CL_SUCCESS)
  148. fatal_CL(error, __LINE__);
  149. //====================================================================================================100
  150. // CREATE PROGRAM, COMPILE IT
  151. //====================================================================================================100
  152. // Load kernel source code from file
  153. const char *source = load_kernel_source("./kernel/kernel_gpu_opencl.cl");
  154. size_t sourceSize = strlen(source);
  155. // Create the program
  156. cl_program program = clCreateProgramWithSource( context,
  157. 1,
  158. &source,
  159. &sourceSize,
  160. &error);
  161. if (error != CL_SUCCESS)
  162. fatal_CL(error, __LINE__);
  163. char clOptions[150];
  164. // sprintf(clOptions,"-I../../src");
  165. sprintf(clOptions,"-I.");
  166. #ifdef RD_WG_SIZE
  167. sprintf(clOptions + strlen(clOptions), " -DRD_WG_SIZE=%d", RD_WG_SIZE);
  168. #endif
  169. #ifdef RD_WG_SIZE_0
  170. sprintf(clOptions + strlen(clOptions), " -DRD_WG_SIZE_0=%d", RD_WG_SIZE_0);
  171. #endif
  172. #ifdef RD_WG_SIZE_0_0
  173. sprintf(clOptions + strlen(clOptions), " -DRD_WG_SIZE_0_0=%d", RD_WG_SIZE_0_0);
  174. #endif
  175. // Compile the program
  176. error = clBuildProgram( program,
  177. 1,
  178. &device,
  179. clOptions,
  180. NULL,
  181. NULL);
  182. // Print warnings and errors from compilation
  183. static char log[65536];
  184. memset(log, 0, sizeof(log));
  185. clGetProgramBuildInfo( program,
  186. device,
  187. CL_PROGRAM_BUILD_LOG,
  188. sizeof(log)-1,
  189. log,
  190. NULL);
  191. printf("-----OpenCL Compiler Output-----\n");
  192. if (strstr(log,"warning:") || strstr(log, "error:"))
  193. printf("<<<<\n%s\n>>>>\n", log);
  194. printf("--------------------------------\n");
  195. if (error != CL_SUCCESS)
  196. fatal_CL(error, __LINE__);
  197. //====================================================================================================100
  198. // CREATE Kernels
  199. //====================================================================================================100
  200. // Extract kernel
  201. cl_kernel extract_kernel;
  202. extract_kernel = clCreateKernel(program,
  203. "extract_kernel",
  204. &error);
  205. if (error != CL_SUCCESS)
  206. fatal_CL(error, __LINE__);
  207. // Prepare kernel
  208. cl_kernel prepare_kernel;
  209. prepare_kernel = clCreateKernel(program,
  210. "prepare_kernel",
  211. &error);
  212. if (error != CL_SUCCESS)
  213. fatal_CL(error, __LINE__);
  214. // Reduce kernel
  215. cl_kernel reduce_kernel;
  216. reduce_kernel = clCreateKernel( program,
  217. "reduce_kernel",
  218. &error);
  219. if (error != CL_SUCCESS)
  220. fatal_CL(error, __LINE__);
  221. // SRAD kernel
  222. cl_kernel srad_kernel;
  223. srad_kernel = clCreateKernel( program,
  224. "srad_kernel",
  225. &error);
  226. if (error != CL_SUCCESS)
  227. fatal_CL(error, __LINE__);
  228. // SRAD2 kernel
  229. cl_kernel srad2_kernel;
  230. srad2_kernel = clCreateKernel( program,
  231. "srad2_kernel",
  232. &error);
  233. if (error != CL_SUCCESS)
  234. fatal_CL(error, __LINE__);
  235. // Compress kernel
  236. cl_kernel compress_kernel;
  237. compress_kernel = clCreateKernel( program,
  238. "compress_kernel",
  239. &error);
  240. if (error != CL_SUCCESS)
  241. fatal_CL(error, __LINE__);
  242. //====================================================================================================100
  243. // TRIGGERING INITIAL DRIVER OVERHEAD
  244. //====================================================================================================100
  245. // cudaThreadSynchronize(); // the above does it
  246. //======================================================================================================================================================150
  247. // GPU VARIABLES
  248. //======================================================================================================================================================150
  249. // CUDA kernel execution parameters
  250. int blocks_x;
  251. //======================================================================================================================================================150
  252. // ALLOCATE MEMORY IN GPU
  253. //======================================================================================================================================================150
  254. //====================================================================================================100
  255. // common memory size
  256. //====================================================================================================100
  257. int mem_size; // matrix memory size
  258. mem_size = sizeof(fp) * Ne; // get the size of float representation of input IMAGE
  259. //====================================================================================================100
  260. // allocate memory for entire IMAGE on DEVICE
  261. //====================================================================================================100
  262. cl_mem d_I;
  263. d_I = clCreateBuffer( context,
  264. CL_MEM_READ_WRITE,
  265. mem_size,
  266. NULL,
  267. &error );
  268. if (error != CL_SUCCESS)
  269. fatal_CL(error, __LINE__);
  270. //====================================================================================================100
  271. // allocate memory for coordinates on DEVICE
  272. //====================================================================================================100
  273. cl_mem d_iN;
  274. d_iN = clCreateBuffer( context,
  275. CL_MEM_READ_WRITE,
  276. mem_size_i,
  277. NULL,
  278. &error );
  279. if (error != CL_SUCCESS)
  280. fatal_CL(error, __LINE__);
  281. cl_mem d_iS;
  282. d_iS = clCreateBuffer( context,
  283. CL_MEM_READ_WRITE,
  284. mem_size_i,
  285. NULL,
  286. &error );
  287. if (error != CL_SUCCESS)
  288. fatal_CL(error, __LINE__);
  289. cl_mem d_jE;
  290. d_jE = clCreateBuffer( context,
  291. CL_MEM_READ_WRITE,
  292. mem_size_j,
  293. NULL,
  294. &error );
  295. if (error != CL_SUCCESS)
  296. fatal_CL(error, __LINE__);
  297. cl_mem d_jW;
  298. d_jW = clCreateBuffer( context,
  299. CL_MEM_READ_WRITE,
  300. mem_size_j,
  301. NULL,
  302. &error );
  303. if (error != CL_SUCCESS)
  304. fatal_CL(error, __LINE__);
  305. //====================================================================================================100
  306. // allocate memory for derivatives
  307. //====================================================================================================100
  308. cl_mem d_dN;
  309. d_dN = clCreateBuffer( context,
  310. CL_MEM_READ_WRITE,
  311. mem_size,
  312. NULL,
  313. &error );
  314. if (error != CL_SUCCESS)
  315. fatal_CL(error, __LINE__);
  316. cl_mem d_dS;
  317. d_dS = clCreateBuffer( context,
  318. CL_MEM_READ_WRITE,
  319. mem_size,
  320. NULL,
  321. &error );
  322. if (error != CL_SUCCESS)
  323. fatal_CL(error, __LINE__);
  324. cl_mem d_dW;
  325. d_dW = clCreateBuffer( context,
  326. CL_MEM_READ_WRITE,
  327. mem_size,
  328. NULL,
  329. &error );
  330. if (error != CL_SUCCESS)
  331. fatal_CL(error, __LINE__);
  332. cl_mem d_dE;
  333. d_dE = clCreateBuffer( context,
  334. CL_MEM_READ_WRITE,
  335. mem_size,
  336. NULL,
  337. &error );
  338. if (error != CL_SUCCESS)
  339. fatal_CL(error, __LINE__);
  340. //====================================================================================================100
  341. // allocate memory for coefficient on DEVICE
  342. //====================================================================================================100
  343. cl_mem d_c;
  344. d_c = clCreateBuffer( context,
  345. CL_MEM_READ_WRITE,
  346. mem_size,
  347. NULL,
  348. &error );
  349. if (error != CL_SUCCESS)
  350. fatal_CL(error, __LINE__);
  351. //====================================================================================================100
  352. // allocate memory for partial sums on DEVICE
  353. //====================================================================================================100
  354. cl_mem d_sums;
  355. d_sums = clCreateBuffer( context,
  356. CL_MEM_READ_WRITE,
  357. mem_size,
  358. NULL,
  359. &error );
  360. if (error != CL_SUCCESS)
  361. fatal_CL(error, __LINE__);
  362. cl_mem d_sums2;
  363. d_sums2 = clCreateBuffer( context,
  364. CL_MEM_READ_WRITE,
  365. mem_size,
  366. NULL,
  367. &error );
  368. if (error != CL_SUCCESS)
  369. fatal_CL(error, __LINE__);
  370. //====================================================================================================100
  371. // End
  372. //====================================================================================================100
  373. //======================================================================================================================================================150
  374. // COPY INPUT TO CPU
  375. //======================================================================================================================================================150
  376. //====================================================================================================100
  377. // Image
  378. //====================================================================================================100
  379. error = clEnqueueWriteBuffer( command_queue,
  380. d_I,
  381. 1,
  382. 0,
  383. mem_size,
  384. image,
  385. 0,
  386. 0,
  387. 0);
  388. if (error != CL_SUCCESS)
  389. fatal_CL(error, __LINE__);
  390. //====================================================================================================100
  391. // coordinates
  392. //====================================================================================================100
  393. error = clEnqueueWriteBuffer( command_queue,
  394. d_iN,
  395. 1,
  396. 0,
  397. mem_size_i,
  398. iN,
  399. 0,
  400. 0,
  401. 0);
  402. if (error != CL_SUCCESS)
  403. fatal_CL(error, __LINE__);
  404. error = clEnqueueWriteBuffer( command_queue,
  405. d_iS,
  406. 1,
  407. 0,
  408. mem_size_i,
  409. iS,
  410. 0,
  411. 0,
  412. 0);
  413. if (error != CL_SUCCESS)
  414. fatal_CL(error, __LINE__);
  415. error = clEnqueueWriteBuffer( command_queue,
  416. d_jE,
  417. 1,
  418. 0,
  419. mem_size_j,
  420. jE,
  421. 0,
  422. 0,
  423. 0);
  424. if (error != CL_SUCCESS)
  425. fatal_CL(error, __LINE__);
  426. error = clEnqueueWriteBuffer( command_queue,
  427. d_jW,
  428. 1,
  429. 0,
  430. mem_size_j,
  431. jW,
  432. 0,
  433. 0,
  434. 0);
  435. if (error != CL_SUCCESS)
  436. fatal_CL(error, __LINE__);
  437. //====================================================================================================100
  438. // End
  439. //====================================================================================================100
  440. //======================================================================================================================================================150
  441. // KERNEL EXECUTION PARAMETERS
  442. //======================================================================================================================================================150
  443. // threads
  444. size_t local_work_size[1];
  445. local_work_size[0] = NUMBER_THREADS;
  446. // workgroups
  447. int blocks_work_size;
  448. size_t global_work_size[1];
  449. blocks_x = Ne/(int)local_work_size[0];
  450. if (Ne % (int)local_work_size[0] != 0){ // compensate for division remainder above by adding one grid
  451. blocks_x = blocks_x + 1;
  452. }
  453. blocks_work_size = blocks_x;
  454. global_work_size[0] = blocks_work_size * local_work_size[0]; // define the number of blocks in the grid
  455. printf("max # of workgroups = %d, # of threads/workgroup = %d (ensure that device can handle)\n", (int)(global_work_size[0]/local_work_size[0]), (int)local_work_size[0]);
  456. //======================================================================================================================================================150
  457. // Extract Kernel - SCALE IMAGE DOWN FROM 0-255 TO 0-1 AND EXTRACT
  458. //======================================================================================================================================================150
  459. //====================================================================================================100
  460. // set arguments
  461. //====================================================================================================100
  462. error = clSetKernelArg( extract_kernel,
  463. 0,
  464. sizeof(long)*2,
  465. (void *) &Ne);
  466. if (error != CL_SUCCESS)
  467. fatal_CL(error, __LINE__);
  468. error = clSetKernelArg( extract_kernel,
  469. 1,
  470. sizeof(cl_mem),
  471. (void *) &d_I);
  472. if (error != CL_SUCCESS)
  473. fatal_CL(error, __LINE__);
  474. //====================================================================================================100
  475. // launch kernel
  476. //====================================================================================================100
  477. error = clEnqueueNDRangeKernel( command_queue,
  478. extract_kernel,
  479. 1,
  480. NULL,
  481. global_work_size,
  482. local_work_size,
  483. 0,
  484. NULL,
  485. NULL);
  486. if (error != CL_SUCCESS)
  487. fatal_CL(error, __LINE__);
  488. //====================================================================================================100
  489. // Synchronization - wait for all operations in the command queue so far to finish
  490. //====================================================================================================100
  491. // error = clFinish(command_queue);
  492. // if (error != CL_SUCCESS)
  493. // fatal_CL(error, __LINE__);
  494. //====================================================================================================100
  495. // End
  496. //====================================================================================================100
  497. //======================================================================================================================================================150
  498. // WHAT IS CONSTANT IN COMPUTATION LOOP
  499. //======================================================================================================================================================150
  500. //====================================================================================================100
  501. // Prepare Kernel
  502. //====================================================================================================100
  503. error = clSetKernelArg( prepare_kernel,
  504. 0,
  505. sizeof(long)*2,
  506. (void *) &Ne);
  507. if (error != CL_SUCCESS)
  508. fatal_CL(error, __LINE__);
  509. error = clSetKernelArg( prepare_kernel,
  510. 1,
  511. sizeof(cl_mem),
  512. (void *) &d_I);
  513. if (error != CL_SUCCESS)
  514. fatal_CL(error, __LINE__);
  515. error = clSetKernelArg( prepare_kernel,
  516. 2,
  517. sizeof(cl_mem),
  518. (void *) &d_sums);
  519. if (error != CL_SUCCESS)
  520. fatal_CL(error, __LINE__);
  521. error = clSetKernelArg( prepare_kernel,
  522. 3,
  523. sizeof(cl_mem),
  524. (void *) &d_sums2);
  525. if (error != CL_SUCCESS)
  526. fatal_CL(error, __LINE__);
  527. //====================================================================================================100
  528. // Reduce Kernel
  529. //====================================================================================================100
  530. int blocks2_x;
  531. int blocks2_work_size;
  532. size_t global_work_size2[1];
  533. long no;
  534. int mul;
  535. int mem_size_single = sizeof(fp) * 1;
  536. fp total;
  537. fp total2;
  538. fp meanROI;
  539. fp meanROI2;
  540. fp varROI;
  541. fp q0sqr;
  542. error = clSetKernelArg( reduce_kernel,
  543. 0,
  544. sizeof(long)*2,
  545. (void *) &Ne);
  546. if (error != CL_SUCCESS)
  547. fatal_CL(error, __LINE__);
  548. error = clSetKernelArg( reduce_kernel,
  549. 3,
  550. sizeof(cl_mem),
  551. (void *) &d_sums);
  552. if (error != CL_SUCCESS)
  553. fatal_CL(error, __LINE__);
  554. error = clSetKernelArg( reduce_kernel,
  555. 4,
  556. sizeof(cl_mem),
  557. (void *) &d_sums2);
  558. if (error != CL_SUCCESS)
  559. fatal_CL(error, __LINE__);
  560. //====================================================================================================100
  561. // SRAD Kernel
  562. //====================================================================================================100
  563. error = clSetKernelArg( srad_kernel,
  564. 0,
  565. sizeof(fp),
  566. (void *) &lambda);
  567. if (error != CL_SUCCESS)
  568. fatal_CL(error, __LINE__);
  569. error = clSetKernelArg( srad_kernel,
  570. 1,
  571. sizeof(int),
  572. (void *) &Nr);
  573. if (error != CL_SUCCESS)
  574. fatal_CL(error, __LINE__);
  575. error = clSetKernelArg( srad_kernel,
  576. 2,
  577. sizeof(int),
  578. (void *) &Nc);
  579. if (error != CL_SUCCESS)
  580. fatal_CL(error, __LINE__);
  581. error = clSetKernelArg( srad_kernel,
  582. 3,
  583. sizeof(long)*2,
  584. (void *) &Ne);
  585. if (error != CL_SUCCESS)
  586. fatal_CL(error, __LINE__);
  587. error = clSetKernelArg( srad_kernel,
  588. 4,
  589. sizeof(cl_mem),
  590. (void *) &d_iN);
  591. if (error != CL_SUCCESS)
  592. fatal_CL(error, __LINE__);
  593. error = clSetKernelArg( srad_kernel,
  594. 5,
  595. sizeof(cl_mem),
  596. (void *) &d_iS);
  597. if (error != CL_SUCCESS)
  598. fatal_CL(error, __LINE__);
  599. error = clSetKernelArg( srad_kernel,
  600. 6,
  601. sizeof(cl_mem),
  602. (void *) &d_jE);
  603. if (error != CL_SUCCESS)
  604. fatal_CL(error, __LINE__);
  605. error = clSetKernelArg( srad_kernel,
  606. 7,
  607. sizeof(cl_mem),
  608. (void *) &d_jW);
  609. if (error != CL_SUCCESS)
  610. fatal_CL(error, __LINE__);
  611. error = clSetKernelArg( srad_kernel,
  612. 8,
  613. sizeof(cl_mem),
  614. (void *) &d_dN);
  615. if (error != CL_SUCCESS)
  616. fatal_CL(error, __LINE__);
  617. error = clSetKernelArg( srad_kernel,
  618. 9,
  619. sizeof(cl_mem),
  620. (void *) &d_dS);
  621. if (error != CL_SUCCESS)
  622. fatal_CL(error, __LINE__);
  623. error = clSetKernelArg( srad_kernel,
  624. 10,
  625. sizeof(cl_mem),
  626. (void *) &d_dW);
  627. if (error != CL_SUCCESS)
  628. fatal_CL(error, __LINE__);
  629. error = clSetKernelArg( srad_kernel,
  630. 11,
  631. sizeof(cl_mem),
  632. (void *) &d_dE);
  633. if (error != CL_SUCCESS)
  634. fatal_CL(error, __LINE__);
  635. error = clSetKernelArg( srad_kernel,
  636. 13,
  637. sizeof(cl_mem),
  638. (void *) &d_c);
  639. if (error != CL_SUCCESS)
  640. fatal_CL(error, __LINE__);
  641. error = clSetKernelArg( srad_kernel,
  642. 14,
  643. sizeof(cl_mem),
  644. (void *) &d_I);
  645. if (error != CL_SUCCESS)
  646. fatal_CL(error, __LINE__);
  647. //====================================================================================================100
  648. // SRAD2 Kernel
  649. //====================================================================================================100
  650. error = clSetKernelArg( srad2_kernel,
  651. 0,
  652. sizeof(fp),
  653. (void *) &lambda);
  654. if (error != CL_SUCCESS)
  655. fatal_CL(error, __LINE__);
  656. error = clSetKernelArg( srad2_kernel,
  657. 1,
  658. sizeof(int),
  659. (void *) &Nr);
  660. if (error != CL_SUCCESS)
  661. fatal_CL(error, __LINE__);
  662. error = clSetKernelArg( srad2_kernel,
  663. 2,
  664. sizeof(int),
  665. (void *) &Nc);
  666. if (error != CL_SUCCESS)
  667. fatal_CL(error, __LINE__);
  668. error = clSetKernelArg( srad2_kernel,
  669. 3,
  670. sizeof(long)*2,
  671. (void *) &Ne);
  672. if (error != CL_SUCCESS)
  673. fatal_CL(error, __LINE__);
  674. error = clSetKernelArg( srad2_kernel,
  675. 4,
  676. sizeof(cl_mem),
  677. (void *) &d_iN);
  678. if (error != CL_SUCCESS)
  679. fatal_CL(error, __LINE__);
  680. error = clSetKernelArg( srad2_kernel,
  681. 5,
  682. sizeof(cl_mem),
  683. (void *) &d_iS);
  684. if (error != CL_SUCCESS)
  685. fatal_CL(error, __LINE__);
  686. error = clSetKernelArg( srad2_kernel,
  687. 6,
  688. sizeof(cl_mem),
  689. (void *) &d_jE);
  690. if (error != CL_SUCCESS)
  691. fatal_CL(error, __LINE__);
  692. error = clSetKernelArg( srad2_kernel,
  693. 7,
  694. sizeof(cl_mem),
  695. (void *) &d_jW);
  696. if (error != CL_SUCCESS)
  697. fatal_CL(error, __LINE__);
  698. error = clSetKernelArg( srad2_kernel,
  699. 8,
  700. sizeof(cl_mem),
  701. (void *) &d_dN);
  702. if (error != CL_SUCCESS)
  703. fatal_CL(error, __LINE__);
  704. error = clSetKernelArg( srad2_kernel,
  705. 9,
  706. sizeof(cl_mem),
  707. (void *) &d_dS);
  708. if (error != CL_SUCCESS)
  709. fatal_CL(error, __LINE__);
  710. error = clSetKernelArg( srad2_kernel,
  711. 10,
  712. sizeof(cl_mem),
  713. (void *) &d_dW);
  714. if (error != CL_SUCCESS)
  715. fatal_CL(error, __LINE__);
  716. error = clSetKernelArg( srad2_kernel,
  717. 11,
  718. sizeof(cl_mem),
  719. (void *) &d_dE);
  720. if (error != CL_SUCCESS)
  721. fatal_CL(error, __LINE__);
  722. error = clSetKernelArg( srad2_kernel,
  723. 12,
  724. sizeof(cl_mem),
  725. (void *) &d_c);
  726. if (error != CL_SUCCESS)
  727. fatal_CL(error, __LINE__);
  728. error = clSetKernelArg( srad2_kernel,
  729. 13,
  730. sizeof(cl_mem),
  731. (void *) &d_I);
  732. if (error != CL_SUCCESS)
  733. fatal_CL(error, __LINE__);
  734. //====================================================================================================100
  735. // End
  736. //====================================================================================================100
  737. //======================================================================================================================================================150
  738. // COMPUTATION
  739. //======================================================================================================================================================150
  740. printf("Iterations Progress: ");
  741. // execute main loop
  742. for (iter=0; iter<niter; iter++){ // do for the number of iterations input parameter
  743. printf("%d ", iter);
  744. fflush(NULL);
  745. //====================================================================================================100
  746. // Prepare kernel
  747. //====================================================================================================100
  748. // launch kernel
  749. error = clEnqueueNDRangeKernel( command_queue,
  750. prepare_kernel,
  751. 1,
  752. NULL,
  753. global_work_size,
  754. local_work_size,
  755. 0,
  756. NULL,
  757. NULL);
  758. if (error != CL_SUCCESS)
  759. fatal_CL(error, __LINE__);
  760. // synchronize
  761. // error = clFinish(command_queue);
  762. // if (error != CL_SUCCESS)
  763. // fatal_CL(error, __LINE__);
  764. //====================================================================================================100
  765. // Reduce Kernel - performs subsequent reductions of sums
  766. //====================================================================================================100
  767. // initial values
  768. blocks2_work_size = blocks_work_size; // original number of blocks
  769. global_work_size2[0] = global_work_size[0];
  770. no = Ne; // original number of sum elements
  771. mul = 1; // original multiplier
  772. // loop
  773. while(blocks2_work_size != 0){
  774. // set arguments that were uptaded in this loop
  775. error = clSetKernelArg( reduce_kernel,
  776. 1,
  777. sizeof(long)*2,
  778. (void *) &no);
  779. if (error != CL_SUCCESS)
  780. fatal_CL(error, __LINE__);
  781. error = clSetKernelArg( reduce_kernel,
  782. 2,
  783. sizeof(int),
  784. (void *) &mul);
  785. if (error != CL_SUCCESS)
  786. fatal_CL(error, __LINE__);
  787. error = clSetKernelArg( reduce_kernel,
  788. 5,
  789. sizeof(int),
  790. (void *) &blocks2_work_size);
  791. if (error != CL_SUCCESS)
  792. fatal_CL(error, __LINE__);
  793. // launch kernel
  794. error = clEnqueueNDRangeKernel( command_queue,
  795. reduce_kernel,
  796. 1,
  797. NULL,
  798. global_work_size2,
  799. local_work_size,
  800. 0,
  801. NULL,
  802. NULL);
  803. if (error != CL_SUCCESS)
  804. fatal_CL(error, __LINE__);
  805. // synchronize
  806. // error = clFinish(command_queue);
  807. // if (error != CL_SUCCESS)
  808. // fatal_CL(error, __LINE__);
  809. // update execution parameters
  810. no = blocks2_work_size; // get current number of elements
  811. if(blocks2_work_size == 1){
  812. blocks2_work_size = 0;
  813. }
  814. else{
  815. mul = mul * NUMBER_THREADS; // update the increment
  816. blocks_x = blocks2_work_size/(int)local_work_size[0]; // number of blocks
  817. if (blocks2_work_size % (int)local_work_size[0] != 0){ // compensate for division remainder above by adding one grid
  818. blocks_x = blocks_x + 1;
  819. }
  820. blocks2_work_size = blocks_x;
  821. global_work_size2[0] = blocks2_work_size * (int)local_work_size[0];
  822. }
  823. }
  824. // copy total sums to device
  825. error = clEnqueueReadBuffer(command_queue,
  826. d_sums,
  827. CL_TRUE,
  828. 0,
  829. mem_size_single,
  830. &total,
  831. 0,
  832. NULL,
  833. NULL);
  834. if (error != CL_SUCCESS)
  835. fatal_CL(error, __LINE__);
  836. error = clEnqueueReadBuffer(command_queue,
  837. d_sums2,
  838. CL_TRUE,
  839. 0,
  840. mem_size_single,
  841. &total2,
  842. 0,
  843. NULL,
  844. NULL);
  845. if (error != CL_SUCCESS)
  846. fatal_CL(error, __LINE__);
  847. //====================================================================================================100
  848. // calculate statistics
  849. //====================================================================================================100
  850. meanROI = total / (fp)(NeROI); // gets mean (average) value of element in ROI
  851. meanROI2 = meanROI * meanROI; //
  852. varROI = (total2 / (fp)(NeROI)) - meanROI2; // gets variance of ROI
  853. q0sqr = varROI / meanROI2; // gets standard deviation of ROI
  854. //====================================================================================================100
  855. // execute srad kernel
  856. //====================================================================================================100
  857. // set arguments that were uptaded in this loop
  858. error = clSetKernelArg( srad_kernel,
  859. 12,
  860. sizeof(fp),
  861. (void *) &q0sqr);
  862. if (error != CL_SUCCESS)
  863. fatal_CL(error, __LINE__);
  864. // launch kernel
  865. error = clEnqueueNDRangeKernel( command_queue,
  866. srad_kernel,
  867. 1,
  868. NULL,
  869. global_work_size,
  870. local_work_size,
  871. 0,
  872. NULL,
  873. NULL);
  874. if (error != CL_SUCCESS)
  875. fatal_CL(error, __LINE__);
  876. // synchronize
  877. // error = clFinish(command_queue);
  878. // if (error != CL_SUCCESS)
  879. // fatal_CL(error, __LINE__);
  880. //====================================================================================================100
  881. // execute srad2 kernel
  882. //====================================================================================================100
  883. // launch kernel
  884. error = clEnqueueNDRangeKernel( command_queue,
  885. srad2_kernel,
  886. 1,
  887. NULL,
  888. global_work_size,
  889. local_work_size,
  890. 0,
  891. NULL,
  892. NULL);
  893. if (error != CL_SUCCESS)
  894. fatal_CL(error, __LINE__);
  895. // synchronize
  896. // error = clFinish(command_queue);
  897. // if (error != CL_SUCCESS)
  898. // fatal_CL(error, __LINE__);
  899. //====================================================================================================100
  900. // End
  901. //====================================================================================================100
  902. }
  903. printf("\n");
  904. //======================================================================================================================================================150
  905. // Compress Kernel - SCALE IMAGE UP FROM 0-1 TO 0-255 AND COMPRESS
  906. //======================================================================================================================================================150
  907. //====================================================================================================100
  908. // set parameters
  909. //====================================================================================================100
  910. error = clSetKernelArg( compress_kernel,
  911. 0,
  912. sizeof(long)*2,
  913. (void *) &Ne);
  914. if (error != CL_SUCCESS)
  915. fatal_CL(error, __LINE__);
  916. error = clSetKernelArg( compress_kernel,
  917. 1,
  918. sizeof(cl_mem),
  919. (void *) &d_I);
  920. if (error != CL_SUCCESS)
  921. fatal_CL(error, __LINE__);
  922. //====================================================================================================100
  923. // launch kernel
  924. //====================================================================================================100
  925. error = clEnqueueNDRangeKernel( command_queue,
  926. compress_kernel,
  927. 1,
  928. NULL,
  929. global_work_size,
  930. local_work_size,
  931. 0,
  932. NULL,
  933. NULL);
  934. if (error != CL_SUCCESS)
  935. fatal_CL(error, __LINE__);
  936. //====================================================================================================100
  937. // synchronize
  938. //====================================================================================================100
  939. error = clFinish(command_queue);
  940. if (error != CL_SUCCESS)
  941. fatal_CL(error, __LINE__);
  942. //====================================================================================================100
  943. // End
  944. //====================================================================================================100
  945. //======================================================================================================================================================150
  946. // COPY RESULTS BACK TO CPU
  947. //======================================================================================================================================================150
  948. error = clEnqueueReadBuffer(command_queue,
  949. d_I,
  950. CL_TRUE,
  951. 0,
  952. mem_size,
  953. image,
  954. 0,
  955. NULL,
  956. NULL);
  957. if (error != CL_SUCCESS)
  958. fatal_CL(error, __LINE__);
  959. // int i;
  960. // for(i=0; i<100; i++){
  961. // printf("%f ", image[i]);
  962. // }
  963. //======================================================================================================================================================150
  964. // FREE MEMORY
  965. //======================================================================================================================================================150
  966. // OpenCL structures
  967. error = clReleaseKernel(extract_kernel);
  968. if (error != CL_SUCCESS)
  969. fatal_CL(error, __LINE__);
  970. error = clReleaseKernel(prepare_kernel);
  971. if (error != CL_SUCCESS)
  972. fatal_CL(error, __LINE__);
  973. error = clReleaseKernel(reduce_kernel);
  974. if (error != CL_SUCCESS)
  975. fatal_CL(error, __LINE__);
  976. error = clReleaseKernel(srad_kernel);
  977. if (error != CL_SUCCESS)
  978. fatal_CL(error, __LINE__);
  979. error = clReleaseKernel(srad2_kernel);
  980. if (error != CL_SUCCESS)
  981. fatal_CL(error, __LINE__);
  982. error = clReleaseKernel(compress_kernel);
  983. if (error != CL_SUCCESS)
  984. fatal_CL(error, __LINE__);
  985. error = clReleaseProgram(program);
  986. if (error != CL_SUCCESS)
  987. fatal_CL(error, __LINE__);
  988. // common_change
  989. error = clReleaseMemObject(d_I);
  990. if (error != CL_SUCCESS)
  991. fatal_CL(error, __LINE__);
  992. error = clReleaseMemObject(d_c);
  993. if (error != CL_SUCCESS)
  994. fatal_CL(error, __LINE__);
  995. error = clReleaseMemObject(d_iN);
  996. if (error != CL_SUCCESS)
  997. fatal_CL(error, __LINE__);
  998. error = clReleaseMemObject(d_iS);
  999. if (error != CL_SUCCESS)
  1000. fatal_CL(error, __LINE__);
  1001. error = clReleaseMemObject(d_jE);
  1002. if (error != CL_SUCCESS)
  1003. fatal_CL(error, __LINE__);
  1004. error = clReleaseMemObject(d_jW);
  1005. if (error != CL_SUCCESS)
  1006. fatal_CL(error, __LINE__);
  1007. error = clReleaseMemObject(d_dN);
  1008. if (error != CL_SUCCESS)
  1009. fatal_CL(error, __LINE__);
  1010. error = clReleaseMemObject(d_dS);
  1011. if (error != CL_SUCCESS)
  1012. fatal_CL(error, __LINE__);
  1013. error = clReleaseMemObject(d_dE);
  1014. if (error != CL_SUCCESS)
  1015. fatal_CL(error, __LINE__);
  1016. error = clReleaseMemObject(d_dW);
  1017. if (error != CL_SUCCESS)
  1018. fatal_CL(error, __LINE__);
  1019. error = clReleaseMemObject(d_sums);
  1020. if (error != CL_SUCCESS)
  1021. fatal_CL(error, __LINE__);
  1022. error = clReleaseMemObject(d_sums2);
  1023. if (error != CL_SUCCESS)
  1024. fatal_CL(error, __LINE__);
  1025. // OpenCL structures
  1026. error = clFlush(command_queue);
  1027. if (error != CL_SUCCESS)
  1028. fatal_CL(error, __LINE__);
  1029. error = clReleaseCommandQueue(command_queue);
  1030. if (error != CL_SUCCESS)
  1031. fatal_CL(error, __LINE__);
  1032. error = clReleaseContext(context);
  1033. if (error != CL_SUCCESS)
  1034. fatal_CL(error, __LINE__);
  1035. //======================================================================================================================================================150
  1036. // End
  1037. //======================================================================================================================================================150
  1038. }
  1039. //========================================================================================================================================================================================================200
  1040. // End
  1041. //========================================================================================================================================================================================================200