main.cpp 34 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058
  1. #include <unistd.h>
  2. #include <error.h>
  3. #include <stdio.h>
  4. #include <stdlib.h>
  5. #include <fcntl.h>
  6. #include <errno.h>
  7. #include <string.h>
  8. #include <assert.h>
  9. #include <sys/time.h>
  10. #include <getopt.h>
  11. #include <iostream>
  12. #include <vector>
  13. #include <fstream>
  14. #include <sstream>
  15. #include <vector>
  16. #include "common.h"
  17. #include "components.h"
  18. #include "dwt.h"
  19. //using namespace std;
  20. #ifdef __APPLE__
  21. #include <OpenCL/cl.h>
  22. #else
  23. #include <CL/opencl.h>
  24. #endif
  25. #define THREADS 256
  26. struct dwt {
  27. char * srcFilename;
  28. char * outFilename;
  29. unsigned char *srcImg;
  30. int pixWidth;
  31. int pixHeight;
  32. int components;
  33. int dwtLvls;
  34. };
  35. cl_context context = 0;
  36. cl_command_queue commandQueue = 0;
  37. cl_program program = 0;
  38. cl_device_id cldevice = 0;
  39. cl_kernel kernel = 0;
  40. cl_kernel c_CopySrcToComponents = 0;
  41. cl_kernel c_CopySrcToComponent = 0;
  42. cl_kernel kl_fdwt53Kernel;
  43. cl_mem memObjects[3] = { 0, 0, 0 };
  44. cl_int errNum = 0;
  45. ///
  46. // functions for preparing create opencl program, contains CreateContext, CreateProgram, CreateCommandQueue, CreateMemBuffer, and Cleanup
  47. // Create an OpenCL context on the first available GPU platform.
  48. cl_context CreateContext(int platform_id, int use_gpu)
  49. {
  50. cl_context context = NULL;
  51. cl_uint platformIdCount = 0;
  52. cl_int errNum;
  53. cl_device_type device_type = use_gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU;
  54. // get number of platforms
  55. clGetPlatformIDs (0, NULL, &platformIdCount);
  56. std::vector<cl_platform_id> platformIds(platformIdCount);
  57. clGetPlatformIDs (platformIdCount, platformIds.data(), NULL);
  58. // In this example, first platform is a CPU, the second one is a GPU. we just choose the first available device.
  59. cl_context_properties contextProperties[] =
  60. {
  61. CL_CONTEXT_PLATFORM,
  62. (cl_context_properties)platformIds[platform_id],
  63. 0
  64. };
  65. context = clCreateContextFromType(contextProperties, device_type,
  66. NULL, NULL, &errNum);
  67. if (errNum != CL_SUCCESS)
  68. {
  69. std::cerr << "Failed to create an OpenCL GPU or CPU context." << std::endl;
  70. return NULL;
  71. }
  72. return context;
  73. }
  74. ///
  75. // Create a command queue on the first device available on the context
  76. //
  77. cl_command_queue CreateCommandQueue(cl_context context, cl_device_id *cldevice, int device_id)
  78. {
  79. cl_int errNum;
  80. cl_device_id *cldevices;
  81. cl_command_queue commandQueue = NULL;
  82. size_t deviceBufferSize = -1;
  83. // First get the size of the devices buffer
  84. errNum = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &deviceBufferSize);
  85. if (errNum != CL_SUCCESS)
  86. {
  87. std::cerr << "Failed call to clGetContextInfo(...,GL_CONTEXT_DEVICES,...)";
  88. return NULL;
  89. }
  90. if (deviceBufferSize <= 0)
  91. {
  92. std::cerr << "No devices available.";
  93. return NULL;
  94. }
  95. // Allocate memory for the devices buffer
  96. cldevices = new cl_device_id[deviceBufferSize / sizeof(cl_device_id)];
  97. errNum = clGetContextInfo(context, CL_CONTEXT_DEVICES, deviceBufferSize, cldevices, NULL);
  98. if (errNum != CL_SUCCESS)
  99. {
  100. delete [] cldevices;
  101. std::cerr << "Failed to get device IDs";
  102. return NULL;
  103. }
  104. commandQueue = clCreateCommandQueue(context, cldevices[device_id], 0, NULL);
  105. if (commandQueue == NULL)
  106. {
  107. delete [] cldevices;
  108. std::cerr << "Failed to create commandQueue for device ";
  109. return NULL;
  110. }
  111. *cldevice = cldevices[device_id];
  112. delete [] cldevices;
  113. return commandQueue;
  114. }
  115. ///
  116. // Create an OpenCL program from the kernel source file
  117. //
  118. cl_program CreateProgram(cl_context context, cl_device_id cldevice, const char* fileName)
  119. {
  120. cl_int errNum;
  121. cl_program program;
  122. std::ifstream kernelFile(fileName, std::ios::in);
  123. if (!kernelFile.is_open())
  124. {
  125. std::cerr << "Failed to open file for reading: " << fileName << std::endl;
  126. return NULL;
  127. }
  128. std::ostringstream oss;
  129. oss << kernelFile.rdbuf();
  130. std::string srcStdStr = oss.str();
  131. const char *srcStr = srcStdStr.c_str();
  132. program = clCreateProgramWithSource(context, 1,
  133. (const char**)&srcStr,
  134. NULL, NULL);
  135. if (program == NULL)
  136. {
  137. std::cerr << "Failed to create CL program from source." << std::endl;
  138. return NULL;
  139. }
  140. errNum = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
  141. if (errNum != CL_SUCCESS)
  142. {
  143. // Determine the reason for the error
  144. char buildLog[16384];
  145. clGetProgramBuildInfo(program, cldevice, CL_PROGRAM_BUILD_LOG,
  146. sizeof(buildLog), buildLog, NULL);
  147. std::cerr << "Error in kernel: " << std::endl;
  148. std::cerr << buildLog;
  149. clReleaseProgram(program);
  150. return NULL;
  151. }
  152. return program;
  153. }
  154. ///
  155. // Cleanup any created OpenCL resources
  156. //
  157. void Cleanup(cl_context context, cl_command_queue commandQueue,
  158. cl_program program, cl_kernel kernel)
  159. {
  160. if (commandQueue != 0)
  161. clReleaseCommandQueue(commandQueue);
  162. if (kernel != 0)
  163. clReleaseKernel(kernel);
  164. if (program != 0)
  165. clReleaseProgram(program);
  166. if (context != 0)
  167. clReleaseContext(context);
  168. }
  169. ///
  170. // Load the input image.
  171. //
  172. int getImg(char * srcFilename, unsigned char *srcImg, int inputSize)
  173. {
  174. // printf("Loading ipnput: %s\n", srcFilename);
  175. char path[] = "../../data/dwt2d/";
  176. char *newSrc = NULL;
  177. if((newSrc = (char *)malloc(strlen(srcFilename)+strlen(path)+1)) != NULL)
  178. {
  179. newSrc[0] = '\0';
  180. strcat(newSrc, path);
  181. strcat(newSrc, srcFilename);
  182. srcFilename= newSrc;
  183. }
  184. printf("Loading ipnput: %s\n", srcFilename);
  185. //read image
  186. int i = open(srcFilename, O_RDONLY, 0644);
  187. if (i == -1)
  188. {
  189. error(0,errno,"cannot access %s", srcFilename);
  190. return -1;
  191. }
  192. int ret = read(i, srcImg, inputSize);
  193. printf("precteno %d, inputsize %d\n", ret, inputSize);
  194. close(i);
  195. return 0;
  196. }
  197. ///
  198. //Show user how to use this program
  199. //
  200. void usage() {
  201. printf("dwt [otpions] src_img.rgb <out_img.dwt>\n\
  202. -d, --dimension\t\tdimensions of src img, e.g. 1920x1080\n\
  203. -c, --components\t\tnumber of color components, default 3\n\
  204. -b, --depth\t\t\tbit depth, default 8\n\
  205. -l, --level\t\t\tDWT level, default 3\n\
  206. -D, --device\t\t\tcuda device\n\
  207. -f, --forward\t\t\tforward transform\n\
  208. -r, --reverse\t\t\treverse transform\n\
  209. -9, --97\t\t\t9/7 transform\n\
  210. -5, --53\t\t\t5/3 transform\n\
  211. -w --write-visual\t\twrite output in visual (tiled) fashion instead of the linear\n\
  212. -p --platform_id\t\t\tOCL platform id to use\n\
  213. -g --use_gpu\t\t\t1 to use gpu, 0 to use cpu\n\
  214. -i --device_id\t\t\tOCL devicde id to use\n");
  215. }
  216. ///
  217. // Check the type of error about opencl program
  218. //
  219. void fatal_CL(cl_int error, int line_no)
  220. {
  221. printf("At line %d: ", line_no);
  222. switch(error) {
  223. case CL_SUCCESS: printf("CL_SUCCESS\n"); break;
  224. case CL_DEVICE_NOT_FOUND: printf("CL_DEVICE_NOT_FOUND\n"); break;
  225. case CL_DEVICE_NOT_AVAILABLE: printf("CL_DEVICE_NOT_AVAILABLE\n"); break;
  226. case CL_COMPILER_NOT_AVAILABLE: printf("CL_COMPILER_NOT_AVAILABLE\n"); break;
  227. case CL_MEM_OBJECT_ALLOCATION_FAILURE: printf("CL_MEM_OBJECT_ALLOCATION_FAILURE\n"); break;
  228. case CL_OUT_OF_RESOURCES: printf("CL_OUT_OF_RESOURCES\n"); break;
  229. case CL_OUT_OF_HOST_MEMORY: printf("CL_OUT_OF_HOST_MEMORY\n"); break;
  230. case CL_PROFILING_INFO_NOT_AVAILABLE: printf("CL_PROFILING_INFO_NOT_AVAILABLE\n"); break;
  231. case CL_MEM_COPY_OVERLAP: printf("CL_MEM_COPY_OVERLAP\n"); break;
  232. case CL_IMAGE_FORMAT_MISMATCH: printf("CL_IMAGE_FORMAT_MISMATCH\n"); break;
  233. case CL_IMAGE_FORMAT_NOT_SUPPORTED: printf("CL_IMAGE_FORMAT_NOT_SUPPORTED\n"); break;
  234. case CL_BUILD_PROGRAM_FAILURE: printf("CL_BUILD_PROGRAM_FAILURE\n"); break;
  235. case CL_MAP_FAILURE: printf("CL_MAP_FAILURE\n"); break;
  236. case CL_INVALID_VALUE: printf("CL_INVALID_VALUE\n"); break;
  237. case CL_INVALID_DEVICE_TYPE: printf("CL_INVALID_DEVICE_TYPE\n"); break;
  238. case CL_INVALID_PLATFORM: printf("CL_INVALID_PLATFORM\n"); break;
  239. case CL_INVALID_DEVICE: printf("CL_INVALID_DEVICE\n"); break;
  240. case CL_INVALID_CONTEXT: printf("CL_INVALID_CONTEXT\n"); break;
  241. case CL_INVALID_QUEUE_PROPERTIES: printf("CL_INVALID_QUEUE_PROPERTIES\n"); break;
  242. case CL_INVALID_COMMAND_QUEUE: printf("CL_INVALID_COMMAND_QUEUE\n"); break;
  243. case CL_INVALID_HOST_PTR: printf("CL_INVALID_HOST_PTR\n"); break;
  244. case CL_INVALID_MEM_OBJECT: printf("CL_INVALID_MEM_OBJECT\n"); break;
  245. case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: printf("CL_INVALID_IMAGE_FORMAT_DESCRIPTOR\n"); break;
  246. case CL_INVALID_IMAGE_SIZE: printf("CL_INVALID_IMAGE_SIZE\n"); break;
  247. case CL_INVALID_SAMPLER: printf("CL_INVALID_SAMPLER\n"); break;
  248. case CL_INVALID_BINARY: printf("CL_INVALID_BINARY\n"); break;
  249. case CL_INVALID_BUILD_OPTIONS: printf("CL_INVALID_BUILD_OPTIONS\n"); break;
  250. case CL_INVALID_PROGRAM: printf("CL_INVALID_PROGRAM\n"); break;
  251. case CL_INVALID_PROGRAM_EXECUTABLE: printf("CL_INVALID_PROGRAM_EXECUTABLE\n"); break;
  252. case CL_INVALID_KERNEL_NAME: printf("CL_INVALID_KERNEL_NAME\n"); break;
  253. case CL_INVALID_KERNEL_DEFINITION: printf("CL_INVALID_KERNEL_DEFINITION\n"); break;
  254. case CL_INVALID_KERNEL: printf("CL_INVALID_KERNEL\n"); break;
  255. case CL_INVALID_ARG_INDEX: printf("CL_INVALID_ARG_INDEX\n"); break;
  256. case CL_INVALID_ARG_VALUE: printf("CL_INVALID_ARG_VALUE\n"); break;
  257. case CL_INVALID_ARG_SIZE: printf("CL_INVALID_ARG_SIZE\n"); break;
  258. case CL_INVALID_KERNEL_ARGS: printf("CL_INVALID_KERNEL_ARGS\n"); break;
  259. case CL_INVALID_WORK_DIMENSION: printf("CL_INVALID_WORK_DIMENSION\n"); break;
  260. case CL_INVALID_WORK_GROUP_SIZE: printf("CL_INVALID_WORK_GROUP_SIZE\n"); break;
  261. case CL_INVALID_WORK_ITEM_SIZE: printf("CL_INVALID_WORK_ITEM_SIZE\n"); break;
  262. case CL_INVALID_GLOBAL_OFFSET: printf("CL_INVALID_GLOBAL_OFFSET\n"); break;
  263. case CL_INVALID_EVENT_WAIT_LIST: printf("CL_INVALID_EVENT_WAIT_LIST\n"); break;
  264. case CL_INVALID_EVENT: printf("CL_INVALID_EVENT\n"); break;
  265. case CL_INVALID_OPERATION: printf("CL_INVALID_OPERATION\n"); break;
  266. case CL_INVALID_GL_OBJECT: printf("CL_INVALID_GL_OBJECT\n"); break;
  267. case CL_INVALID_BUFFER_SIZE: printf("CL_INVALID_BUFFER_SIZE\n"); break;
  268. case CL_INVALID_MIP_LEVEL: printf("CL_INVALID_MIP_LEVEL\n"); break;
  269. case CL_INVALID_GLOBAL_WORK_SIZE: printf("CL_INVALID_GLOBAL_WORK_SIZE\n"); break;
  270. #ifdef CL_VERSION_1_1
  271. case CL_MISALIGNED_SUB_BUFFER_OFFSET: printf("CL_MISALIGNED_SUB_BUFFER_OFFSET\n"); break;
  272. case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST: printf("CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST\n"); break;
  273. #endif
  274. default: printf("Invalid OpenCL error code\n");
  275. }
  276. }
  277. ///
  278. // Separate compoents of 8bit RGB source image
  279. //in file components.cu
  280. void rgbToComponents(cl_mem d_r, cl_mem d_g, cl_mem d_b, unsigned char * h_src, int width, int height)
  281. {
  282. int pixels = width * height;
  283. int alignedSize = DIVANDRND(width*height, THREADS) * THREADS * 3; //aligned to thread block size -- THREADS
  284. cl_mem cl_d_src;
  285. cl_d_src = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, pixels*3, h_src, &errNum);
  286. // fatal_CL(errNum, __LINE__);
  287. size_t globalWorkSize[1] = { alignedSize/3};
  288. size_t localWorkSize[1] = { THREADS };
  289. errNum = clSetKernelArg(c_CopySrcToComponents, 0, sizeof(cl_mem), &d_r);
  290. errNum |= clSetKernelArg(c_CopySrcToComponents, 1, sizeof(cl_mem), &d_g);
  291. errNum |= clSetKernelArg(c_CopySrcToComponents, 2, sizeof(cl_mem), &d_b);
  292. errNum |= clSetKernelArg(c_CopySrcToComponents, 3, sizeof(cl_mem), &cl_d_src);
  293. errNum |= clSetKernelArg(c_CopySrcToComponents, 4, sizeof(int), &pixels);
  294. // fatal_CL(errNum, __LINE__);
  295. errNum = clEnqueueNDRangeKernel(commandQueue, c_CopySrcToComponents, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
  296. // fatal_CL(errNum, __LINE__);
  297. // Free Memory
  298. errNum = clReleaseMemObject(cl_d_src);
  299. // fatal_CL(errNum, __LINE__);
  300. }
  301. ///
  302. // Copy a 8bit source image data into a color compoment
  303. //in file components.cu
  304. void bwToComponent(cl_mem d_c, unsigned char * h_src, int width, int height)
  305. {
  306. cl_mem cl_d_src;
  307. int pixels = width*height;
  308. int alignedSize = DIVANDRND(pixels, THREADS) * THREADS;
  309. cl_d_src = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, pixels, h_src, NULL);
  310. // fatal_CL(errNum, __LINE__);
  311. size_t globalWorkSize[1] = { alignedSize/9};
  312. size_t localWorkSize[1] = { THREADS };
  313. assert(alignedSize%(THREADS*3) == 0);
  314. errNum = clSetKernelArg(c_CopySrcToComponent, 0, sizeof(cl_mem), &d_c);
  315. errNum |= clSetKernelArg(c_CopySrcToComponent, 1, sizeof(cl_mem), &cl_d_src);
  316. errNum |= clSetKernelArg(c_CopySrcToComponent, 2, sizeof(int), &pixels);
  317. // fatal_CL(errNum, __LINE__);
  318. errNum = clEnqueueNDRangeKernel(commandQueue, c_CopySrcToComponent, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
  319. std::cout<<"in function bwToComponent errNum= "<<errNum<<"\n";
  320. // fatal_CL(errNum, __LINE__);
  321. std::cout<<"bwToComponent has finished\n";
  322. // Free Memory
  323. errNum = clReleaseMemObject(cl_d_src);
  324. // fatal_CL(errNum, __LINE__);
  325. }
  326. /// Only computes optimal number of sliding window steps, number of threadblocks and then lanches the 5/3 FDWT kernel.
  327. /// @tparam WIN_SX width of sliding window
  328. /// @tparam WIN_SY height of sliding window
  329. /// @param in input image
  330. /// @param out output buffer
  331. /// @param sx width of the input image
  332. /// @param sy height of the input image
  333. ///launchFDWT53Kerneld is in file
  334. void launchFDWT53Kernel (int WIN_SX, int WIN_SY, cl_mem in, cl_mem out, int sx, int sy)
  335. {
  336. // compute optimal number of steps of each sliding window
  337. // cuda_dwt called a function divRndUp from namespace cuda_gwt. this function takes n and d, "return (n / d) + ((n % d) ? 1 : 0);"
  338. //
  339. const int steps = ( sy/ (15 * WIN_SY)) + ((sy % (15 * WIN_SY)) ? 1 : 0);
  340. int gx = ( sx/ WIN_SX) + ((sx % WIN_SX) ? 1 : 0); //use function divRndUp(n, d){return (n / d) + ((n % d) ? 1 : 0);}
  341. int gy = ( sy/ (WIN_SY*steps)) + ((sy % (WIN_SY*steps)) ? 1 : 0);
  342. printf("sliding steps = %d , gx = %d , gy = %d \n", steps, gx, gy);
  343. // prepare grid size
  344. size_t globalWorkSize[2] = { gx*WIN_SX, gy*1};
  345. size_t localWorkSize[2] = { WIN_SX , 1};
  346. // printf("\n globalx=%d, globaly=%d, blocksize=%d\n", gx, gy, WIN_SX);
  347. errNum = clSetKernelArg(kl_fdwt53Kernel, 0, sizeof(cl_mem), &in);
  348. errNum |= clSetKernelArg(kl_fdwt53Kernel, 1, sizeof(cl_mem), &out);
  349. errNum |= clSetKernelArg(kl_fdwt53Kernel, 2, sizeof(int), &sx);
  350. errNum |= clSetKernelArg(kl_fdwt53Kernel, 3, sizeof(int), &sy);
  351. errNum |= clSetKernelArg(kl_fdwt53Kernel, 4, sizeof(int), &steps);
  352. errNum |= clSetKernelArg(kl_fdwt53Kernel, 5, sizeof(int), &WIN_SX);
  353. errNum |= clSetKernelArg(kl_fdwt53Kernel, 6, sizeof(int), &WIN_SY);
  354. // fatal_CL(errNum, __LINE__);
  355. errNum = clEnqueueNDRangeKernel(commandQueue, kl_fdwt53Kernel, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
  356. // fatal_CL(errNum, __LINE__);
  357. printf("kl_fdwt53Kernel in launchFDW53Kernel has finished\n");
  358. }
  359. /// Simple cudaMemcpy wrapped in performance tester.
  360. /// param dest destination bufer
  361. /// param src source buffer
  362. /// param sx width of copied image
  363. /// param sy height of copied image
  364. ///from /cuda_gwt/common.h/namespace
  365. void memCopy (cl_mem dest, cl_mem src, const size_t sx, const size_t sy){
  366. errNum = clEnqueueCopyBuffer (commandQueue, src, dest, 0, 0, sx*sy*sizeof(int), 0, NULL, NULL);
  367. // fatal_CL (errNum, __LINE__);
  368. }
  369. /// Forward 5/3 2D DWT. See common rules (above) for more details.
  370. /// @param in Expected to be normalized into range [-128, 127].
  371. /// Will not be preserved (will be overwritten).
  372. /// @param out output buffer on GPU
  373. /// @param sizeX width of input image (in pixels)
  374. /// @param sizeY height of input image (in pixels)
  375. /// @param levels number of recursive DWT levels
  376. /// @backup use to test time
  377. //at the end of namespace dwt_cuda (line338)
  378. void fdwt53(cl_mem in, cl_mem out, int sizeX, int sizeY, int levels)
  379. {
  380. // select right width of kernel for the size of the image
  381. if(sizeX >= 960)
  382. {
  383. launchFDWT53Kernel(192, 8, in, out, sizeX, sizeY);
  384. }
  385. else if (sizeX >= 480)
  386. {
  387. launchFDWT53Kernel(128, 8, in, out, sizeX, sizeY);
  388. } else
  389. {
  390. launchFDWT53Kernel(64, 8, in, out, sizeX, sizeY);
  391. }
  392. // if this was not the last level, continue recursively with other levels
  393. if (levels > 1)
  394. {
  395. // copy output's LL band back into input buffer
  396. const int llSizeX = (sizeX / 2) + ((sizeX % 2) ? 1 :0);
  397. const int llSizeY = (sizeY / 2) + ((sizeY % 2) ? 1 :0);
  398. memCopy(in, out, llSizeX, llSizeY);
  399. // run remaining levels of FDWT
  400. fdwt53(in, out, llSizeX, llSizeY, levels - 1);
  401. }
  402. }
  403. ///
  404. // in dwt.cu
  405. int nStage2dDWT(cl_mem in, cl_mem out, cl_mem backup, int pixWidth, int pixHeight, int stages, bool forward)
  406. {
  407. printf("\n*** %d stages of 2D forward DWT:\n", stages);
  408. // create backup of input, because each test iteration overwrites it
  409. const int size = pixHeight * pixWidth * sizeof(int);
  410. // Measure time of individual levels.
  411. if (forward)
  412. fdwt53(in, out, pixWidth, pixHeight, stages );
  413. //else
  414. // rdwt(in, out, pixWidth, pixHeight, stages);
  415. // rdwt means rdwt53(can be found in file rdwt53.cu) which has not been defined
  416. return 0;
  417. }
  418. ///
  419. //in file dwt.cu
  420. void samplesToChar(unsigned char * dst, int * src, int samplesNum)
  421. {
  422. int i;
  423. for(i = 0; i < samplesNum; i++)
  424. {
  425. int r = src[i]+128;
  426. if (r > 255) r = 255;
  427. if (r < 0) r = 0;
  428. dst[i] = (unsigned char)r;
  429. }
  430. }
  431. ///
  432. //in file dwt.cu
  433. /// Write output linear orderd
  434. int writeLinear(cl_mem component, int pixWidth, int pixHeight, const char * filename, const char * suffix)
  435. {
  436. unsigned char * result;
  437. int *gpu_output;
  438. int i;
  439. int size;
  440. int samplesNum = pixWidth*pixHeight;
  441. size = samplesNum*sizeof(int);
  442. gpu_output = (int *)malloc(size);
  443. memset(gpu_output, 0, size);
  444. result = (unsigned char *)malloc(samplesNum);
  445. errNum = clEnqueueReadBuffer(commandQueue, component, CL_TRUE, 0, size, gpu_output, 0, NULL, NULL);
  446. // fatal_CL(errNum, __LINE__);
  447. // T to char
  448. samplesToChar(result, gpu_output, samplesNum);
  449. // Write component
  450. char outfile[strlen(filename)+strlen(suffix)];
  451. strcpy(outfile, filename);
  452. strcpy(outfile+strlen(filename), suffix);
  453. i = open(outfile, O_CREAT|O_WRONLY, 0644);
  454. if (i == -1)
  455. {
  456. error(0,errno,"cannot access %s", outfile);
  457. return -1;
  458. }
  459. printf("\nWriting to %s (%d x %d)\n", outfile, pixWidth, pixHeight);
  460. write(i, result, samplesNum);
  461. close(i);
  462. // Clean up
  463. free(gpu_output);
  464. free(result);
  465. return 0;
  466. }
  467. ///
  468. // Write output visual ordered
  469. //in file dwt.cu
  470. int writeNStage2DDWT(cl_mem component, int pixWidth, int pixHeight, int stages, const char * filename, const char * suffix)
  471. {
  472. struct band {
  473. int dimX;
  474. int dimY;
  475. };
  476. struct dimensions {
  477. struct band LL;
  478. struct band HL;
  479. struct band LH;
  480. struct band HH;
  481. };
  482. unsigned char * result;
  483. int *src;
  484. int *dst;
  485. int i,s;
  486. int size;
  487. int offset;
  488. int yOffset;
  489. int samplesNum = pixWidth*pixHeight;
  490. struct dimensions * bandDims;
  491. bandDims = (struct dimensions *)malloc(stages * sizeof(struct dimensions));
  492. bandDims[0].LL.dimX = DIVANDRND(pixWidth,2);
  493. bandDims[0].LL.dimY = DIVANDRND(pixHeight,2);
  494. bandDims[0].HL.dimX = pixWidth - bandDims[0].LL.dimX;
  495. bandDims[0].HL.dimY = bandDims[0].LL.dimY;
  496. bandDims[0].LH.dimX = bandDims[0].LL.dimX;
  497. bandDims[0].LH.dimY = pixHeight - bandDims[0].LL.dimY;
  498. bandDims[0].HH.dimX = bandDims[0].HL.dimX;
  499. bandDims[0].HH.dimY = bandDims[0].LH.dimY;
  500. for (i = 1; i < stages; i++)
  501. {
  502. bandDims[i].LL.dimX = DIVANDRND(bandDims[i-1].LL.dimX,2);
  503. bandDims[i].LL.dimY = DIVANDRND(bandDims[i-1].LL.dimY,2);
  504. bandDims[i].HL.dimX = bandDims[i-1].LL.dimX - bandDims[i].LL.dimX;
  505. bandDims[i].HL.dimY = bandDims[i].LL.dimY;
  506. bandDims[i].LH.dimX = bandDims[i].LL.dimX;
  507. bandDims[i].LH.dimY = bandDims[i-1].LL.dimY - bandDims[i].LL.dimY;
  508. bandDims[i].HH.dimX = bandDims[i].HL.dimX;
  509. bandDims[i].HH.dimY = bandDims[i].LH.dimY;
  510. }
  511. #if 0
  512. printf("Original image pixWidth x pixHeight: %d x %d\n", pixWidth, pixHeight);
  513. for (i = 0; i < stages; i++)
  514. {
  515. printf("Stage %d: LL: pixWidth x pixHeight: %d x %d\n", i, bandDims[i].LL.dimX, bandDims[i].LL.dimY);
  516. printf("Stage %d: HL: pixWidth x pixHeight: %d x %d\n", i, bandDims[i].HL.dimX, bandDims[i].HL.dimY);
  517. printf("Stage %d: LH: pixWidth x pixHeight: %d x %d\n", i, bandDims[i].LH.dimX, bandDims[i].LH.dimY);
  518. printf("Stage %d: HH: pixWidth x pixHeight: %d x %d\n", i, bandDims[i].HH.dimX, bandDims[i].HH.dimY);
  519. }
  520. #endif
  521. size = samplesNum*sizeof(int);
  522. src = (int *)malloc(size);
  523. memset(src, 0, size);
  524. dst = (int *)malloc(size);
  525. memset(dst, 0, size);
  526. result = (unsigned char *)malloc(samplesNum);
  527. errNum = clEnqueueReadBuffer(commandQueue, component, CL_TRUE, 0, size, src, 0, NULL, NULL);
  528. // fatal_CL(errNum, __LINE__);
  529. // LL Band
  530. size = bandDims[stages-1].LL.dimX * sizeof(int);
  531. for (i = 0; i < bandDims[stages-1].LL.dimY; i++)
  532. {
  533. memcpy(dst+i*pixWidth, src+i*bandDims[stages-1].LL.dimX, size);
  534. }
  535. for (s = stages - 1; s >= 0; s--) {
  536. // HL Band
  537. size = bandDims[s].HL.dimX * sizeof(int);
  538. offset = bandDims[s].LL.dimX * bandDims[s].LL.dimY;
  539. for (i = 0; i < bandDims[s].HL.dimY; i++)
  540. {
  541. memcpy(dst+i*pixWidth+bandDims[s].LL.dimX,
  542. src+offset+i*bandDims[s].HL.dimX,
  543. size);
  544. }
  545. // LH band
  546. size = bandDims[s].LH.dimX * sizeof(int);
  547. offset += bandDims[s].HL.dimX * bandDims[s].HL.dimY;
  548. yOffset = bandDims[s].LL.dimY;
  549. for (i = 0; i < bandDims[s].HL.dimY; i++)
  550. {
  551. memcpy(dst+(yOffset+i)*pixWidth,
  552. src+offset+i*bandDims[s].LH.dimX,
  553. size);
  554. }
  555. //HH band
  556. size = bandDims[s].HH.dimX * sizeof(int);
  557. offset += bandDims[s].LH.dimX * bandDims[s].LH.dimY;
  558. yOffset = bandDims[s].HL.dimY;
  559. for (i = 0; i < bandDims[s].HH.dimY; i++)
  560. {
  561. memcpy(dst+(yOffset+i)*pixWidth+bandDims[s].LH.dimX,
  562. src+offset+i*bandDims[s].HH.dimX,
  563. size);
  564. }
  565. }
  566. // Write component
  567. samplesToChar(result, dst, samplesNum);
  568. char outfile[strlen(filename)+strlen(suffix)];
  569. strcpy(outfile, filename);
  570. strcpy(outfile+strlen(filename), suffix);
  571. i = open(outfile, O_CREAT|O_WRONLY, 0644);
  572. if (i == -1)
  573. {
  574. error(0,errno,"cannot access %s", outfile);
  575. return -1;
  576. }
  577. printf("\nWriting to %s (%d x %d)\n", outfile, pixWidth, pixHeight);
  578. write(i, result, samplesNum);
  579. close(i);
  580. free(src);
  581. free(dst);
  582. free(result);
  583. free(bandDims);
  584. return 0;
  585. }
  586. ///
  587. // Process of DWT algorithm
  588. //
  589. template <typename T>
  590. void processDWT(struct dwt *d, int forward, int writeVisual)
  591. {
  592. int componentSize = d->pixWidth * d->pixHeight * sizeof(T);
  593. T *c_r_out, *c_g_out, *c_b_out, *backup, *c_r, *c_g, *c_b;
  594. // initialize to zeros
  595. T *temp = (T *)malloc(componentSize);
  596. memset(temp, 0, componentSize);
  597. cl_mem cl_c_r_out;
  598. cl_c_r_out = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, componentSize, temp, &errNum);
  599. // fatal_CL(errNum, __LINE__);
  600. cl_mem cl_backup;
  601. cl_backup = clCreateBuffer(context, CL_MEM_READ_WRITE |CL_MEM_COPY_HOST_PTR, componentSize, temp, &errNum);
  602. // fatal_CL(errNum, __LINE__);
  603. if (d->components == 3) {
  604. // Alloc two more buffers for G and B
  605. cl_mem cl_c_g_out;
  606. cl_c_g_out = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, componentSize, temp, &errNum);
  607. // fatal_CL(errNum, __LINE__);
  608. cl_mem cl_c_b_out;
  609. cl_c_b_out = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, componentSize, temp, &errNum);
  610. // fatal_CL(errNum, __LINE__);
  611. // Load components
  612. cl_mem cl_c_r, cl_c_g, cl_c_b;
  613. cl_c_r = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,componentSize, temp, &errNum);
  614. // fatal_CL(errNum, __LINE__);
  615. cl_c_g = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,componentSize, temp, &errNum);
  616. // fatal_CL(errNum, __LINE__);
  617. cl_c_b = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,componentSize, temp, &errNum);
  618. // fatal_CL(errNum, __LINE__);
  619. rgbToComponents(cl_c_r, cl_c_g, cl_c_b, d->srcImg, d->pixWidth, d->pixHeight);
  620. //Compute DWT and always store int file
  621. nStage2dDWT(cl_c_r, cl_c_r_out, cl_backup, d->pixWidth, d->pixHeight, d->dwtLvls, forward);
  622. nStage2dDWT(cl_c_g, cl_c_g_out, cl_backup, d->pixWidth, d->pixHeight, d->dwtLvls, forward);
  623. nStage2dDWT(cl_c_b, cl_c_b_out, cl_backup, d->pixWidth, d->pixHeight, d->dwtLvls, forward);
  624. // ---------test----------
  625. /* T *h_r_out=(T*)malloc(componentSize);
  626. errNum = clEnqueueReadBuffer(commandQueue, cl_c_g_out, CL_TRUE, 0, componentSize, h_r_out, 0, NULL, NULL);
  627. fatal_CL(errNum, __LINE__);
  628. int ii;
  629. for(ii=0;ii<componentSize/sizeof(T);ii++) {
  630. fprintf(stderr, "%d ", (int)h_r_out[ii]);
  631. if((ii+1) % (d->pixWidth) == 0) fprintf(stderr, "\n");
  632. }
  633. */ // ---------test----------
  634. #ifdef OUTPUT
  635. // Store DWT to file
  636. if(writeVisual){
  637. writeNStage2DDWT(cl_c_r_out, d->pixWidth, d->pixHeight, d->dwtLvls, d->outFilename, ".r");
  638. writeNStage2DDWT(cl_c_g_out, d->pixWidth, d->pixHeight, d->dwtLvls, d->outFilename, ".g");
  639. writeNStage2DDWT(cl_c_b_out, d->pixWidth, d->pixHeight, d->dwtLvls, d->outFilename, ".b");
  640. } else {
  641. writeLinear(cl_c_r_out, d->pixWidth, d->pixHeight, d->outFilename, ".r");
  642. writeLinear(cl_c_g_out, d->pixWidth, d->pixHeight, d->outFilename, ".g");
  643. writeLinear(cl_c_b_out, d->pixWidth, d->pixHeight, d->outFilename, ".b");
  644. }
  645. #endif
  646. clReleaseMemObject(cl_c_r);
  647. clReleaseMemObject(cl_c_g);
  648. clReleaseMemObject(cl_c_b);
  649. clReleaseMemObject(cl_c_g_out);
  650. clReleaseMemObject(cl_c_b_out);
  651. } else if(d->components == 1) {
  652. // Load components
  653. cl_mem cl_c_r;
  654. cl_c_r = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,componentSize, temp, &errNum);
  655. // fatal_CL(errNum, __LINE__);
  656. bwToComponent(cl_c_r, d->srcImg, d->pixWidth, d->pixHeight);
  657. // Compute DWT
  658. nStage2dDWT(cl_c_r, cl_c_r_out, cl_backup, d->pixWidth, d->pixHeight, d->dwtLvls, forward);
  659. //Store DWT to file
  660. if(writeVisual){
  661. writeNStage2DDWT(cl_c_r_out, d->pixWidth, d->pixHeight, d->dwtLvls, d->outFilename, ".r");
  662. } else {
  663. writeLinear(cl_c_r_out, d->pixWidth, d->pixHeight, d->outFilename, ".r");
  664. }
  665. clReleaseMemObject(cl_c_r);
  666. }
  667. free(temp);
  668. clReleaseMemObject(cl_c_r_out);
  669. }
  670. int main(int argc, char **argv)
  671. {
  672. int optindex = 0;
  673. signed char ch;
  674. struct option longopts[] =
  675. {
  676. {"dimension", required_argument, 0, 'd'}, //dimensions of src img
  677. {"components", required_argument, 0, 'c'}, //numger of components of src img
  678. {"depth", required_argument, 0, 'b'}, //bit depth of src img
  679. {"level", required_argument, 0, 'l'}, //level of dwt
  680. {"device", required_argument, 0, 'D'}, //cuda device
  681. {"forward", no_argument, 0, 'f'}, //forward transform
  682. {"reverse", no_argument, 0, 'r'}, //forward transform
  683. {"97", no_argument, 0, '9'}, //9/7 transform
  684. {"53", no_argument, 0, '5' }, //5/3transform
  685. {"write-visual",no_argument, 0, 'w' }, //write output (subbands) in visual (tiled) order instead of linear
  686. {"help", no_argument, 0, 'h'},
  687. {"platform_id", required_argument, 0, 'p'},
  688. {"use_gpu", required_argument, 0, 'g'},
  689. {"device_id", required_argument, 0, 'i'}
  690. };
  691. int pixWidth = 0; //<real pixWidth
  692. int pixHeight = 0; //<real pixHeight
  693. int compCount = 3; //number of components; 3 for RGB or YUV, 4 for RGBA
  694. int bitDepth = 8;
  695. int dwtLvls = 3; //default numuber of DWT levels
  696. int device = 0;
  697. int forward = 1; //forward transform
  698. int dwt97 = 0; //1=dwt9/7, 0=dwt5/3 transform
  699. int writeVisual = 0; //write output (subbands) in visual (tiled) order instead of linear
  700. char * pos;
  701. int platform_id = 0;
  702. int device_id = 0;
  703. int use_gpu = 0;
  704. while ((ch = getopt_long(argc, argv, "d:c:b:l:D:fr95whp:g:i:", longopts, &optindex)) != -1)
  705. {
  706. switch (ch) {
  707. case 'd':
  708. pixWidth = atoi(optarg);
  709. pos = strstr(optarg, "x");
  710. if (pos == NULL || pixWidth == 0 || (strlen(pos) >= strlen(optarg)))
  711. {
  712. usage();
  713. return -1;
  714. }
  715. pixHeight = atoi(pos+1);
  716. break;
  717. case 'c':
  718. compCount = atoi(optarg);
  719. break;
  720. case 'b':
  721. bitDepth = atoi(optarg);
  722. break;
  723. case 'l':
  724. dwtLvls = atoi(optarg);
  725. break;
  726. case 'D':
  727. device = atoi(optarg);
  728. break;
  729. case 'f':
  730. forward = 1;
  731. break;
  732. case 'r':
  733. forward = 0;
  734. break;
  735. case '9':
  736. dwt97 = 1;
  737. break;
  738. case '5':
  739. dwt97 = 0;
  740. break;
  741. case 'w':
  742. writeVisual = 1;
  743. break;
  744. case 'p':
  745. platform_id = atoi(optarg);
  746. break;
  747. case 'g':
  748. use_gpu = atoi(optarg);
  749. break;
  750. case 'i':
  751. device_id = atoi(optarg);
  752. break;
  753. case 'h':
  754. usage();
  755. return 0;
  756. case '?':
  757. return -1;
  758. default :
  759. usage();
  760. return -1;
  761. }
  762. }
  763. argc -= optind;
  764. argv += optind;
  765. if (argc == 0)
  766. { // at least one filename is expected
  767. printf("Please supply src file name\n");
  768. usage();
  769. return -1;
  770. }
  771. if (pixWidth <= 0 || pixHeight <=0)
  772. {
  773. printf("Wrong or missing dimensions\n");
  774. usage();
  775. return -1;
  776. }
  777. if (forward == 0)
  778. {
  779. writeVisual = 0; //do not write visual when RDWT
  780. }
  781. //
  782. // device init
  783. // Create an OpenCL context on first available platform
  784. context = CreateContext(platform_id, use_gpu);
  785. if (context == NULL)
  786. {
  787. std::cerr << "Failed to create OpenCL context." << std::endl;
  788. return 1;
  789. }
  790. // Create a command-queue on the first device available
  791. // on the created context
  792. commandQueue = CreateCommandQueue(context, &cldevice, device_id);
  793. if (commandQueue == NULL)
  794. {
  795. Cleanup(context, commandQueue, program, kernel);
  796. return 1;
  797. }
  798. // Create OpenCL program from com_dwt.cl kernel source
  799. program = CreateProgram(context, cldevice, "com_dwt.cl");
  800. if (program == NULL)
  801. {
  802. printf("fail to create program!!\n");
  803. }
  804. // Create OpenCL kernel
  805. c_CopySrcToComponents = clCreateKernel(program, "c_CopySrcToComponents", NULL);
  806. if (c_CopySrcToComponents == NULL)
  807. {
  808. std::cerr << "Failed to create kernel" << std::endl;
  809. }
  810. c_CopySrcToComponent = clCreateKernel(program, "c_CopySrcToComponent", NULL);
  811. if (c_CopySrcToComponent == NULL)
  812. {
  813. std::cerr << "Failed to create kernel" << std::endl;
  814. }
  815. kl_fdwt53Kernel = clCreateKernel(program, "cl_fdwt53Kernel", NULL);
  816. if (kl_fdwt53Kernel == NULL)
  817. {
  818. std::cerr<<"Failed to create kernel\n";
  819. }
  820. //initialize struct dwt
  821. struct dwt *d;
  822. d = (struct dwt *)malloc(sizeof(struct dwt));
  823. d->srcImg = NULL;
  824. d->pixWidth = pixWidth;
  825. d->pixHeight = pixHeight;
  826. d->components = compCount;
  827. d->dwtLvls = dwtLvls;
  828. // file names
  829. d->srcFilename = (char *)malloc(strlen(argv[0]));
  830. strcpy(d->srcFilename, argv[0]);
  831. if (argc == 1)
  832. { // only one filename supplyed
  833. d->outFilename = (char *)malloc(strlen(d->srcFilename)+4);
  834. strcpy(d->outFilename, d->srcFilename);
  835. strcpy(d->outFilename+strlen(d->srcFilename), ".dwt");
  836. } else {
  837. d->outFilename = strdup(argv[1]);
  838. }
  839. //Input review
  840. printf("\nSource file:\t\t%s\n", d->srcFilename);
  841. printf(" Dimensions:\t\t%dx%d\n", pixWidth, pixHeight);
  842. printf(" Components count:\t%d\n", compCount);
  843. printf(" Bit depth:\t\t%d\n", bitDepth);
  844. printf(" DWT levels:\t\t%d\n", dwtLvls);
  845. printf(" Forward transform:\t%d\n", forward);
  846. printf(" 9/7 transform:\t\t%d\n", dwt97);
  847. //data sizes
  848. int inputSize = pixWidth*pixHeight*compCount; //<amount of data (in bytes) to proccess
  849. //load img source image
  850. d->srcImg = (unsigned char *) malloc (inputSize);
  851. if (getImg(d->srcFilename, d->srcImg, inputSize) == -1)
  852. return -1;
  853. // DWT
  854. // Create memory objects, Set arguments for kernel functions, Queue the kernel up for execution across the array, Read the output buffer back to the Host, Output the result buffer
  855. if (forward == 1)
  856. {
  857. if(dwt97 == 1 )
  858. processDWT<float>(d, forward, writeVisual);
  859. else // 5/3
  860. processDWT<int>(d, forward, writeVisual);
  861. }
  862. else
  863. { // reverse
  864. if(dwt97 == 1 )
  865. processDWT<float>(d, forward, writeVisual);
  866. else // 5/3
  867. processDWT<int>(d, forward, writeVisual);
  868. }
  869. Cleanup(context, commandQueue, program, kernel);
  870. clReleaseKernel(c_CopySrcToComponents);
  871. clReleaseKernel(c_CopySrcToComponent);
  872. return 0;
  873. }