main.cpp 34 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045
  1. #include <unistd.h>
  2. #include <error.h>
  3. #include <stdio.h>
  4. #include <stdlib.h>
  5. #include <fcntl.h>
  6. #include <errno.h>
  7. #include <string.h>
  8. #include <assert.h>
  9. #include <sys/time.h>
  10. #include <getopt.h>
  11. #include <iostream>
  12. #include <vector>
  13. #include <fstream>
  14. #include <sstream>
  15. #include <vector>
  16. #include "common.h"
  17. #include "components.h"
  18. #include "dwt.h"
  19. //using namespace std;
  20. #ifdef __APPLE__
  21. #include <OpenCL/cl.h>
  22. #else
  23. #include <CL/opencl.h>
  24. #endif
  25. #define THREADS 256
  26. struct dwt {
  27. char * srcFilename;
  28. char * outFilename;
  29. unsigned char *srcImg;
  30. int pixWidth;
  31. int pixHeight;
  32. int components;
  33. int dwtLvls;
  34. };
  35. cl_context context = 0;
  36. cl_command_queue commandQueue = 0;
  37. cl_program program = 0;
  38. cl_device_id cldevice = 0;
  39. cl_kernel kernel = 0;
  40. cl_kernel c_CopySrcToComponents = 0;
  41. cl_kernel c_CopySrcToComponent = 0;
  42. cl_kernel kl_fdwt53Kernel;
  43. cl_mem memObjects[3] = { 0, 0, 0 };
  44. cl_int errNum = 0;
  45. ///
  46. // functions for preparing create opencl program, contains CreateContext, CreateProgram, CreateCommandQueue, CreateMemBuffer, and Cleanup
  47. // Create an OpenCL context on the first available GPU platform.
  48. cl_context CreateContext()
  49. {
  50. cl_context context = NULL;
  51. cl_uint platformIdCount = 0;
  52. cl_int errNum;
  53. // get number of platforms
  54. clGetPlatformIDs (0, NULL, &platformIdCount);
  55. std::vector<cl_platform_id> platformIds(platformIdCount);
  56. clGetPlatformIDs (platformIdCount, platformIds.data(), NULL);
  57. // In this example, first platform is a CPU, the second one is a GPU. we just choose the first available device.
  58. cl_context_properties contextProperties[] =
  59. {
  60. CL_CONTEXT_PLATFORM,
  61. (cl_context_properties)platformIds[1],
  62. 0
  63. };
  64. context = clCreateContextFromType(contextProperties, CL_DEVICE_TYPE_GPU,
  65. NULL, NULL, &errNum);
  66. if (errNum != CL_SUCCESS)
  67. {
  68. std::cout << "Could not create GPU context, trying CPU..." << std::endl;
  69. context = clCreateContextFromType(contextProperties, CL_DEVICE_TYPE_CPU,
  70. NULL, NULL, &errNum);
  71. if (errNum != CL_SUCCESS)
  72. {
  73. std::cerr << "Failed to create an OpenCL GPU or CPU context." << std::endl;
  74. return NULL;
  75. }
  76. }
  77. return context;
  78. }
  79. ///
  80. // Create a command queue on the first device available on the context
  81. //
  82. cl_command_queue CreateCommandQueue(cl_context context, cl_device_id *cldevice)
  83. {
  84. cl_int errNum;
  85. cl_device_id *cldevices;
  86. cl_command_queue commandQueue = NULL;
  87. size_t deviceBufferSize = -1;
  88. // First get the size of the devices buffer
  89. errNum = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &deviceBufferSize);
  90. if (errNum != CL_SUCCESS)
  91. {
  92. std::cerr << "Failed call to clGetContextInfo(...,GL_CONTEXT_DEVICES,...)";
  93. return NULL;
  94. }
  95. if (deviceBufferSize <= 0)
  96. {
  97. std::cerr << "No devices available.";
  98. return NULL;
  99. }
  100. // Allocate memory for the devices buffer
  101. cldevices = new cl_device_id[deviceBufferSize / sizeof(cl_device_id)];
  102. errNum = clGetContextInfo(context, CL_CONTEXT_DEVICES, deviceBufferSize, cldevices, NULL);
  103. if (errNum != CL_SUCCESS)
  104. {
  105. delete [] cldevices;
  106. std::cerr << "Failed to get device IDs";
  107. return NULL;
  108. }
  109. commandQueue = clCreateCommandQueue(context, cldevices[0], 0, NULL);
  110. if (commandQueue == NULL)
  111. {
  112. delete [] cldevices;
  113. std::cerr << "Failed to create commandQueue for device ";
  114. return NULL;
  115. }
  116. *cldevice = cldevices[0];
  117. delete [] cldevices;
  118. return commandQueue;
  119. }
  120. ///
  121. // Create an OpenCL program from the kernel source file
  122. //
  123. cl_program CreateProgram(cl_context context, cl_device_id cldevice, const char* fileName)
  124. {
  125. cl_int errNum;
  126. cl_program program;
  127. std::ifstream kernelFile(fileName, std::ios::in);
  128. if (!kernelFile.is_open())
  129. {
  130. std::cerr << "Failed to open file for reading: " << fileName << std::endl;
  131. return NULL;
  132. }
  133. std::ostringstream oss;
  134. oss << kernelFile.rdbuf();
  135. std::string srcStdStr = oss.str();
  136. const char *srcStr = srcStdStr.c_str();
  137. program = clCreateProgramWithSource(context, 1,
  138. (const char**)&srcStr,
  139. NULL, NULL);
  140. if (program == NULL)
  141. {
  142. std::cerr << "Failed to create CL program from source." << std::endl;
  143. return NULL;
  144. }
  145. errNum = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
  146. if (errNum != CL_SUCCESS)
  147. {
  148. // Determine the reason for the error
  149. char buildLog[16384];
  150. clGetProgramBuildInfo(program, cldevice, CL_PROGRAM_BUILD_LOG,
  151. sizeof(buildLog), buildLog, NULL);
  152. std::cerr << "Error in kernel: " << std::endl;
  153. std::cerr << buildLog;
  154. clReleaseProgram(program);
  155. return NULL;
  156. }
  157. return program;
  158. }
  159. ///
  160. // Cleanup any created OpenCL resources
  161. //
  162. void Cleanup(cl_context context, cl_command_queue commandQueue,
  163. cl_program program, cl_kernel kernel)
  164. {
  165. if (commandQueue != 0)
  166. clReleaseCommandQueue(commandQueue);
  167. if (kernel != 0)
  168. clReleaseKernel(kernel);
  169. if (program != 0)
  170. clReleaseProgram(program);
  171. if (context != 0)
  172. clReleaseContext(context);
  173. }
  174. ///
  175. // Load the input image.
  176. //
  177. int getImg(char * srcFilename, unsigned char *srcImg, int inputSize)
  178. {
  179. // printf("Loading ipnput: %s\n", srcFilename);
  180. char path[] = "../../data/dwt2d/";
  181. char *newSrc = NULL;
  182. if((newSrc = (char *)malloc(strlen(srcFilename)+strlen(path)+1)) != NULL)
  183. {
  184. newSrc[0] = '\0';
  185. strcat(newSrc, path);
  186. strcat(newSrc, srcFilename);
  187. srcFilename= newSrc;
  188. }
  189. printf("Loading ipnput: %s\n", srcFilename);
  190. //read image
  191. int i = open(srcFilename, O_RDONLY, 0644);
  192. if (i == -1)
  193. {
  194. error(0,errno,"cannot access %s", srcFilename);
  195. return -1;
  196. }
  197. int ret = read(i, srcImg, inputSize);
  198. printf("precteno %d, inputsize %d\n", ret, inputSize);
  199. close(i);
  200. return 0;
  201. }
  202. ///
  203. //Show user how to use this program
  204. //
  205. void usage() {
  206. printf("dwt [otpions] src_img.rgb <out_img.dwt>\n\
  207. -d, --dimension\t\tdimensions of src img, e.g. 1920x1080\n\
  208. -c, --components\t\tnumber of color components, default 3\n\
  209. -b, --depth\t\t\tbit depth, default 8\n\
  210. -l, --level\t\t\tDWT level, default 3\n\
  211. -D, --device\t\t\tcuda device\n\
  212. -f, --forward\t\t\tforward transform\n\
  213. -r, --reverse\t\t\treverse transform\n\
  214. -9, --97\t\t\t9/7 transform\n\
  215. -5, --53\t\t\t5/3 transform\n\
  216. -w --write-visual\t\twrite output in visual (tiled) fashion instead of the linear\n");
  217. }
  218. ///
  219. // Check the type of error about opencl program
  220. //
  221. void fatal_CL(cl_int error, int line_no)
  222. {
  223. printf("At line %d: ", line_no);
  224. switch(error) {
  225. case CL_SUCCESS: printf("CL_SUCCESS\n"); break;
  226. case CL_DEVICE_NOT_FOUND: printf("CL_DEVICE_NOT_FOUND\n"); break;
  227. case CL_DEVICE_NOT_AVAILABLE: printf("CL_DEVICE_NOT_AVAILABLE\n"); break;
  228. case CL_COMPILER_NOT_AVAILABLE: printf("CL_COMPILER_NOT_AVAILABLE\n"); break;
  229. case CL_MEM_OBJECT_ALLOCATION_FAILURE: printf("CL_MEM_OBJECT_ALLOCATION_FAILURE\n"); break;
  230. case CL_OUT_OF_RESOURCES: printf("CL_OUT_OF_RESOURCES\n"); break;
  231. case CL_OUT_OF_HOST_MEMORY: printf("CL_OUT_OF_HOST_MEMORY\n"); break;
  232. case CL_PROFILING_INFO_NOT_AVAILABLE: printf("CL_PROFILING_INFO_NOT_AVAILABLE\n"); break;
  233. case CL_MEM_COPY_OVERLAP: printf("CL_MEM_COPY_OVERLAP\n"); break;
  234. case CL_IMAGE_FORMAT_MISMATCH: printf("CL_IMAGE_FORMAT_MISMATCH\n"); break;
  235. case CL_IMAGE_FORMAT_NOT_SUPPORTED: printf("CL_IMAGE_FORMAT_NOT_SUPPORTED\n"); break;
  236. case CL_BUILD_PROGRAM_FAILURE: printf("CL_BUILD_PROGRAM_FAILURE\n"); break;
  237. case CL_MAP_FAILURE: printf("CL_MAP_FAILURE\n"); break;
  238. case CL_INVALID_VALUE: printf("CL_INVALID_VALUE\n"); break;
  239. case CL_INVALID_DEVICE_TYPE: printf("CL_INVALID_DEVICE_TYPE\n"); break;
  240. case CL_INVALID_PLATFORM: printf("CL_INVALID_PLATFORM\n"); break;
  241. case CL_INVALID_DEVICE: printf("CL_INVALID_DEVICE\n"); break;
  242. case CL_INVALID_CONTEXT: printf("CL_INVALID_CONTEXT\n"); break;
  243. case CL_INVALID_QUEUE_PROPERTIES: printf("CL_INVALID_QUEUE_PROPERTIES\n"); break;
  244. case CL_INVALID_COMMAND_QUEUE: printf("CL_INVALID_COMMAND_QUEUE\n"); break;
  245. case CL_INVALID_HOST_PTR: printf("CL_INVALID_HOST_PTR\n"); break;
  246. case CL_INVALID_MEM_OBJECT: printf("CL_INVALID_MEM_OBJECT\n"); break;
  247. case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: printf("CL_INVALID_IMAGE_FORMAT_DESCRIPTOR\n"); break;
  248. case CL_INVALID_IMAGE_SIZE: printf("CL_INVALID_IMAGE_SIZE\n"); break;
  249. case CL_INVALID_SAMPLER: printf("CL_INVALID_SAMPLER\n"); break;
  250. case CL_INVALID_BINARY: printf("CL_INVALID_BINARY\n"); break;
  251. case CL_INVALID_BUILD_OPTIONS: printf("CL_INVALID_BUILD_OPTIONS\n"); break;
  252. case CL_INVALID_PROGRAM: printf("CL_INVALID_PROGRAM\n"); break;
  253. case CL_INVALID_PROGRAM_EXECUTABLE: printf("CL_INVALID_PROGRAM_EXECUTABLE\n"); break;
  254. case CL_INVALID_KERNEL_NAME: printf("CL_INVALID_KERNEL_NAME\n"); break;
  255. case CL_INVALID_KERNEL_DEFINITION: printf("CL_INVALID_KERNEL_DEFINITION\n"); break;
  256. case CL_INVALID_KERNEL: printf("CL_INVALID_KERNEL\n"); break;
  257. case CL_INVALID_ARG_INDEX: printf("CL_INVALID_ARG_INDEX\n"); break;
  258. case CL_INVALID_ARG_VALUE: printf("CL_INVALID_ARG_VALUE\n"); break;
  259. case CL_INVALID_ARG_SIZE: printf("CL_INVALID_ARG_SIZE\n"); break;
  260. case CL_INVALID_KERNEL_ARGS: printf("CL_INVALID_KERNEL_ARGS\n"); break;
  261. case CL_INVALID_WORK_DIMENSION: printf("CL_INVALID_WORK_DIMENSION\n"); break;
  262. case CL_INVALID_WORK_GROUP_SIZE: printf("CL_INVALID_WORK_GROUP_SIZE\n"); break;
  263. case CL_INVALID_WORK_ITEM_SIZE: printf("CL_INVALID_WORK_ITEM_SIZE\n"); break;
  264. case CL_INVALID_GLOBAL_OFFSET: printf("CL_INVALID_GLOBAL_OFFSET\n"); break;
  265. case CL_INVALID_EVENT_WAIT_LIST: printf("CL_INVALID_EVENT_WAIT_LIST\n"); break;
  266. case CL_INVALID_EVENT: printf("CL_INVALID_EVENT\n"); break;
  267. case CL_INVALID_OPERATION: printf("CL_INVALID_OPERATION\n"); break;
  268. case CL_INVALID_GL_OBJECT: printf("CL_INVALID_GL_OBJECT\n"); break;
  269. case CL_INVALID_BUFFER_SIZE: printf("CL_INVALID_BUFFER_SIZE\n"); break;
  270. case CL_INVALID_MIP_LEVEL: printf("CL_INVALID_MIP_LEVEL\n"); break;
  271. case CL_INVALID_GLOBAL_WORK_SIZE: printf("CL_INVALID_GLOBAL_WORK_SIZE\n"); break;
  272. #ifdef CL_VERSION_1_1
  273. case CL_MISALIGNED_SUB_BUFFER_OFFSET: printf("CL_MISALIGNED_SUB_BUFFER_OFFSET\n"); break;
  274. case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST: printf("CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST\n"); break;
  275. #endif
  276. default: printf("Invalid OpenCL error code\n");
  277. }
  278. }
  279. ///
  280. // Separate compoents of 8bit RGB source image
  281. //in file components.cu
  282. void rgbToComponents(cl_mem d_r, cl_mem d_g, cl_mem d_b, unsigned char * h_src, int width, int height)
  283. {
  284. int pixels = width * height;
  285. int alignedSize = DIVANDRND(width*height, THREADS) * THREADS * 3; //aligned to thread block size -- THREADS
  286. cl_mem cl_d_src;
  287. cl_d_src = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, pixels*3, h_src, &errNum);
  288. // fatal_CL(errNum, __LINE__);
  289. size_t globalWorkSize[1] = { alignedSize/3};
  290. size_t localWorkSize[1] = { THREADS };
  291. errNum = clSetKernelArg(c_CopySrcToComponents, 0, sizeof(cl_mem), &d_r);
  292. errNum |= clSetKernelArg(c_CopySrcToComponents, 1, sizeof(cl_mem), &d_g);
  293. errNum |= clSetKernelArg(c_CopySrcToComponents, 2, sizeof(cl_mem), &d_b);
  294. errNum |= clSetKernelArg(c_CopySrcToComponents, 3, sizeof(cl_mem), &cl_d_src);
  295. errNum |= clSetKernelArg(c_CopySrcToComponents, 4, sizeof(int), &pixels);
  296. // fatal_CL(errNum, __LINE__);
  297. errNum = clEnqueueNDRangeKernel(commandQueue, c_CopySrcToComponents, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
  298. // fatal_CL(errNum, __LINE__);
  299. // Free Memory
  300. errNum = clReleaseMemObject(cl_d_src);
  301. // fatal_CL(errNum, __LINE__);
  302. }
  303. ///
  304. // Copy a 8bit source image data into a color compoment
  305. //in file components.cu
  306. void bwToComponent(cl_mem d_c, unsigned char * h_src, int width, int height)
  307. {
  308. cl_mem cl_d_src;
  309. int pixels = width*height;
  310. int alignedSize = DIVANDRND(pixels, THREADS) * THREADS;
  311. cl_d_src = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, pixels, h_src, NULL);
  312. // fatal_CL(errNum, __LINE__);
  313. size_t globalWorkSize[1] = { alignedSize/9};
  314. size_t localWorkSize[1] = { THREADS };
  315. assert(alignedSize%(THREADS*3) == 0);
  316. errNum = clSetKernelArg(c_CopySrcToComponent, 0, sizeof(cl_mem), &d_c);
  317. errNum |= clSetKernelArg(c_CopySrcToComponent, 1, sizeof(cl_mem), &cl_d_src);
  318. errNum |= clSetKernelArg(c_CopySrcToComponent, 2, sizeof(int), &pixels);
  319. // fatal_CL(errNum, __LINE__);
  320. errNum = clEnqueueNDRangeKernel(commandQueue, c_CopySrcToComponent, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
  321. std::cout<<"in function bwToComponent errNum= "<<errNum<<"\n";
  322. // fatal_CL(errNum, __LINE__);
  323. std::cout<<"bwToComponent has finished\n";
  324. // Free Memory
  325. errNum = clReleaseMemObject(cl_d_src);
  326. // fatal_CL(errNum, __LINE__);
  327. }
  328. /// Only computes optimal number of sliding window steps, number of threadblocks and then lanches the 5/3 FDWT kernel.
  329. /// @tparam WIN_SX width of sliding window
  330. /// @tparam WIN_SY height of sliding window
  331. /// @param in input image
  332. /// @param out output buffer
  333. /// @param sx width of the input image
  334. /// @param sy height of the input image
  335. ///launchFDWT53Kerneld is in file
  336. void launchFDWT53Kernel (int WIN_SX, int WIN_SY, cl_mem in, cl_mem out, int sx, int sy)
  337. {
  338. // compute optimal number of steps of each sliding window
  339. // cuda_dwt called a function divRndUp from namespace cuda_gwt. this function takes n and d, "return (n / d) + ((n % d) ? 1 : 0);"
  340. //
  341. const int steps = ( sy/ (15 * WIN_SY)) + ((sy % (15 * WIN_SY)) ? 1 : 0);
  342. int gx = ( sx/ WIN_SX) + ((sx % WIN_SX) ? 1 : 0); //use function divRndUp(n, d){return (n / d) + ((n % d) ? 1 : 0);}
  343. int gy = ( sy/ (WIN_SY*steps)) + ((sy % (WIN_SY*steps)) ? 1 : 0);
  344. printf("sliding steps = %d , gx = %d , gy = %d \n", steps, gx, gy);
  345. // prepare grid size
  346. size_t globalWorkSize[2] = { gx*WIN_SX, gy*1};
  347. size_t localWorkSize[2] = { WIN_SX , 1};
  348. // printf("\n globalx=%d, globaly=%d, blocksize=%d\n", gx, gy, WIN_SX);
  349. errNum = clSetKernelArg(kl_fdwt53Kernel, 0, sizeof(cl_mem), &in);
  350. errNum |= clSetKernelArg(kl_fdwt53Kernel, 1, sizeof(cl_mem), &out);
  351. errNum |= clSetKernelArg(kl_fdwt53Kernel, 2, sizeof(int), &sx);
  352. errNum |= clSetKernelArg(kl_fdwt53Kernel, 3, sizeof(int), &sy);
  353. errNum |= clSetKernelArg(kl_fdwt53Kernel, 4, sizeof(int), &steps);
  354. errNum |= clSetKernelArg(kl_fdwt53Kernel, 5, sizeof(int), &WIN_SX);
  355. errNum |= clSetKernelArg(kl_fdwt53Kernel, 6, sizeof(int), &WIN_SY);
  356. // fatal_CL(errNum, __LINE__);
  357. errNum = clEnqueueNDRangeKernel(commandQueue, kl_fdwt53Kernel, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
  358. // fatal_CL(errNum, __LINE__);
  359. printf("kl_fdwt53Kernel in launchFDW53Kernel has finished\n");
  360. }
  361. /// Simple cudaMemcpy wrapped in performance tester.
  362. /// param dest destination bufer
  363. /// param src source buffer
  364. /// param sx width of copied image
  365. /// param sy height of copied image
  366. ///from /cuda_gwt/common.h/namespace
  367. void memCopy (cl_mem dest, cl_mem src, const size_t sx, const size_t sy){
  368. errNum = clEnqueueCopyBuffer (commandQueue, src, dest, 0, 0, sx*sy*sizeof(int), 0, NULL, NULL);
  369. // fatal_CL (errNum, __LINE__);
  370. }
  371. /// Forward 5/3 2D DWT. See common rules (above) for more details.
  372. /// @param in Expected to be normalized into range [-128, 127].
  373. /// Will not be preserved (will be overwritten).
  374. /// @param out output buffer on GPU
  375. /// @param sizeX width of input image (in pixels)
  376. /// @param sizeY height of input image (in pixels)
  377. /// @param levels number of recursive DWT levels
  378. /// @backup use to test time
  379. //at the end of namespace dwt_cuda (line338)
  380. void fdwt53(cl_mem in, cl_mem out, int sizeX, int sizeY, int levels)
  381. {
  382. // select right width of kernel for the size of the image
  383. if(sizeX >= 960)
  384. {
  385. launchFDWT53Kernel(192, 8, in, out, sizeX, sizeY);
  386. }
  387. else if (sizeX >= 480)
  388. {
  389. launchFDWT53Kernel(128, 8, in, out, sizeX, sizeY);
  390. } else
  391. {
  392. launchFDWT53Kernel(64, 8, in, out, sizeX, sizeY);
  393. }
  394. // if this was not the last level, continue recursively with other levels
  395. if (levels > 1)
  396. {
  397. // copy output's LL band back into input buffer
  398. const int llSizeX = (sizeX / 2) + ((sizeX % 2) ? 1 :0);
  399. const int llSizeY = (sizeY / 2) + ((sizeY % 2) ? 1 :0);
  400. memCopy(in, out, llSizeX, llSizeY);
  401. // run remaining levels of FDWT
  402. fdwt53(in, out, llSizeX, llSizeY, levels - 1);
  403. }
  404. }
  405. ///
  406. // in dwt.cu
  407. int nStage2dDWT(cl_mem in, cl_mem out, cl_mem backup, int pixWidth, int pixHeight, int stages, bool forward)
  408. {
  409. printf("\n*** %d stages of 2D forward DWT:\n", stages);
  410. // create backup of input, because each test iteration overwrites it
  411. const int size = pixHeight * pixWidth * sizeof(int);
  412. // Measure time of individual levels.
  413. if (forward)
  414. fdwt53(in, out, pixWidth, pixHeight, stages );
  415. //else
  416. // rdwt(in, out, pixWidth, pixHeight, stages);
  417. // rdwt means rdwt53(can be found in file rdwt53.cu) which has not been defined
  418. return 0;
  419. }
  420. ///
  421. //in file dwt.cu
  422. void samplesToChar(unsigned char * dst, int * src, int samplesNum)
  423. {
  424. int i;
  425. for(i = 0; i < samplesNum; i++)
  426. {
  427. int r = src[i]+128;
  428. if (r > 255) r = 255;
  429. if (r < 0) r = 0;
  430. dst[i] = (unsigned char)r;
  431. }
  432. }
  433. ///
  434. //in file dwt.cu
  435. /// Write output linear orderd
  436. int writeLinear(cl_mem component, int pixWidth, int pixHeight, const char * filename, const char * suffix)
  437. {
  438. unsigned char * result;
  439. int *gpu_output;
  440. int i;
  441. int size;
  442. int samplesNum = pixWidth*pixHeight;
  443. size = samplesNum*sizeof(int);
  444. gpu_output = (int *)malloc(size);
  445. memset(gpu_output, 0, size);
  446. result = (unsigned char *)malloc(samplesNum);
  447. errNum = clEnqueueReadBuffer(commandQueue, component, CL_TRUE, 0, size, gpu_output, 0, NULL, NULL);
  448. // fatal_CL(errNum, __LINE__);
  449. // T to char
  450. samplesToChar(result, gpu_output, samplesNum);
  451. // Write component
  452. char outfile[strlen(filename)+strlen(suffix)];
  453. strcpy(outfile, filename);
  454. strcpy(outfile+strlen(filename), suffix);
  455. i = open(outfile, O_CREAT|O_WRONLY, 0644);
  456. if (i == -1)
  457. {
  458. error(0,errno,"cannot access %s", outfile);
  459. return -1;
  460. }
  461. printf("\nWriting to %s (%d x %d)\n", outfile, pixWidth, pixHeight);
  462. write(i, result, samplesNum);
  463. close(i);
  464. // Clean up
  465. free(gpu_output);
  466. free(result);
  467. return 0;
  468. }
  469. ///
  470. // Write output visual ordered
  471. //in file dwt.cu
  472. int writeNStage2DDWT(cl_mem component, int pixWidth, int pixHeight, int stages, const char * filename, const char * suffix)
  473. {
  474. struct band {
  475. int dimX;
  476. int dimY;
  477. };
  478. struct dimensions {
  479. struct band LL;
  480. struct band HL;
  481. struct band LH;
  482. struct band HH;
  483. };
  484. unsigned char * result;
  485. int *src;
  486. int *dst;
  487. int i,s;
  488. int size;
  489. int offset;
  490. int yOffset;
  491. int samplesNum = pixWidth*pixHeight;
  492. struct dimensions * bandDims;
  493. bandDims = (struct dimensions *)malloc(stages * sizeof(struct dimensions));
  494. bandDims[0].LL.dimX = DIVANDRND(pixWidth,2);
  495. bandDims[0].LL.dimY = DIVANDRND(pixHeight,2);
  496. bandDims[0].HL.dimX = pixWidth - bandDims[0].LL.dimX;
  497. bandDims[0].HL.dimY = bandDims[0].LL.dimY;
  498. bandDims[0].LH.dimX = bandDims[0].LL.dimX;
  499. bandDims[0].LH.dimY = pixHeight - bandDims[0].LL.dimY;
  500. bandDims[0].HH.dimX = bandDims[0].HL.dimX;
  501. bandDims[0].HH.dimY = bandDims[0].LH.dimY;
  502. for (i = 1; i < stages; i++)
  503. {
  504. bandDims[i].LL.dimX = DIVANDRND(bandDims[i-1].LL.dimX,2);
  505. bandDims[i].LL.dimY = DIVANDRND(bandDims[i-1].LL.dimY,2);
  506. bandDims[i].HL.dimX = bandDims[i-1].LL.dimX - bandDims[i].LL.dimX;
  507. bandDims[i].HL.dimY = bandDims[i].LL.dimY;
  508. bandDims[i].LH.dimX = bandDims[i].LL.dimX;
  509. bandDims[i].LH.dimY = bandDims[i-1].LL.dimY - bandDims[i].LL.dimY;
  510. bandDims[i].HH.dimX = bandDims[i].HL.dimX;
  511. bandDims[i].HH.dimY = bandDims[i].LH.dimY;
  512. }
  513. #if 0
  514. printf("Original image pixWidth x pixHeight: %d x %d\n", pixWidth, pixHeight);
  515. for (i = 0; i < stages; i++)
  516. {
  517. printf("Stage %d: LL: pixWidth x pixHeight: %d x %d\n", i, bandDims[i].LL.dimX, bandDims[i].LL.dimY);
  518. printf("Stage %d: HL: pixWidth x pixHeight: %d x %d\n", i, bandDims[i].HL.dimX, bandDims[i].HL.dimY);
  519. printf("Stage %d: LH: pixWidth x pixHeight: %d x %d\n", i, bandDims[i].LH.dimX, bandDims[i].LH.dimY);
  520. printf("Stage %d: HH: pixWidth x pixHeight: %d x %d\n", i, bandDims[i].HH.dimX, bandDims[i].HH.dimY);
  521. }
  522. #endif
  523. size = samplesNum*sizeof(int);
  524. src = (int *)malloc(size);
  525. memset(src, 0, size);
  526. dst = (int *)malloc(size);
  527. memset(dst, 0, size);
  528. result = (unsigned char *)malloc(samplesNum);
  529. errNum = clEnqueueReadBuffer(commandQueue, component, CL_TRUE, 0, size, src, 0, NULL, NULL);
  530. // fatal_CL(errNum, __LINE__);
  531. // LL Band
  532. size = bandDims[stages-1].LL.dimX * sizeof(int);
  533. for (i = 0; i < bandDims[stages-1].LL.dimY; i++)
  534. {
  535. memcpy(dst+i*pixWidth, src+i*bandDims[stages-1].LL.dimX, size);
  536. }
  537. for (s = stages - 1; s >= 0; s--) {
  538. // HL Band
  539. size = bandDims[s].HL.dimX * sizeof(int);
  540. offset = bandDims[s].LL.dimX * bandDims[s].LL.dimY;
  541. for (i = 0; i < bandDims[s].HL.dimY; i++)
  542. {
  543. memcpy(dst+i*pixWidth+bandDims[s].LL.dimX,
  544. src+offset+i*bandDims[s].HL.dimX,
  545. size);
  546. }
  547. // LH band
  548. size = bandDims[s].LH.dimX * sizeof(int);
  549. offset += bandDims[s].HL.dimX * bandDims[s].HL.dimY;
  550. yOffset = bandDims[s].LL.dimY;
  551. for (i = 0; i < bandDims[s].HL.dimY; i++)
  552. {
  553. memcpy(dst+(yOffset+i)*pixWidth,
  554. src+offset+i*bandDims[s].LH.dimX,
  555. size);
  556. }
  557. //HH band
  558. size = bandDims[s].HH.dimX * sizeof(int);
  559. offset += bandDims[s].LH.dimX * bandDims[s].LH.dimY;
  560. yOffset = bandDims[s].HL.dimY;
  561. for (i = 0; i < bandDims[s].HH.dimY; i++)
  562. {
  563. memcpy(dst+(yOffset+i)*pixWidth+bandDims[s].LH.dimX,
  564. src+offset+i*bandDims[s].HH.dimX,
  565. size);
  566. }
  567. }
  568. // Write component
  569. samplesToChar(result, dst, samplesNum);
  570. char outfile[strlen(filename)+strlen(suffix)];
  571. strcpy(outfile, filename);
  572. strcpy(outfile+strlen(filename), suffix);
  573. i = open(outfile, O_CREAT|O_WRONLY, 0644);
  574. if (i == -1)
  575. {
  576. error(0,errno,"cannot access %s", outfile);
  577. return -1;
  578. }
  579. printf("\nWriting to %s (%d x %d)\n", outfile, pixWidth, pixHeight);
  580. write(i, result, samplesNum);
  581. close(i);
  582. free(src);
  583. free(dst);
  584. free(result);
  585. free(bandDims);
  586. return 0;
  587. }
  588. ///
  589. // Process of DWT algorithm
  590. //
  591. template <typename T>
  592. void processDWT(struct dwt *d, int forward, int writeVisual)
  593. {
  594. int componentSize = d->pixWidth * d->pixHeight * sizeof(T);
  595. T *c_r_out, *c_g_out, *c_b_out, *backup, *c_r, *c_g, *c_b;
  596. // initialize to zeros
  597. T *temp = (T *)malloc(componentSize);
  598. memset(temp, 0, componentSize);
  599. cl_mem cl_c_r_out;
  600. cl_c_r_out = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, componentSize, temp, &errNum);
  601. // fatal_CL(errNum, __LINE__);
  602. cl_mem cl_backup;
  603. cl_backup = clCreateBuffer(context, CL_MEM_READ_WRITE |CL_MEM_COPY_HOST_PTR, componentSize, temp, &errNum);
  604. // fatal_CL(errNum, __LINE__);
  605. if (d->components == 3) {
  606. // Alloc two more buffers for G and B
  607. cl_mem cl_c_g_out;
  608. cl_c_g_out = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, componentSize, temp, &errNum);
  609. // fatal_CL(errNum, __LINE__);
  610. cl_mem cl_c_b_out;
  611. cl_c_b_out = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, componentSize, temp, &errNum);
  612. // fatal_CL(errNum, __LINE__);
  613. // Load components
  614. cl_mem cl_c_r, cl_c_g, cl_c_b;
  615. cl_c_r = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,componentSize, temp, &errNum);
  616. // fatal_CL(errNum, __LINE__);
  617. cl_c_g = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,componentSize, temp, &errNum);
  618. // fatal_CL(errNum, __LINE__);
  619. cl_c_b = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,componentSize, temp, &errNum);
  620. // fatal_CL(errNum, __LINE__);
  621. rgbToComponents(cl_c_r, cl_c_g, cl_c_b, d->srcImg, d->pixWidth, d->pixHeight);
  622. //Compute DWT and always store int file
  623. nStage2dDWT(cl_c_r, cl_c_r_out, cl_backup, d->pixWidth, d->pixHeight, d->dwtLvls, forward);
  624. nStage2dDWT(cl_c_g, cl_c_g_out, cl_backup, d->pixWidth, d->pixHeight, d->dwtLvls, forward);
  625. nStage2dDWT(cl_c_b, cl_c_b_out, cl_backup, d->pixWidth, d->pixHeight, d->dwtLvls, forward);
  626. // ---------test----------
  627. /* T *h_r_out=(T*)malloc(componentSize);
  628. errNum = clEnqueueReadBuffer(commandQueue, cl_c_g_out, CL_TRUE, 0, componentSize, h_r_out, 0, NULL, NULL);
  629. fatal_CL(errNum, __LINE__);
  630. int ii;
  631. for(ii=0;ii<componentSize/sizeof(T);ii++) {
  632. fprintf(stderr, "%d ", (int)h_r_out[ii]);
  633. if((ii+1) % (d->pixWidth) == 0) fprintf(stderr, "\n");
  634. }
  635. */ // ---------test----------
  636. #ifdef OUTPUT
  637. // Store DWT to file
  638. if(writeVisual){
  639. writeNStage2DDWT(cl_c_r_out, d->pixWidth, d->pixHeight, d->dwtLvls, d->outFilename, ".r");
  640. writeNStage2DDWT(cl_c_g_out, d->pixWidth, d->pixHeight, d->dwtLvls, d->outFilename, ".g");
  641. writeNStage2DDWT(cl_c_b_out, d->pixWidth, d->pixHeight, d->dwtLvls, d->outFilename, ".b");
  642. } else {
  643. writeLinear(cl_c_r_out, d->pixWidth, d->pixHeight, d->outFilename, ".r");
  644. writeLinear(cl_c_g_out, d->pixWidth, d->pixHeight, d->outFilename, ".g");
  645. writeLinear(cl_c_b_out, d->pixWidth, d->pixHeight, d->outFilename, ".b");
  646. }
  647. #endif
  648. clReleaseMemObject(cl_c_r);
  649. clReleaseMemObject(cl_c_g);
  650. clReleaseMemObject(cl_c_b);
  651. clReleaseMemObject(cl_c_g_out);
  652. clReleaseMemObject(cl_c_b_out);
  653. } else if(d->components == 1) {
  654. // Load components
  655. cl_mem cl_c_r;
  656. cl_c_r = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,componentSize, temp, &errNum);
  657. // fatal_CL(errNum, __LINE__);
  658. bwToComponent(cl_c_r, d->srcImg, d->pixWidth, d->pixHeight);
  659. // Compute DWT
  660. nStage2dDWT(cl_c_r, cl_c_r_out, cl_backup, d->pixWidth, d->pixHeight, d->dwtLvls, forward);
  661. //Store DWT to file
  662. if(writeVisual){
  663. writeNStage2DDWT(cl_c_r_out, d->pixWidth, d->pixHeight, d->dwtLvls, d->outFilename, ".r");
  664. } else {
  665. writeLinear(cl_c_r_out, d->pixWidth, d->pixHeight, d->outFilename, ".r");
  666. }
  667. clReleaseMemObject(cl_c_r);
  668. }
  669. free(temp);
  670. clReleaseMemObject(cl_c_r_out);
  671. }
  672. int main(int argc, char **argv)
  673. {
  674. int optindex = 0;
  675. signed char ch;
  676. struct option longopts[] =
  677. {
  678. {"dimension", required_argument, 0, 'd'}, //dimensions of src img
  679. {"components", required_argument, 0, 'c'}, //numger of components of src img
  680. {"depth", required_argument, 0, 'b'}, //bit depth of src img
  681. {"level", required_argument, 0, 'l'}, //level of dwt
  682. {"device", required_argument, 0, 'D'}, //cuda device
  683. {"forward", no_argument, 0, 'f'}, //forward transform
  684. {"reverse", no_argument, 0, 'r'}, //forward transform
  685. {"97", no_argument, 0, '9'}, //9/7 transform
  686. {"53", no_argument, 0, '5' }, //5/3transform
  687. {"write-visual",no_argument, 0, 'w' }, //write output (subbands) in visual (tiled) order instead of linear
  688. {"help", no_argument, 0, 'h'}
  689. };
  690. int pixWidth = 0; //<real pixWidth
  691. int pixHeight = 0; //<real pixHeight
  692. int compCount = 3; //number of components; 3 for RGB or YUV, 4 for RGBA
  693. int bitDepth = 8;
  694. int dwtLvls = 3; //default numuber of DWT levels
  695. int device = 0;
  696. int forward = 1; //forward transform
  697. int dwt97 = 0; //1=dwt9/7, 0=dwt5/3 transform
  698. int writeVisual = 0; //write output (subbands) in visual (tiled) order instead of linear
  699. char * pos;
  700. while ((ch = getopt_long(argc, argv, "d:c:b:l:D:fr95wh", longopts, &optindex)) != -1)
  701. {
  702. switch (ch) {
  703. case 'd':
  704. pixWidth = atoi(optarg);
  705. pos = strstr(optarg, "x");
  706. if (pos == NULL || pixWidth == 0 || (strlen(pos) >= strlen(optarg)))
  707. {
  708. usage();
  709. return -1;
  710. }
  711. pixHeight = atoi(pos+1);
  712. break;
  713. case 'c':
  714. compCount = atoi(optarg);
  715. break;
  716. case 'b':
  717. bitDepth = atoi(optarg);
  718. break;
  719. case 'l':
  720. dwtLvls = atoi(optarg);
  721. break;
  722. case 'D':
  723. device = atoi(optarg);
  724. break;
  725. case 'f':
  726. forward = 1;
  727. break;
  728. case 'r':
  729. forward = 0;
  730. break;
  731. case '9':
  732. dwt97 = 1;
  733. break;
  734. case '5':
  735. dwt97 = 0;
  736. break;
  737. case 'w':
  738. writeVisual = 1;
  739. break;
  740. case 'h':
  741. usage();
  742. return 0;
  743. case '?':
  744. return -1;
  745. default :
  746. usage();
  747. return -1;
  748. }
  749. }
  750. argc -= optind;
  751. argv += optind;
  752. if (argc == 0)
  753. { // at least one filename is expected
  754. printf("Please supply src file name\n");
  755. usage();
  756. return -1;
  757. }
  758. if (pixWidth <= 0 || pixHeight <=0)
  759. {
  760. printf("Wrong or missing dimensions\n");
  761. usage();
  762. return -1;
  763. }
  764. if (forward == 0)
  765. {
  766. writeVisual = 0; //do not write visual when RDWT
  767. }
  768. //
  769. // device init
  770. // Create an OpenCL context on first available platform
  771. context = CreateContext();
  772. if (context == NULL)
  773. {
  774. std::cerr << "Failed to create OpenCL context." << std::endl;
  775. return 1;
  776. }
  777. // Create a command-queue on the first device available
  778. // on the created context
  779. commandQueue = CreateCommandQueue(context, &cldevice);
  780. if (commandQueue == NULL)
  781. {
  782. Cleanup(context, commandQueue, program, kernel);
  783. return 1;
  784. }
  785. // Create OpenCL program from com_dwt.cl kernel source
  786. program = CreateProgram(context, cldevice, "com_dwt.cl");
  787. if (program == NULL)
  788. {
  789. printf("fail to create program!!\n");
  790. }
  791. // Create OpenCL kernel
  792. c_CopySrcToComponents = clCreateKernel(program, "c_CopySrcToComponents", NULL);
  793. if (c_CopySrcToComponents == NULL)
  794. {
  795. std::cerr << "Failed to create kernel" << std::endl;
  796. }
  797. c_CopySrcToComponent = clCreateKernel(program, "c_CopySrcToComponent", NULL);
  798. if (c_CopySrcToComponent == NULL)
  799. {
  800. std::cerr << "Failed to create kernel" << std::endl;
  801. }
  802. kl_fdwt53Kernel = clCreateKernel(program, "cl_fdwt53Kernel", NULL);
  803. if (kl_fdwt53Kernel == NULL)
  804. {
  805. std::cerr<<"Failed to create kernel\n";
  806. }
  807. //initialize struct dwt
  808. struct dwt *d;
  809. d = (struct dwt *)malloc(sizeof(struct dwt));
  810. d->srcImg = NULL;
  811. d->pixWidth = pixWidth;
  812. d->pixHeight = pixHeight;
  813. d->components = compCount;
  814. d->dwtLvls = dwtLvls;
  815. // file names
  816. d->srcFilename = (char *)malloc(strlen(argv[0]));
  817. strcpy(d->srcFilename, argv[0]);
  818. if (argc == 1)
  819. { // only one filename supplyed
  820. d->outFilename = (char *)malloc(strlen(d->srcFilename)+4);
  821. strcpy(d->outFilename, d->srcFilename);
  822. strcpy(d->outFilename+strlen(d->srcFilename), ".dwt");
  823. } else {
  824. d->outFilename = strdup(argv[1]);
  825. }
  826. //Input review
  827. printf("\nSource file:\t\t%s\n", d->srcFilename);
  828. printf(" Dimensions:\t\t%dx%d\n", pixWidth, pixHeight);
  829. printf(" Components count:\t%d\n", compCount);
  830. printf(" Bit depth:\t\t%d\n", bitDepth);
  831. printf(" DWT levels:\t\t%d\n", dwtLvls);
  832. printf(" Forward transform:\t%d\n", forward);
  833. printf(" 9/7 transform:\t\t%d\n", dwt97);
  834. //data sizes
  835. int inputSize = pixWidth*pixHeight*compCount; //<amount of data (in bytes) to proccess
  836. //load img source image
  837. d->srcImg = (unsigned char *) malloc (inputSize);
  838. if (getImg(d->srcFilename, d->srcImg, inputSize) == -1)
  839. return -1;
  840. // DWT
  841. // Create memory objects, Set arguments for kernel functions, Queue the kernel up for execution across the array, Read the output buffer back to the Host, Output the result buffer
  842. if (forward == 1)
  843. {
  844. if(dwt97 == 1 )
  845. processDWT<float>(d, forward, writeVisual);
  846. else // 5/3
  847. processDWT<int>(d, forward, writeVisual);
  848. }
  849. else
  850. { // reverse
  851. if(dwt97 == 1 )
  852. processDWT<float>(d, forward, writeVisual);
  853. else // 5/3
  854. processDWT<int>(d, forward, writeVisual);
  855. }
  856. Cleanup(context, commandQueue, program, kernel);
  857. clReleaseKernel(c_CopySrcToComponents);
  858. clReleaseKernel(c_CopySrcToComponent);
  859. return 0;
  860. }