12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058 |
- #include <unistd.h>
- #include <error.h>
- #include <stdio.h>
- #include <stdlib.h>
- #include <fcntl.h>
- #include <errno.h>
- #include <string.h>
- #include <assert.h>
- #include <sys/time.h>
- #include <getopt.h>
- #include <iostream>
- #include <vector>
- #include <fstream>
- #include <sstream>
- #include <vector>
- #include "common.h"
- #include "components.h"
- #include "dwt.h"
- //using namespace std;
- #ifdef __APPLE__
- #include <OpenCL/cl.h>
- #else
- #include <CL/opencl.h>
- #endif
- #define THREADS 256
- struct dwt {
- char * srcFilename;
- char * outFilename;
- unsigned char *srcImg;
- int pixWidth;
- int pixHeight;
- int components;
- int dwtLvls;
- };
- cl_context context = 0;
- cl_command_queue commandQueue = 0;
- cl_program program = 0;
- cl_device_id cldevice = 0;
- cl_kernel kernel = 0;
- cl_kernel c_CopySrcToComponents = 0;
- cl_kernel c_CopySrcToComponent = 0;
- cl_kernel kl_fdwt53Kernel;
- cl_mem memObjects[3] = { 0, 0, 0 };
- cl_int errNum = 0;
- ///
- // functions for preparing create opencl program, contains CreateContext, CreateProgram, CreateCommandQueue, CreateMemBuffer, and Cleanup
- // Create an OpenCL context on the first available GPU platform.
- cl_context CreateContext(int platform_id, int use_gpu)
- {
- cl_context context = NULL;
- cl_uint platformIdCount = 0;
- cl_int errNum;
- cl_device_type device_type = use_gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU;
- // get number of platforms
- clGetPlatformIDs (0, NULL, &platformIdCount);
- std::vector<cl_platform_id> platformIds(platformIdCount);
- clGetPlatformIDs (platformIdCount, platformIds.data(), NULL);
- // In this example, first platform is a CPU, the second one is a GPU. we just choose the first available device.
- cl_context_properties contextProperties[] =
- {
- CL_CONTEXT_PLATFORM,
- (cl_context_properties)platformIds[platform_id],
- 0
- };
- context = clCreateContextFromType(contextProperties, device_type,
- NULL, NULL, &errNum);
- if (errNum != CL_SUCCESS)
- {
- std::cerr << "Failed to create an OpenCL GPU or CPU context." << std::endl;
- return NULL;
- }
- return context;
- }
- ///
- // Create a command queue on the first device available on the context
- //
- cl_command_queue CreateCommandQueue(cl_context context, cl_device_id *cldevice, int device_id)
- {
- cl_int errNum;
- cl_device_id *cldevices;
- cl_command_queue commandQueue = NULL;
- size_t deviceBufferSize = -1;
- // First get the size of the devices buffer
- errNum = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &deviceBufferSize);
- if (errNum != CL_SUCCESS)
- {
- std::cerr << "Failed call to clGetContextInfo(...,GL_CONTEXT_DEVICES,...)";
- return NULL;
- }
- if (deviceBufferSize <= 0)
- {
- std::cerr << "No devices available.";
- return NULL;
- }
- // Allocate memory for the devices buffer
- cldevices = new cl_device_id[deviceBufferSize / sizeof(cl_device_id)];
- errNum = clGetContextInfo(context, CL_CONTEXT_DEVICES, deviceBufferSize, cldevices, NULL);
- if (errNum != CL_SUCCESS)
- {
- delete [] cldevices;
- std::cerr << "Failed to get device IDs";
- return NULL;
- }
- commandQueue = clCreateCommandQueue(context, cldevices[device_id], 0, NULL);
- if (commandQueue == NULL)
- {
- delete [] cldevices;
- std::cerr << "Failed to create commandQueue for device ";
- return NULL;
- }
- *cldevice = cldevices[device_id];
- delete [] cldevices;
- return commandQueue;
- }
- ///
- // Create an OpenCL program from the kernel source file
- //
- cl_program CreateProgram(cl_context context, cl_device_id cldevice, const char* fileName)
- {
- cl_int errNum;
- cl_program program;
- std::ifstream kernelFile(fileName, std::ios::in);
- if (!kernelFile.is_open())
- {
- std::cerr << "Failed to open file for reading: " << fileName << std::endl;
- return NULL;
- }
- std::ostringstream oss;
- oss << kernelFile.rdbuf();
- std::string srcStdStr = oss.str();
- const char *srcStr = srcStdStr.c_str();
- program = clCreateProgramWithSource(context, 1,
- (const char**)&srcStr,
- NULL, NULL);
- if (program == NULL)
- {
- std::cerr << "Failed to create CL program from source." << std::endl;
- return NULL;
- }
- errNum = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
- if (errNum != CL_SUCCESS)
- {
- // Determine the reason for the error
- char buildLog[16384];
- clGetProgramBuildInfo(program, cldevice, CL_PROGRAM_BUILD_LOG,
- sizeof(buildLog), buildLog, NULL);
- std::cerr << "Error in kernel: " << std::endl;
- std::cerr << buildLog;
- clReleaseProgram(program);
- return NULL;
- }
- return program;
- }
- ///
- // Cleanup any created OpenCL resources
- //
- void Cleanup(cl_context context, cl_command_queue commandQueue,
- cl_program program, cl_kernel kernel)
- {
- if (commandQueue != 0)
- clReleaseCommandQueue(commandQueue);
- if (kernel != 0)
- clReleaseKernel(kernel);
- if (program != 0)
- clReleaseProgram(program);
- if (context != 0)
- clReleaseContext(context);
- }
- ///
- // Load the input image.
- //
- int getImg(char * srcFilename, unsigned char *srcImg, int inputSize)
- {
- // printf("Loading ipnput: %s\n", srcFilename);
- char path[] = "../../data/dwt2d/";
- char *newSrc = NULL;
- if((newSrc = (char *)malloc(strlen(srcFilename)+strlen(path)+1)) != NULL)
- {
- newSrc[0] = '\0';
- strcat(newSrc, path);
- strcat(newSrc, srcFilename);
- srcFilename= newSrc;
- }
- printf("Loading ipnput: %s\n", srcFilename);
- //read image
- int i = open(srcFilename, O_RDONLY, 0644);
- if (i == -1)
- {
- error(0,errno,"cannot access %s", srcFilename);
- return -1;
- }
- int ret = read(i, srcImg, inputSize);
- printf("precteno %d, inputsize %d\n", ret, inputSize);
- close(i);
- return 0;
- }
- ///
- //Show user how to use this program
- //
- void usage() {
- printf("dwt [otpions] src_img.rgb <out_img.dwt>\n\
- -d, --dimension\t\tdimensions of src img, e.g. 1920x1080\n\
- -c, --components\t\tnumber of color components, default 3\n\
- -b, --depth\t\t\tbit depth, default 8\n\
- -l, --level\t\t\tDWT level, default 3\n\
- -D, --device\t\t\tcuda device\n\
- -f, --forward\t\t\tforward transform\n\
- -r, --reverse\t\t\treverse transform\n\
- -9, --97\t\t\t9/7 transform\n\
- -5, --53\t\t\t5/3 transform\n\
- -w --write-visual\t\twrite output in visual (tiled) fashion instead of the linear\n\
- -p --platform_id\t\t\tOCL platform id to use\n\
- -g --use_gpu\t\t\t1 to use gpu, 0 to use cpu\n\
- -i --device_id\t\t\tOCL devicde id to use\n");
- }
- ///
- // Check the type of error about opencl program
- //
- void fatal_CL(cl_int error, int line_no)
- {
- printf("At line %d: ", line_no);
- switch(error) {
- case CL_SUCCESS: printf("CL_SUCCESS\n"); break;
- case CL_DEVICE_NOT_FOUND: printf("CL_DEVICE_NOT_FOUND\n"); break;
- case CL_DEVICE_NOT_AVAILABLE: printf("CL_DEVICE_NOT_AVAILABLE\n"); break;
- case CL_COMPILER_NOT_AVAILABLE: printf("CL_COMPILER_NOT_AVAILABLE\n"); break;
- case CL_MEM_OBJECT_ALLOCATION_FAILURE: printf("CL_MEM_OBJECT_ALLOCATION_FAILURE\n"); break;
- case CL_OUT_OF_RESOURCES: printf("CL_OUT_OF_RESOURCES\n"); break;
- case CL_OUT_OF_HOST_MEMORY: printf("CL_OUT_OF_HOST_MEMORY\n"); break;
- case CL_PROFILING_INFO_NOT_AVAILABLE: printf("CL_PROFILING_INFO_NOT_AVAILABLE\n"); break;
- case CL_MEM_COPY_OVERLAP: printf("CL_MEM_COPY_OVERLAP\n"); break;
- case CL_IMAGE_FORMAT_MISMATCH: printf("CL_IMAGE_FORMAT_MISMATCH\n"); break;
- case CL_IMAGE_FORMAT_NOT_SUPPORTED: printf("CL_IMAGE_FORMAT_NOT_SUPPORTED\n"); break;
- case CL_BUILD_PROGRAM_FAILURE: printf("CL_BUILD_PROGRAM_FAILURE\n"); break;
- case CL_MAP_FAILURE: printf("CL_MAP_FAILURE\n"); break;
- case CL_INVALID_VALUE: printf("CL_INVALID_VALUE\n"); break;
- case CL_INVALID_DEVICE_TYPE: printf("CL_INVALID_DEVICE_TYPE\n"); break;
- case CL_INVALID_PLATFORM: printf("CL_INVALID_PLATFORM\n"); break;
- case CL_INVALID_DEVICE: printf("CL_INVALID_DEVICE\n"); break;
- case CL_INVALID_CONTEXT: printf("CL_INVALID_CONTEXT\n"); break;
- case CL_INVALID_QUEUE_PROPERTIES: printf("CL_INVALID_QUEUE_PROPERTIES\n"); break;
- case CL_INVALID_COMMAND_QUEUE: printf("CL_INVALID_COMMAND_QUEUE\n"); break;
- case CL_INVALID_HOST_PTR: printf("CL_INVALID_HOST_PTR\n"); break;
- case CL_INVALID_MEM_OBJECT: printf("CL_INVALID_MEM_OBJECT\n"); break;
- case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: printf("CL_INVALID_IMAGE_FORMAT_DESCRIPTOR\n"); break;
- case CL_INVALID_IMAGE_SIZE: printf("CL_INVALID_IMAGE_SIZE\n"); break;
- case CL_INVALID_SAMPLER: printf("CL_INVALID_SAMPLER\n"); break;
- case CL_INVALID_BINARY: printf("CL_INVALID_BINARY\n"); break;
- case CL_INVALID_BUILD_OPTIONS: printf("CL_INVALID_BUILD_OPTIONS\n"); break;
- case CL_INVALID_PROGRAM: printf("CL_INVALID_PROGRAM\n"); break;
- case CL_INVALID_PROGRAM_EXECUTABLE: printf("CL_INVALID_PROGRAM_EXECUTABLE\n"); break;
- case CL_INVALID_KERNEL_NAME: printf("CL_INVALID_KERNEL_NAME\n"); break;
- case CL_INVALID_KERNEL_DEFINITION: printf("CL_INVALID_KERNEL_DEFINITION\n"); break;
- case CL_INVALID_KERNEL: printf("CL_INVALID_KERNEL\n"); break;
- case CL_INVALID_ARG_INDEX: printf("CL_INVALID_ARG_INDEX\n"); break;
- case CL_INVALID_ARG_VALUE: printf("CL_INVALID_ARG_VALUE\n"); break;
- case CL_INVALID_ARG_SIZE: printf("CL_INVALID_ARG_SIZE\n"); break;
- case CL_INVALID_KERNEL_ARGS: printf("CL_INVALID_KERNEL_ARGS\n"); break;
- case CL_INVALID_WORK_DIMENSION: printf("CL_INVALID_WORK_DIMENSION\n"); break;
- case CL_INVALID_WORK_GROUP_SIZE: printf("CL_INVALID_WORK_GROUP_SIZE\n"); break;
- case CL_INVALID_WORK_ITEM_SIZE: printf("CL_INVALID_WORK_ITEM_SIZE\n"); break;
- case CL_INVALID_GLOBAL_OFFSET: printf("CL_INVALID_GLOBAL_OFFSET\n"); break;
- case CL_INVALID_EVENT_WAIT_LIST: printf("CL_INVALID_EVENT_WAIT_LIST\n"); break;
- case CL_INVALID_EVENT: printf("CL_INVALID_EVENT\n"); break;
- case CL_INVALID_OPERATION: printf("CL_INVALID_OPERATION\n"); break;
- case CL_INVALID_GL_OBJECT: printf("CL_INVALID_GL_OBJECT\n"); break;
- case CL_INVALID_BUFFER_SIZE: printf("CL_INVALID_BUFFER_SIZE\n"); break;
- case CL_INVALID_MIP_LEVEL: printf("CL_INVALID_MIP_LEVEL\n"); break;
- case CL_INVALID_GLOBAL_WORK_SIZE: printf("CL_INVALID_GLOBAL_WORK_SIZE\n"); break;
- #ifdef CL_VERSION_1_1
- case CL_MISALIGNED_SUB_BUFFER_OFFSET: printf("CL_MISALIGNED_SUB_BUFFER_OFFSET\n"); break;
- case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST: printf("CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST\n"); break;
- #endif
- default: printf("Invalid OpenCL error code\n");
- }
- }
- ///
- // Separate compoents of 8bit RGB source image
- //in file components.cu
- void rgbToComponents(cl_mem d_r, cl_mem d_g, cl_mem d_b, unsigned char * h_src, int width, int height)
- {
- int pixels = width * height;
- int alignedSize = DIVANDRND(width*height, THREADS) * THREADS * 3; //aligned to thread block size -- THREADS
- cl_mem cl_d_src;
- cl_d_src = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, pixels*3, h_src, &errNum);
- // fatal_CL(errNum, __LINE__);
- size_t globalWorkSize[1] = { alignedSize/3};
- size_t localWorkSize[1] = { THREADS };
- errNum = clSetKernelArg(c_CopySrcToComponents, 0, sizeof(cl_mem), &d_r);
- errNum |= clSetKernelArg(c_CopySrcToComponents, 1, sizeof(cl_mem), &d_g);
- errNum |= clSetKernelArg(c_CopySrcToComponents, 2, sizeof(cl_mem), &d_b);
- errNum |= clSetKernelArg(c_CopySrcToComponents, 3, sizeof(cl_mem), &cl_d_src);
- errNum |= clSetKernelArg(c_CopySrcToComponents, 4, sizeof(int), &pixels);
- // fatal_CL(errNum, __LINE__);
- errNum = clEnqueueNDRangeKernel(commandQueue, c_CopySrcToComponents, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
- // fatal_CL(errNum, __LINE__);
- // Free Memory
- errNum = clReleaseMemObject(cl_d_src);
- // fatal_CL(errNum, __LINE__);
- }
- ///
- // Copy a 8bit source image data into a color compoment
- //in file components.cu
- void bwToComponent(cl_mem d_c, unsigned char * h_src, int width, int height)
- {
- cl_mem cl_d_src;
- int pixels = width*height;
- int alignedSize = DIVANDRND(pixels, THREADS) * THREADS;
- cl_d_src = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, pixels, h_src, NULL);
- // fatal_CL(errNum, __LINE__);
- size_t globalWorkSize[1] = { alignedSize/9};
- size_t localWorkSize[1] = { THREADS };
- assert(alignedSize%(THREADS*3) == 0);
- errNum = clSetKernelArg(c_CopySrcToComponent, 0, sizeof(cl_mem), &d_c);
- errNum |= clSetKernelArg(c_CopySrcToComponent, 1, sizeof(cl_mem), &cl_d_src);
- errNum |= clSetKernelArg(c_CopySrcToComponent, 2, sizeof(int), &pixels);
- // fatal_CL(errNum, __LINE__);
- errNum = clEnqueueNDRangeKernel(commandQueue, c_CopySrcToComponent, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
- std::cout<<"in function bwToComponent errNum= "<<errNum<<"\n";
- // fatal_CL(errNum, __LINE__);
- std::cout<<"bwToComponent has finished\n";
- // Free Memory
- errNum = clReleaseMemObject(cl_d_src);
- // fatal_CL(errNum, __LINE__);
- }
- /// Only computes optimal number of sliding window steps, number of threadblocks and then lanches the 5/3 FDWT kernel.
- /// @tparam WIN_SX width of sliding window
- /// @tparam WIN_SY height of sliding window
- /// @param in input image
- /// @param out output buffer
- /// @param sx width of the input image
- /// @param sy height of the input image
- ///launchFDWT53Kerneld is in file
- void launchFDWT53Kernel (int WIN_SX, int WIN_SY, cl_mem in, cl_mem out, int sx, int sy)
- {
- // compute optimal number of steps of each sliding window
- // cuda_dwt called a function divRndUp from namespace cuda_gwt. this function takes n and d, "return (n / d) + ((n % d) ? 1 : 0);"
- //
- const int steps = ( sy/ (15 * WIN_SY)) + ((sy % (15 * WIN_SY)) ? 1 : 0);
- int gx = ( sx/ WIN_SX) + ((sx % WIN_SX) ? 1 : 0); //use function divRndUp(n, d){return (n / d) + ((n % d) ? 1 : 0);}
- int gy = ( sy/ (WIN_SY*steps)) + ((sy % (WIN_SY*steps)) ? 1 : 0);
- printf("sliding steps = %d , gx = %d , gy = %d \n", steps, gx, gy);
- // prepare grid size
- size_t globalWorkSize[2] = { gx*WIN_SX, gy*1};
- size_t localWorkSize[2] = { WIN_SX , 1};
- // printf("\n globalx=%d, globaly=%d, blocksize=%d\n", gx, gy, WIN_SX);
- errNum = clSetKernelArg(kl_fdwt53Kernel, 0, sizeof(cl_mem), &in);
- errNum |= clSetKernelArg(kl_fdwt53Kernel, 1, sizeof(cl_mem), &out);
- errNum |= clSetKernelArg(kl_fdwt53Kernel, 2, sizeof(int), &sx);
- errNum |= clSetKernelArg(kl_fdwt53Kernel, 3, sizeof(int), &sy);
- errNum |= clSetKernelArg(kl_fdwt53Kernel, 4, sizeof(int), &steps);
- errNum |= clSetKernelArg(kl_fdwt53Kernel, 5, sizeof(int), &WIN_SX);
- errNum |= clSetKernelArg(kl_fdwt53Kernel, 6, sizeof(int), &WIN_SY);
- // fatal_CL(errNum, __LINE__);
- errNum = clEnqueueNDRangeKernel(commandQueue, kl_fdwt53Kernel, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
- // fatal_CL(errNum, __LINE__);
- printf("kl_fdwt53Kernel in launchFDW53Kernel has finished\n");
- }
- /// Simple cudaMemcpy wrapped in performance tester.
- /// param dest destination bufer
- /// param src source buffer
- /// param sx width of copied image
- /// param sy height of copied image
- ///from /cuda_gwt/common.h/namespace
- void memCopy (cl_mem dest, cl_mem src, const size_t sx, const size_t sy){
- errNum = clEnqueueCopyBuffer (commandQueue, src, dest, 0, 0, sx*sy*sizeof(int), 0, NULL, NULL);
- // fatal_CL (errNum, __LINE__);
- }
- /// Forward 5/3 2D DWT. See common rules (above) for more details.
- /// @param in Expected to be normalized into range [-128, 127].
- /// Will not be preserved (will be overwritten).
- /// @param out output buffer on GPU
- /// @param sizeX width of input image (in pixels)
- /// @param sizeY height of input image (in pixels)
- /// @param levels number of recursive DWT levels
- /// @backup use to test time
- //at the end of namespace dwt_cuda (line338)
- void fdwt53(cl_mem in, cl_mem out, int sizeX, int sizeY, int levels)
- {
- // select right width of kernel for the size of the image
- if(sizeX >= 960)
- {
- launchFDWT53Kernel(192, 8, in, out, sizeX, sizeY);
- }
- else if (sizeX >= 480)
- {
- launchFDWT53Kernel(128, 8, in, out, sizeX, sizeY);
- } else
- {
- launchFDWT53Kernel(64, 8, in, out, sizeX, sizeY);
- }
- // if this was not the last level, continue recursively with other levels
- if (levels > 1)
- {
- // copy output's LL band back into input buffer
- const int llSizeX = (sizeX / 2) + ((sizeX % 2) ? 1 :0);
- const int llSizeY = (sizeY / 2) + ((sizeY % 2) ? 1 :0);
- memCopy(in, out, llSizeX, llSizeY);
- // run remaining levels of FDWT
- fdwt53(in, out, llSizeX, llSizeY, levels - 1);
- }
- }
- ///
- // in dwt.cu
- int nStage2dDWT(cl_mem in, cl_mem out, cl_mem backup, int pixWidth, int pixHeight, int stages, bool forward)
- {
- printf("\n*** %d stages of 2D forward DWT:\n", stages);
- // create backup of input, because each test iteration overwrites it
- const int size = pixHeight * pixWidth * sizeof(int);
- // Measure time of individual levels.
- if (forward)
- fdwt53(in, out, pixWidth, pixHeight, stages );
- //else
- // rdwt(in, out, pixWidth, pixHeight, stages);
- // rdwt means rdwt53(can be found in file rdwt53.cu) which has not been defined
- return 0;
- }
- ///
- //in file dwt.cu
- void samplesToChar(unsigned char * dst, int * src, int samplesNum)
- {
- int i;
- for(i = 0; i < samplesNum; i++)
- {
- int r = src[i]+128;
- if (r > 255) r = 255;
- if (r < 0) r = 0;
- dst[i] = (unsigned char)r;
- }
- }
- ///
- //in file dwt.cu
- /// Write output linear orderd
- int writeLinear(cl_mem component, int pixWidth, int pixHeight, const char * filename, const char * suffix)
- {
- unsigned char * result;
- int *gpu_output;
- int i;
- int size;
- int samplesNum = pixWidth*pixHeight;
- size = samplesNum*sizeof(int);
- gpu_output = (int *)malloc(size);
- memset(gpu_output, 0, size);
- result = (unsigned char *)malloc(samplesNum);
- errNum = clEnqueueReadBuffer(commandQueue, component, CL_TRUE, 0, size, gpu_output, 0, NULL, NULL);
- // fatal_CL(errNum, __LINE__);
- // T to char
- samplesToChar(result, gpu_output, samplesNum);
- // Write component
- char outfile[strlen(filename)+strlen(suffix)];
- strcpy(outfile, filename);
- strcpy(outfile+strlen(filename), suffix);
- i = open(outfile, O_CREAT|O_WRONLY, 0644);
- if (i == -1)
- {
- error(0,errno,"cannot access %s", outfile);
- return -1;
- }
- printf("\nWriting to %s (%d x %d)\n", outfile, pixWidth, pixHeight);
- write(i, result, samplesNum);
- close(i);
- // Clean up
- free(gpu_output);
- free(result);
- return 0;
- }
- ///
- // Write output visual ordered
- //in file dwt.cu
- int writeNStage2DDWT(cl_mem component, int pixWidth, int pixHeight, int stages, const char * filename, const char * suffix)
- {
- struct band {
- int dimX;
- int dimY;
- };
- struct dimensions {
- struct band LL;
- struct band HL;
- struct band LH;
- struct band HH;
- };
- unsigned char * result;
- int *src;
- int *dst;
- int i,s;
- int size;
- int offset;
- int yOffset;
- int samplesNum = pixWidth*pixHeight;
- struct dimensions * bandDims;
- bandDims = (struct dimensions *)malloc(stages * sizeof(struct dimensions));
- bandDims[0].LL.dimX = DIVANDRND(pixWidth,2);
- bandDims[0].LL.dimY = DIVANDRND(pixHeight,2);
- bandDims[0].HL.dimX = pixWidth - bandDims[0].LL.dimX;
- bandDims[0].HL.dimY = bandDims[0].LL.dimY;
- bandDims[0].LH.dimX = bandDims[0].LL.dimX;
- bandDims[0].LH.dimY = pixHeight - bandDims[0].LL.dimY;
- bandDims[0].HH.dimX = bandDims[0].HL.dimX;
- bandDims[0].HH.dimY = bandDims[0].LH.dimY;
- for (i = 1; i < stages; i++)
- {
- bandDims[i].LL.dimX = DIVANDRND(bandDims[i-1].LL.dimX,2);
- bandDims[i].LL.dimY = DIVANDRND(bandDims[i-1].LL.dimY,2);
- bandDims[i].HL.dimX = bandDims[i-1].LL.dimX - bandDims[i].LL.dimX;
- bandDims[i].HL.dimY = bandDims[i].LL.dimY;
- bandDims[i].LH.dimX = bandDims[i].LL.dimX;
- bandDims[i].LH.dimY = bandDims[i-1].LL.dimY - bandDims[i].LL.dimY;
- bandDims[i].HH.dimX = bandDims[i].HL.dimX;
- bandDims[i].HH.dimY = bandDims[i].LH.dimY;
- }
- #if 0
- printf("Original image pixWidth x pixHeight: %d x %d\n", pixWidth, pixHeight);
- for (i = 0; i < stages; i++)
- {
- printf("Stage %d: LL: pixWidth x pixHeight: %d x %d\n", i, bandDims[i].LL.dimX, bandDims[i].LL.dimY);
- printf("Stage %d: HL: pixWidth x pixHeight: %d x %d\n", i, bandDims[i].HL.dimX, bandDims[i].HL.dimY);
- printf("Stage %d: LH: pixWidth x pixHeight: %d x %d\n", i, bandDims[i].LH.dimX, bandDims[i].LH.dimY);
- printf("Stage %d: HH: pixWidth x pixHeight: %d x %d\n", i, bandDims[i].HH.dimX, bandDims[i].HH.dimY);
- }
- #endif
- size = samplesNum*sizeof(int);
- src = (int *)malloc(size);
- memset(src, 0, size);
- dst = (int *)malloc(size);
- memset(dst, 0, size);
- result = (unsigned char *)malloc(samplesNum);
- errNum = clEnqueueReadBuffer(commandQueue, component, CL_TRUE, 0, size, src, 0, NULL, NULL);
- // fatal_CL(errNum, __LINE__);
- // LL Band
- size = bandDims[stages-1].LL.dimX * sizeof(int);
- for (i = 0; i < bandDims[stages-1].LL.dimY; i++)
- {
- memcpy(dst+i*pixWidth, src+i*bandDims[stages-1].LL.dimX, size);
- }
- for (s = stages - 1; s >= 0; s--) {
- // HL Band
- size = bandDims[s].HL.dimX * sizeof(int);
- offset = bandDims[s].LL.dimX * bandDims[s].LL.dimY;
- for (i = 0; i < bandDims[s].HL.dimY; i++)
- {
- memcpy(dst+i*pixWidth+bandDims[s].LL.dimX,
- src+offset+i*bandDims[s].HL.dimX,
- size);
- }
- // LH band
- size = bandDims[s].LH.dimX * sizeof(int);
- offset += bandDims[s].HL.dimX * bandDims[s].HL.dimY;
- yOffset = bandDims[s].LL.dimY;
- for (i = 0; i < bandDims[s].HL.dimY; i++)
- {
- memcpy(dst+(yOffset+i)*pixWidth,
- src+offset+i*bandDims[s].LH.dimX,
- size);
- }
- //HH band
- size = bandDims[s].HH.dimX * sizeof(int);
- offset += bandDims[s].LH.dimX * bandDims[s].LH.dimY;
- yOffset = bandDims[s].HL.dimY;
- for (i = 0; i < bandDims[s].HH.dimY; i++)
- {
- memcpy(dst+(yOffset+i)*pixWidth+bandDims[s].LH.dimX,
- src+offset+i*bandDims[s].HH.dimX,
- size);
- }
- }
- // Write component
- samplesToChar(result, dst, samplesNum);
- char outfile[strlen(filename)+strlen(suffix)];
- strcpy(outfile, filename);
- strcpy(outfile+strlen(filename), suffix);
- i = open(outfile, O_CREAT|O_WRONLY, 0644);
- if (i == -1)
- {
- error(0,errno,"cannot access %s", outfile);
- return -1;
- }
- printf("\nWriting to %s (%d x %d)\n", outfile, pixWidth, pixHeight);
- write(i, result, samplesNum);
- close(i);
- free(src);
- free(dst);
- free(result);
- free(bandDims);
- return 0;
- }
- ///
- // Process of DWT algorithm
- //
- template <typename T>
- void processDWT(struct dwt *d, int forward, int writeVisual)
- {
- int componentSize = d->pixWidth * d->pixHeight * sizeof(T);
- T *c_r_out, *c_g_out, *c_b_out, *backup, *c_r, *c_g, *c_b;
- // initialize to zeros
- T *temp = (T *)malloc(componentSize);
- memset(temp, 0, componentSize);
- cl_mem cl_c_r_out;
- cl_c_r_out = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, componentSize, temp, &errNum);
- // fatal_CL(errNum, __LINE__);
- cl_mem cl_backup;
- cl_backup = clCreateBuffer(context, CL_MEM_READ_WRITE |CL_MEM_COPY_HOST_PTR, componentSize, temp, &errNum);
- // fatal_CL(errNum, __LINE__);
- if (d->components == 3) {
- // Alloc two more buffers for G and B
- cl_mem cl_c_g_out;
- cl_c_g_out = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, componentSize, temp, &errNum);
- // fatal_CL(errNum, __LINE__);
- cl_mem cl_c_b_out;
- cl_c_b_out = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, componentSize, temp, &errNum);
- // fatal_CL(errNum, __LINE__);
- // Load components
- cl_mem cl_c_r, cl_c_g, cl_c_b;
- cl_c_r = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,componentSize, temp, &errNum);
- // fatal_CL(errNum, __LINE__);
- cl_c_g = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,componentSize, temp, &errNum);
- // fatal_CL(errNum, __LINE__);
- cl_c_b = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,componentSize, temp, &errNum);
- // fatal_CL(errNum, __LINE__);
- rgbToComponents(cl_c_r, cl_c_g, cl_c_b, d->srcImg, d->pixWidth, d->pixHeight);
- //Compute DWT and always store int file
- nStage2dDWT(cl_c_r, cl_c_r_out, cl_backup, d->pixWidth, d->pixHeight, d->dwtLvls, forward);
- nStage2dDWT(cl_c_g, cl_c_g_out, cl_backup, d->pixWidth, d->pixHeight, d->dwtLvls, forward);
- nStage2dDWT(cl_c_b, cl_c_b_out, cl_backup, d->pixWidth, d->pixHeight, d->dwtLvls, forward);
- // ---------test----------
- /* T *h_r_out=(T*)malloc(componentSize);
- errNum = clEnqueueReadBuffer(commandQueue, cl_c_g_out, CL_TRUE, 0, componentSize, h_r_out, 0, NULL, NULL);
- fatal_CL(errNum, __LINE__);
- int ii;
- for(ii=0;ii<componentSize/sizeof(T);ii++) {
- fprintf(stderr, "%d ", (int)h_r_out[ii]);
- if((ii+1) % (d->pixWidth) == 0) fprintf(stderr, "\n");
- }
- */ // ---------test----------
- #ifdef OUTPUT
- // Store DWT to file
- if(writeVisual){
- writeNStage2DDWT(cl_c_r_out, d->pixWidth, d->pixHeight, d->dwtLvls, d->outFilename, ".r");
- writeNStage2DDWT(cl_c_g_out, d->pixWidth, d->pixHeight, d->dwtLvls, d->outFilename, ".g");
- writeNStage2DDWT(cl_c_b_out, d->pixWidth, d->pixHeight, d->dwtLvls, d->outFilename, ".b");
- } else {
- writeLinear(cl_c_r_out, d->pixWidth, d->pixHeight, d->outFilename, ".r");
- writeLinear(cl_c_g_out, d->pixWidth, d->pixHeight, d->outFilename, ".g");
- writeLinear(cl_c_b_out, d->pixWidth, d->pixHeight, d->outFilename, ".b");
- }
- #endif
- clReleaseMemObject(cl_c_r);
- clReleaseMemObject(cl_c_g);
- clReleaseMemObject(cl_c_b);
- clReleaseMemObject(cl_c_g_out);
- clReleaseMemObject(cl_c_b_out);
- } else if(d->components == 1) {
- // Load components
- cl_mem cl_c_r;
- cl_c_r = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,componentSize, temp, &errNum);
- // fatal_CL(errNum, __LINE__);
- bwToComponent(cl_c_r, d->srcImg, d->pixWidth, d->pixHeight);
- // Compute DWT
- nStage2dDWT(cl_c_r, cl_c_r_out, cl_backup, d->pixWidth, d->pixHeight, d->dwtLvls, forward);
- //Store DWT to file
- if(writeVisual){
- writeNStage2DDWT(cl_c_r_out, d->pixWidth, d->pixHeight, d->dwtLvls, d->outFilename, ".r");
- } else {
- writeLinear(cl_c_r_out, d->pixWidth, d->pixHeight, d->outFilename, ".r");
- }
- clReleaseMemObject(cl_c_r);
- }
- free(temp);
- clReleaseMemObject(cl_c_r_out);
- }
- int main(int argc, char **argv)
- {
- int optindex = 0;
- signed char ch;
- struct option longopts[] =
- {
- {"dimension", required_argument, 0, 'd'}, //dimensions of src img
- {"components", required_argument, 0, 'c'}, //numger of components of src img
- {"depth", required_argument, 0, 'b'}, //bit depth of src img
- {"level", required_argument, 0, 'l'}, //level of dwt
- {"device", required_argument, 0, 'D'}, //cuda device
- {"forward", no_argument, 0, 'f'}, //forward transform
- {"reverse", no_argument, 0, 'r'}, //forward transform
- {"97", no_argument, 0, '9'}, //9/7 transform
- {"53", no_argument, 0, '5' }, //5/3transform
- {"write-visual",no_argument, 0, 'w' }, //write output (subbands) in visual (tiled) order instead of linear
- {"help", no_argument, 0, 'h'},
- {"platform_id", required_argument, 0, 'p'},
- {"use_gpu", required_argument, 0, 'g'},
- {"device_id", required_argument, 0, 'i'}
- };
- int pixWidth = 0; //<real pixWidth
- int pixHeight = 0; //<real pixHeight
- int compCount = 3; //number of components; 3 for RGB or YUV, 4 for RGBA
- int bitDepth = 8;
- int dwtLvls = 3; //default numuber of DWT levels
- int device = 0;
- int forward = 1; //forward transform
- int dwt97 = 0; //1=dwt9/7, 0=dwt5/3 transform
- int writeVisual = 0; //write output (subbands) in visual (tiled) order instead of linear
- char * pos;
- int platform_id = 0;
- int device_id = 0;
- int use_gpu = 0;
- while ((ch = getopt_long(argc, argv, "d:c:b:l:D:fr95whp:g:i:", longopts, &optindex)) != -1)
- {
- switch (ch) {
- case 'd':
- pixWidth = atoi(optarg);
- pos = strstr(optarg, "x");
- if (pos == NULL || pixWidth == 0 || (strlen(pos) >= strlen(optarg)))
- {
- usage();
- return -1;
- }
- pixHeight = atoi(pos+1);
- break;
- case 'c':
- compCount = atoi(optarg);
- break;
- case 'b':
- bitDepth = atoi(optarg);
- break;
- case 'l':
- dwtLvls = atoi(optarg);
- break;
- case 'D':
- device = atoi(optarg);
- break;
- case 'f':
- forward = 1;
- break;
- case 'r':
- forward = 0;
- break;
- case '9':
- dwt97 = 1;
- break;
- case '5':
- dwt97 = 0;
- break;
- case 'w':
- writeVisual = 1;
- break;
- case 'p':
- platform_id = atoi(optarg);
- break;
- case 'g':
- use_gpu = atoi(optarg);
- break;
- case 'i':
- device_id = atoi(optarg);
- break;
- case 'h':
- usage();
- return 0;
- case '?':
- return -1;
- default :
- usage();
- return -1;
- }
- }
- argc -= optind;
- argv += optind;
- if (argc == 0)
- { // at least one filename is expected
- printf("Please supply src file name\n");
- usage();
- return -1;
- }
- if (pixWidth <= 0 || pixHeight <=0)
- {
- printf("Wrong or missing dimensions\n");
- usage();
- return -1;
- }
- if (forward == 0)
- {
- writeVisual = 0; //do not write visual when RDWT
- }
- //
- // device init
- // Create an OpenCL context on first available platform
- context = CreateContext(platform_id, use_gpu);
- if (context == NULL)
- {
- std::cerr << "Failed to create OpenCL context." << std::endl;
- return 1;
- }
- // Create a command-queue on the first device available
- // on the created context
- commandQueue = CreateCommandQueue(context, &cldevice, device_id);
- if (commandQueue == NULL)
- {
- Cleanup(context, commandQueue, program, kernel);
- return 1;
- }
- // Create OpenCL program from com_dwt.cl kernel source
- program = CreateProgram(context, cldevice, "com_dwt.cl");
- if (program == NULL)
- {
- printf("fail to create program!!\n");
- }
- // Create OpenCL kernel
- c_CopySrcToComponents = clCreateKernel(program, "c_CopySrcToComponents", NULL);
- if (c_CopySrcToComponents == NULL)
- {
- std::cerr << "Failed to create kernel" << std::endl;
- }
- c_CopySrcToComponent = clCreateKernel(program, "c_CopySrcToComponent", NULL);
- if (c_CopySrcToComponent == NULL)
- {
- std::cerr << "Failed to create kernel" << std::endl;
- }
- kl_fdwt53Kernel = clCreateKernel(program, "cl_fdwt53Kernel", NULL);
- if (kl_fdwt53Kernel == NULL)
- {
- std::cerr<<"Failed to create kernel\n";
- }
- //initialize struct dwt
- struct dwt *d;
- d = (struct dwt *)malloc(sizeof(struct dwt));
- d->srcImg = NULL;
- d->pixWidth = pixWidth;
- d->pixHeight = pixHeight;
- d->components = compCount;
- d->dwtLvls = dwtLvls;
- // file names
- d->srcFilename = (char *)malloc(strlen(argv[0]));
- strcpy(d->srcFilename, argv[0]);
- if (argc == 1)
- { // only one filename supplyed
- d->outFilename = (char *)malloc(strlen(d->srcFilename)+4);
- strcpy(d->outFilename, d->srcFilename);
- strcpy(d->outFilename+strlen(d->srcFilename), ".dwt");
- } else {
- d->outFilename = strdup(argv[1]);
- }
- //Input review
- printf("\nSource file:\t\t%s\n", d->srcFilename);
- printf(" Dimensions:\t\t%dx%d\n", pixWidth, pixHeight);
- printf(" Components count:\t%d\n", compCount);
- printf(" Bit depth:\t\t%d\n", bitDepth);
- printf(" DWT levels:\t\t%d\n", dwtLvls);
- printf(" Forward transform:\t%d\n", forward);
- printf(" 9/7 transform:\t\t%d\n", dwt97);
- //data sizes
- int inputSize = pixWidth*pixHeight*compCount; //<amount of data (in bytes) to proccess
- //load img source image
- d->srcImg = (unsigned char *) malloc (inputSize);
- if (getImg(d->srcFilename, d->srcImg, inputSize) == -1)
- return -1;
- // DWT
- // Create memory objects, Set arguments for kernel functions, Queue the kernel up for execution across the array, Read the output buffer back to the Host, Output the result buffer
- if (forward == 1)
- {
- if(dwt97 == 1 )
- processDWT<float>(d, forward, writeVisual);
- else // 5/3
- processDWT<int>(d, forward, writeVisual);
- }
- else
- { // reverse
- if(dwt97 == 1 )
- processDWT<float>(d, forward, writeVisual);
- else // 5/3
- processDWT<int>(d, forward, writeVisual);
- }
- Cleanup(context, commandQueue, program, kernel);
- clReleaseKernel(c_CopySrcToComponents);
- clReleaseKernel(c_CopySrcToComponent);
- return 0;
- }
|