Переглянути джерело

Implemented device selection for dwt2d

We now use parameters as the way to select OpenCL platform, device and
type instead of having the hardcoded in the code. In this way we can
easily change the type of the benchmark without recompiling it.

The parsing is done using the pre-existing structure of command line
arguments parsing, that uses the getopt_long helper function and is
quite elegant with respect to the other methods.
Andrea Gussoni 8 роки тому
батько
коміт
1bf63ab652
3 змінених файлів з 197 додано та 183 видалено
  1. 195 182
      opencl/dwt2d/main.cpp
  2. 1 1
      opencl/dwt2d/run-cpu
  3. 1 0
      opencl/dwt2d/run-gpu

+ 195 - 182
opencl/dwt2d/main.cpp

@@ -21,9 +21,9 @@
 #ifdef __APPLE__
 #include <OpenCL/cl.h>
 #else
-#include <CL/opencl.h> 
+#include <CL/opencl.h>
 #endif
- 
+
 #define THREADS 256
 
 struct dwt {
@@ -53,40 +53,35 @@ cl_int errNum = 0;
 
 ///
 // functions for preparing create opencl program, contains CreateContext, CreateProgram, CreateCommandQueue, CreateMemBuffer, and Cleanup
-// Create an OpenCL context on the first available GPU platform. 
-cl_context CreateContext()
+// Create an OpenCL context on the first available GPU platform.
+cl_context CreateContext(int platform_id, int use_gpu)
 {
     cl_context context = NULL;
     cl_uint platformIdCount = 0;
-    cl_int errNum;
+    cl_int errNum;
+    cl_device_type device_type = use_gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU;
 
     // get number of platforms
     clGetPlatformIDs (0, NULL, &platformIdCount);
 
     std::vector<cl_platform_id> platformIds(platformIdCount);
     clGetPlatformIDs (platformIdCount, platformIds.data(), NULL);
-	
-	// In this example, first platform is a CPU, the second one is a GPU. we just choose the first available device.  
+
+	// In this example, first platform is a CPU, the second one is a GPU. we just choose the first available device.
     cl_context_properties contextProperties[] =
     {
         CL_CONTEXT_PLATFORM,
-        (cl_context_properties)platformIds[1],
+        (cl_context_properties)platformIds[platform_id],
         0
     };
-    context = clCreateContextFromType(contextProperties, CL_DEVICE_TYPE_GPU,
+    context = clCreateContextFromType(contextProperties, device_type,
                                       NULL, NULL, &errNum);
     if (errNum != CL_SUCCESS)
     {
-        std::cout << "Could not create GPU context, trying CPU..." << std::endl;
-        context = clCreateContextFromType(contextProperties, CL_DEVICE_TYPE_CPU,
-                                          NULL, NULL, &errNum);
-        if (errNum != CL_SUCCESS)
-        {
-            std::cerr << "Failed to create an OpenCL GPU or CPU context." << std::endl;
-            return NULL;
-        }
+        std::cerr << "Failed to create an OpenCL GPU or CPU context." << std::endl;
+        return NULL;
     }
-    
+
     return context;
 
 }
@@ -94,7 +89,7 @@ cl_context CreateContext()
 ///
 //  Create a command queue on the first device available on the context
 //
-cl_command_queue CreateCommandQueue(cl_context context, cl_device_id *cldevice)
+cl_command_queue CreateCommandQueue(cl_context context, cl_device_id *cldevice, int device_id)
 {
     cl_int errNum;
     cl_device_id *cldevices;
@@ -125,7 +120,7 @@ cl_command_queue CreateCommandQueue(cl_context context, cl_device_id *cldevice)
         return NULL;
     }
 
-    commandQueue = clCreateCommandQueue(context, cldevices[0], 0, NULL);
+    commandQueue = clCreateCommandQueue(context, cldevices[device_id], 0, NULL);
     if (commandQueue == NULL)
     {
         delete [] cldevices;
@@ -133,7 +128,7 @@ cl_command_queue CreateCommandQueue(cl_context context, cl_device_id *cldevice)
         return NULL;
     }
 
-    *cldevice = cldevices[0];
+    *cldevice = cldevices[device_id];
     delete [] cldevices;
     return commandQueue;
 }
@@ -213,7 +208,7 @@ int getImg(char * srcFilename, unsigned char *srcImg, int inputSize)
     // printf("Loading ipnput: %s\n", srcFilename);
     char path[] = "../../data/dwt2d/";
     char *newSrc = NULL;
-    
+
     if((newSrc = (char *)malloc(strlen(srcFilename)+strlen(path)+1)) != NULL)
     {
         newSrc[0] = '\0';
@@ -225,8 +220,8 @@ int getImg(char * srcFilename, unsigned char *srcImg, int inputSize)
 
     //read image
     int i = open(srcFilename, O_RDONLY, 0644);
-    if (i == -1) 
-	{ 
+    if (i == -1)
+	{
         error(0,errno,"cannot access %s", srcFilename);
         return -1;
     }
@@ -251,7 +246,10 @@ void usage() {
   -r, --reverse\t\t\treverse transform\n\
   -9, --97\t\t\t9/7 transform\n\
   -5, --53\t\t\t5/3 transform\n\
-  -w  --write-visual\t\twrite output in visual (tiled) fashion instead of the linear\n");
+  -w  --write-visual\t\twrite output in visual (tiled) fashion instead of the linear\n\
+  -p  --platform_id\t\t\tOCL platform id to use\n\
+  -g  --use_gpu\t\t\t1 to use gpu, 0 to use cpu\n\
+  -i  --device_id\t\t\tOCL devicde id to use\n");
 }
 
 ///
@@ -331,7 +329,7 @@ void rgbToComponents(cl_mem d_r, cl_mem d_g, cl_mem d_b, unsigned char * h_src,
 {
     int pixels      = width * height;
     int alignedSize =  DIVANDRND(width*height, THREADS) * THREADS * 3; //aligned to thread block size -- THREADS
-	
+
 	cl_mem cl_d_src;
 	cl_d_src = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, pixels*3, h_src, &errNum);
 	// fatal_CL(errNum, __LINE__);
@@ -344,14 +342,14 @@ void rgbToComponents(cl_mem d_r, cl_mem d_g, cl_mem d_b, unsigned char * h_src,
 	errNum |= clSetKernelArg(c_CopySrcToComponents, 2, sizeof(cl_mem), &d_b);
 	errNum |= clSetKernelArg(c_CopySrcToComponents, 3, sizeof(cl_mem), &cl_d_src);
 	errNum |= clSetKernelArg(c_CopySrcToComponents, 4, sizeof(int), &pixels);
-	// fatal_CL(errNum, __LINE__);	
-	
+	// fatal_CL(errNum, __LINE__);
+
 	errNum = clEnqueueNDRangeKernel(commandQueue, c_CopySrcToComponents, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
 	// fatal_CL(errNum, __LINE__);
-	
-    // Free Memory 
-	errNum = clReleaseMemObject(cl_d_src);  
-	// fatal_CL(errNum, __LINE__);	
+
+    // Free Memory
+	errNum = clReleaseMemObject(cl_d_src);
+	// fatal_CL(errNum, __LINE__);
 }
 
 
@@ -364,27 +362,27 @@ void bwToComponent(cl_mem d_c, unsigned char * h_src, int width, int height)
 	cl_mem cl_d_src;
     int pixels      = width*height;
     int alignedSize =  DIVANDRND(pixels, THREADS) * THREADS;
-	
+
 	cl_d_src = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, pixels, h_src, NULL);
 	// fatal_CL(errNum, __LINE__);
-	
+
 	size_t globalWorkSize[1] = { alignedSize/9};
     size_t localWorkSize[1] = { THREADS };
 	assert(alignedSize%(THREADS*3) == 0);
-	
+
 	errNum  = clSetKernelArg(c_CopySrcToComponent, 0, sizeof(cl_mem), &d_c);
 	errNum |= clSetKernelArg(c_CopySrcToComponent, 1, sizeof(cl_mem), &cl_d_src);
 	errNum |= clSetKernelArg(c_CopySrcToComponent, 2, sizeof(int), &pixels);
-	// fatal_CL(errNum, __LINE__);	
-	
+	// fatal_CL(errNum, __LINE__);
+
 	errNum = clEnqueueNDRangeKernel(commandQueue, c_CopySrcToComponent, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
-	std::cout<<"in function bwToComponent errNum= "<<errNum<<"\n"; 
+	std::cout<<"in function bwToComponent errNum= "<<errNum<<"\n";
 	// fatal_CL(errNum, __LINE__);
 	std::cout<<"bwToComponent has finished\n";
-	
-    // Free Memory 
-	errNum = clReleaseMemObject(cl_d_src);  
-	// fatal_CL(errNum, __LINE__);	
+
+    // Free Memory
+	errNum = clReleaseMemObject(cl_d_src);
+	// fatal_CL(errNum, __LINE__);
 
 }
 
@@ -395,27 +393,27 @@ void bwToComponent(cl_mem d_c, unsigned char * h_src, int width, int height)
 /// @tparam WIN_SY  height of sliding window
 /// @param in       input image
 /// @param out      output buffer
-/// @param sx       width of the input image 
+/// @param sx       width of the input image
 /// @param sy       height of the input image
-///launchFDWT53Kerneld is in file 
+///launchFDWT53Kerneld is in file
 void launchFDWT53Kernel (int WIN_SX, int WIN_SY, cl_mem in, cl_mem out, int sx, int sy)
 {
     // compute optimal number of steps of each sliding window
 	// cuda_dwt called a function divRndUp from namespace cuda_gwt. this function takes n and d, "return (n / d) + ((n % d) ? 1 : 0);"
 	//
-	
-    const int steps = ( sy/ (15 * WIN_SY)) + ((sy % (15 * WIN_SY)) ? 1 : 0);	
-	
+
+    const int steps = ( sy/ (15 * WIN_SY)) + ((sy % (15 * WIN_SY)) ? 1 : 0);
+
 	int gx = ( sx/ WIN_SX) + ((sx %  WIN_SX) ? 1 : 0);  //use function divRndUp(n, d){return (n / d) + ((n % d) ? 1 : 0);}
 	int gy = ( sy/ (WIN_SY*steps)) + ((sy %  (WIN_SY*steps)) ? 1 : 0);
-	
+
 	printf("sliding steps = %d , gx = %d , gy = %d \n", steps, gx, gy);
-	
+
     // prepare grid size
 	size_t globalWorkSize[2] = { gx*WIN_SX, gy*1};
     size_t localWorkSize[2]  = { WIN_SX , 1};
     // printf("\n globalx=%d, globaly=%d, blocksize=%d\n", gx, gy, WIN_SX);
-	
+
 	errNum  = clSetKernelArg(kl_fdwt53Kernel, 0, sizeof(cl_mem), &in);
 	errNum |= clSetKernelArg(kl_fdwt53Kernel, 1, sizeof(cl_mem), &out);
 	errNum |= clSetKernelArg(kl_fdwt53Kernel, 2, sizeof(int), &sx);
@@ -424,11 +422,11 @@ void launchFDWT53Kernel (int WIN_SX, int WIN_SY, cl_mem in, cl_mem out, int sx,
 	errNum |= clSetKernelArg(kl_fdwt53Kernel, 5, sizeof(int), &WIN_SX);
 	errNum |= clSetKernelArg(kl_fdwt53Kernel, 6, sizeof(int), &WIN_SY);
 	// fatal_CL(errNum, __LINE__);
-	
+
 	errNum = clEnqueueNDRangeKernel(commandQueue, kl_fdwt53Kernel, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
 	// fatal_CL(errNum, __LINE__);
 	printf("kl_fdwt53Kernel in launchFDW53Kernel has finished\n");
-	
+
 
 }
 
@@ -460,32 +458,32 @@ void memCopy (cl_mem dest,  cl_mem src, const size_t sx, const size_t sy){
 void fdwt53(cl_mem in, cl_mem out, int sizeX, int sizeY, int levels)
 {
     // select right width of kernel for the size of the image
-	
-    if(sizeX >= 960) 
+
+    if(sizeX >= 960)
 	{
       launchFDWT53Kernel(192, 8, in, out, sizeX, sizeY);
-    } 
-	else if (sizeX >= 480) 
+    }
+	else if (sizeX >= 480)
 	{
       launchFDWT53Kernel(128, 8, in, out, sizeX, sizeY);
-    } else 
+    } else
 	{
       launchFDWT53Kernel(64, 8, in, out, sizeX, sizeY);
 	}
-		
+
 	// if this was not the last level, continue recursively with other levels
 	if (levels > 1)
 	{
-		// copy output's LL band back into input buffer 
+		// copy output's LL band back into input buffer
 		const int llSizeX = (sizeX / 2) + ((sizeX % 2) ? 1 :0);
 		const int llSizeY = (sizeY / 2) + ((sizeY % 2) ? 1 :0);
 		memCopy(in, out, llSizeX, llSizeY);
-		
+
 		// run remaining levels of FDWT
 		fdwt53(in, out, llSizeX, llSizeY, levels - 1);
-	}	
+	}
 }
-	
+
 
 ///
 // in dwt.cu
@@ -493,17 +491,17 @@ int nStage2dDWT(cl_mem in, cl_mem out, cl_mem backup, int pixWidth, int pixHeigh
 {
     printf("\n*** %d stages of 2D forward DWT:\n", stages);
 
-    // create backup of input, because each test iteration overwrites it 
+    // create backup of input, because each test iteration overwrites it
     const int size = pixHeight * pixWidth * sizeof(int);
-	
-	// Measure time of individual levels. 
+
+	// Measure time of individual levels.
 	if (forward)
 		fdwt53(in, out, pixWidth, pixHeight, stages );
 	//else
 	//	rdwt(in, out, pixWidth, pixHeight, stages);
-	// rdwt means rdwt53(can be found in file rdwt53.cu) which has not been defined 
-	
-	
+	// rdwt means rdwt53(can be found in file rdwt53.cu) which has not been defined
+
+
 	return 0;
 }
 
@@ -519,7 +517,7 @@ void samplesToChar(unsigned char * dst, int * src, int samplesNum)
 	{
         int r = src[i]+128;
         if (r > 255) r = 255;
-        if (r < 0)   r = 0; 
+        if (r < 0)   r = 0;
         dst[i] = (unsigned char)r;
     }
 }
@@ -536,33 +534,33 @@ int writeLinear(cl_mem component, int pixWidth, int pixHeight, const char * file
     int i;
     int size;
     int samplesNum = pixWidth*pixHeight;
-	
+
 	size = samplesNum*sizeof(int);
 	gpu_output = (int *)malloc(size);
     memset(gpu_output, 0, size);
 	result = (unsigned char *)malloc(samplesNum);
-	
+
 	errNum = clEnqueueReadBuffer(commandQueue, component, CL_TRUE, 0, size, gpu_output, 0, NULL, NULL);
-	// fatal_CL(errNum, __LINE__);	
-	
-	// T to char 
+	// fatal_CL(errNum, __LINE__);
+
+	// T to char
 	samplesToChar(result, gpu_output, samplesNum);
-	
-	// Write component 
+
+	// Write component
 	char outfile[strlen(filename)+strlen(suffix)];
     strcpy(outfile, filename);
     strcpy(outfile+strlen(filename), suffix);
     i = open(outfile, O_CREAT|O_WRONLY, 0644);
-	if (i == -1) 
-	{ 
+	if (i == -1)
+	{
         error(0,errno,"cannot access %s", outfile);
         return -1;
     }
 	printf("\nWriting to %s (%d x %d)\n", outfile, pixWidth, pixHeight);
     write(i, result, samplesNum);
     close(i);
-	
-	// Clean up 
+
+	// Clean up
     free(gpu_output);
     free(result);
 
@@ -572,12 +570,12 @@ int writeLinear(cl_mem component, int pixWidth, int pixHeight, const char * file
 
 
 ///
-// Write output visual ordered 
+// Write output visual ordered
 //in file dwt.cu
 int writeNStage2DDWT(cl_mem component, int pixWidth, int pixHeight, int stages, const char * filename, const char * suffix)
 {
 	struct band {
-        int dimX; 
+        int dimX;
         int dimY;
     };
     struct dimensions {
@@ -596,7 +594,7 @@ int writeNStage2DDWT(cl_mem component, int pixWidth, int pixHeight, int stages,
     int yOffset;
     int samplesNum = pixWidth*pixHeight;
     struct dimensions * bandDims;
-	
+
 	bandDims = (struct dimensions *)malloc(stages * sizeof(struct dimensions));
 
     bandDims[0].LL.dimX = DIVANDRND(pixWidth,2);
@@ -607,8 +605,8 @@ int writeNStage2DDWT(cl_mem component, int pixWidth, int pixHeight, int stages,
     bandDims[0].LH.dimY = pixHeight - bandDims[0].LL.dimY;
     bandDims[0].HH.dimX = bandDims[0].HL.dimX;
     bandDims[0].HH.dimY = bandDims[0].LH.dimY;
-	
-	for (i = 1; i < stages; i++) 
+
+	for (i = 1; i < stages; i++)
 	{
         bandDims[i].LL.dimX = DIVANDRND(bandDims[i-1].LL.dimX,2);
         bandDims[i].LL.dimY = DIVANDRND(bandDims[i-1].LL.dimY,2);
@@ -619,10 +617,10 @@ int writeNStage2DDWT(cl_mem component, int pixWidth, int pixHeight, int stages,
         bandDims[i].HH.dimX = bandDims[i].HL.dimX;
         bandDims[i].HH.dimY = bandDims[i].LH.dimY;
     }
-	
+
 #if 0
     printf("Original image pixWidth x pixHeight: %d x %d\n", pixWidth, pixHeight);
-    for (i = 0; i < stages; i++) 
+    for (i = 0; i < stages; i++)
 	{
         printf("Stage %d: LL: pixWidth x pixHeight: %d x %d\n", i, bandDims[i].LL.dimX, bandDims[i].LL.dimY);
         printf("Stage %d: HL: pixWidth x pixHeight: %d x %d\n", i, bandDims[i].HL.dimX, bandDims[i].HL.dimY);
@@ -631,8 +629,8 @@ int writeNStage2DDWT(cl_mem component, int pixWidth, int pixHeight, int stages,
     }
 #endif
 
-	size = samplesNum*sizeof(int);	
-	
+	size = samplesNum*sizeof(int);
+
 	src = (int *)malloc(size);
     memset(src, 0, size);
 	dst = (int *)malloc(size);
@@ -640,24 +638,24 @@ int writeNStage2DDWT(cl_mem component, int pixWidth, int pixHeight, int stages,
 	result = (unsigned char *)malloc(samplesNum);
 
 	errNum = clEnqueueReadBuffer(commandQueue, component, CL_TRUE, 0, size, src, 0, NULL, NULL);
-	// fatal_CL(errNum, __LINE__);	
-	
+	// fatal_CL(errNum, __LINE__);
+
 
-	// LL Band 	
+	// LL Band
 	size = bandDims[stages-1].LL.dimX * sizeof(int);
-	for (i = 0; i < bandDims[stages-1].LL.dimY; i++) 
+	for (i = 0; i < bandDims[stages-1].LL.dimY; i++)
 	{
         memcpy(dst+i*pixWidth, src+i*bandDims[stages-1].LL.dimX, size);
 	}
-    
+
     for (s = stages - 1; s >= 0; s--) {
         // HL Band
         size = bandDims[s].HL.dimX * sizeof(int);
         offset = bandDims[s].LL.dimX * bandDims[s].LL.dimY;
-        for (i = 0; i < bandDims[s].HL.dimY; i++) 
+        for (i = 0; i < bandDims[s].HL.dimY; i++)
 		{
             memcpy(dst+i*pixWidth+bandDims[s].LL.dimX,
-                src+offset+i*bandDims[s].HL.dimX, 
+                src+offset+i*bandDims[s].HL.dimX,
                 size);
         }
 
@@ -665,48 +663,48 @@ int writeNStage2DDWT(cl_mem component, int pixWidth, int pixHeight, int stages,
 		size = bandDims[s].LH.dimX * sizeof(int);
         offset += bandDims[s].HL.dimX * bandDims[s].HL.dimY;
         yOffset = bandDims[s].LL.dimY;
-        for (i = 0; i < bandDims[s].HL.dimY; i++) 
+        for (i = 0; i < bandDims[s].HL.dimY; i++)
 		{
             memcpy(dst+(yOffset+i)*pixWidth,
-                src+offset+i*bandDims[s].LH.dimX, 
+                src+offset+i*bandDims[s].LH.dimX,
                 size);
         }
-	
+
 		//HH band
         size = bandDims[s].HH.dimX * sizeof(int);
         offset += bandDims[s].LH.dimX * bandDims[s].LH.dimY;
         yOffset = bandDims[s].HL.dimY;
-        for (i = 0; i < bandDims[s].HH.dimY; i++) 
+        for (i = 0; i < bandDims[s].HH.dimY; i++)
 		{
             memcpy(dst+(yOffset+i)*pixWidth+bandDims[s].LH.dimX,
-                src+offset+i*bandDims[s].HH.dimX, 
+                src+offset+i*bandDims[s].HH.dimX,
                 size);
         }
 	}
-	
+
     // Write component
-	samplesToChar(result, dst, samplesNum);	
-	
+	samplesToChar(result, dst, samplesNum);
+
 	char outfile[strlen(filename)+strlen(suffix)];
     strcpy(outfile, filename);
     strcpy(outfile+strlen(filename), suffix);
     i = open(outfile, O_CREAT|O_WRONLY, 0644);
-	
-    if (i == -1) 
+
+    if (i == -1)
 	{
         error(0,errno,"cannot access %s", outfile);
         return -1;
     }
-	
+
     printf("\nWriting to %s (%d x %d)\n", outfile, pixWidth, pixHeight);
     write(i, result, samplesNum);
     close(i);
-	
+
 	free(src);
 	free(dst);
     free(result);
     free(bandDims);
-	
+
 	return 0;
 }
 
@@ -719,58 +717,58 @@ int writeNStage2DDWT(cl_mem component, int pixWidth, int pixHeight, int stages,
 template <typename T>
 void processDWT(struct dwt *d, int forward, int writeVisual)
 {
-	
+
 	int componentSize = d->pixWidth * d->pixHeight * sizeof(T);
-    
+
     T *c_r_out, *c_g_out, *c_b_out, *backup, *c_r, *c_g, *c_b;
-	
+
 	// initialize to zeros
 	T *temp = (T *)malloc(componentSize);
 	memset(temp, 0, componentSize);
-	
+
 	cl_mem cl_c_r_out;
 	cl_c_r_out = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, componentSize, temp, &errNum);
 	// fatal_CL(errNum, __LINE__);
-	
-	
-	
+
+
+
 	cl_mem cl_backup;
-	cl_backup  = clCreateBuffer(context, CL_MEM_READ_WRITE |CL_MEM_COPY_HOST_PTR, componentSize, temp, &errNum);  
+	cl_backup  = clCreateBuffer(context, CL_MEM_READ_WRITE |CL_MEM_COPY_HOST_PTR, componentSize, temp, &errNum);
 	// fatal_CL(errNum, __LINE__);
-	
+
 	if (d->components == 3) {
-		// Alloc two more buffers for G and B 
+		// Alloc two more buffers for G and B
 		cl_mem cl_c_g_out;
-		cl_c_g_out = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, componentSize, temp, &errNum);  
+		cl_c_g_out = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, componentSize, temp, &errNum);
 		// fatal_CL(errNum, __LINE__);
-		
+
 		cl_mem cl_c_b_out;
 		cl_c_b_out = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, componentSize, temp, &errNum);
-        // fatal_CL(errNum, __LINE__);	
-        
-        // Load components 
+        // fatal_CL(errNum, __LINE__);
+
+        // Load components
         cl_mem cl_c_r, cl_c_g, cl_c_b;
 		cl_c_r = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,componentSize, temp, &errNum);
-		// fatal_CL(errNum, __LINE__); 		
+		// fatal_CL(errNum, __LINE__);
 		cl_c_g = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,componentSize, temp, &errNum);
-		// fatal_CL(errNum, __LINE__);		
-		cl_c_b = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,componentSize, temp, &errNum); 
 		// fatal_CL(errNum, __LINE__);
-		
-		
+		cl_c_b = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,componentSize, temp, &errNum);
+		// fatal_CL(errNum, __LINE__);
+
+
         rgbToComponents(cl_c_r, cl_c_g, cl_c_b, d->srcImg, d->pixWidth, d->pixHeight);
-       
-        
+
+
         //Compute DWT and always store int file
-       
+
         nStage2dDWT(cl_c_r, cl_c_r_out, cl_backup, d->pixWidth, d->pixHeight, d->dwtLvls, forward);
         nStage2dDWT(cl_c_g, cl_c_g_out, cl_backup, d->pixWidth, d->pixHeight, d->dwtLvls, forward);
         nStage2dDWT(cl_c_b, cl_c_b_out, cl_backup, d->pixWidth, d->pixHeight, d->dwtLvls, forward);
-        
-        
+
+
         // ---------test----------
 /*      T *h_r_out=(T*)malloc(componentSize);
-		errNum = clEnqueueReadBuffer(commandQueue, cl_c_g_out, CL_TRUE, 0, componentSize, h_r_out, 0, NULL, NULL); 
+		errNum = clEnqueueReadBuffer(commandQueue, cl_c_g_out, CL_TRUE, 0, componentSize, h_r_out, 0, NULL, NULL);
 		fatal_CL(errNum, __LINE__);
         int ii;
 		for(ii=0;ii<componentSize/sizeof(T);ii++) {
@@ -779,48 +777,48 @@ void processDWT(struct dwt *d, int forward, int writeVisual)
         }
 */        // ---------test----------
 
-#ifdef OUTPUT        
+#ifdef OUTPUT
         // Store DWT to file
         if(writeVisual){
             writeNStage2DDWT(cl_c_r_out, d->pixWidth, d->pixHeight, d->dwtLvls, d->outFilename, ".r");
             writeNStage2DDWT(cl_c_g_out, d->pixWidth, d->pixHeight, d->dwtLvls, d->outFilename, ".g");
             writeNStage2DDWT(cl_c_b_out, d->pixWidth, d->pixHeight, d->dwtLvls, d->outFilename, ".b");
-        
+
         } else {
             writeLinear(cl_c_r_out, d->pixWidth, d->pixHeight, d->outFilename, ".r");
             writeLinear(cl_c_g_out, d->pixWidth, d->pixHeight, d->outFilename, ".g");
             writeLinear(cl_c_b_out, d->pixWidth, d->pixHeight, d->outFilename, ".b");
         }
-#endif		
-		
+#endif
+
 		clReleaseMemObject(cl_c_r);
 		clReleaseMemObject(cl_c_g);
 		clReleaseMemObject(cl_c_b);
 		clReleaseMemObject(cl_c_g_out);
 		clReleaseMemObject(cl_c_b_out);
 
-	} else if(d->components == 1) { 
-        // Load components 
+	} else if(d->components == 1) {
+        // Load components
         cl_mem cl_c_r;
 		cl_c_r = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,componentSize, temp, &errNum);
-		// fatal_CL(errNum, __LINE__); 		
-        
+		// fatal_CL(errNum, __LINE__);
+
         bwToComponent(cl_c_r, d->srcImg, d->pixWidth, d->pixHeight);
 
         // Compute DWT
         nStage2dDWT(cl_c_r, cl_c_r_out, cl_backup, d->pixWidth, d->pixHeight, d->dwtLvls, forward);
-    
+
         //Store DWT to file
         if(writeVisual){
             writeNStage2DDWT(cl_c_r_out, d->pixWidth, d->pixHeight, d->dwtLvls, d->outFilename, ".r");
         } else {
             writeLinear(cl_c_r_out, d->pixWidth, d->pixHeight, d->outFilename, ".r");
         }
-		
-		
+
+
 		clReleaseMemObject(cl_c_r);
 
-    } 
+    }
 
 	free(temp);
 	clReleaseMemObject(cl_c_r_out);
@@ -828,11 +826,11 @@ void processDWT(struct dwt *d, int forward, int writeVisual)
 
 
 
-int main(int argc, char **argv) 
+int main(int argc, char **argv)
 {
     int optindex = 0;
     signed char ch;
-    struct option longopts[] = 
+    struct option longopts[] =
 	{
         {"dimension",   required_argument, 0, 'd'}, //dimensions of src img
         {"components",  required_argument, 0, 'c'}, //numger of components of src img
@@ -844,27 +842,33 @@ int main(int argc, char **argv)
         {"97",          no_argument,       0, '9'}, //9/7 transform
         {"53",          no_argument,       0, '5' }, //5/3transform
         {"write-visual",no_argument,       0, 'w' }, //write output (subbands) in visual (tiled) order instead of linear
-        {"help",        no_argument,       0, 'h'}  
+        {"help",        no_argument,       0, 'h'},
+        {"platform_id", required_argument, 0, 'p'},
+        {"use_gpu",     required_argument, 0, 'g'},
+        {"device_id",   required_argument, 0, 'i'}
     };
-    
+
     int pixWidth    = 0; //<real pixWidth
     int pixHeight   = 0; //<real pixHeight
     int compCount   = 3; //number of components; 3 for RGB or YUV, 4 for RGBA
-    int bitDepth    = 8; 
+    int bitDepth    = 8;
     int dwtLvls     = 3; //default numuber of DWT levels
     int device      = 0;
     int forward     = 1; //forward transform
     int dwt97       = 0; //1=dwt9/7, 0=dwt5/3 transform
     int writeVisual = 0; //write output (subbands) in visual (tiled) order instead of linear
-    char * pos;
- 
-    while ((ch = getopt_long(argc, argv, "d:c:b:l:D:fr95wh", longopts, &optindex)) != -1) 
+    char * pos;
+    int platform_id = 0;
+    int device_id   = 0;
+    int use_gpu     = 0;
+
+    while ((ch = getopt_long(argc, argv, "d:c:b:l:D:fr95whp:g:i:", longopts, &optindex)) != -1)
 	{
         switch (ch) {
         case 'd':
             pixWidth = atoi(optarg);
             pos = strstr(optarg, "x");
-            if (pos == NULL || pixWidth == 0 || (strlen(pos) >= strlen(optarg))) 
+            if (pos == NULL || pixWidth == 0 || (strlen(pos) >= strlen(optarg)))
 			{
                 usage();
                 return -1;
@@ -897,6 +901,15 @@ int main(int argc, char **argv)
             break;
         case 'w':
             writeVisual = 1;
+            break;
+        case 'p':
+            platform_id = atoi(optarg);
+            break;
+        case 'g':
+            use_gpu = atoi(optarg);
+            break;
+        case 'i':
+            device_id = atoi(optarg);
             break;
         case 'h':
             usage();
@@ -911,31 +924,31 @@ int main(int argc, char **argv)
 	argc -= optind;
 	argv += optind;
 
-    if (argc == 0) 
+    if (argc == 0)
 	{ // at least one filename is expected
         printf("Please supply src file name\n");
         usage();
         return -1;
     }
 
-    if (pixWidth <= 0 || pixHeight <=0) 
+    if (pixWidth <= 0 || pixHeight <=0)
 	{
         printf("Wrong or missing dimensions\n");
         usage();
         return -1;
     }
 
-    if (forward == 0) 
+    if (forward == 0)
 	{
         writeVisual = 0; //do not write visual when RDWT
     }
-	
-	
-	
+
+
+
 	//
 	// device init
 	// Create an OpenCL context on first available platform
-    context = CreateContext();
+    context = CreateContext(platform_id, use_gpu);
     if (context == NULL)
     {
         std::cerr << "Failed to create OpenCL context." << std::endl;
@@ -944,13 +957,13 @@ int main(int argc, char **argv)
 
     // Create a command-queue on the first device available
     // on the created context
-    commandQueue = CreateCommandQueue(context, &cldevice);
+    commandQueue = CreateCommandQueue(context, &cldevice, device_id);
     if (commandQueue == NULL)
     {
         Cleanup(context, commandQueue, program, kernel);
         return 1;
     }
-	
+
 	// Create OpenCL program from com_dwt.cl kernel source
 	program = CreateProgram(context, cldevice, "com_dwt.cl");
     if (program == NULL)
@@ -959,26 +972,26 @@ int main(int argc, char **argv)
     }
 
 	// Create OpenCL kernel
-	c_CopySrcToComponents = clCreateKernel(program, "c_CopySrcToComponents", NULL); 
+	c_CopySrcToComponents = clCreateKernel(program, "c_CopySrcToComponents", NULL);
 	if (c_CopySrcToComponents == NULL)
     {
         std::cerr << "Failed to create kernel" << std::endl;
     }
 
-	c_CopySrcToComponent = clCreateKernel(program, "c_CopySrcToComponent", NULL); 
+	c_CopySrcToComponent = clCreateKernel(program, "c_CopySrcToComponent", NULL);
 	if (c_CopySrcToComponent == NULL)
     {
         std::cerr << "Failed to create kernel" << std::endl;
     }
-	
-	kl_fdwt53Kernel = clCreateKernel(program, "cl_fdwt53Kernel", NULL); 
+
+	kl_fdwt53Kernel = clCreateKernel(program, "cl_fdwt53Kernel", NULL);
     if (kl_fdwt53Kernel == NULL)
 	{
 		std::cerr<<"Failed to create kernel\n";
 	}
-	
 
-	
+
+
 	//initialize struct dwt
 	struct dwt *d;
     d = (struct dwt *)malloc(sizeof(struct dwt));
@@ -987,11 +1000,11 @@ int main(int argc, char **argv)
     d->pixHeight = pixHeight;
     d->components = compCount;
     d->dwtLvls  = dwtLvls;
-	
+
 	// file names
     d->srcFilename = (char *)malloc(strlen(argv[0]));
     strcpy(d->srcFilename, argv[0]);
-    if (argc == 1) 
+    if (argc == 1)
 	{ // only one filename supplyed
         d->outFilename = (char *)malloc(strlen(d->srcFilename)+4);
         strcpy(d->outFilename, d->srcFilename);
@@ -1008,38 +1021,38 @@ int main(int argc, char **argv)
     printf(" DWT levels:\t\t%d\n", dwtLvls);
     printf(" Forward transform:\t%d\n", forward);
     printf(" 9/7 transform:\t\t%d\n", dwt97);
-    
+
     //data sizes
     int inputSize = pixWidth*pixHeight*compCount; //<amount of data (in bytes) to proccess
 
     //load img source image
 	d->srcImg = (unsigned char *) malloc (inputSize);
-	if (getImg(d->srcFilename, d->srcImg, inputSize) == -1) 
+	if (getImg(d->srcFilename, d->srcImg, inputSize) == -1)
         return -1;
-		
+
 	 // DWT
 	// Create memory objects, Set arguments for kernel functions, Queue the kernel up for execution across the array, Read the output buffer back to the Host, Output the result buffer
-	
-    if (forward == 1) 
+
+    if (forward == 1)
 	{
         if(dwt97 == 1 )
             processDWT<float>(d, forward, writeVisual);
         else // 5/3
             processDWT<int>(d, forward, writeVisual);
     }
-    else 
+    else
 	{ // reverse
         if(dwt97 == 1 )
             processDWT<float>(d, forward, writeVisual);
         else // 5/3
             processDWT<int>(d, forward, writeVisual);
     }
-	
+
 
 	Cleanup(context, commandQueue, program, kernel);
 	clReleaseKernel(c_CopySrcToComponents);
 	clReleaseKernel(c_CopySrcToComponent);
-	
+
     return 0;
-	
+
 }

+ 1 - 1
opencl/dwt2d/run-cpu

@@ -1 +1 @@
-./dwt2d -d 1024x1024 -f -5 -l 333 ../../data/dwt2d/rgb.bmp
+./dwt2d -d 1024x1024 -f -5 -l 333 -p 1 -i 0 -g 0 ../../data/dwt2d/rgb.bmp

+ 1 - 0
opencl/dwt2d/run-gpu

@@ -0,0 +1 @@
+./dwt2d -d 1024x1024 -f -5 -l 333 -p 0 -i 0 -g 1 ../../data/dwt2d/rgb.bmp