浏览代码

Implemented device selection for backdrop

We now use parameters as the way to select OpenCL platform, device and
type instead of having the hardcoded in the code. In this way we can
easily change the type of the benchmark without recompiling it.

Reused part of the available routine to parse command line arguments
Andrea Gussoni 8 年之前
父节点
当前提交
89a204fd5a
共有 5 个文件被更改,包括 92 次插入62 次删除
  1. 3 3
      opencl/backprop/backprop.h
  2. 47 48
      opencl/backprop/backprop_ocl.cpp
  3. 40 10
      opencl/backprop/facetrain.c
  4. 1 1
      opencl/backprop/run-cpu
  5. 1 0
      opencl/backprop/run-gpu

+ 3 - 3
opencl/backprop/backprop.h

@@ -3,7 +3,7 @@
 
 
 #define BIGRND 0x7fffffff
 #define BIGRND 0x7fffffff
 #define THREADS 256
 #define THREADS 256
-#define WIDTH 16  // shared memory width  
+#define WIDTH 16  // shared memory width
 #define HEIGHT 16 // shared memory height
 #define HEIGHT 16 // shared memory height
 #define BLOCK_SIZE 16
 #define BLOCK_SIZE 16
 
 
@@ -52,10 +52,10 @@ void bpnn_save(BPNN *net, char *filename);
 //BPNN *bpnn_read();
 //BPNN *bpnn_read();
 BPNN *bpnn_read(char *filename);
 BPNN *bpnn_read(char *filename);
 void load(BPNN *net);
 void load(BPNN *net);
-int bpnn_train_kernel(BPNN *net, float *eo, float *eh);
+int bpnn_train_kernel(BPNN *net, float *eo, float *eh, int platform_num, int device_num, int use_gpu);
 void bpnn_layerforward(float *l1, float *l2, float **conn, int n1, int n2);
 void bpnn_layerforward(float *l1, float *l2, float **conn, int n1, int n2);
 void bpnn_output_error(float *delta, float *target, float *output, int nj, float *err);
 void bpnn_output_error(float *delta, float *target, float *output, int nj, float *err);
-void bpnn_hidden_error(float *delta_h, int nh, float *delta_o, int no, float **who, float *hidden, float *err); 
+void bpnn_hidden_error(float *delta_h, int nh, float *delta_o, int no, float **who, float *hidden, float *err);
 void bpnn_adjust_weights(float *delta, int ndelta, float *ly, int nly, float **w, float **oldw);
 void bpnn_adjust_weights(float *delta, int ndelta, float *ly, int nly, float **w, float **oldw);
 int setup(int argc, char** argv);
 int setup(int argc, char** argv);
 float **alloc_2d_dbl(int m, int n);
 float **alloc_2d_dbl(int m, int n);

+ 47 - 48
opencl/backprop/backprop_ocl.cpp

@@ -8,7 +8,7 @@
 
 
 #ifdef NV //NVIDIA
 #ifdef NV //NVIDIA
 	#include <oclUtils.h>
 	#include <oclUtils.h>
-#else 
+#else
 	#include <CL/cl.h>
 	#include <CL/cl.h>
 #endif
 #endif
 
 
@@ -22,7 +22,7 @@ static cl_device_type   device_type;
 static cl_device_id   * device_list;
 static cl_device_id   * device_list;
 static cl_int           num_devices;
 static cl_int           num_devices;
 
 
-static int initialize(int use_gpu)
+static int initialize(int platform_num, int device_num, int use_gpu)
 {
 {
 	cl_int result;
 	cl_int result;
 	size_t size;
 	size_t size;
@@ -32,11 +32,11 @@ static int initialize(int use_gpu)
 
 
 	// create OpenCL context
 	// create OpenCL context
 	clGetPlatformIDs(0, NULL, &platformCount);
 	clGetPlatformIDs(0, NULL, &platformCount);
-    	platforms_ids = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount);
-    	clGetPlatformIDs(platformCount, platforms_ids, NULL);
+  platforms_ids = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount);
+  clGetPlatformIDs(platformCount, platforms_ids, NULL);
 
 
 	if (clGetPlatformIDs(platformCount, platforms_ids, NULL) != CL_SUCCESS) { printf("ERROR: clGetPlatformIDs(1,*,0) failed\n"); return -1; }
 	if (clGetPlatformIDs(platformCount, platforms_ids, NULL) != CL_SUCCESS) { printf("ERROR: clGetPlatformIDs(1,*,0) failed\n"); return -1; }
-	cl_context_properties ctxprop[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platforms_ids[1], 0};
+	cl_context_properties ctxprop[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platforms_ids[platform_num], 0};
 	device_type = use_gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU;
 	device_type = use_gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU;
 	context = clCreateContextFromType( ctxprop, device_type, NULL, NULL, NULL );
 	context = clCreateContextFromType( ctxprop, device_type, NULL, NULL, NULL );
 	if( !context ) { printf("ERROR: clCreateContextFromType(%s) failed\n", use_gpu ? "GPU" : "CPU"); return -1; }
 	if( !context ) { printf("ERROR: clCreateContextFromType(%s) failed\n", use_gpu ? "GPU" : "CPU"); return -1; }
@@ -45,7 +45,7 @@ static int initialize(int use_gpu)
 	result = clGetContextInfo( context, CL_CONTEXT_DEVICES, 0, NULL, &size );
 	result = clGetContextInfo( context, CL_CONTEXT_DEVICES, 0, NULL, &size );
 	num_devices = (int) (size / sizeof(cl_device_id));
 	num_devices = (int) (size / sizeof(cl_device_id));
 	printf("num_devices = %d\n", num_devices);
 	printf("num_devices = %d\n", num_devices);
-	
+
 	if( result != CL_SUCCESS || num_devices < 1 ) { printf("ERROR: clGetContextInfo() failed\n"); return -1; }
 	if( result != CL_SUCCESS || num_devices < 1 ) { printf("ERROR: clGetContextInfo() failed\n"); return -1; }
 	device_list = new cl_device_id[num_devices];
 	device_list = new cl_device_id[num_devices];
 	//device_list = (cl_device_id *)malloc(sizeof(cl_device_id)*num_devices);
 	//device_list = (cl_device_id *)malloc(sizeof(cl_device_id)*num_devices);
@@ -54,7 +54,7 @@ static int initialize(int use_gpu)
 	if( result != CL_SUCCESS ) { printf("ERROR: clGetContextInfo() failed\n"); return -1; }
 	if( result != CL_SUCCESS ) { printf("ERROR: clGetContextInfo() failed\n"); return -1; }
 
 
 	// create command queue for the first device
 	// create command queue for the first device
-	cmd_queue = clCreateCommandQueue( context, device_list[0], 0, NULL );
+	cmd_queue = clCreateCommandQueue( context, device_list[device_num], 0, NULL );
 	if( !cmd_queue ) { printf("ERROR: clCreateCommandQueue() failed\n"); return -1; }
 	if( !cmd_queue ) { printf("ERROR: clCreateCommandQueue() failed\n"); return -1; }
 	return 0;
 	return 0;
 }
 }
@@ -89,38 +89,37 @@ unsigned int num_blocks = 0;
 // Program main
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////
 int
 int
-main( int argc, char** argv) 
+main( int argc, char** argv)
 {
 {
 	setup(argc, argv);
 	setup(argc, argv);
 }
 }
 
 
 
 
 
 
-int bpnn_train_kernel(BPNN *net, float *eo, float *eh)
+int bpnn_train_kernel(BPNN *net, float *eo, float *eh, int platform_num, int device_num, int use_gpu)
 {
 {
 	int in, hid, out;
 	int in, hid, out;
 	float out_err, hid_err;
 	float out_err, hid_err;
-  
+
 	in = net->input_n;
 	in = net->input_n;
 	hid = net->hidden_n;
 	hid = net->hidden_n;
-	out = net->output_n;   
-   
+	out = net->output_n;
+
 	int sourcesize = 1024*1024;
 	int sourcesize = 1024*1024;
-	char * source = (char *)calloc(sourcesize, sizeof(char)); 
+	char * source = (char *)calloc(sourcesize, sizeof(char));
 	if(!source) { printf("ERROR: calloc(%d) failed\n", sourcesize); return -1; }
 	if(!source) { printf("ERROR: calloc(%d) failed\n", sourcesize); return -1; }
 
 
 	// read the kernel core source
 	// read the kernel core source
 	char * kernel_bp1  = "bpnn_layerforward_ocl";
 	char * kernel_bp1  = "bpnn_layerforward_ocl";
 	char * kernel_bp2  = "bpnn_adjust_weights_ocl";
 	char * kernel_bp2  = "bpnn_adjust_weights_ocl";
 	char * tempchar = "./backprop_kernel.cl";
 	char * tempchar = "./backprop_kernel.cl";
-	FILE * fp = fopen(tempchar, "rb"); 
+	FILE * fp = fopen(tempchar, "rb");
 	if(!fp) { printf("ERROR: unable to open '%s'\n", tempchar); return -1; }
 	if(!fp) { printf("ERROR: unable to open '%s'\n", tempchar); return -1; }
 	fread(source + strlen(source), sourcesize, 1, fp);
 	fread(source + strlen(source), sourcesize, 1, fp);
 	fclose(fp);
 	fclose(fp);
-	
-	int use_gpu = 0;
-	if(initialize(use_gpu)) return -1;
-	
+
+	if(initialize(platform_num, device_num, use_gpu)) return -1;
+
 	// compile kernel
 	// compile kernel
 	cl_int err = 0;
 	cl_int err = 0;
 	const char * slist[2] = { source, 0 };
 	const char * slist[2] = { source, 0 };
@@ -135,46 +134,46 @@ int bpnn_train_kernel(BPNN *net, float *eo, float *eh)
 		//if(err || strstr(log,"warning:") || strstr(log, "error:")) printf("<<<<\n%s\n>>>>\n", log);
 		//if(err || strstr(log,"warning:") || strstr(log, "error:")) printf("<<<<\n%s\n>>>>\n", log);
 	}
 	}
 	if(err != CL_SUCCESS) { printf("ERROR: clBuildProgram() => %d\n", err); return -1; }
 	if(err != CL_SUCCESS) { printf("ERROR: clBuildProgram() => %d\n", err); return -1; }
-    	
+
 	cl_kernel kernel1;
 	cl_kernel kernel1;
 	cl_kernel kernel2;
 	cl_kernel kernel2;
-	kernel1 = clCreateKernel(prog, kernel_bp1, &err);  
-	kernel2 = clCreateKernel(prog, kernel_bp2, &err);  
+	kernel1 = clCreateKernel(prog, kernel_bp1, &err);
+	kernel2 = clCreateKernel(prog, kernel_bp2, &err);
 	if(err != CL_SUCCESS) { printf("ERROR: clCreateKernel() 0 => %d\n", err); return -1; }
 	if(err != CL_SUCCESS) { printf("ERROR: clCreateKernel() 0 => %d\n", err); return -1; }
 	clReleaseProgram(prog);
 	clReleaseProgram(prog);
-	
+
 	float *input_weights_one_dim;
 	float *input_weights_one_dim;
     float *input_weights_prev_one_dim;
     float *input_weights_prev_one_dim;
 	float * partial_sum;
 	float * partial_sum;
 	float sum;
 	float sum;
 	float num_blocks = in / BLOCK_SIZE;
 	float num_blocks = in / BLOCK_SIZE;
-	
+
 	input_weights_one_dim = (float *) malloc((in + 1)* (hid + 1) * sizeof(float));
 	input_weights_one_dim = (float *) malloc((in + 1)* (hid + 1) * sizeof(float));
 	input_weights_prev_one_dim = (float *) malloc((in + 1)* (hid + 1) * sizeof(float));
 	input_weights_prev_one_dim = (float *) malloc((in + 1)* (hid + 1) * sizeof(float));
 	partial_sum = (float *) malloc(num_blocks * WIDTH * sizeof(float));
 	partial_sum = (float *) malloc(num_blocks * WIDTH * sizeof(float));
-	
+
 	// set global and local workitems
 	// set global and local workitems
-	size_t global_work[3] = { BLOCK_SIZE, BLOCK_SIZE * num_blocks, 1 }; 
+	size_t global_work[3] = { BLOCK_SIZE, BLOCK_SIZE * num_blocks, 1 };
 	size_t local_work[3] = { BLOCK_SIZE, BLOCK_SIZE, 1 };
 	size_t local_work[3] = { BLOCK_SIZE, BLOCK_SIZE, 1 };
-	
+
 	// this preprocessing stage is temporarily added to correct the bug of wrong memcopy using two-dimensional net->inputweights
 	// this preprocessing stage is temporarily added to correct the bug of wrong memcopy using two-dimensional net->inputweights
 	// todo: fix mem allocation
 	// todo: fix mem allocation
 	int m = 0;
 	int m = 0;
-	for (int k = 0; k <= in; k++) {	
+	for (int k = 0; k <= in; k++) {
 		for (int j = 0; j <= hid; j++) {
 		for (int j = 0; j <= hid; j++) {
 		input_weights_one_dim[m] = net->input_weights[k][j];
 		input_weights_one_dim[m] = net->input_weights[k][j];
 		input_weights_prev_one_dim[m] = net-> input_prev_weights[k][j];
 		input_weights_prev_one_dim[m] = net-> input_prev_weights[k][j];
 	    m++;
 	    m++;
 		}
 		}
 	}
 	}
-	
+
 	cl_mem input_hidden_ocl;
 	cl_mem input_hidden_ocl;
 	cl_mem input_ocl;
 	cl_mem input_ocl;
 	cl_mem output_hidden_ocl;
 	cl_mem output_hidden_ocl;
 	cl_mem hidden_partial_sum;
 	cl_mem hidden_partial_sum;
 	cl_mem hidden_delta_ocl;
 	cl_mem hidden_delta_ocl;
 	cl_mem input_prev_weights_ocl;
 	cl_mem input_prev_weights_ocl;
-  
+
 	input_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (in + 1) * sizeof(float), NULL, &err );
 	input_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (in + 1) * sizeof(float), NULL, &err );
 	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer input_ocl\n"); return -1;}
 	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer input_ocl\n"); return -1;}
 	input_hidden_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (in + 1) * (hid + 1) * sizeof(float), NULL, &err );
 	input_hidden_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (in + 1) * (hid + 1) * sizeof(float), NULL, &err );
@@ -187,15 +186,15 @@ int bpnn_train_kernel(BPNN *net, float *eo, float *eh)
 	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer hidden_delta_ocl\n"); return -1;}
 	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer hidden_delta_ocl\n"); return -1;}
 	input_prev_weights_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (in + 1) * (hid + 1) * sizeof(float), NULL, &err );
 	input_prev_weights_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (in + 1) * (hid + 1) * sizeof(float), NULL, &err );
 	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer input_prev_weights_ocl\n"); return -1;}
 	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer input_prev_weights_ocl\n"); return -1;}
-		
+
 	printf("Performing GPU computation\n");
 	printf("Performing GPU computation\n");
-	
+
 	//write buffers
 	//write buffers
 	err = clEnqueueWriteBuffer(cmd_queue, input_ocl, 1, 0, (in + 1) * sizeof(float), net->input_units, 0, 0, 0);
 	err = clEnqueueWriteBuffer(cmd_queue, input_ocl, 1, 0, (in + 1) * sizeof(float), net->input_units, 0, 0, 0);
 	if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_ocl\n"); return -1; }
 	if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_ocl\n"); return -1; }
 	err = clEnqueueWriteBuffer(cmd_queue, input_hidden_ocl, 1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_one_dim, 0, 0, 0);
 	err = clEnqueueWriteBuffer(cmd_queue, input_hidden_ocl, 1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_one_dim, 0, 0, 0);
 	if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_hidden_ocl\n"); return -1; }
 	if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_hidden_ocl\n"); return -1; }
- 
+
 	clSetKernelArg(kernel1, 0, sizeof(void *), (void*) &input_ocl);
 	clSetKernelArg(kernel1, 0, sizeof(void *), (void*) &input_ocl);
 	clSetKernelArg(kernel1, 1, sizeof(void *), (void*) &output_hidden_ocl);
 	clSetKernelArg(kernel1, 1, sizeof(void *), (void*) &output_hidden_ocl);
 	clSetKernelArg(kernel1, 2, sizeof(void *), (void*) &input_hidden_ocl);
 	clSetKernelArg(kernel1, 2, sizeof(void *), (void*) &input_hidden_ocl);
@@ -204,26 +203,26 @@ int bpnn_train_kernel(BPNN *net, float *eo, float *eh)
 	clSetKernelArg(kernel1, 5, sizeof(float ) *  HEIGHT * WIDTH, (void*)NULL );
 	clSetKernelArg(kernel1, 5, sizeof(float ) *  HEIGHT * WIDTH, (void*)NULL );
 	clSetKernelArg(kernel1, 6, sizeof(cl_int), (void*) &in);
 	clSetKernelArg(kernel1, 6, sizeof(cl_int), (void*) &in);
 	clSetKernelArg(kernel1, 7, sizeof(cl_int), (void*) &hid);
 	clSetKernelArg(kernel1, 7, sizeof(cl_int), (void*) &hid);
-  
+
 	err = clEnqueueNDRangeKernel(cmd_queue, kernel1, 2, NULL, global_work, local_work, 0, 0, 0);
 	err = clEnqueueNDRangeKernel(cmd_queue, kernel1, 2, NULL, global_work, local_work, 0, 0, 0);
-	if(err != CL_SUCCESS) { printf("ERROR: 1  clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; }	
-  
+	if(err != CL_SUCCESS) { printf("ERROR: 1  clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; }
+
 	err = clEnqueueReadBuffer(cmd_queue, hidden_partial_sum, 1, 0, num_blocks * WIDTH * sizeof(float), partial_sum, 0, 0, 0);
 	err = clEnqueueReadBuffer(cmd_queue, hidden_partial_sum, 1, 0, num_blocks * WIDTH * sizeof(float), partial_sum, 0, 0, 0);
-	if(err != CL_SUCCESS) { printf("ERROR: 1  clEnqueueReadBuffer: partial sum\n"); return -1; }	
-  
+	if(err != CL_SUCCESS) { printf("ERROR: 1  clEnqueueReadBuffer: partial sum\n"); return -1; }
+
 	for (int j = 1; j <= hid; j++) {
 	for (int j = 1; j <= hid; j++) {
 		sum = 0.0;
 		sum = 0.0;
-		for (int k = 0; k < num_blocks; k++) {	
+		for (int k = 0; k < num_blocks; k++) {
 		sum += partial_sum[k * hid + j-1] ;
 		sum += partial_sum[k * hid + j-1] ;
     }
     }
 		sum += net->input_weights[0][j];
 		sum += net->input_weights[0][j];
 		net-> hidden_units[j] = float(1.0 / (1.0 + exp(-sum)));
 		net-> hidden_units[j] = float(1.0 / (1.0 + exp(-sum)));
 	}
 	}
 
 
-	
+
 	bpnn_layerforward(net->hidden_units, net->output_units, net->hidden_weights, hid, out);
 	bpnn_layerforward(net->hidden_units, net->output_units, net->hidden_weights, hid, out);
 	bpnn_output_error(net->output_delta, net->target, net->output_units, out, &out_err);
 	bpnn_output_error(net->output_delta, net->target, net->output_units, out, &out_err);
-	bpnn_hidden_error(net->hidden_delta, hid, net->output_delta, out, net->hidden_weights, net->hidden_units, &hid_err);  
+	bpnn_hidden_error(net->hidden_delta, hid, net->output_delta, out, net->hidden_weights, net->hidden_units, &hid_err);
 	bpnn_adjust_weights(net->output_delta, out, net->hidden_units, hid, net->hidden_weights, net->hidden_prev_weights);
 	bpnn_adjust_weights(net->output_delta, out, net->hidden_units, hid, net->hidden_weights, net->hidden_prev_weights);
 
 
 	err = clEnqueueWriteBuffer(cmd_queue, hidden_delta_ocl,       1, 0, (hid + 1) * sizeof(float), net->hidden_delta, 0, 0, 0);
 	err = clEnqueueWriteBuffer(cmd_queue, hidden_delta_ocl,       1, 0, (hid + 1) * sizeof(float), net->hidden_delta, 0, 0, 0);
@@ -232,28 +231,28 @@ int bpnn_train_kernel(BPNN *net, float *eo, float *eh)
 	if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_prev_weights_ocl\n"); return -1; }
 	if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_prev_weights_ocl\n"); return -1; }
 	err = clEnqueueWriteBuffer(cmd_queue, input_hidden_ocl,       1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_one_dim, 0, 0, 0);
 	err = clEnqueueWriteBuffer(cmd_queue, input_hidden_ocl,       1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_one_dim, 0, 0, 0);
 	if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_hidden_ocl\n"); return -1; }
 	if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_hidden_ocl\n"); return -1; }
-  
+
 	clSetKernelArg(kernel2, 0, sizeof(void *), (void*) &hidden_delta_ocl);
 	clSetKernelArg(kernel2, 0, sizeof(void *), (void*) &hidden_delta_ocl);
 	clSetKernelArg(kernel2, 1, sizeof(cl_int), (void*) &hid);
 	clSetKernelArg(kernel2, 1, sizeof(cl_int), (void*) &hid);
 	clSetKernelArg(kernel2, 2, sizeof(void *), (void*) &input_ocl);
 	clSetKernelArg(kernel2, 2, sizeof(void *), (void*) &input_ocl);
 	clSetKernelArg(kernel2, 3, sizeof(cl_int), (void*) &in);
 	clSetKernelArg(kernel2, 3, sizeof(cl_int), (void*) &in);
 	clSetKernelArg(kernel2, 4, sizeof(void *), (void*) &input_hidden_ocl);
 	clSetKernelArg(kernel2, 4, sizeof(void *), (void*) &input_hidden_ocl);
 	clSetKernelArg(kernel2, 5, sizeof(void *), (void*) &input_prev_weights_ocl );
 	clSetKernelArg(kernel2, 5, sizeof(void *), (void*) &input_prev_weights_ocl );
-  
+
 	err = clEnqueueNDRangeKernel(cmd_queue, kernel2, 2, NULL, global_work, local_work, 0, 0, 0);
 	err = clEnqueueNDRangeKernel(cmd_queue, kernel2, 2, NULL, global_work, local_work, 0, 0, 0);
-	if(err != CL_SUCCESS) { printf("ERROR: 1  clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; }	
-  
+	if(err != CL_SUCCESS) { printf("ERROR: 1  clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; }
+
 	err = clEnqueueReadBuffer(cmd_queue, input_ocl, 1, 0, (in + 1) * sizeof(float), net->input_units, 0, 0, 0);
 	err = clEnqueueReadBuffer(cmd_queue, input_ocl, 1, 0, (in + 1) * sizeof(float), net->input_units, 0, 0, 0);
-	if(err != CL_SUCCESS) { printf("ERROR: 1  clEnqueueReadBuffer: input_ocl\n"); return -1; }	
+	if(err != CL_SUCCESS) { printf("ERROR: 1  clEnqueueReadBuffer: input_ocl\n"); return -1; }
 	err = clEnqueueReadBuffer(cmd_queue, input_hidden_ocl, 1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_one_dim, 0, 0, 0);
 	err = clEnqueueReadBuffer(cmd_queue, input_hidden_ocl, 1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_one_dim, 0, 0, 0);
-	if(err != CL_SUCCESS) { printf("ERROR: 1  clEnqueueReadBuffer: input_hidden_ocl\n"); return -1; }	
-  
+	if(err != CL_SUCCESS) { printf("ERROR: 1  clEnqueueReadBuffer: input_hidden_ocl\n"); return -1; }
+
 	clReleaseMemObject(input_ocl);
 	clReleaseMemObject(input_ocl);
 	clReleaseMemObject(output_hidden_ocl);
 	clReleaseMemObject(output_hidden_ocl);
 	clReleaseMemObject(input_hidden_ocl);
 	clReleaseMemObject(input_hidden_ocl);
 	clReleaseMemObject(hidden_partial_sum);
 	clReleaseMemObject(hidden_partial_sum);
 	clReleaseMemObject(input_prev_weights_ocl);
 	clReleaseMemObject(input_prev_weights_ocl);
-  
+
 	free(input_weights_prev_one_dim);
 	free(input_weights_prev_one_dim);
 	free(partial_sum);
 	free(partial_sum);
 	free(input_weights_one_dim);
 	free(input_weights_one_dim);

+ 40 - 10
opencl/backprop/facetrain.c

@@ -1,6 +1,7 @@
+#include <math.h>
 #include <stdio.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdlib.h>
-#include <math.h>
+#include <unistd.h>
 #include "backprop.h"
 #include "backprop.h"
 #include "omp.h"
 #include "omp.h"
 
 
@@ -8,6 +9,9 @@ extern char *strcpy();
 extern void exit();
 extern void exit();
 
 
 int layer_size = 0;
 int layer_size = 0;
+int platform_id = 0;
+int device_id = 0;
+int use_gpu = 0;
 
 
 void backprop_face()
 void backprop_face()
 {
 {
@@ -15,33 +19,59 @@ void backprop_face()
   int i;
   int i;
   float out_err, hid_err;
   float out_err, hid_err;
   net = bpnn_create(layer_size, 16, 1); // (16, 1 can not be changed)
   net = bpnn_create(layer_size, 16, 1); // (16, 1 can not be changed)
-  
+
   printf("Input layer size : %d\n", layer_size);
   printf("Input layer size : %d\n", layer_size);
   load(net);
   load(net);
   //entering the training kernel, only one iteration
   //entering the training kernel, only one iteration
   printf("Starting training kernel\n");
   printf("Starting training kernel\n");
-  bpnn_train_kernel(net, &out_err, &hid_err);
+  bpnn_train_kernel(net, &out_err, &hid_err, platform_id, device_id, use_gpu);
   bpnn_free(net);
   bpnn_free(net);
   printf("\nFinish the training for one iteration\n");
   printf("\nFinish the training for one iteration\n");
 }
 }
 
 
+void Usage(char *argv0){
+
+  char *help =
+  "\nUsage: %s [switches] \n\n"
+  "    -l               :layer size                     \n"
+  "    -p platform_id   :OCL platform to use [default=0]\n"
+  "    -d device_id     :OCL device to use   [default=0]\n"
+  "    -g use_gpu       :1 for GPU 0 for CPU [default=0]\n";
+  fprintf(stderr, help, argv0);
+  exit(-1);
+
+}
+
 int setup(int argc, char **argv)
 int setup(int argc, char **argv)
 {
 {
-	
+
   int seed;
   int seed;
 
 
-  if (argc!=2){
-  fprintf(stderr, "usage: backprop <num of input elements>\n");
-  exit(0);
+  int opt;
+  extern char *optarg;
+  while ((opt=getopt(argc,argv,"l:p:d:g:"))!= EOF) {
+      switch (opt) {
+          case 'p': platform_id = atoi(optarg);
+                    break;
+          case 'd': device_id = atoi(optarg);
+                    break;
+          case 'g': use_gpu = atoi(optarg);
+                    break;
+          case 'l': layer_size = atoi(optarg);
+                    break;
+          case '?': Usage(argv[0]);
+                    break;
+          default:  Usage(argv[0]);
+                    break;
+      }
   }
   }
-  layer_size = atoi(argv[1]);
+
   if (layer_size%16!=0){
   if (layer_size%16!=0){
   fprintf(stderr, "The number of input points must be divided by 16\n");
   fprintf(stderr, "The number of input points must be divided by 16\n");
   exit(0);
   exit(0);
   }
   }
-  
 
 
-  seed = 7;   
+  seed = 7;
   bpnn_initialize(seed);
   bpnn_initialize(seed);
   backprop_face();
   backprop_face();
 
 

+ 1 - 1
opencl/backprop/run-cpu

@@ -1 +1 @@
-./backprop 1048576
+./backprop -l 1048576 -p 1 -d 0 -g 0

+ 1 - 0
opencl/backprop/run-gpu

@@ -0,0 +1 @@
+./backprop -l 1048576 -p 0 -d 0 -g 1