Browse Source

Implemented device selection for backdrop

We now use parameters as the way to select OpenCL platform, device and
type instead of having the hardcoded in the code. In this way we can
easily change the type of the benchmark without recompiling it.

Reused part of the available routine to parse command line arguments
Andrea Gussoni 8 years ago
parent
commit
89a204fd5a

+ 3 - 3
opencl/backprop/backprop.h

@@ -3,7 +3,7 @@
 
 #define BIGRND 0x7fffffff
 #define THREADS 256
-#define WIDTH 16  // shared memory width  
+#define WIDTH 16  // shared memory width
 #define HEIGHT 16 // shared memory height
 #define BLOCK_SIZE 16
 
@@ -52,10 +52,10 @@ void bpnn_save(BPNN *net, char *filename);
 //BPNN *bpnn_read();
 BPNN *bpnn_read(char *filename);
 void load(BPNN *net);
-int bpnn_train_kernel(BPNN *net, float *eo, float *eh);
+int bpnn_train_kernel(BPNN *net, float *eo, float *eh, int platform_num, int device_num, int use_gpu);
 void bpnn_layerforward(float *l1, float *l2, float **conn, int n1, int n2);
 void bpnn_output_error(float *delta, float *target, float *output, int nj, float *err);
-void bpnn_hidden_error(float *delta_h, int nh, float *delta_o, int no, float **who, float *hidden, float *err); 
+void bpnn_hidden_error(float *delta_h, int nh, float *delta_o, int no, float **who, float *hidden, float *err);
 void bpnn_adjust_weights(float *delta, int ndelta, float *ly, int nly, float **w, float **oldw);
 int setup(int argc, char** argv);
 float **alloc_2d_dbl(int m, int n);

+ 47 - 48
opencl/backprop/backprop_ocl.cpp

@@ -8,7 +8,7 @@
 
 #ifdef NV //NVIDIA
 	#include <oclUtils.h>
-#else 
+#else
 	#include <CL/cl.h>
 #endif
 
@@ -22,7 +22,7 @@ static cl_device_type   device_type;
 static cl_device_id   * device_list;
 static cl_int           num_devices;
 
-static int initialize(int use_gpu)
+static int initialize(int platform_num, int device_num, int use_gpu)
 {
 	cl_int result;
 	size_t size;
@@ -32,11 +32,11 @@ static int initialize(int use_gpu)
 
 	// create OpenCL context
 	clGetPlatformIDs(0, NULL, &platformCount);
-    	platforms_ids = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount);
-    	clGetPlatformIDs(platformCount, platforms_ids, NULL);
+  platforms_ids = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount);
+  clGetPlatformIDs(platformCount, platforms_ids, NULL);
 
 	if (clGetPlatformIDs(platformCount, platforms_ids, NULL) != CL_SUCCESS) { printf("ERROR: clGetPlatformIDs(1,*,0) failed\n"); return -1; }
-	cl_context_properties ctxprop[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platforms_ids[1], 0};
+	cl_context_properties ctxprop[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platforms_ids[platform_num], 0};
 	device_type = use_gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU;
 	context = clCreateContextFromType( ctxprop, device_type, NULL, NULL, NULL );
 	if( !context ) { printf("ERROR: clCreateContextFromType(%s) failed\n", use_gpu ? "GPU" : "CPU"); return -1; }
@@ -45,7 +45,7 @@ static int initialize(int use_gpu)
 	result = clGetContextInfo( context, CL_CONTEXT_DEVICES, 0, NULL, &size );
 	num_devices = (int) (size / sizeof(cl_device_id));
 	printf("num_devices = %d\n", num_devices);
-	
+
 	if( result != CL_SUCCESS || num_devices < 1 ) { printf("ERROR: clGetContextInfo() failed\n"); return -1; }
 	device_list = new cl_device_id[num_devices];
 	//device_list = (cl_device_id *)malloc(sizeof(cl_device_id)*num_devices);
@@ -54,7 +54,7 @@ static int initialize(int use_gpu)
 	if( result != CL_SUCCESS ) { printf("ERROR: clGetContextInfo() failed\n"); return -1; }
 
 	// create command queue for the first device
-	cmd_queue = clCreateCommandQueue( context, device_list[0], 0, NULL );
+	cmd_queue = clCreateCommandQueue( context, device_list[device_num], 0, NULL );
 	if( !cmd_queue ) { printf("ERROR: clCreateCommandQueue() failed\n"); return -1; }
 	return 0;
 }
@@ -89,38 +89,37 @@ unsigned int num_blocks = 0;
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
 int
-main( int argc, char** argv) 
+main( int argc, char** argv)
 {
 	setup(argc, argv);
 }
 
 
 
-int bpnn_train_kernel(BPNN *net, float *eo, float *eh)
+int bpnn_train_kernel(BPNN *net, float *eo, float *eh, int platform_num, int device_num, int use_gpu)
 {
 	int in, hid, out;
 	float out_err, hid_err;
-  
+
 	in = net->input_n;
 	hid = net->hidden_n;
-	out = net->output_n;   
-   
+	out = net->output_n;
+
 	int sourcesize = 1024*1024;
-	char * source = (char *)calloc(sourcesize, sizeof(char)); 
+	char * source = (char *)calloc(sourcesize, sizeof(char));
 	if(!source) { printf("ERROR: calloc(%d) failed\n", sourcesize); return -1; }
 
 	// read the kernel core source
 	char * kernel_bp1  = "bpnn_layerforward_ocl";
 	char * kernel_bp2  = "bpnn_adjust_weights_ocl";
 	char * tempchar = "./backprop_kernel.cl";
-	FILE * fp = fopen(tempchar, "rb"); 
+	FILE * fp = fopen(tempchar, "rb");
 	if(!fp) { printf("ERROR: unable to open '%s'\n", tempchar); return -1; }
 	fread(source + strlen(source), sourcesize, 1, fp);
 	fclose(fp);
-	
-	int use_gpu = 0;
-	if(initialize(use_gpu)) return -1;
-	
+
+	if(initialize(platform_num, device_num, use_gpu)) return -1;
+
 	// compile kernel
 	cl_int err = 0;
 	const char * slist[2] = { source, 0 };
@@ -135,46 +134,46 @@ int bpnn_train_kernel(BPNN *net, float *eo, float *eh)
 		//if(err || strstr(log,"warning:") || strstr(log, "error:")) printf("<<<<\n%s\n>>>>\n", log);
 	}
 	if(err != CL_SUCCESS) { printf("ERROR: clBuildProgram() => %d\n", err); return -1; }
-    	
+
 	cl_kernel kernel1;
 	cl_kernel kernel2;
-	kernel1 = clCreateKernel(prog, kernel_bp1, &err);  
-	kernel2 = clCreateKernel(prog, kernel_bp2, &err);  
+	kernel1 = clCreateKernel(prog, kernel_bp1, &err);
+	kernel2 = clCreateKernel(prog, kernel_bp2, &err);
 	if(err != CL_SUCCESS) { printf("ERROR: clCreateKernel() 0 => %d\n", err); return -1; }
 	clReleaseProgram(prog);
-	
+
 	float *input_weights_one_dim;
     float *input_weights_prev_one_dim;
 	float * partial_sum;
 	float sum;
 	float num_blocks = in / BLOCK_SIZE;
-	
+
 	input_weights_one_dim = (float *) malloc((in + 1)* (hid + 1) * sizeof(float));
 	input_weights_prev_one_dim = (float *) malloc((in + 1)* (hid + 1) * sizeof(float));
 	partial_sum = (float *) malloc(num_blocks * WIDTH * sizeof(float));
-	
+
 	// set global and local workitems
-	size_t global_work[3] = { BLOCK_SIZE, BLOCK_SIZE * num_blocks, 1 }; 
+	size_t global_work[3] = { BLOCK_SIZE, BLOCK_SIZE * num_blocks, 1 };
 	size_t local_work[3] = { BLOCK_SIZE, BLOCK_SIZE, 1 };
-	
+
 	// this preprocessing stage is temporarily added to correct the bug of wrong memcopy using two-dimensional net->inputweights
 	// todo: fix mem allocation
 	int m = 0;
-	for (int k = 0; k <= in; k++) {	
+	for (int k = 0; k <= in; k++) {
 		for (int j = 0; j <= hid; j++) {
 		input_weights_one_dim[m] = net->input_weights[k][j];
 		input_weights_prev_one_dim[m] = net-> input_prev_weights[k][j];
 	    m++;
 		}
 	}
-	
+
 	cl_mem input_hidden_ocl;
 	cl_mem input_ocl;
 	cl_mem output_hidden_ocl;
 	cl_mem hidden_partial_sum;
 	cl_mem hidden_delta_ocl;
 	cl_mem input_prev_weights_ocl;
-  
+
 	input_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (in + 1) * sizeof(float), NULL, &err );
 	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer input_ocl\n"); return -1;}
 	input_hidden_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (in + 1) * (hid + 1) * sizeof(float), NULL, &err );
@@ -187,15 +186,15 @@ int bpnn_train_kernel(BPNN *net, float *eo, float *eh)
 	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer hidden_delta_ocl\n"); return -1;}
 	input_prev_weights_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (in + 1) * (hid + 1) * sizeof(float), NULL, &err );
 	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer input_prev_weights_ocl\n"); return -1;}
-		
+
 	printf("Performing GPU computation\n");
-	
+
 	//write buffers
 	err = clEnqueueWriteBuffer(cmd_queue, input_ocl, 1, 0, (in + 1) * sizeof(float), net->input_units, 0, 0, 0);
 	if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_ocl\n"); return -1; }
 	err = clEnqueueWriteBuffer(cmd_queue, input_hidden_ocl, 1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_one_dim, 0, 0, 0);
 	if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_hidden_ocl\n"); return -1; }
- 
+
 	clSetKernelArg(kernel1, 0, sizeof(void *), (void*) &input_ocl);
 	clSetKernelArg(kernel1, 1, sizeof(void *), (void*) &output_hidden_ocl);
 	clSetKernelArg(kernel1, 2, sizeof(void *), (void*) &input_hidden_ocl);
@@ -204,26 +203,26 @@ int bpnn_train_kernel(BPNN *net, float *eo, float *eh)
 	clSetKernelArg(kernel1, 5, sizeof(float ) *  HEIGHT * WIDTH, (void*)NULL );
 	clSetKernelArg(kernel1, 6, sizeof(cl_int), (void*) &in);
 	clSetKernelArg(kernel1, 7, sizeof(cl_int), (void*) &hid);
-  
+
 	err = clEnqueueNDRangeKernel(cmd_queue, kernel1, 2, NULL, global_work, local_work, 0, 0, 0);
-	if(err != CL_SUCCESS) { printf("ERROR: 1  clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; }	
-  
+	if(err != CL_SUCCESS) { printf("ERROR: 1  clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; }
+
 	err = clEnqueueReadBuffer(cmd_queue, hidden_partial_sum, 1, 0, num_blocks * WIDTH * sizeof(float), partial_sum, 0, 0, 0);
-	if(err != CL_SUCCESS) { printf("ERROR: 1  clEnqueueReadBuffer: partial sum\n"); return -1; }	
-  
+	if(err != CL_SUCCESS) { printf("ERROR: 1  clEnqueueReadBuffer: partial sum\n"); return -1; }
+
 	for (int j = 1; j <= hid; j++) {
 		sum = 0.0;
-		for (int k = 0; k < num_blocks; k++) {	
+		for (int k = 0; k < num_blocks; k++) {
 		sum += partial_sum[k * hid + j-1] ;
     }
 		sum += net->input_weights[0][j];
 		net-> hidden_units[j] = float(1.0 / (1.0 + exp(-sum)));
 	}
 
-	
+
 	bpnn_layerforward(net->hidden_units, net->output_units, net->hidden_weights, hid, out);
 	bpnn_output_error(net->output_delta, net->target, net->output_units, out, &out_err);
-	bpnn_hidden_error(net->hidden_delta, hid, net->output_delta, out, net->hidden_weights, net->hidden_units, &hid_err);  
+	bpnn_hidden_error(net->hidden_delta, hid, net->output_delta, out, net->hidden_weights, net->hidden_units, &hid_err);
 	bpnn_adjust_weights(net->output_delta, out, net->hidden_units, hid, net->hidden_weights, net->hidden_prev_weights);
 
 	err = clEnqueueWriteBuffer(cmd_queue, hidden_delta_ocl,       1, 0, (hid + 1) * sizeof(float), net->hidden_delta, 0, 0, 0);
@@ -232,28 +231,28 @@ int bpnn_train_kernel(BPNN *net, float *eo, float *eh)
 	if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_prev_weights_ocl\n"); return -1; }
 	err = clEnqueueWriteBuffer(cmd_queue, input_hidden_ocl,       1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_one_dim, 0, 0, 0);
 	if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_hidden_ocl\n"); return -1; }
-  
+
 	clSetKernelArg(kernel2, 0, sizeof(void *), (void*) &hidden_delta_ocl);
 	clSetKernelArg(kernel2, 1, sizeof(cl_int), (void*) &hid);
 	clSetKernelArg(kernel2, 2, sizeof(void *), (void*) &input_ocl);
 	clSetKernelArg(kernel2, 3, sizeof(cl_int), (void*) &in);
 	clSetKernelArg(kernel2, 4, sizeof(void *), (void*) &input_hidden_ocl);
 	clSetKernelArg(kernel2, 5, sizeof(void *), (void*) &input_prev_weights_ocl );
-  
+
 	err = clEnqueueNDRangeKernel(cmd_queue, kernel2, 2, NULL, global_work, local_work, 0, 0, 0);
-	if(err != CL_SUCCESS) { printf("ERROR: 1  clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; }	
-  
+	if(err != CL_SUCCESS) { printf("ERROR: 1  clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; }
+
 	err = clEnqueueReadBuffer(cmd_queue, input_ocl, 1, 0, (in + 1) * sizeof(float), net->input_units, 0, 0, 0);
-	if(err != CL_SUCCESS) { printf("ERROR: 1  clEnqueueReadBuffer: input_ocl\n"); return -1; }	
+	if(err != CL_SUCCESS) { printf("ERROR: 1  clEnqueueReadBuffer: input_ocl\n"); return -1; }
 	err = clEnqueueReadBuffer(cmd_queue, input_hidden_ocl, 1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_one_dim, 0, 0, 0);
-	if(err != CL_SUCCESS) { printf("ERROR: 1  clEnqueueReadBuffer: input_hidden_ocl\n"); return -1; }	
-  
+	if(err != CL_SUCCESS) { printf("ERROR: 1  clEnqueueReadBuffer: input_hidden_ocl\n"); return -1; }
+
 	clReleaseMemObject(input_ocl);
 	clReleaseMemObject(output_hidden_ocl);
 	clReleaseMemObject(input_hidden_ocl);
 	clReleaseMemObject(hidden_partial_sum);
 	clReleaseMemObject(input_prev_weights_ocl);
-  
+
 	free(input_weights_prev_one_dim);
 	free(partial_sum);
 	free(input_weights_one_dim);

+ 40 - 10
opencl/backprop/facetrain.c

@@ -1,6 +1,7 @@
+#include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <math.h>
+#include <unistd.h>
 #include "backprop.h"
 #include "omp.h"
 
@@ -8,6 +9,9 @@ extern char *strcpy();
 extern void exit();
 
 int layer_size = 0;
+int platform_id = 0;
+int device_id = 0;
+int use_gpu = 0;
 
 void backprop_face()
 {
@@ -15,33 +19,59 @@ void backprop_face()
   int i;
   float out_err, hid_err;
   net = bpnn_create(layer_size, 16, 1); // (16, 1 can not be changed)
-  
+
   printf("Input layer size : %d\n", layer_size);
   load(net);
   //entering the training kernel, only one iteration
   printf("Starting training kernel\n");
-  bpnn_train_kernel(net, &out_err, &hid_err);
+  bpnn_train_kernel(net, &out_err, &hid_err, platform_id, device_id, use_gpu);
   bpnn_free(net);
   printf("\nFinish the training for one iteration\n");
 }
 
+void Usage(char *argv0){
+
+  char *help =
+  "\nUsage: %s [switches] \n\n"
+  "    -l               :layer size                     \n"
+  "    -p platform_id   :OCL platform to use [default=0]\n"
+  "    -d device_id     :OCL device to use   [default=0]\n"
+  "    -g use_gpu       :1 for GPU 0 for CPU [default=0]\n";
+  fprintf(stderr, help, argv0);
+  exit(-1);
+
+}
+
 int setup(int argc, char **argv)
 {
-	
+
   int seed;
 
-  if (argc!=2){
-  fprintf(stderr, "usage: backprop <num of input elements>\n");
-  exit(0);
+  int opt;
+  extern char *optarg;
+  while ((opt=getopt(argc,argv,"l:p:d:g:"))!= EOF) {
+      switch (opt) {
+          case 'p': platform_id = atoi(optarg);
+                    break;
+          case 'd': device_id = atoi(optarg);
+                    break;
+          case 'g': use_gpu = atoi(optarg);
+                    break;
+          case 'l': layer_size = atoi(optarg);
+                    break;
+          case '?': Usage(argv[0]);
+                    break;
+          default:  Usage(argv[0]);
+                    break;
+      }
   }
-  layer_size = atoi(argv[1]);
+
   if (layer_size%16!=0){
   fprintf(stderr, "The number of input points must be divided by 16\n");
   exit(0);
   }
-  
 
-  seed = 7;   
+  seed = 7;
   bpnn_initialize(seed);
   backprop_face();
 

+ 1 - 1
opencl/backprop/run-cpu

@@ -1 +1 @@
-./backprop 1048576
+./backprop -l 1048576 -p 1 -d 0 -g 0

+ 1 - 0
opencl/backprop/run-gpu

@@ -0,0 +1 @@
+./backprop -l 1048576 -p 0 -d 0 -g 1