Browse Source

Added run script, fix for cpu backdrop benchmark

Fixed the initialization of the OCL runtime in order to take in account
also the CPU platform of pocl
Andrea Gussoni 8 years ago
parent
commit
5a508b7881
2 changed files with 262 additions and 255 deletions
  1. 261 255
      opencl/backprop/backprop_ocl.cpp
  2. 1 0
      opencl/backprop/run-cpu

+ 261 - 255
opencl/backprop/backprop_ocl.cpp

@@ -1,255 +1,261 @@
-// includes, system
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <math.h>
-#include <sys/time.h>
-#include "backprop.h"
-
-#ifdef NV //NVIDIA
-	#include <oclUtils.h>
-#else 
-	#include <CL/cl.h>
-#endif
-
-
-////////////////////////////////////////////////////////////////////////////////
-
-// local variables
-static cl_context	    context;
-static cl_command_queue cmd_queue;
-static cl_device_type   device_type;
-static cl_device_id   * device_list;
-static cl_int           num_devices;
-
-static int initialize(int use_gpu)
-{
-	cl_int result;
-	size_t size;
-
-	// create OpenCL context
-	cl_platform_id platform_id;
-	if (clGetPlatformIDs(1, &platform_id, NULL) != CL_SUCCESS) { printf("ERROR: clGetPlatformIDs(1,*,0) failed\n"); return -1; }
-	cl_context_properties ctxprop[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform_id, 0};
-	device_type = use_gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU;
-	context = clCreateContextFromType( ctxprop, device_type, NULL, NULL, NULL );
-	if( !context ) { printf("ERROR: clCreateContextFromType(%s) failed\n", use_gpu ? "GPU" : "CPU"); return -1; }
-
-	// get the list of GPUs
-	result = clGetContextInfo( context, CL_CONTEXT_DEVICES, 0, NULL, &size );
-	num_devices = (int) (size / sizeof(cl_device_id));
-	printf("num_devices = %d\n", num_devices);
-	
-	if( result != CL_SUCCESS || num_devices < 1 ) { printf("ERROR: clGetContextInfo() failed\n"); return -1; }
-	device_list = new cl_device_id[num_devices];
-	//device_list = (cl_device_id *)malloc(sizeof(cl_device_id)*num_devices);
-	if( !device_list ) { printf("ERROR: new cl_device_id[] failed\n"); return -1; }
-	result = clGetContextInfo( context, CL_CONTEXT_DEVICES, size, device_list, NULL );
-	if( result != CL_SUCCESS ) { printf("ERROR: clGetContextInfo() failed\n"); return -1; }
-
-	// create command queue for the first device
-	cmd_queue = clCreateCommandQueue( context, device_list[0], 0, NULL );
-	if( !cmd_queue ) { printf("ERROR: clCreateCommandQueue() failed\n"); return -1; }
-	return 0;
-}
-
-static int shutdown()
-{
-	// release resources
-	if( cmd_queue ) clReleaseCommandQueue( cmd_queue );
-	if( context ) clReleaseContext( context );
-	if( device_list ) delete[] device_list;
-
-	// reset all variables
-	cmd_queue = 0;
-	context = 0;
-	device_list = 0;
-	num_devices = 0;
-	device_type = 0;
-
-	return 0;
-}
-
-double gettime() {
-  struct timeval t;
-  gettimeofday(&t,NULL);
-  return t.tv_sec+t.tv_usec*1e-6;
-}
-
-unsigned int num_threads = 0;
-unsigned int num_blocks = 0;
-
-////////////////////////////////////////////////////////////////////////////////
-// Program main
-////////////////////////////////////////////////////////////////////////////////
-int
-main( int argc, char** argv) 
-{
-	setup(argc, argv);
-}
-
-
-
-int bpnn_train_kernel(BPNN *net, float *eo, float *eh)
-{
-	int in, hid, out;
-	float out_err, hid_err;
-  
-	in = net->input_n;
-	hid = net->hidden_n;
-	out = net->output_n;   
-   
-	int sourcesize = 1024*1024;
-	char * source = (char *)calloc(sourcesize, sizeof(char)); 
-	if(!source) { printf("ERROR: calloc(%d) failed\n", sourcesize); return -1; }
-
-	// read the kernel core source
-	char * kernel_bp1  = "bpnn_layerforward_ocl";
-	char * kernel_bp2  = "bpnn_adjust_weights_ocl";
-	char * tempchar = "./backprop_kernel.cl";
-	FILE * fp = fopen(tempchar, "rb"); 
-	if(!fp) { printf("ERROR: unable to open '%s'\n", tempchar); return -1; }
-	fread(source + strlen(source), sourcesize, 1, fp);
-	fclose(fp);
-	
-	int use_gpu = 1;
-	if(initialize(use_gpu)) return -1;
-	
-	// compile kernel
-	cl_int err = 0;
-	const char * slist[2] = { source, 0 };
-	cl_program prog = clCreateProgramWithSource(context, 1, slist, NULL, &err);
-	if(err != CL_SUCCESS) { printf("ERROR: clCreateProgramWithSource() => %d\n", err); return -1; }
-	err = clBuildProgram(prog, 0, NULL, NULL, NULL, NULL);
-	{ // show warnings/errors
-		//static char log[65536]; memset(log, 0, sizeof(log));
-		//cl_device_id device_id = 0;
-		//err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(device_id), &device_id, NULL);
-		//clGetProgramBuildInfo(prog, device_id, CL_PROGRAM_BUILD_LOG, sizeof(log)-1, log, NULL);
-		//if(err || strstr(log,"warning:") || strstr(log, "error:")) printf("<<<<\n%s\n>>>>\n", log);
-	}
-	if(err != CL_SUCCESS) { printf("ERROR: clBuildProgram() => %d\n", err); return -1; }
-    	
-	cl_kernel kernel1;
-	cl_kernel kernel2;
-	kernel1 = clCreateKernel(prog, kernel_bp1, &err);  
-	kernel2 = clCreateKernel(prog, kernel_bp2, &err);  
-	if(err != CL_SUCCESS) { printf("ERROR: clCreateKernel() 0 => %d\n", err); return -1; }
-	clReleaseProgram(prog);
-	
-	float *input_weights_one_dim;
-    float *input_weights_prev_one_dim;
-	float * partial_sum;
-	float sum;
-	float num_blocks = in / BLOCK_SIZE;
-	
-	input_weights_one_dim = (float *) malloc((in + 1)* (hid + 1) * sizeof(float));
-	input_weights_prev_one_dim = (float *) malloc((in + 1)* (hid + 1) * sizeof(float));
-	partial_sum = (float *) malloc(num_blocks * WIDTH * sizeof(float));
-	
-	// set global and local workitems
-	size_t global_work[3] = { BLOCK_SIZE, BLOCK_SIZE * num_blocks, 1 }; 
-	size_t local_work[3] = { BLOCK_SIZE, BLOCK_SIZE, 1 };
-	
-	// this preprocessing stage is temporarily added to correct the bug of wrong memcopy using two-dimensional net->inputweights
-	// todo: fix mem allocation
-	int m = 0;
-	for (int k = 0; k <= in; k++) {	
-		for (int j = 0; j <= hid; j++) {
-		input_weights_one_dim[m] = net->input_weights[k][j];
-		input_weights_prev_one_dim[m] = net-> input_prev_weights[k][j];
-	    m++;
-		}
-	}
-	
-	cl_mem input_hidden_ocl;
-	cl_mem input_ocl;
-	cl_mem output_hidden_ocl;
-	cl_mem hidden_partial_sum;
-	cl_mem hidden_delta_ocl;
-	cl_mem input_prev_weights_ocl;
-  
-	input_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (in + 1) * sizeof(float), NULL, &err );
-	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer input_ocl\n"); return -1;}
-	input_hidden_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (in + 1) * (hid + 1) * sizeof(float), NULL, &err );
-	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer input_hidden_ocl\n"); return -1;}
-	output_hidden_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (hid + 1) * sizeof(float), NULL, &err );
-	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer output_hidden_ocl\n"); return -1;}
-	hidden_partial_sum = clCreateBuffer(context, CL_MEM_READ_WRITE, num_blocks * WIDTH * sizeof(float), NULL, &err );
-	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer hidden_partial_sum\n"); return -1;}
-	hidden_delta_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (hid + 1) * sizeof(float), NULL, &err );
-	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer hidden_delta_ocl\n"); return -1;}
-	input_prev_weights_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (in + 1) * (hid + 1) * sizeof(float), NULL, &err );
-	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer input_prev_weights_ocl\n"); return -1;}
-		
-	printf("Performing GPU computation\n");
-	
-	//write buffers
-	err = clEnqueueWriteBuffer(cmd_queue, input_ocl, 1, 0, (in + 1) * sizeof(float), net->input_units, 0, 0, 0);
-	if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_ocl\n"); return -1; }
-	err = clEnqueueWriteBuffer(cmd_queue, input_hidden_ocl, 1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_one_dim, 0, 0, 0);
-	if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_hidden_ocl\n"); return -1; }
- 
-	clSetKernelArg(kernel1, 0, sizeof(void *), (void*) &input_ocl);
-	clSetKernelArg(kernel1, 1, sizeof(void *), (void*) &output_hidden_ocl);
-	clSetKernelArg(kernel1, 2, sizeof(void *), (void*) &input_hidden_ocl);
-	clSetKernelArg(kernel1, 3, sizeof(void *), (void*) &hidden_partial_sum );
-	clSetKernelArg(kernel1, 4, sizeof(float) *  HEIGHT, (void*)NULL );
-	clSetKernelArg(kernel1, 5, sizeof(float ) *  HEIGHT * WIDTH, (void*)NULL );
-	clSetKernelArg(kernel1, 6, sizeof(cl_int), (void*) &in);
-	clSetKernelArg(kernel1, 7, sizeof(cl_int), (void*) &hid);
-  
-	err = clEnqueueNDRangeKernel(cmd_queue, kernel1, 2, NULL, global_work, local_work, 0, 0, 0);
-	if(err != CL_SUCCESS) { printf("ERROR: 1  clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; }	
-  
-	err = clEnqueueReadBuffer(cmd_queue, hidden_partial_sum, 1, 0, num_blocks * WIDTH * sizeof(float), partial_sum, 0, 0, 0);
-	if(err != CL_SUCCESS) { printf("ERROR: 1  clEnqueueReadBuffer: partial sum\n"); return -1; }	
-  
-	for (int j = 1; j <= hid; j++) {
-		sum = 0.0;
-		for (int k = 0; k < num_blocks; k++) {	
-		sum += partial_sum[k * hid + j-1] ;
-    }
-		sum += net->input_weights[0][j];
-		net-> hidden_units[j] = float(1.0 / (1.0 + exp(-sum)));
-	}
-
-	
-	bpnn_layerforward(net->hidden_units, net->output_units, net->hidden_weights, hid, out);
-	bpnn_output_error(net->output_delta, net->target, net->output_units, out, &out_err);
-	bpnn_hidden_error(net->hidden_delta, hid, net->output_delta, out, net->hidden_weights, net->hidden_units, &hid_err);  
-	bpnn_adjust_weights(net->output_delta, out, net->hidden_units, hid, net->hidden_weights, net->hidden_prev_weights);
-
-	err = clEnqueueWriteBuffer(cmd_queue, hidden_delta_ocl,       1, 0, (hid + 1) * sizeof(float), net->hidden_delta, 0, 0, 0);
-	if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer hidden_delta_ocl\n"); return -1; }
-	err = clEnqueueWriteBuffer(cmd_queue, input_prev_weights_ocl, 1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_prev_one_dim, 0, 0, 0);
-	if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_prev_weights_ocl\n"); return -1; }
-	err = clEnqueueWriteBuffer(cmd_queue, input_hidden_ocl,       1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_one_dim, 0, 0, 0);
-	if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_hidden_ocl\n"); return -1; }
-  
-	clSetKernelArg(kernel2, 0, sizeof(void *), (void*) &hidden_delta_ocl);
-	clSetKernelArg(kernel2, 1, sizeof(cl_int), (void*) &hid);
-	clSetKernelArg(kernel2, 2, sizeof(void *), (void*) &input_ocl);
-	clSetKernelArg(kernel2, 3, sizeof(cl_int), (void*) &in);
-	clSetKernelArg(kernel2, 4, sizeof(void *), (void*) &input_hidden_ocl);
-	clSetKernelArg(kernel2, 5, sizeof(void *), (void*) &input_prev_weights_ocl );
-  
-	err = clEnqueueNDRangeKernel(cmd_queue, kernel2, 2, NULL, global_work, local_work, 0, 0, 0);
-	if(err != CL_SUCCESS) { printf("ERROR: 1  clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; }	
-  
-	err = clEnqueueReadBuffer(cmd_queue, input_ocl, 1, 0, (in + 1) * sizeof(float), net->input_units, 0, 0, 0);
-	if(err != CL_SUCCESS) { printf("ERROR: 1  clEnqueueReadBuffer: input_ocl\n"); return -1; }	
-	err = clEnqueueReadBuffer(cmd_queue, input_hidden_ocl, 1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_one_dim, 0, 0, 0);
-	if(err != CL_SUCCESS) { printf("ERROR: 1  clEnqueueReadBuffer: input_hidden_ocl\n"); return -1; }	
-  
-	clReleaseMemObject(input_ocl);
-	clReleaseMemObject(output_hidden_ocl);
-	clReleaseMemObject(input_hidden_ocl);
-	clReleaseMemObject(hidden_partial_sum);
-	clReleaseMemObject(input_prev_weights_ocl);
-  
-	free(input_weights_prev_one_dim);
-	free(partial_sum);
-	free(input_weights_one_dim);
-
-}
+// includes, system
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include <sys/time.h>
+#include "backprop.h"
+
+#ifdef NV //NVIDIA
+	#include <oclUtils.h>
+#else 
+	#include <CL/cl.h>
+#endif
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+// local variables
+static cl_context	    context;
+static cl_command_queue cmd_queue;
+static cl_device_type   device_type;
+static cl_device_id   * device_list;
+static cl_int           num_devices;
+
+static int initialize(int use_gpu)
+{
+	cl_int result;
+	size_t size;
+	// modify a bit the initialization phase to discover all the ocl platforms an not only the first one (needed to discover the cpu)
+	cl_uint platformCount;
+	cl_platform_id* platforms_ids;
+
+	// create OpenCL context
+	clGetPlatformIDs(0, NULL, &platformCount);
+    	platforms_ids = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount);
+    	clGetPlatformIDs(platformCount, platforms_ids, NULL);
+
+	if (clGetPlatformIDs(platformCount, platforms_ids, NULL) != CL_SUCCESS) { printf("ERROR: clGetPlatformIDs(1,*,0) failed\n"); return -1; }
+	cl_context_properties ctxprop[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platforms_ids[1], 0};
+	device_type = use_gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU;
+	context = clCreateContextFromType( ctxprop, device_type, NULL, NULL, NULL );
+	if( !context ) { printf("ERROR: clCreateContextFromType(%s) failed\n", use_gpu ? "GPU" : "CPU"); return -1; }
+
+	// get the list of GPUs
+	result = clGetContextInfo( context, CL_CONTEXT_DEVICES, 0, NULL, &size );
+	num_devices = (int) (size / sizeof(cl_device_id));
+	printf("num_devices = %d\n", num_devices);
+	
+	if( result != CL_SUCCESS || num_devices < 1 ) { printf("ERROR: clGetContextInfo() failed\n"); return -1; }
+	device_list = new cl_device_id[num_devices];
+	//device_list = (cl_device_id *)malloc(sizeof(cl_device_id)*num_devices);
+	if( !device_list ) { printf("ERROR: new cl_device_id[] failed\n"); return -1; }
+	result = clGetContextInfo( context, CL_CONTEXT_DEVICES, size, device_list, NULL );
+	if( result != CL_SUCCESS ) { printf("ERROR: clGetContextInfo() failed\n"); return -1; }
+
+	// create command queue for the first device
+	cmd_queue = clCreateCommandQueue( context, device_list[0], 0, NULL );
+	if( !cmd_queue ) { printf("ERROR: clCreateCommandQueue() failed\n"); return -1; }
+	return 0;
+}
+
+static int shutdown()
+{
+	// release resources
+	if( cmd_queue ) clReleaseCommandQueue( cmd_queue );
+	if( context ) clReleaseContext( context );
+	if( device_list ) delete[] device_list;
+
+	// reset all variables
+	cmd_queue = 0;
+	context = 0;
+	device_list = 0;
+	num_devices = 0;
+	device_type = 0;
+
+	return 0;
+}
+
+double gettime() {
+  struct timeval t;
+  gettimeofday(&t,NULL);
+  return t.tv_sec+t.tv_usec*1e-6;
+}
+
+unsigned int num_threads = 0;
+unsigned int num_blocks = 0;
+
+////////////////////////////////////////////////////////////////////////////////
+// Program main
+////////////////////////////////////////////////////////////////////////////////
+int
+main( int argc, char** argv) 
+{
+	setup(argc, argv);
+}
+
+
+
+int bpnn_train_kernel(BPNN *net, float *eo, float *eh)
+{
+	int in, hid, out;
+	float out_err, hid_err;
+  
+	in = net->input_n;
+	hid = net->hidden_n;
+	out = net->output_n;   
+   
+	int sourcesize = 1024*1024;
+	char * source = (char *)calloc(sourcesize, sizeof(char)); 
+	if(!source) { printf("ERROR: calloc(%d) failed\n", sourcesize); return -1; }
+
+	// read the kernel core source
+	char * kernel_bp1  = "bpnn_layerforward_ocl";
+	char * kernel_bp2  = "bpnn_adjust_weights_ocl";
+	char * tempchar = "./backprop_kernel.cl";
+	FILE * fp = fopen(tempchar, "rb"); 
+	if(!fp) { printf("ERROR: unable to open '%s'\n", tempchar); return -1; }
+	fread(source + strlen(source), sourcesize, 1, fp);
+	fclose(fp);
+	
+	int use_gpu = 0;
+	if(initialize(use_gpu)) return -1;
+	
+	// compile kernel
+	cl_int err = 0;
+	const char * slist[2] = { source, 0 };
+	cl_program prog = clCreateProgramWithSource(context, 1, slist, NULL, &err);
+	if(err != CL_SUCCESS) { printf("ERROR: clCreateProgramWithSource() => %d\n", err); return -1; }
+	err = clBuildProgram(prog, 0, NULL, NULL, NULL, NULL);
+	{ // show warnings/errors
+		//static char log[65536]; memset(log, 0, sizeof(log));
+		//cl_device_id device_id = 0;
+		//err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(device_id), &device_id, NULL);
+		//clGetProgramBuildInfo(prog, device_id, CL_PROGRAM_BUILD_LOG, sizeof(log)-1, log, NULL);
+		//if(err || strstr(log,"warning:") || strstr(log, "error:")) printf("<<<<\n%s\n>>>>\n", log);
+	}
+	if(err != CL_SUCCESS) { printf("ERROR: clBuildProgram() => %d\n", err); return -1; }
+    	
+	cl_kernel kernel1;
+	cl_kernel kernel2;
+	kernel1 = clCreateKernel(prog, kernel_bp1, &err);  
+	kernel2 = clCreateKernel(prog, kernel_bp2, &err);  
+	if(err != CL_SUCCESS) { printf("ERROR: clCreateKernel() 0 => %d\n", err); return -1; }
+	clReleaseProgram(prog);
+	
+	float *input_weights_one_dim;
+    float *input_weights_prev_one_dim;
+	float * partial_sum;
+	float sum;
+	float num_blocks = in / BLOCK_SIZE;
+	
+	input_weights_one_dim = (float *) malloc((in + 1)* (hid + 1) * sizeof(float));
+	input_weights_prev_one_dim = (float *) malloc((in + 1)* (hid + 1) * sizeof(float));
+	partial_sum = (float *) malloc(num_blocks * WIDTH * sizeof(float));
+	
+	// set global and local workitems
+	size_t global_work[3] = { BLOCK_SIZE, BLOCK_SIZE * num_blocks, 1 }; 
+	size_t local_work[3] = { BLOCK_SIZE, BLOCK_SIZE, 1 };
+	
+	// this preprocessing stage is temporarily added to correct the bug of wrong memcopy using two-dimensional net->inputweights
+	// todo: fix mem allocation
+	int m = 0;
+	for (int k = 0; k <= in; k++) {	
+		for (int j = 0; j <= hid; j++) {
+		input_weights_one_dim[m] = net->input_weights[k][j];
+		input_weights_prev_one_dim[m] = net-> input_prev_weights[k][j];
+	    m++;
+		}
+	}
+	
+	cl_mem input_hidden_ocl;
+	cl_mem input_ocl;
+	cl_mem output_hidden_ocl;
+	cl_mem hidden_partial_sum;
+	cl_mem hidden_delta_ocl;
+	cl_mem input_prev_weights_ocl;
+  
+	input_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (in + 1) * sizeof(float), NULL, &err );
+	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer input_ocl\n"); return -1;}
+	input_hidden_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (in + 1) * (hid + 1) * sizeof(float), NULL, &err );
+	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer input_hidden_ocl\n"); return -1;}
+	output_hidden_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (hid + 1) * sizeof(float), NULL, &err );
+	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer output_hidden_ocl\n"); return -1;}
+	hidden_partial_sum = clCreateBuffer(context, CL_MEM_READ_WRITE, num_blocks * WIDTH * sizeof(float), NULL, &err );
+	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer hidden_partial_sum\n"); return -1;}
+	hidden_delta_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (hid + 1) * sizeof(float), NULL, &err );
+	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer hidden_delta_ocl\n"); return -1;}
+	input_prev_weights_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (in + 1) * (hid + 1) * sizeof(float), NULL, &err );
+	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer input_prev_weights_ocl\n"); return -1;}
+		
+	printf("Performing GPU computation\n");
+	
+	//write buffers
+	err = clEnqueueWriteBuffer(cmd_queue, input_ocl, 1, 0, (in + 1) * sizeof(float), net->input_units, 0, 0, 0);
+	if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_ocl\n"); return -1; }
+	err = clEnqueueWriteBuffer(cmd_queue, input_hidden_ocl, 1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_one_dim, 0, 0, 0);
+	if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_hidden_ocl\n"); return -1; }
+ 
+	clSetKernelArg(kernel1, 0, sizeof(void *), (void*) &input_ocl);
+	clSetKernelArg(kernel1, 1, sizeof(void *), (void*) &output_hidden_ocl);
+	clSetKernelArg(kernel1, 2, sizeof(void *), (void*) &input_hidden_ocl);
+	clSetKernelArg(kernel1, 3, sizeof(void *), (void*) &hidden_partial_sum );
+	clSetKernelArg(kernel1, 4, sizeof(float) *  HEIGHT, (void*)NULL );
+	clSetKernelArg(kernel1, 5, sizeof(float ) *  HEIGHT * WIDTH, (void*)NULL );
+	clSetKernelArg(kernel1, 6, sizeof(cl_int), (void*) &in);
+	clSetKernelArg(kernel1, 7, sizeof(cl_int), (void*) &hid);
+  
+	err = clEnqueueNDRangeKernel(cmd_queue, kernel1, 2, NULL, global_work, local_work, 0, 0, 0);
+	if(err != CL_SUCCESS) { printf("ERROR: 1  clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; }	
+  
+	err = clEnqueueReadBuffer(cmd_queue, hidden_partial_sum, 1, 0, num_blocks * WIDTH * sizeof(float), partial_sum, 0, 0, 0);
+	if(err != CL_SUCCESS) { printf("ERROR: 1  clEnqueueReadBuffer: partial sum\n"); return -1; }	
+  
+	for (int j = 1; j <= hid; j++) {
+		sum = 0.0;
+		for (int k = 0; k < num_blocks; k++) {	
+		sum += partial_sum[k * hid + j-1] ;
+    }
+		sum += net->input_weights[0][j];
+		net-> hidden_units[j] = float(1.0 / (1.0 + exp(-sum)));
+	}
+
+	
+	bpnn_layerforward(net->hidden_units, net->output_units, net->hidden_weights, hid, out);
+	bpnn_output_error(net->output_delta, net->target, net->output_units, out, &out_err);
+	bpnn_hidden_error(net->hidden_delta, hid, net->output_delta, out, net->hidden_weights, net->hidden_units, &hid_err);  
+	bpnn_adjust_weights(net->output_delta, out, net->hidden_units, hid, net->hidden_weights, net->hidden_prev_weights);
+
+	err = clEnqueueWriteBuffer(cmd_queue, hidden_delta_ocl,       1, 0, (hid + 1) * sizeof(float), net->hidden_delta, 0, 0, 0);
+	if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer hidden_delta_ocl\n"); return -1; }
+	err = clEnqueueWriteBuffer(cmd_queue, input_prev_weights_ocl, 1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_prev_one_dim, 0, 0, 0);
+	if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_prev_weights_ocl\n"); return -1; }
+	err = clEnqueueWriteBuffer(cmd_queue, input_hidden_ocl,       1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_one_dim, 0, 0, 0);
+	if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_hidden_ocl\n"); return -1; }
+  
+	clSetKernelArg(kernel2, 0, sizeof(void *), (void*) &hidden_delta_ocl);
+	clSetKernelArg(kernel2, 1, sizeof(cl_int), (void*) &hid);
+	clSetKernelArg(kernel2, 2, sizeof(void *), (void*) &input_ocl);
+	clSetKernelArg(kernel2, 3, sizeof(cl_int), (void*) &in);
+	clSetKernelArg(kernel2, 4, sizeof(void *), (void*) &input_hidden_ocl);
+	clSetKernelArg(kernel2, 5, sizeof(void *), (void*) &input_prev_weights_ocl );
+  
+	err = clEnqueueNDRangeKernel(cmd_queue, kernel2, 2, NULL, global_work, local_work, 0, 0, 0);
+	if(err != CL_SUCCESS) { printf("ERROR: 1  clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; }	
+  
+	err = clEnqueueReadBuffer(cmd_queue, input_ocl, 1, 0, (in + 1) * sizeof(float), net->input_units, 0, 0, 0);
+	if(err != CL_SUCCESS) { printf("ERROR: 1  clEnqueueReadBuffer: input_ocl\n"); return -1; }	
+	err = clEnqueueReadBuffer(cmd_queue, input_hidden_ocl, 1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_one_dim, 0, 0, 0);
+	if(err != CL_SUCCESS) { printf("ERROR: 1  clEnqueueReadBuffer: input_hidden_ocl\n"); return -1; }	
+  
+	clReleaseMemObject(input_ocl);
+	clReleaseMemObject(output_hidden_ocl);
+	clReleaseMemObject(input_hidden_ocl);
+	clReleaseMemObject(hidden_partial_sum);
+	clReleaseMemObject(input_prev_weights_ocl);
+  
+	free(input_weights_prev_one_dim);
+	free(partial_sum);
+	free(input_weights_one_dim);
+
+}

+ 1 - 0
opencl/backprop/run-cpu

@@ -0,0 +1 @@
+./backprop 65536