backprop_ocl.cpp 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260
  1. // includes, system
  2. #include <stdlib.h>
  3. #include <stdio.h>
  4. #include <string.h>
  5. #include <math.h>
  6. #include <sys/time.h>
  7. #include "backprop.h"
  8. #ifdef NV //NVIDIA
  9. #include <oclUtils.h>
  10. #else
  11. #include <CL/cl.h>
  12. #endif
  13. ////////////////////////////////////////////////////////////////////////////////
  14. // local variables
  15. static cl_context context;
  16. static cl_command_queue cmd_queue;
  17. static cl_device_type device_type;
  18. static cl_device_id * device_list;
  19. static cl_int num_devices;
  20. static int initialize(int platform_num, int device_num, int use_gpu)
  21. {
  22. cl_int result;
  23. size_t size;
  24. // modify a bit the initialization phase to discover all the ocl platforms an not only the first one (needed to discover the cpu)
  25. cl_uint platformCount;
  26. cl_platform_id* platforms_ids;
  27. // create OpenCL context
  28. clGetPlatformIDs(0, NULL, &platformCount);
  29. platforms_ids = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount);
  30. clGetPlatformIDs(platformCount, platforms_ids, NULL);
  31. if (clGetPlatformIDs(platformCount, platforms_ids, NULL) != CL_SUCCESS) { printf("ERROR: clGetPlatformIDs(1,*,0) failed\n"); return -1; }
  32. cl_context_properties ctxprop[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platforms_ids[platform_num], 0};
  33. device_type = use_gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU;
  34. context = clCreateContextFromType( ctxprop, device_type, NULL, NULL, NULL );
  35. if( !context ) { printf("ERROR: clCreateContextFromType(%s) failed\n", use_gpu ? "GPU" : "CPU"); return -1; }
  36. // get the list of GPUs
  37. result = clGetContextInfo( context, CL_CONTEXT_DEVICES, 0, NULL, &size );
  38. num_devices = (int) (size / sizeof(cl_device_id));
  39. printf("num_devices = %d\n", num_devices);
  40. if( result != CL_SUCCESS || num_devices < 1 ) { printf("ERROR: clGetContextInfo() failed\n"); return -1; }
  41. device_list = new cl_device_id[num_devices];
  42. //device_list = (cl_device_id *)malloc(sizeof(cl_device_id)*num_devices);
  43. if( !device_list ) { printf("ERROR: new cl_device_id[] failed\n"); return -1; }
  44. result = clGetContextInfo( context, CL_CONTEXT_DEVICES, size, device_list, NULL );
  45. if( result != CL_SUCCESS ) { printf("ERROR: clGetContextInfo() failed\n"); return -1; }
  46. // create command queue for the first device
  47. cmd_queue = clCreateCommandQueue( context, device_list[device_num], 0, NULL );
  48. if( !cmd_queue ) { printf("ERROR: clCreateCommandQueue() failed\n"); return -1; }
  49. return 0;
  50. }
  51. static int shutdown()
  52. {
  53. // release resources
  54. if( cmd_queue ) clReleaseCommandQueue( cmd_queue );
  55. if( context ) clReleaseContext( context );
  56. if( device_list ) delete[] device_list;
  57. // reset all variables
  58. cmd_queue = 0;
  59. context = 0;
  60. device_list = 0;
  61. num_devices = 0;
  62. device_type = 0;
  63. return 0;
  64. }
  65. double gettime() {
  66. struct timeval t;
  67. gettimeofday(&t,NULL);
  68. return t.tv_sec+t.tv_usec*1e-6;
  69. }
  70. unsigned int num_threads = 0;
  71. unsigned int num_blocks = 0;
  72. ////////////////////////////////////////////////////////////////////////////////
  73. // Program main
  74. ////////////////////////////////////////////////////////////////////////////////
  75. int
  76. main( int argc, char** argv)
  77. {
  78. setup(argc, argv);
  79. }
  80. int bpnn_train_kernel(BPNN *net, float *eo, float *eh, int platform_num, int device_num, int use_gpu)
  81. {
  82. int in, hid, out;
  83. float out_err, hid_err;
  84. in = net->input_n;
  85. hid = net->hidden_n;
  86. out = net->output_n;
  87. int sourcesize = 1024*1024;
  88. char * source = (char *)calloc(sourcesize, sizeof(char));
  89. if(!source) { printf("ERROR: calloc(%d) failed\n", sourcesize); return -1; }
  90. // read the kernel core source
  91. char * kernel_bp1 = "bpnn_layerforward_ocl";
  92. char * kernel_bp2 = "bpnn_adjust_weights_ocl";
  93. char * tempchar = "./backprop_kernel.cl";
  94. FILE * fp = fopen(tempchar, "rb");
  95. if(!fp) { printf("ERROR: unable to open '%s'\n", tempchar); return -1; }
  96. fread(source + strlen(source), sourcesize, 1, fp);
  97. fclose(fp);
  98. if(initialize(platform_num, device_num, use_gpu)) return -1;
  99. // compile kernel
  100. cl_int err = 0;
  101. const char * slist[2] = { source, 0 };
  102. cl_program prog = clCreateProgramWithSource(context, 1, slist, NULL, &err);
  103. if(err != CL_SUCCESS) { printf("ERROR: clCreateProgramWithSource() => %d\n", err); return -1; }
  104. err = clBuildProgram(prog, 0, NULL, NULL, NULL, NULL);
  105. { // show warnings/errors
  106. //static char log[65536]; memset(log, 0, sizeof(log));
  107. //cl_device_id device_id = 0;
  108. //err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(device_id), &device_id, NULL);
  109. //clGetProgramBuildInfo(prog, device_id, CL_PROGRAM_BUILD_LOG, sizeof(log)-1, log, NULL);
  110. //if(err || strstr(log,"warning:") || strstr(log, "error:")) printf("<<<<\n%s\n>>>>\n", log);
  111. }
  112. if(err != CL_SUCCESS) { printf("ERROR: clBuildProgram() => %d\n", err); return -1; }
  113. cl_kernel kernel1;
  114. cl_kernel kernel2;
  115. kernel1 = clCreateKernel(prog, kernel_bp1, &err);
  116. kernel2 = clCreateKernel(prog, kernel_bp2, &err);
  117. if(err != CL_SUCCESS) { printf("ERROR: clCreateKernel() 0 => %d\n", err); return -1; }
  118. clReleaseProgram(prog);
  119. float *input_weights_one_dim;
  120. float *input_weights_prev_one_dim;
  121. float * partial_sum;
  122. float sum;
  123. float num_blocks = in / BLOCK_SIZE;
  124. input_weights_one_dim = (float *) malloc((in + 1)* (hid + 1) * sizeof(float));
  125. input_weights_prev_one_dim = (float *) malloc((in + 1)* (hid + 1) * sizeof(float));
  126. partial_sum = (float *) malloc(num_blocks * WIDTH * sizeof(float));
  127. // set global and local workitems
  128. size_t global_work[3] = { BLOCK_SIZE, BLOCK_SIZE * num_blocks, 1 };
  129. size_t local_work[3] = { BLOCK_SIZE, BLOCK_SIZE, 1 };
  130. // this preprocessing stage is temporarily added to correct the bug of wrong memcopy using two-dimensional net->inputweights
  131. // todo: fix mem allocation
  132. int m = 0;
  133. for (int k = 0; k <= in; k++) {
  134. for (int j = 0; j <= hid; j++) {
  135. input_weights_one_dim[m] = net->input_weights[k][j];
  136. input_weights_prev_one_dim[m] = net-> input_prev_weights[k][j];
  137. m++;
  138. }
  139. }
  140. cl_mem input_hidden_ocl;
  141. cl_mem input_ocl;
  142. cl_mem output_hidden_ocl;
  143. cl_mem hidden_partial_sum;
  144. cl_mem hidden_delta_ocl;
  145. cl_mem input_prev_weights_ocl;
  146. input_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (in + 1) * sizeof(float), NULL, &err );
  147. if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer input_ocl\n"); return -1;}
  148. input_hidden_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (in + 1) * (hid + 1) * sizeof(float), NULL, &err );
  149. if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer input_hidden_ocl\n"); return -1;}
  150. output_hidden_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (hid + 1) * sizeof(float), NULL, &err );
  151. if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer output_hidden_ocl\n"); return -1;}
  152. hidden_partial_sum = clCreateBuffer(context, CL_MEM_READ_WRITE, num_blocks * WIDTH * sizeof(float), NULL, &err );
  153. if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer hidden_partial_sum\n"); return -1;}
  154. hidden_delta_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (hid + 1) * sizeof(float), NULL, &err );
  155. if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer hidden_delta_ocl\n"); return -1;}
  156. input_prev_weights_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (in + 1) * (hid + 1) * sizeof(float), NULL, &err );
  157. if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer input_prev_weights_ocl\n"); return -1;}
  158. printf("Performing GPU computation\n");
  159. //write buffers
  160. err = clEnqueueWriteBuffer(cmd_queue, input_ocl, 1, 0, (in + 1) * sizeof(float), net->input_units, 0, 0, 0);
  161. if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_ocl\n"); return -1; }
  162. err = clEnqueueWriteBuffer(cmd_queue, input_hidden_ocl, 1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_one_dim, 0, 0, 0);
  163. if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_hidden_ocl\n"); return -1; }
  164. clSetKernelArg(kernel1, 0, sizeof(void *), (void*) &input_ocl);
  165. clSetKernelArg(kernel1, 1, sizeof(void *), (void*) &output_hidden_ocl);
  166. clSetKernelArg(kernel1, 2, sizeof(void *), (void*) &input_hidden_ocl);
  167. clSetKernelArg(kernel1, 3, sizeof(void *), (void*) &hidden_partial_sum );
  168. clSetKernelArg(kernel1, 4, sizeof(float) * HEIGHT, (void*)NULL );
  169. clSetKernelArg(kernel1, 5, sizeof(float ) * HEIGHT * WIDTH, (void*)NULL );
  170. clSetKernelArg(kernel1, 6, sizeof(cl_int), (void*) &in);
  171. clSetKernelArg(kernel1, 7, sizeof(cl_int), (void*) &hid);
  172. err = clEnqueueNDRangeKernel(cmd_queue, kernel1, 2, NULL, global_work, local_work, 0, 0, 0);
  173. if(err != CL_SUCCESS) { printf("ERROR: 1 clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; }
  174. err = clEnqueueReadBuffer(cmd_queue, hidden_partial_sum, 1, 0, num_blocks * WIDTH * sizeof(float), partial_sum, 0, 0, 0);
  175. if(err != CL_SUCCESS) { printf("ERROR: 1 clEnqueueReadBuffer: partial sum\n"); return -1; }
  176. for (int j = 1; j <= hid; j++) {
  177. sum = 0.0;
  178. for (int k = 0; k < num_blocks; k++) {
  179. sum += partial_sum[k * hid + j-1] ;
  180. }
  181. sum += net->input_weights[0][j];
  182. net-> hidden_units[j] = float(1.0 / (1.0 + exp(-sum)));
  183. }
  184. bpnn_layerforward(net->hidden_units, net->output_units, net->hidden_weights, hid, out);
  185. bpnn_output_error(net->output_delta, net->target, net->output_units, out, &out_err);
  186. bpnn_hidden_error(net->hidden_delta, hid, net->output_delta, out, net->hidden_weights, net->hidden_units, &hid_err);
  187. bpnn_adjust_weights(net->output_delta, out, net->hidden_units, hid, net->hidden_weights, net->hidden_prev_weights);
  188. err = clEnqueueWriteBuffer(cmd_queue, hidden_delta_ocl, 1, 0, (hid + 1) * sizeof(float), net->hidden_delta, 0, 0, 0);
  189. if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer hidden_delta_ocl\n"); return -1; }
  190. err = clEnqueueWriteBuffer(cmd_queue, input_prev_weights_ocl, 1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_prev_one_dim, 0, 0, 0);
  191. if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_prev_weights_ocl\n"); return -1; }
  192. err = clEnqueueWriteBuffer(cmd_queue, input_hidden_ocl, 1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_one_dim, 0, 0, 0);
  193. if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_hidden_ocl\n"); return -1; }
  194. clSetKernelArg(kernel2, 0, sizeof(void *), (void*) &hidden_delta_ocl);
  195. clSetKernelArg(kernel2, 1, sizeof(cl_int), (void*) &hid);
  196. clSetKernelArg(kernel2, 2, sizeof(void *), (void*) &input_ocl);
  197. clSetKernelArg(kernel2, 3, sizeof(cl_int), (void*) &in);
  198. clSetKernelArg(kernel2, 4, sizeof(void *), (void*) &input_hidden_ocl);
  199. clSetKernelArg(kernel2, 5, sizeof(void *), (void*) &input_prev_weights_ocl );
  200. err = clEnqueueNDRangeKernel(cmd_queue, kernel2, 2, NULL, global_work, local_work, 0, 0, 0);
  201. if(err != CL_SUCCESS) { printf("ERROR: 1 clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; }
  202. err = clEnqueueReadBuffer(cmd_queue, input_ocl, 1, 0, (in + 1) * sizeof(float), net->input_units, 0, 0, 0);
  203. if(err != CL_SUCCESS) { printf("ERROR: 1 clEnqueueReadBuffer: input_ocl\n"); return -1; }
  204. err = clEnqueueReadBuffer(cmd_queue, input_hidden_ocl, 1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_one_dim, 0, 0, 0);
  205. if(err != CL_SUCCESS) { printf("ERROR: 1 clEnqueueReadBuffer: input_hidden_ocl\n"); return -1; }
  206. clReleaseMemObject(input_ocl);
  207. clReleaseMemObject(output_hidden_ocl);
  208. clReleaseMemObject(input_hidden_ocl);
  209. clReleaseMemObject(hidden_partial_sum);
  210. clReleaseMemObject(input_prev_weights_ocl);
  211. free(input_weights_prev_one_dim);
  212. free(partial_sum);
  213. free(input_weights_one_dim);
  214. }