lud.cpp 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320
  1. /*
  2. * =====================================================================================
  3. *
  4. * Filename: lud.cu
  5. *
  6. * Description: The main wrapper for the suite
  7. *
  8. * Version: 1.0
  9. * Created: 10/22/2009 08:40:34 PM
  10. * Revision: none
  11. * Compiler: gcc
  12. *
  13. * Author: Liang Wang (lw2aw), lw2aw@virginia.edu
  14. * Company: CS@UVa
  15. *
  16. * =====================================================================================
  17. */
  18. #include <stdio.h>
  19. #include <unistd.h>
  20. #include <getopt.h>
  21. #include <stdlib.h>
  22. #include <assert.h>
  23. #include "common.h"
  24. #include <sys/time.h>
  25. #include <CL/cl.h>
  26. #include <string.h>
  27. #include <string>
  28. #ifdef RD_WG_SIZE_0_0
  29. #define BLOCK_SIZE RD_WG_SIZE_0_0
  30. #elif defined(RD_WG_SIZE_0)
  31. #define BLOCK_SIZE RD_WG_SIZE_0
  32. #elif defined(RD_WG_SIZE)
  33. #define BLOCK_SIZE RD_WG_SIZE
  34. #else
  35. #define BLOCK_SIZE 16
  36. #endif
  37. double gettime() {
  38. struct timeval t;
  39. gettimeofday(&t,NULL);
  40. return t.tv_sec+t.tv_usec*1e-6;
  41. }
  42. static cl_context context;
  43. static cl_command_queue cmd_queue;
  44. static cl_device_type device_type;
  45. static cl_device_id * device_list;
  46. static cl_int num_devices;
  47. static int initialize(int use_gpu)
  48. {
  49. cl_int result;
  50. size_t size;
  51. // create OpenCL context
  52. cl_platform_id platform_id;
  53. if (clGetPlatformIDs(1, &platform_id, NULL) != CL_SUCCESS) { printf("ERROR: clGetPlatformIDs(1,*,0) failed\n"); return -1; }
  54. cl_context_properties ctxprop[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform_id, 0};
  55. device_type = use_gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU;
  56. context = clCreateContextFromType( ctxprop, device_type, NULL, NULL, NULL );
  57. if( !context ) { printf("ERROR: clCreateContextFromType(%s) failed\n", use_gpu ? "GPU" : "CPU"); return -1; }
  58. // get the list of GPUs
  59. result = clGetContextInfo( context, CL_CONTEXT_DEVICES, 0, NULL, &size );
  60. num_devices = (int) (size / sizeof(cl_device_id));
  61. printf("num_devices = %d\n", num_devices);
  62. if( result != CL_SUCCESS || num_devices < 1 ) { printf("ERROR: clGetContextInfo() failed\n"); return -1; }
  63. device_list = new cl_device_id[num_devices];
  64. if( !device_list ) { printf("ERROR: new cl_device_id[] failed\n"); return -1; }
  65. result = clGetContextInfo( context, CL_CONTEXT_DEVICES, size, device_list, NULL );
  66. if( result != CL_SUCCESS ) { printf("ERROR: clGetContextInfo() failed\n"); return -1; }
  67. // create command queue for the first device
  68. cmd_queue = clCreateCommandQueue( context, device_list[0], 0, NULL );
  69. if( !cmd_queue ) { printf("ERROR: clCreateCommandQueue() failed\n"); return -1; }
  70. return 0;
  71. }
  72. static int shutdown()
  73. {
  74. // release resources
  75. if( cmd_queue ) clReleaseCommandQueue( cmd_queue );
  76. if( context ) clReleaseContext( context );
  77. if( device_list ) delete device_list;
  78. // reset all variables
  79. cmd_queue = 0;
  80. context = 0;
  81. device_list = 0;
  82. num_devices = 0;
  83. device_type = 0;
  84. return 0;
  85. }
  86. static int do_verify = 0;
  87. void lud_cuda(float *d_m, int matrix_dim);
  88. static struct option long_options[] = {
  89. /* name, has_arg, flag, val */
  90. {"input", 1, NULL, 'i'},
  91. {"size", 1, NULL, 's'},
  92. {"verify", 0, NULL, 'v'},
  93. {0,0,0,0}
  94. };
  95. int
  96. main ( int argc, char *argv[] )
  97. {
  98. printf("WG size of kernel = %d X %d\n", BLOCK_SIZE, BLOCK_SIZE);
  99. int matrix_dim = 32; /* default matrix_dim */
  100. int opt, option_index=0;
  101. func_ret_t ret;
  102. const char *input_file = NULL;
  103. float *m, *mm;
  104. stopwatch sw;
  105. while ((opt = getopt_long(argc, argv, "::vs:i:",
  106. long_options, &option_index)) != -1 ) {
  107. switch(opt){
  108. case 'i':
  109. input_file = optarg;
  110. break;
  111. case 'v':
  112. do_verify = 1;
  113. break;
  114. case 's':
  115. matrix_dim = atoi(optarg);
  116. printf("Generate input matrix internally, size =%d\n", matrix_dim);
  117. // fprintf(stderr, "Currently not supported, use -i instead\n");
  118. // fprintf(stderr, "Usage: %s [-v] [-s matrix_size|-i input_file]\n", argv[0]);
  119. // exit(EXIT_FAILURE);
  120. break;
  121. case '?':
  122. fprintf(stderr, "invalid option\n");
  123. break;
  124. case ':':
  125. fprintf(stderr, "missing argument\n");
  126. break;
  127. default:
  128. fprintf(stderr, "Usage: %s [-v] [-s matrix_size|-i input_file]\n",
  129. argv[0]);
  130. exit(EXIT_FAILURE);
  131. }
  132. }
  133. if ( (optind < argc) || (optind == 1)) {
  134. fprintf(stderr, "Usage: %s [-v] [-s matrix_size|-i input_file]\n", argv[0]);
  135. exit(EXIT_FAILURE);
  136. }
  137. if (input_file) {
  138. printf("Reading matrix from file %s\n", input_file);
  139. ret = create_matrix_from_file(&m, input_file, &matrix_dim);
  140. if (ret != RET_SUCCESS) {
  141. m = NULL;
  142. fprintf(stderr, "error create matrix from file %s\n", input_file);
  143. exit(EXIT_FAILURE);
  144. }
  145. }
  146. else if (matrix_dim) {
  147. printf("Creating matrix internally size=%d\n", matrix_dim);
  148. ret = create_matrix(&m, matrix_dim);
  149. if (ret != RET_SUCCESS) {
  150. m = NULL;
  151. fprintf(stderr, "error create matrix internally size=%d\n", matrix_dim);
  152. exit(EXIT_FAILURE);
  153. }
  154. }
  155. else {
  156. printf("No input file specified!\n");
  157. exit(EXIT_FAILURE);
  158. }
  159. if (do_verify){
  160. printf("Before LUD\n");
  161. // print_matrix(m, matrix_dim);
  162. matrix_duplicate(m, &mm, matrix_dim);
  163. }
  164. int sourcesize = 1024*1024;
  165. char * source = (char *)calloc(sourcesize, sizeof(char));
  166. if(!source) { printf("ERROR: calloc(%d) failed\n", sourcesize); return -1; }
  167. char * kernel_lud_diag = "lud_diagonal";
  168. char * kernel_lud_peri = "lud_perimeter";
  169. char * kernel_lud_inter = "lud_internal";
  170. FILE * fp = fopen("./lud_kernel.cl", "rb");
  171. if(!fp) { printf("ERROR: unable to open '%s'\n"); return -1; }
  172. fread(source + strlen(source), sourcesize, 1, fp);
  173. fclose(fp);
  174. // Use 1: GPU 0: CPU
  175. int use_gpu = 1;
  176. // OpenCL initialization
  177. if(initialize(use_gpu)) return -1;
  178. // compile kernel
  179. cl_int err = 0;
  180. const char * slist[2] = { source, 0 };
  181. cl_program prog = clCreateProgramWithSource(context, 1, slist, NULL, &err);
  182. if(err != CL_SUCCESS) { printf("ERROR: clCreateProgramWithSource() => %d\n", err); return -1; }
  183. char clOptions[110];
  184. // sprintf(clOptions,"-I../../src");
  185. sprintf(clOptions," ");
  186. #ifdef BLOCK_SIZE
  187. sprintf(clOptions + strlen(clOptions), " -DBLOCK_SIZE=%d", BLOCK_SIZE);
  188. #endif
  189. err = clBuildProgram(prog, 0, NULL, clOptions, NULL, NULL);
  190. { // show warnings/errors
  191. //static char log[65536]; memset(log, 0, sizeof(log));
  192. //cl_device_id device_id = 0;
  193. //err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(device_id), &device_id, NULL);
  194. //clGetProgramBuildInfo(prog, device_id, CL_PROGRAM_BUILD_LOG, sizeof(log)-1, log, NULL);
  195. //if(err || strstr(log,"warning:") || strstr(log, "error:")) printf("<<<<\n%s\n>>>>\n", log);
  196. }
  197. if(err != CL_SUCCESS) { printf("ERROR: clBuildProgram() => %d\n", err); return -1; }
  198. cl_kernel diagnal;
  199. cl_kernel perimeter;
  200. cl_kernel internal;
  201. diagnal = clCreateKernel(prog, kernel_lud_diag, &err);
  202. perimeter = clCreateKernel(prog, kernel_lud_peri, &err);
  203. internal = clCreateKernel(prog, kernel_lud_inter, &err);
  204. if(err != CL_SUCCESS) { printf("ERROR: clCreateKernel() 0 => %d\n", err); return -1; }
  205. clReleaseProgram(prog);
  206. //size_t local_work[3] = { 1, 1, 1 };
  207. //size_t global_work[3] = {1, 1, 1 };
  208. cl_mem d_m;
  209. d_m = clCreateBuffer(context, CL_MEM_READ_WRITE, matrix_dim*matrix_dim * sizeof(float), NULL, &err );
  210. if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer d_m (size:%d) => %d\n", matrix_dim*matrix_dim, err); return -1;}
  211. /* beginning of timing point */
  212. stopwatch_start(&sw);
  213. err = clEnqueueWriteBuffer(cmd_queue, d_m, 1, 0, matrix_dim*matrix_dim*sizeof(float), m, 0, 0, 0);
  214. if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer d_m (size:%d) => %d\n", matrix_dim*matrix_dim, err); return -1; }
  215. int i=0;
  216. for (i=0; i < matrix_dim-BLOCK_SIZE; i += BLOCK_SIZE) {
  217. clSetKernelArg(diagnal, 0, sizeof(void *), (void*) &d_m);
  218. clSetKernelArg(diagnal, 1, sizeof(float) * BLOCK_SIZE * BLOCK_SIZE, (void*)NULL );
  219. clSetKernelArg(diagnal, 2, sizeof(cl_int), (void*) &matrix_dim);
  220. clSetKernelArg(diagnal, 3, sizeof(cl_int), (void*) &i);
  221. size_t global_work1[3] = {BLOCK_SIZE, 1, 1};
  222. size_t local_work1[3] = {BLOCK_SIZE, 1, 1};
  223. err = clEnqueueNDRangeKernel(cmd_queue, diagnal, 2, NULL, global_work1, local_work1, 0, 0, 0);
  224. if(err != CL_SUCCESS) { printf("ERROR: diagnal clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; }
  225. clSetKernelArg(perimeter, 0, sizeof(void *), (void*) &d_m);
  226. clSetKernelArg(perimeter, 1, sizeof(float) * BLOCK_SIZE * BLOCK_SIZE, (void*)NULL );
  227. clSetKernelArg(perimeter, 2, sizeof(float) * BLOCK_SIZE * BLOCK_SIZE, (void*)NULL );
  228. clSetKernelArg(perimeter, 3, sizeof(float) * BLOCK_SIZE * BLOCK_SIZE, (void*)NULL );
  229. clSetKernelArg(perimeter, 4, sizeof(cl_int), (void*) &matrix_dim);
  230. clSetKernelArg(perimeter, 5, sizeof(cl_int), (void*) &i);
  231. size_t global_work2[3] = {BLOCK_SIZE * 2 * ((matrix_dim-i)/BLOCK_SIZE-1), 1, 1};
  232. size_t local_work2[3] = {BLOCK_SIZE * 2, 1, 1};
  233. err = clEnqueueNDRangeKernel(cmd_queue, perimeter, 2, NULL, global_work2, local_work2, 0, 0, 0);
  234. if(err != CL_SUCCESS) { printf("ERROR: perimeter clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; }
  235. clSetKernelArg(internal, 0, sizeof(void *), (void*) &d_m);
  236. clSetKernelArg(internal, 1, sizeof(float) * BLOCK_SIZE * BLOCK_SIZE, (void*)NULL );
  237. clSetKernelArg(internal, 2, sizeof(float) * BLOCK_SIZE * BLOCK_SIZE, (void*)NULL );
  238. clSetKernelArg(internal, 3, sizeof(cl_int), (void*) &matrix_dim);
  239. clSetKernelArg(internal, 4, sizeof(cl_int), (void*) &i);
  240. size_t global_work3[3] = {BLOCK_SIZE * ((matrix_dim-i)/BLOCK_SIZE-1), BLOCK_SIZE * ((matrix_dim-i)/BLOCK_SIZE-1), 1};
  241. size_t local_work3[3] = {BLOCK_SIZE, BLOCK_SIZE, 1};
  242. err = clEnqueueNDRangeKernel(cmd_queue, internal, 2, NULL, global_work3, local_work3, 0, 0, 0);
  243. if(err != CL_SUCCESS) { printf("ERROR: internal clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; }
  244. }
  245. clSetKernelArg(diagnal, 0, sizeof(void *), (void*) &d_m);
  246. clSetKernelArg(diagnal, 1, sizeof(float) * BLOCK_SIZE * BLOCK_SIZE, (void*)NULL );
  247. clSetKernelArg(diagnal, 2, sizeof(cl_int), (void*) &matrix_dim);
  248. clSetKernelArg(diagnal, 3, sizeof(cl_int), (void*) &i);
  249. size_t global_work1[3] = {BLOCK_SIZE, 1, 1};
  250. size_t local_work1[3] = {BLOCK_SIZE, 1, 1};
  251. err = clEnqueueNDRangeKernel(cmd_queue, diagnal, 2, NULL, global_work1, local_work1, 0, 0, 0);
  252. if(err != CL_SUCCESS) { printf("ERROR: diagnal clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; }
  253. err = clEnqueueReadBuffer(cmd_queue, d_m, 1, 0, matrix_dim*matrix_dim*sizeof(float), m, 0, 0, 0);
  254. if(err != CL_SUCCESS) { printf("ERROR: clEnqueueReadBuffer d_m (size:%d) => %d\n", matrix_dim*matrix_dim, err); return -1; }
  255. clFinish(cmd_queue);
  256. /* end of timing point */
  257. stopwatch_stop(&sw);
  258. printf("Time consumed(ms): %lf\n", 1000*get_interval_by_sec(&sw));
  259. clReleaseMemObject(d_m);
  260. if (do_verify){
  261. printf("After LUD\n");
  262. // print_matrix(m, matrix_dim);
  263. printf(">>>Verify<<<<\n");
  264. lud_verify(mm, m, matrix_dim);
  265. free(mm);
  266. }
  267. free(m);
  268. if(shutdown()) return -1;
  269. }
  270. /* ---------- end of function main ---------- */