lud.cpp 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325
  1. /*
  2. * =====================================================================================
  3. *
  4. * Filename: lud.cu
  5. *
  6. * Description: The main wrapper for the suite
  7. *
  8. * Version: 1.0
  9. * Created: 10/22/2009 08:40:34 PM
  10. * Revision: none
  11. * Compiler: gcc
  12. *
  13. * Author: Liang Wang (lw2aw), lw2aw@virginia.edu
  14. * Company: CS@UVa
  15. *
  16. * =====================================================================================
  17. */
  18. #include <stdio.h>
  19. #include <unistd.h>
  20. #include <getopt.h>
  21. #include <stdlib.h>
  22. #include <assert.h>
  23. #include "common.h"
  24. #include <sys/time.h>
  25. #include <CL/cl.h>
  26. #include <string.h>
  27. #include <string>
  28. #ifdef RD_WG_SIZE_0_0
  29. #define BLOCK_SIZE RD_WG_SIZE_0_0
  30. #elif defined(RD_WG_SIZE_0)
  31. #define BLOCK_SIZE RD_WG_SIZE_0
  32. #elif defined(RD_WG_SIZE)
  33. #define BLOCK_SIZE RD_WG_SIZE
  34. #else
  35. #define BLOCK_SIZE 16
  36. #endif
  37. double gettime() {
  38. struct timeval t;
  39. gettimeofday(&t,NULL);
  40. return t.tv_sec+t.tv_usec*1e-6;
  41. }
  42. static cl_context context;
  43. static cl_command_queue cmd_queue;
  44. static cl_device_type device_type;
  45. static cl_device_id * device_list;
  46. static cl_int num_devices;
  47. static int initialize(int use_gpu)
  48. {
  49. cl_int result;
  50. size_t size;
  51. // modify a little bit the initialization of the platforms to handle also the case in which we have more than one platform on our system.
  52. cl_uint platformCount;
  53. cl_platform_id *platforms_ids;
  54. // create OpenCL context
  55. clGetPlatformIDs(0, NULL, &platformCount);
  56. platforms_ids = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount);
  57. if (clGetPlatformIDs(platformCount, platforms_ids, NULL) != CL_SUCCESS) { printf("ERROR: clGetPlatformIDs(1,*,0) failed\n"); return -1; }
  58. cl_context_properties ctxprop[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platforms_ids[1], 0};
  59. device_type = use_gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU;
  60. context = clCreateContextFromType( ctxprop, device_type, NULL, NULL, NULL );
  61. if( !context ) { printf("ERROR: clCreateContextFromType(%s) failed\n", use_gpu ? "GPU" : "CPU"); return -1; }
  62. // get the list of GPUs
  63. result = clGetContextInfo( context, CL_CONTEXT_DEVICES, 0, NULL, &size );
  64. num_devices = (int) (size / sizeof(cl_device_id));
  65. printf("num_devices = %d\n", num_devices);
  66. if( result != CL_SUCCESS || num_devices < 1 ) { printf("ERROR: clGetContextInfo() failed\n"); return -1; }
  67. device_list = new cl_device_id[num_devices];
  68. if( !device_list ) { printf("ERROR: new cl_device_id[] failed\n"); return -1; }
  69. result = clGetContextInfo( context, CL_CONTEXT_DEVICES, size, device_list, NULL );
  70. if( result != CL_SUCCESS ) { printf("ERROR: clGetContextInfo() failed\n"); return -1; }
  71. // create command queue for the first device
  72. cmd_queue = clCreateCommandQueue( context, device_list[0], 0, NULL );
  73. if( !cmd_queue ) { printf("ERROR: clCreateCommandQueue() failed\n"); return -1; }
  74. return 0;
  75. }
  76. static int shutdown()
  77. {
  78. // release resources
  79. if( cmd_queue ) clReleaseCommandQueue( cmd_queue );
  80. if( context ) clReleaseContext( context );
  81. if( device_list ) delete device_list;
  82. // reset all variables
  83. cmd_queue = 0;
  84. context = 0;
  85. device_list = 0;
  86. num_devices = 0;
  87. device_type = 0;
  88. return 0;
  89. }
  90. static int do_verify = 0;
  91. void lud_cuda(float *d_m, int matrix_dim);
  92. static struct option long_options[] = {
  93. /* name, has_arg, flag, val */
  94. {"input", 1, NULL, 'i'},
  95. {"size", 1, NULL, 's'},
  96. {"verify", 0, NULL, 'v'},
  97. {0,0,0,0}
  98. };
  99. int
  100. main ( int argc, char *argv[] )
  101. {
  102. printf("WG size of kernel = %d X %d\n", BLOCK_SIZE, BLOCK_SIZE);
  103. int matrix_dim = 32; /* default matrix_dim */
  104. int opt, option_index=0;
  105. func_ret_t ret;
  106. const char *input_file = NULL;
  107. float *m, *mm;
  108. stopwatch sw;
  109. while ((opt = getopt_long(argc, argv, "::vs:i:",
  110. long_options, &option_index)) != -1 ) {
  111. switch(opt){
  112. case 'i':
  113. input_file = optarg;
  114. break;
  115. case 'v':
  116. do_verify = 1;
  117. break;
  118. case 's':
  119. matrix_dim = atoi(optarg);
  120. printf("Generate input matrix internally, size =%d\n", matrix_dim);
  121. // fprintf(stderr, "Currently not supported, use -i instead\n");
  122. // fprintf(stderr, "Usage: %s [-v] [-s matrix_size|-i input_file]\n", argv[0]);
  123. // exit(EXIT_FAILURE);
  124. break;
  125. case '?':
  126. fprintf(stderr, "invalid option\n");
  127. break;
  128. case ':':
  129. fprintf(stderr, "missing argument\n");
  130. break;
  131. default:
  132. fprintf(stderr, "Usage: %s [-v] [-s matrix_size|-i input_file]\n",
  133. argv[0]);
  134. exit(EXIT_FAILURE);
  135. }
  136. }
  137. if ( (optind < argc) || (optind == 1)) {
  138. fprintf(stderr, "Usage: %s [-v] [-s matrix_size|-i input_file]\n", argv[0]);
  139. exit(EXIT_FAILURE);
  140. }
  141. if (input_file) {
  142. printf("Reading matrix from file %s\n", input_file);
  143. ret = create_matrix_from_file(&m, input_file, &matrix_dim);
  144. if (ret != RET_SUCCESS) {
  145. m = NULL;
  146. fprintf(stderr, "error create matrix from file %s\n", input_file);
  147. exit(EXIT_FAILURE);
  148. }
  149. }
  150. else if (matrix_dim) {
  151. printf("Creating matrix internally size=%d\n", matrix_dim);
  152. ret = create_matrix(&m, matrix_dim);
  153. if (ret != RET_SUCCESS) {
  154. m = NULL;
  155. fprintf(stderr, "error create matrix internally size=%d\n", matrix_dim);
  156. exit(EXIT_FAILURE);
  157. }
  158. }
  159. else {
  160. printf("No input file specified!\n");
  161. exit(EXIT_FAILURE);
  162. }
  163. if (do_verify){
  164. printf("Before LUD\n");
  165. // print_matrix(m, matrix_dim);
  166. matrix_duplicate(m, &mm, matrix_dim);
  167. }
  168. int sourcesize = 1024*1024;
  169. char * source = (char *)calloc(sourcesize, sizeof(char));
  170. if(!source) { printf("ERROR: calloc(%d) failed\n", sourcesize); return -1; }
  171. char * kernel_lud_diag = "lud_diagonal";
  172. char * kernel_lud_peri = "lud_perimeter";
  173. char * kernel_lud_inter = "lud_internal";
  174. FILE * fp = fopen("./lud_kernel.cl", "rb");
  175. if(!fp) { printf("ERROR: unable to open '%s'\n"); return -1; }
  176. fread(source + strlen(source), sourcesize, 1, fp);
  177. fclose(fp);
  178. // Use 1: GPU 0: CPU
  179. int use_gpu = 0;
  180. // OpenCL initialization
  181. if(initialize(use_gpu)) return -1;
  182. // compile kernel
  183. cl_int err = 0;
  184. const char * slist[2] = { source, 0 };
  185. cl_program prog = clCreateProgramWithSource(context, 1, slist, NULL, &err);
  186. if(err != CL_SUCCESS) { printf("ERROR: clCreateProgramWithSource() => %d\n", err); return -1; }
  187. char clOptions[110];
  188. // sprintf(clOptions,"-I../../src");
  189. sprintf(clOptions," ");
  190. #ifdef BLOCK_SIZE
  191. sprintf(clOptions + strlen(clOptions), " -DBLOCK_SIZE=%d", BLOCK_SIZE);
  192. #endif
  193. err = clBuildProgram(prog, 0, NULL, clOptions, NULL, NULL);
  194. { // show warnings/errors
  195. //static char log[65536]; memset(log, 0, sizeof(log));
  196. //cl_device_id device_id = 0;
  197. //err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(device_id), &device_id, NULL);
  198. //clGetProgramBuildInfo(prog, device_id, CL_PROGRAM_BUILD_LOG, sizeof(log)-1, log, NULL);
  199. //if(err || strstr(log,"warning:") || strstr(log, "error:")) printf("<<<<\n%s\n>>>>\n", log);
  200. }
  201. if(err != CL_SUCCESS) { printf("ERROR: clBuildProgram() => %d\n", err); return -1; }
  202. cl_kernel diagnal;
  203. cl_kernel perimeter;
  204. cl_kernel internal;
  205. diagnal = clCreateKernel(prog, kernel_lud_diag, &err);
  206. perimeter = clCreateKernel(prog, kernel_lud_peri, &err);
  207. internal = clCreateKernel(prog, kernel_lud_inter, &err);
  208. if(err != CL_SUCCESS) { printf("ERROR: clCreateKernel() 0 => %d\n", err); return -1; }
  209. clReleaseProgram(prog);
  210. //size_t local_work[3] = { 1, 1, 1 };
  211. //size_t global_work[3] = {1, 1, 1 };
  212. cl_mem d_m;
  213. d_m = clCreateBuffer(context, CL_MEM_READ_WRITE, matrix_dim*matrix_dim * sizeof(float), NULL, &err );
  214. if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer d_m (size:%d) => %d\n", matrix_dim*matrix_dim, err); return -1;}
  215. /* beginning of timing point */
  216. stopwatch_start(&sw);
  217. err = clEnqueueWriteBuffer(cmd_queue, d_m, 1, 0, matrix_dim*matrix_dim*sizeof(float), m, 0, 0, 0);
  218. if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer d_m (size:%d) => %d\n", matrix_dim*matrix_dim, err); return -1; }
  219. int i=0;
  220. for (i=0; i < matrix_dim-BLOCK_SIZE; i += BLOCK_SIZE) {
  221. clSetKernelArg(diagnal, 0, sizeof(void *), (void*) &d_m);
  222. clSetKernelArg(diagnal, 1, sizeof(float) * BLOCK_SIZE * BLOCK_SIZE, (void*)NULL );
  223. clSetKernelArg(diagnal, 2, sizeof(cl_int), (void*) &matrix_dim);
  224. clSetKernelArg(diagnal, 3, sizeof(cl_int), (void*) &i);
  225. size_t global_work1[3] = {BLOCK_SIZE, 1, 1};
  226. size_t local_work1[3] = {BLOCK_SIZE, 1, 1};
  227. err = clEnqueueNDRangeKernel(cmd_queue, diagnal, 2, NULL, global_work1, local_work1, 0, 0, 0);
  228. if(err != CL_SUCCESS) { printf("ERROR: diagnal clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; }
  229. clSetKernelArg(perimeter, 0, sizeof(void *), (void*) &d_m);
  230. clSetKernelArg(perimeter, 1, sizeof(float) * BLOCK_SIZE * BLOCK_SIZE, (void*)NULL );
  231. clSetKernelArg(perimeter, 2, sizeof(float) * BLOCK_SIZE * BLOCK_SIZE, (void*)NULL );
  232. clSetKernelArg(perimeter, 3, sizeof(float) * BLOCK_SIZE * BLOCK_SIZE, (void*)NULL );
  233. clSetKernelArg(perimeter, 4, sizeof(cl_int), (void*) &matrix_dim);
  234. clSetKernelArg(perimeter, 5, sizeof(cl_int), (void*) &i);
  235. size_t global_work2[3] = {BLOCK_SIZE * 2 * ((matrix_dim-i)/BLOCK_SIZE-1), 1, 1};
  236. size_t local_work2[3] = {BLOCK_SIZE * 2, 1, 1};
  237. err = clEnqueueNDRangeKernel(cmd_queue, perimeter, 2, NULL, global_work2, local_work2, 0, 0, 0);
  238. if(err != CL_SUCCESS) { printf("ERROR: perimeter clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; }
  239. clSetKernelArg(internal, 0, sizeof(void *), (void*) &d_m);
  240. clSetKernelArg(internal, 1, sizeof(float) * BLOCK_SIZE * BLOCK_SIZE, (void*)NULL );
  241. clSetKernelArg(internal, 2, sizeof(float) * BLOCK_SIZE * BLOCK_SIZE, (void*)NULL );
  242. clSetKernelArg(internal, 3, sizeof(cl_int), (void*) &matrix_dim);
  243. clSetKernelArg(internal, 4, sizeof(cl_int), (void*) &i);
  244. size_t global_work3[3] = {BLOCK_SIZE * ((matrix_dim-i)/BLOCK_SIZE-1), BLOCK_SIZE * ((matrix_dim-i)/BLOCK_SIZE-1), 1};
  245. size_t local_work3[3] = {BLOCK_SIZE, BLOCK_SIZE, 1};
  246. err = clEnqueueNDRangeKernel(cmd_queue, internal, 2, NULL, global_work3, local_work3, 0, 0, 0);
  247. if(err != CL_SUCCESS) { printf("ERROR: internal clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; }
  248. }
  249. clSetKernelArg(diagnal, 0, sizeof(void *), (void*) &d_m);
  250. clSetKernelArg(diagnal, 1, sizeof(float) * BLOCK_SIZE * BLOCK_SIZE, (void*)NULL );
  251. clSetKernelArg(diagnal, 2, sizeof(cl_int), (void*) &matrix_dim);
  252. clSetKernelArg(diagnal, 3, sizeof(cl_int), (void*) &i);
  253. size_t global_work1[3] = {BLOCK_SIZE, 1, 1};
  254. size_t local_work1[3] = {BLOCK_SIZE, 1, 1};
  255. err = clEnqueueNDRangeKernel(cmd_queue, diagnal, 2, NULL, global_work1, local_work1, 0, 0, 0);
  256. if(err != CL_SUCCESS) { printf("ERROR: diagnal clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; }
  257. err = clEnqueueReadBuffer(cmd_queue, d_m, 1, 0, matrix_dim*matrix_dim*sizeof(float), m, 0, 0, 0);
  258. if(err != CL_SUCCESS) { printf("ERROR: clEnqueueReadBuffer d_m (size:%d) => %d\n", matrix_dim*matrix_dim, err); return -1; }
  259. clFinish(cmd_queue);
  260. /* end of timing point */
  261. stopwatch_stop(&sw);
  262. printf("Time consumed(ms): %lf\n", 1000*get_interval_by_sec(&sw));
  263. clReleaseMemObject(d_m);
  264. if (do_verify){
  265. printf("After LUD\n");
  266. // print_matrix(m, matrix_dim);
  267. printf(">>>Verify<<<<\n");
  268. lud_verify(mm, m, matrix_dim);
  269. free(mm);
  270. }
  271. free(m);
  272. if(shutdown()) return -1;
  273. }
  274. /* ---------- end of function main ---------- */