lud.cpp 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334
  1. /*
  2. * =====================================================================================
  3. *
  4. * Filename: lud.cu
  5. *
  6. * Description: The main wrapper for the suite
  7. *
  8. * Version: 1.0
  9. * Created: 10/22/2009 08:40:34 PM
  10. * Revision: none
  11. * Compiler: gcc
  12. *
  13. * Author: Liang Wang (lw2aw), lw2aw@virginia.edu
  14. * Company: CS@UVa
  15. *
  16. * =====================================================================================
  17. */
  18. #include <stdio.h>
  19. #include <unistd.h>
  20. #include <getopt.h>
  21. #include <stdlib.h>
  22. #include <assert.h>
  23. #include "common.h"
  24. #include <sys/time.h>
  25. #include <CL/cl.h>
  26. #include <string.h>
  27. #include <string>
  28. #ifdef RD_WG_SIZE_0_0
  29. #define BLOCK_SIZE RD_WG_SIZE_0_0
  30. #elif defined(RD_WG_SIZE_0)
  31. #define BLOCK_SIZE RD_WG_SIZE_0
  32. #elif defined(RD_WG_SIZE)
  33. #define BLOCK_SIZE RD_WG_SIZE
  34. #else
  35. #define BLOCK_SIZE 16
  36. #endif
  37. double gettime() {
  38. struct timeval t;
  39. gettimeofday(&t,NULL);
  40. return t.tv_sec+t.tv_usec*1e-6;
  41. }
  42. static cl_context context;
  43. static cl_command_queue cmd_queue;
  44. static cl_device_type device_type;
  45. static cl_device_id * device_list;
  46. static cl_int num_devices;
  47. static int initialize(int platform_id, int device_id, int use_gpu)
  48. {
  49. cl_int result;
  50. size_t size;
  51. // modify a little bit the initialization of the platforms to handle also the case in which we have more than one platform on our system.
  52. cl_uint platformCount;
  53. cl_platform_id *platforms_ids;
  54. // create OpenCL context
  55. clGetPlatformIDs(0, NULL, &platformCount);
  56. platforms_ids = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount);
  57. if (clGetPlatformIDs(platformCount, platforms_ids, NULL) != CL_SUCCESS) { printf("ERROR: clGetPlatformIDs(1,*,0) failed\n"); return -1; }
  58. cl_context_properties ctxprop[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platforms_ids[platform_id], 0};
  59. device_type = use_gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU;
  60. context = clCreateContextFromType( ctxprop, device_type, NULL, NULL, NULL );
  61. if( !context ) { printf("ERROR: clCreateContextFromType(%s) failed\n", use_gpu ? "GPU" : "CPU"); return -1; }
  62. // get the list of GPUs/CPUs
  63. result = clGetContextInfo( context, CL_CONTEXT_DEVICES, 0, NULL, &size );
  64. num_devices = (int) (size / sizeof(cl_device_id));
  65. printf("num_devices = %d\n", num_devices);
  66. if( result != CL_SUCCESS || num_devices < 1 ) { printf("ERROR: clGetContextInfo() failed\n"); return -1; }
  67. device_list = new cl_device_id[num_devices];
  68. if( !device_list ) { printf("ERROR: new cl_device_id[] failed\n"); return -1; }
  69. result = clGetContextInfo( context, CL_CONTEXT_DEVICES, size, device_list, NULL );
  70. if( result != CL_SUCCESS ) { printf("ERROR: clGetContextInfo() failed\n"); return -1; }
  71. // create command queue for the device passed as parameter
  72. cmd_queue = clCreateCommandQueue( context, device_list[device_id], 0, NULL );
  73. if( !cmd_queue ) { printf("ERROR: clCreateCommandQueue() failed\n"); return -1; }
  74. return 0;
  75. }
  76. static int shutdown()
  77. {
  78. // release resources
  79. if( cmd_queue ) clReleaseCommandQueue( cmd_queue );
  80. if( context ) clReleaseContext( context );
  81. if( device_list ) delete device_list;
  82. // reset all variables
  83. cmd_queue = 0;
  84. context = 0;
  85. device_list = 0;
  86. num_devices = 0;
  87. device_type = 0;
  88. return 0;
  89. }
  90. static int do_verify = 0;
  91. void lud_cuda(float *d_m, int matrix_dim);
  92. static struct option long_options[] = {
  93. /* name, has_arg, flag, val */
  94. {"input", 1, NULL, 'i'},
  95. {"size", 1, NULL, 's'},
  96. {"verify", 0, NULL, 'v'},
  97. {0,0,0,0}
  98. };
  99. int
  100. main ( int argc, char *argv[] )
  101. {
  102. printf("WG size of kernel = %d X %d\n", BLOCK_SIZE, BLOCK_SIZE);
  103. int matrix_dim = 32; /* default matrix_dim */
  104. int opt, option_index=0;
  105. func_ret_t ret;
  106. const char *input_file = NULL;
  107. float *m, *mm;
  108. stopwatch sw;
  109. // Variables to store information on platform and device to use_gpu
  110. int platform_id = 0;
  111. int device_id = 0;
  112. int use_gpu = 0;
  113. while ((opt = getopt_long(argc, argv, "::vs:i:p:d:g:",
  114. long_options, &option_index)) != -1 ) {
  115. switch(opt){
  116. case 'i':
  117. input_file = optarg;
  118. break;
  119. case 'v':
  120. do_verify = 1;
  121. break;
  122. case 's':
  123. matrix_dim = atoi(optarg);
  124. printf("Generate input matrix internally, size =%d\n", matrix_dim);
  125. break;
  126. case 'p':
  127. platform_id = atoi(optarg);
  128. break;
  129. case 'd':
  130. device_id = atoi(optarg);
  131. break;
  132. case 'g':
  133. use_gpu = atoi(optarg);
  134. break;
  135. case '?':
  136. fprintf(stderr, "invalid option\n");
  137. break;
  138. case ':':
  139. fprintf(stderr, "missing argument\n");
  140. break;
  141. default:
  142. fprintf(stderr, "Usage: %s [-v] [-s matrix_size|-i input_file] -p platform -d device -g use_gpu\n",
  143. argv[0]);
  144. exit(EXIT_FAILURE);
  145. }
  146. }
  147. if ( (optind < argc) || (optind == 1)) {
  148. fprintf(stderr, "Usage: %s [-v] [-s matrix_size|-i input_file]\n", argv[0]);
  149. exit(EXIT_FAILURE);
  150. }
  151. if (input_file) {
  152. printf("Reading matrix from file %s\n", input_file);
  153. ret = create_matrix_from_file(&m, input_file, &matrix_dim);
  154. if (ret != RET_SUCCESS) {
  155. m = NULL;
  156. fprintf(stderr, "error create matrix from file %s\n", input_file);
  157. exit(EXIT_FAILURE);
  158. }
  159. }
  160. else if (matrix_dim) {
  161. printf("Creating matrix internally size=%d\n", matrix_dim);
  162. ret = create_matrix(&m, matrix_dim);
  163. if (ret != RET_SUCCESS) {
  164. m = NULL;
  165. fprintf(stderr, "error create matrix internally size=%d\n", matrix_dim);
  166. exit(EXIT_FAILURE);
  167. }
  168. }
  169. else {
  170. printf("No input file specified!\n");
  171. exit(EXIT_FAILURE);
  172. }
  173. if (do_verify){
  174. printf("Before LUD\n");
  175. // print_matrix(m, matrix_dim);
  176. matrix_duplicate(m, &mm, matrix_dim);
  177. }
  178. int sourcesize = 1024*1024;
  179. char * source = (char *)calloc(sourcesize, sizeof(char));
  180. if(!source) { printf("ERROR: calloc(%d) failed\n", sourcesize); return -1; }
  181. char * kernel_lud_diag = "lud_diagonal";
  182. char * kernel_lud_peri = "lud_perimeter";
  183. char * kernel_lud_inter = "lud_internal";
  184. FILE * fp = fopen("./lud_kernel.cl", "rb");
  185. if(!fp) { printf("ERROR: unable to open '%s'\n"); return -1; }
  186. fread(source + strlen(source), sourcesize, 1, fp);
  187. fclose(fp);
  188. // OpenCL initialization
  189. if(initialize(platform_id, device_id, use_gpu)) return -1;
  190. // compile kernel
  191. cl_int err = 0;
  192. const char * slist[2] = { source, 0 };
  193. cl_program prog = clCreateProgramWithSource(context, 1, slist, NULL, &err);
  194. if(err != CL_SUCCESS) { printf("ERROR: clCreateProgramWithSource() => %d\n", err); return -1; }
  195. char clOptions[110];
  196. // sprintf(clOptions,"-I../../src");
  197. sprintf(clOptions," ");
  198. #ifdef BLOCK_SIZE
  199. sprintf(clOptions + strlen(clOptions), " -DBLOCK_SIZE=%d", BLOCK_SIZE);
  200. #endif
  201. err = clBuildProgram(prog, 0, NULL, clOptions, NULL, NULL);
  202. { // show warnings/errors
  203. //static char log[65536]; memset(log, 0, sizeof(log));
  204. //cl_device_id device_id = 0;
  205. //err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(device_id), &device_id, NULL);
  206. //clGetProgramBuildInfo(prog, device_id, CL_PROGRAM_BUILD_LOG, sizeof(log)-1, log, NULL);
  207. //if(err || strstr(log,"warning:") || strstr(log, "error:")) printf("<<<<\n%s\n>>>>\n", log);
  208. }
  209. if(err != CL_SUCCESS) { printf("ERROR: clBuildProgram() => %d\n", err); return -1; }
  210. cl_kernel diagnal;
  211. cl_kernel perimeter;
  212. cl_kernel internal;
  213. diagnal = clCreateKernel(prog, kernel_lud_diag, &err);
  214. perimeter = clCreateKernel(prog, kernel_lud_peri, &err);
  215. internal = clCreateKernel(prog, kernel_lud_inter, &err);
  216. if(err != CL_SUCCESS) { printf("ERROR: clCreateKernel() 0 => %d\n", err); return -1; }
  217. clReleaseProgram(prog);
  218. //size_t local_work[3] = { 1, 1, 1 };
  219. //size_t global_work[3] = {1, 1, 1 };
  220. cl_mem d_m;
  221. d_m = clCreateBuffer(context, CL_MEM_READ_WRITE, matrix_dim*matrix_dim * sizeof(float), NULL, &err );
  222. if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer d_m (size:%d) => %d\n", matrix_dim*matrix_dim, err); return -1;}
  223. /* beginning of timing point */
  224. stopwatch_start(&sw);
  225. err = clEnqueueWriteBuffer(cmd_queue, d_m, 1, 0, matrix_dim*matrix_dim*sizeof(float), m, 0, 0, 0);
  226. if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer d_m (size:%d) => %d\n", matrix_dim*matrix_dim, err); return -1; }
  227. int i=0;
  228. for (i=0; i < matrix_dim-BLOCK_SIZE; i += BLOCK_SIZE) {
  229. clSetKernelArg(diagnal, 0, sizeof(void *), (void*) &d_m);
  230. clSetKernelArg(diagnal, 1, sizeof(float) * BLOCK_SIZE * BLOCK_SIZE, (void*)NULL );
  231. clSetKernelArg(diagnal, 2, sizeof(cl_int), (void*) &matrix_dim);
  232. clSetKernelArg(diagnal, 3, sizeof(cl_int), (void*) &i);
  233. size_t global_work1[3] = {BLOCK_SIZE, 1, 1};
  234. size_t local_work1[3] = {BLOCK_SIZE, 1, 1};
  235. err = clEnqueueNDRangeKernel(cmd_queue, diagnal, 2, NULL, global_work1, local_work1, 0, 0, 0);
  236. if(err != CL_SUCCESS) { printf("ERROR: diagnal clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; }
  237. clSetKernelArg(perimeter, 0, sizeof(void *), (void*) &d_m);
  238. clSetKernelArg(perimeter, 1, sizeof(float) * BLOCK_SIZE * BLOCK_SIZE, (void*)NULL );
  239. clSetKernelArg(perimeter, 2, sizeof(float) * BLOCK_SIZE * BLOCK_SIZE, (void*)NULL );
  240. clSetKernelArg(perimeter, 3, sizeof(float) * BLOCK_SIZE * BLOCK_SIZE, (void*)NULL );
  241. clSetKernelArg(perimeter, 4, sizeof(cl_int), (void*) &matrix_dim);
  242. clSetKernelArg(perimeter, 5, sizeof(cl_int), (void*) &i);
  243. size_t global_work2[3] = {BLOCK_SIZE * 2 * ((matrix_dim-i)/BLOCK_SIZE-1), 1, 1};
  244. size_t local_work2[3] = {BLOCK_SIZE * 2, 1, 1};
  245. err = clEnqueueNDRangeKernel(cmd_queue, perimeter, 2, NULL, global_work2, local_work2, 0, 0, 0);
  246. if(err != CL_SUCCESS) { printf("ERROR: perimeter clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; }
  247. clSetKernelArg(internal, 0, sizeof(void *), (void*) &d_m);
  248. clSetKernelArg(internal, 1, sizeof(float) * BLOCK_SIZE * BLOCK_SIZE, (void*)NULL );
  249. clSetKernelArg(internal, 2, sizeof(float) * BLOCK_SIZE * BLOCK_SIZE, (void*)NULL );
  250. clSetKernelArg(internal, 3, sizeof(cl_int), (void*) &matrix_dim);
  251. clSetKernelArg(internal, 4, sizeof(cl_int), (void*) &i);
  252. size_t global_work3[3] = {BLOCK_SIZE * ((matrix_dim-i)/BLOCK_SIZE-1), BLOCK_SIZE * ((matrix_dim-i)/BLOCK_SIZE-1), 1};
  253. size_t local_work3[3] = {BLOCK_SIZE, BLOCK_SIZE, 1};
  254. err = clEnqueueNDRangeKernel(cmd_queue, internal, 2, NULL, global_work3, local_work3, 0, 0, 0);
  255. if(err != CL_SUCCESS) { printf("ERROR: internal clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; }
  256. }
  257. clSetKernelArg(diagnal, 0, sizeof(void *), (void*) &d_m);
  258. clSetKernelArg(diagnal, 1, sizeof(float) * BLOCK_SIZE * BLOCK_SIZE, (void*)NULL );
  259. clSetKernelArg(diagnal, 2, sizeof(cl_int), (void*) &matrix_dim);
  260. clSetKernelArg(diagnal, 3, sizeof(cl_int), (void*) &i);
  261. size_t global_work1[3] = {BLOCK_SIZE, 1, 1};
  262. size_t local_work1[3] = {BLOCK_SIZE, 1, 1};
  263. err = clEnqueueNDRangeKernel(cmd_queue, diagnal, 2, NULL, global_work1, local_work1, 0, 0, 0);
  264. if(err != CL_SUCCESS) { printf("ERROR: diagnal clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; }
  265. err = clEnqueueReadBuffer(cmd_queue, d_m, 1, 0, matrix_dim*matrix_dim*sizeof(float), m, 0, 0, 0);
  266. if(err != CL_SUCCESS) { printf("ERROR: clEnqueueReadBuffer d_m (size:%d) => %d\n", matrix_dim*matrix_dim, err); return -1; }
  267. clFinish(cmd_queue);
  268. /* end of timing point */
  269. stopwatch_stop(&sw);
  270. printf("Time consumed(ms): %lf\n", 1000*get_interval_by_sec(&sw));
  271. clReleaseMemObject(d_m);
  272. if (do_verify){
  273. printf("After LUD\n");
  274. // print_matrix(m, matrix_dim);
  275. printf(">>>Verify<<<<\n");
  276. lud_verify(mm, m, matrix_dim);
  277. free(mm);
  278. }
  279. free(m);
  280. if(shutdown()) return -1;
  281. }
  282. /* ---------- end of function main ---------- */