gaussianElim.cpp 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557
  1. #ifndef __GAUSSIAN_ELIMINATION__
  2. #define __GAUSSIAN_ELIMINATION__
  3. #include "gaussianElim.h"
  4. #include <math.h>
  5. #ifdef RD_WG_SIZE_0_0
  6. #define BLOCK_SIZE_0 RD_WG_SIZE_0_0
  7. #elif defined(RD_WG_SIZE_0)
  8. #define BLOCK_SIZE_0 RD_WG_SIZE_0
  9. #elif defined(RD_WG_SIZE)
  10. #define BLOCK_SIZE_0 RD_WG_SIZE
  11. #else
  12. #define BLOCK_SIZE_0 0
  13. #endif
  14. //2D defines. Go from specific to general
  15. #ifdef RD_WG_SIZE_1_0
  16. #define BLOCK_SIZE_1_X RD_WG_SIZE_1_0
  17. #elif defined(RD_WG_SIZE_1)
  18. #define BLOCK_SIZE_1_X RD_WG_SIZE_1
  19. #elif defined(RD_WG_SIZE)
  20. #define BLOCK_SIZE_1_X RD_WG_SIZE
  21. #else
  22. #define BLOCK_SIZE_1_X 0
  23. #endif
  24. #ifdef RD_WG_SIZE_1_1
  25. #define BLOCK_SIZE_1_Y RD_WG_SIZE_1_1
  26. #elif defined(RD_WG_SIZE_1)
  27. #define BLOCK_SIZE_1_Y RD_WG_SIZE_1
  28. #elif defined(RD_WG_SIZE)
  29. #define BLOCK_SIZE_1_Y RD_WG_SIZE
  30. #else
  31. #define BLOCK_SIZE_1_Y 0
  32. #endif
  33. cl_context context=NULL;
  34. // create both matrix and right hand side, Ke Wang 2013/08/12 11:51:06
  35. void
  36. create_matrix(float *m, int size){
  37. int i,j;
  38. float lamda = -0.01;
  39. float coe[2*size-1];
  40. float coe_i =0.0;
  41. for (i=0; i < size; i++)
  42. {
  43. coe_i = 10*exp(lamda*i);
  44. j=size-1+i;
  45. coe[j]=coe_i;
  46. j=size-1-i;
  47. coe[j]=coe_i;
  48. }
  49. for (i=0; i < size; i++) {
  50. for (j=0; j < size; j++) {
  51. m[i*size+j]=coe[size-1-i+j];
  52. }
  53. }
  54. }
  55. int main(int argc, char *argv[]) {
  56. printf("WG size of kernel 1 = %d, WG size of kernel 2= %d X %d\n", BLOCK_SIZE_0, BLOCK_SIZE_1_X, BLOCK_SIZE_1_Y);
  57. float *a=NULL, *b=NULL, *finalVec=NULL;
  58. float *m=NULL;
  59. int size = -1;
  60. FILE *fp;
  61. // args
  62. char filename[200];
  63. int quiet=1,timing=0,platform=-1,device=-1,use_gpu=-1;
  64. // parse command line
  65. if (parseCommandline(argc, argv, filename,
  66. &quiet, &timing, &platform, &device, &size, &use_gpu)) {
  67. printUsage();
  68. return 0;
  69. }
  70. context = cl_init_context(platform,device,quiet);
  71. if(size < 1)
  72. {
  73. fp = fopen(filename, "r");
  74. fscanf(fp, "%d", &size);
  75. a = (float *) malloc(size * size * sizeof(float));
  76. InitMat(fp,size, a, size, size);
  77. b = (float *) malloc(size * sizeof(float));
  78. InitAry(fp, b, size);
  79. fclose(fp);
  80. }
  81. else
  82. {
  83. printf("create input internally before create, size = %d \n", size);
  84. a = (float *) malloc(size * size * sizeof(float));
  85. create_matrix(a, size);
  86. b = (float *) malloc(size * sizeof(float));
  87. for (int i =0; i< size; i++)
  88. b[i]=1.0;
  89. }
  90. if (!quiet) {
  91. printf("The input matrix a is:\n");
  92. PrintMat(a, size, size, size);
  93. printf("The input array b is:\n");
  94. PrintAry(b, size);
  95. }
  96. // create the solution matrix
  97. m = (float *) malloc(size * size * sizeof(float));
  98. // create a new vector to hold the final answer
  99. finalVec = (float *) malloc(size * sizeof(float));
  100. InitPerRun(size,m);
  101. //begin timing
  102. // printf("The result of array b is before run: \n");
  103. // PrintAry(b, size);
  104. // run kernels
  105. ForwardSub(context,a,b,m,size,timing);
  106. // printf("The result of array b is after run: \n");
  107. // PrintAry(b, size);
  108. //end timing
  109. if (!quiet) {
  110. printf("The result of matrix m is: \n");
  111. PrintMat(m, size, size, size);
  112. printf("The result of matrix a is: \n");
  113. PrintMat(a, size, size, size);
  114. printf("The result of array b is: \n");
  115. PrintAry(b, size);
  116. BackSub(a,b,finalVec,size);
  117. printf("The final solution is: \n");
  118. PrintAry(finalVec,size);
  119. }
  120. free(m);
  121. free(a);
  122. free(b);
  123. free(finalVec);
  124. cl_cleanup();
  125. //OpenClGaussianElimination(context,timing);
  126. return 0;
  127. }
  128. /*------------------------------------------------------
  129. ** ForwardSub() -- Forward substitution of Gaussian
  130. ** elimination.
  131. **------------------------------------------------------
  132. */
  133. void ForwardSub(cl_context context, float *a, float *b, float *m, int size,int timing){
  134. // 1. set up kernels
  135. cl_kernel fan1_kernel,fan2_kernel;
  136. cl_int status=0;
  137. cl_program gaussianElim_program;
  138. cl_event writeEvent,kernelEvent,readEvent;
  139. float writeTime=0,readTime=0,kernelTime=0;
  140. float writeMB=0,readMB=0;
  141. gaussianElim_program = cl_compileProgram(
  142. (char *)"gaussianElim_kernels.cl",NULL);
  143. fan1_kernel = clCreateKernel(
  144. gaussianElim_program, "Fan1", &status);
  145. status = cl_errChk(status, (char *)"Error Creating Fan1 kernel",true);
  146. if(status)exit(1);
  147. fan2_kernel = clCreateKernel(
  148. gaussianElim_program, "Fan2", &status);
  149. status = cl_errChk(status, (char *)"Error Creating Fan2 kernel",true);
  150. if(status)exit(1);
  151. // 2. set up memory on device and send ipts data to device
  152. cl_mem a_dev, b_dev, m_dev;
  153. cl_int error=0;
  154. a_dev = clCreateBuffer(context, CL_MEM_READ_WRITE,
  155. sizeof(float)*size*size, NULL, &error);
  156. b_dev = clCreateBuffer(context, CL_MEM_READ_WRITE,
  157. sizeof(float)*size, NULL, &error);
  158. m_dev = clCreateBuffer(context, CL_MEM_READ_WRITE,
  159. sizeof(float) * size * size, NULL, &error);
  160. cl_command_queue command_queue = cl_getCommandQueue();
  161. error = clEnqueueWriteBuffer(command_queue,
  162. a_dev,
  163. 1, // change to 0 for nonblocking write
  164. 0, // offset
  165. sizeof(float)*size*size,
  166. a,
  167. 0,
  168. NULL,
  169. &writeEvent);
  170. if (timing) writeTime+=eventTime(writeEvent,command_queue);
  171. clReleaseEvent(writeEvent);
  172. error = clEnqueueWriteBuffer(command_queue,
  173. b_dev,
  174. 1, // change to 0 for nonblocking write
  175. 0, // offset
  176. sizeof(float)*size,
  177. b,
  178. 0,
  179. NULL,
  180. &writeEvent);
  181. if (timing) writeTime+=eventTime(writeEvent,command_queue);
  182. clReleaseEvent(writeEvent);
  183. error = clEnqueueWriteBuffer(command_queue,
  184. m_dev,
  185. 1, // change to 0 for nonblocking write
  186. 0, // offset
  187. sizeof(float)*size*size,
  188. m,
  189. 0,
  190. NULL,
  191. &writeEvent);
  192. if (timing) writeTime+=eventTime(writeEvent,command_queue);
  193. clReleaseEvent(writeEvent);
  194. writeMB = (float)(sizeof(float) * size * (size + size + 1) / 1e6);
  195. // 3. Determine block sizes
  196. size_t globalWorksizeFan1[1];
  197. size_t globalWorksizeFan2[2];
  198. size_t localWorksizeFan1Buf[1]={BLOCK_SIZE_0};
  199. size_t localWorksizeFan2Buf[2]={BLOCK_SIZE_1_X, BLOCK_SIZE_1_Y};
  200. size_t *localWorksizeFan1=NULL;
  201. size_t *localWorksizeFan2=NULL;
  202. globalWorksizeFan1[0] = size;
  203. globalWorksizeFan2[0] = size;
  204. globalWorksizeFan2[1] = size;
  205. if(localWorksizeFan1Buf[0]){
  206. localWorksizeFan1=localWorksizeFan1Buf;
  207. globalWorksizeFan1[0]=(int)ceil(globalWorksizeFan1[0]/(double)localWorks\
  208. izeFan1Buf[0])*localWorksizeFan1Buf[0];
  209. }
  210. if(localWorksizeFan2Buf[0]){
  211. localWorksizeFan2=localWorksizeFan2Buf;
  212. globalWorksizeFan2[0]=(int)ceil(globalWorksizeFan2[0]/(double)localWorks\
  213. izeFan2Buf[0])*localWorksizeFan2Buf[0];
  214. globalWorksizeFan2[1]=(int)ceil(globalWorksizeFan2[1]/(double)localWorks\
  215. izeFan2Buf[1])*localWorksizeFan2Buf[1];
  216. }
  217. int t;
  218. // 4. Setup and Run kernels
  219. for (t=0; t<(size-1); t++) {
  220. // kernel args
  221. cl_int argchk;
  222. argchk = clSetKernelArg(fan1_kernel, 0, sizeof(cl_mem), (void *)&m_dev);
  223. argchk |= clSetKernelArg(fan1_kernel, 1, sizeof(cl_mem), (void *)&a_dev);
  224. argchk |= clSetKernelArg(fan1_kernel, 2, sizeof(cl_mem), (void *)&b_dev);
  225. argchk |= clSetKernelArg(fan1_kernel, 3, sizeof(int), (void *)&size);
  226. argchk |= clSetKernelArg(fan1_kernel, 4, sizeof(int), (void *)&t);
  227. cl_errChk(argchk,"ERROR in Setting Fan1 kernel args",true);
  228. // launch kernel
  229. error = clEnqueueNDRangeKernel(
  230. command_queue, fan1_kernel, 1, 0,
  231. globalWorksizeFan1,localWorksizeFan1,
  232. 0, NULL, &kernelEvent);
  233. cl_errChk(error,"ERROR in Executing Fan1 Kernel",true);
  234. if (timing) {
  235. // printf("here1a\n");
  236. kernelTime+=eventTime(kernelEvent,command_queue);
  237. // printf("here1b\n");
  238. }
  239. clReleaseEvent(kernelEvent);
  240. //Fan1<<<dimGrid,dimBlock>>>(m_cuda,a_cuda,Size,t);
  241. //cudaThreadSynchronize();
  242. // kernel args
  243. argchk = clSetKernelArg(fan2_kernel, 0, sizeof(cl_mem), (void *)&m_dev);
  244. argchk |= clSetKernelArg(fan2_kernel, 1, sizeof(cl_mem), (void *)&a_dev);
  245. argchk |= clSetKernelArg(fan2_kernel, 2, sizeof(cl_mem), (void *)&b_dev);
  246. argchk |= clSetKernelArg(fan2_kernel, 3, sizeof(int), (void *)&size);
  247. argchk |= clSetKernelArg(fan2_kernel, 4, sizeof(int), (void *)&t);
  248. cl_errChk(argchk,"ERROR in Setting Fan2 kernel args",true);
  249. // launch kernel
  250. error = clEnqueueNDRangeKernel(
  251. command_queue, fan2_kernel, 2, 0,
  252. globalWorksizeFan2,NULL,
  253. 0, NULL, &kernelEvent);
  254. cl_errChk(error,"ERROR in Executing Fan1 Kernel",true);
  255. if (timing) {
  256. // printf("here2a\n");
  257. kernelTime+=eventTime(kernelEvent,command_queue);
  258. // printf("here2b\n");
  259. }
  260. clReleaseEvent(kernelEvent);
  261. //Fan2<<<dimGridXY,dimBlockXY>>>(m_cuda,a_cuda,b_cuda,Size,Size-t,t);
  262. //cudaThreadSynchronize();
  263. }
  264. // 5. transfer data off of device
  265. error = clEnqueueReadBuffer(command_queue,
  266. a_dev,
  267. 1, // change to 0 for nonblocking write
  268. 0, // offset
  269. sizeof(float) * size * size,
  270. a,
  271. 0,
  272. NULL,
  273. &readEvent);
  274. cl_errChk(error,"ERROR with clEnqueueReadBuffer",true);
  275. if (timing) readTime+=eventTime(readEvent,command_queue);
  276. clReleaseEvent(readEvent);
  277. error = clEnqueueReadBuffer(command_queue,
  278. b_dev,
  279. 1, // change to 0 for nonblocking write
  280. 0, // offset
  281. sizeof(float) * size,
  282. b,
  283. 0,
  284. NULL,
  285. &readEvent);
  286. cl_errChk(error,"ERROR with clEnqueueReadBuffer",true);
  287. if (timing) readTime+=eventTime(readEvent,command_queue);
  288. clReleaseEvent(readEvent);
  289. error = clEnqueueReadBuffer(command_queue,
  290. m_dev,
  291. 1, // change to 0 for nonblocking write
  292. 0, // offset
  293. sizeof(float) * size * size,
  294. m,
  295. 0,
  296. NULL,
  297. &readEvent);
  298. cl_errChk(error,"ERROR with clEnqueueReadBuffer",true);
  299. if (timing) readTime+=eventTime(readEvent,command_queue);
  300. clReleaseEvent(readEvent);
  301. readMB = (float)(sizeof(float) * size * (size + size + 1) / 1e6);
  302. if (timing) {
  303. printf("Matrix Size\tWrite(s) [size]\t\tKernel(s)\tRead(s) [size]\t\tTotal(s)\n");
  304. printf("%dx%d \t",size,size);
  305. printf("%f [%.2fMB]\t",writeTime,writeMB);
  306. printf("%f\t",kernelTime);
  307. printf("%f [%.2fMB]\t",readTime,readMB);
  308. printf("%f\n\n",writeTime+kernelTime+readTime);
  309. }
  310. }
  311. float eventTime(cl_event event,cl_command_queue command_queue){
  312. cl_int error=0;
  313. cl_ulong eventStart,eventEnd;
  314. clFinish(command_queue);
  315. error = clGetEventProfilingInfo(event,CL_PROFILING_COMMAND_START,
  316. sizeof(cl_ulong),&eventStart,NULL);
  317. cl_errChk(error,"ERROR in Event Profiling.",true);
  318. error = clGetEventProfilingInfo(event,CL_PROFILING_COMMAND_END,
  319. sizeof(cl_ulong),&eventEnd,NULL);
  320. cl_errChk(error,"ERROR in Event Profiling.",true);
  321. return (float)((eventEnd-eventStart)/1e9);
  322. }
  323. // Ke Wang add a function to generate input internally
  324. int parseCommandline(int argc, char *argv[], char* filename,
  325. int *q, int *t, int *p, int *d, int *size, int *use_gpu){
  326. int i;
  327. if (argc < 2) return 1; // error
  328. // strncpy(filename,argv[1],100);
  329. char flag;
  330. for(i=1;i<argc;i++) {
  331. if (argv[i][0]=='-') {// flag
  332. flag = argv[i][1];
  333. switch (flag) {
  334. case 's': // platform
  335. i++;
  336. *size = atoi(argv[i]);
  337. printf("Create matrix internally in parse, size = %d \n", *size);
  338. break;
  339. case 'f': // platform
  340. i++;
  341. strncpy(filename,argv[i],100);
  342. printf("Read file from %s \n", filename);
  343. break;
  344. case 'h': // help
  345. return 1;
  346. break;
  347. case 'q': // quiet
  348. *q = 1;
  349. break;
  350. case 't': // timing
  351. *t = 1;
  352. break;
  353. case 'p': // platform
  354. i++;
  355. *p = atoi(argv[i]);
  356. break;
  357. case 'd': // device
  358. i++;
  359. *d = atoi(argv[i]);
  360. break;
  361. case 'g': // device
  362. i++;
  363. *use_gpu = atoi(argv[i]);
  364. break;
  365. }
  366. }
  367. }
  368. if ((*d >= 0 && *p<0) || (*p>=0 && *d<0)) // both p and d must be specified if either are specified
  369. return 1;
  370. return 0;
  371. }
  372. void printUsage(){
  373. printf("Gaussian Elimination Usage\n");
  374. printf("\n");
  375. printf("gaussianElimination [filename] [-hqt] [-p [int] -d [int]]\n");
  376. printf("\n");
  377. printf("example:\n");
  378. printf("$ ./gaussianElimination matrix4.txt\n");
  379. printf("\n");
  380. printf("filename the filename that holds the matrix data\n");
  381. printf("\n");
  382. printf("-h Display the help file\n");
  383. printf("-q Quiet mode. Suppress all text output.\n");
  384. printf("-t Print timing information.\n");
  385. printf("\n");
  386. printf("-p [int] Choose the platform (must choose both platform and device)\n");
  387. printf("-d [int] Choose the device (must choose both platform and device)\n");
  388. printf("-g [int] 1 for gpu and 0 for cpu\n");
  389. printf("\n");
  390. printf("\n");
  391. printf("Notes: 1. The filename is required as the first parameter.\n");
  392. printf(" 2. If you declare either the device or the platform,\n");
  393. printf(" you must declare both.\n\n");
  394. }
  395. /*------------------------------------------------------
  396. ** InitPerRun() -- Initialize the contents of the
  397. ** multipier matrix **m
  398. **------------------------------------------------------
  399. */
  400. void InitPerRun(int size,float *m)
  401. {
  402. int i;
  403. for (i=0; i<size*size; i++)
  404. *(m+i) = 0.0;
  405. }
  406. void BackSub(float *a, float *b, float *finalVec, int size)
  407. {
  408. // solve "bottom up"
  409. int i,j;
  410. for(i=0;i<size;i++){
  411. finalVec[size-i-1]=b[size-i-1];
  412. for(j=0;j<i;j++)
  413. {
  414. finalVec[size-i-1]-=*(a+size*(size-i-1)+(size-j-1)) * finalVec[size-j-1];
  415. }
  416. finalVec[size-i-1]=finalVec[size-i-1]/ *(a+size*(size-i-1)+(size-i-1));
  417. }
  418. }
  419. void InitMat(FILE *fp, int size, float *ary, int nrow, int ncol)
  420. {
  421. int i, j;
  422. for (i=0; i<nrow; i++) {
  423. for (j=0; j<ncol; j++) {
  424. fscanf(fp, "%f", ary+size*i+j);
  425. }
  426. }
  427. }
  428. /*------------------------------------------------------
  429. ** InitAry() -- Initialize the array (vector) by reading
  430. ** data from the data file
  431. **------------------------------------------------------
  432. */
  433. void InitAry(FILE *fp, float *ary, int ary_size)
  434. {
  435. int i;
  436. for (i=0; i<ary_size; i++) {
  437. fscanf(fp, "%f", &ary[i]);
  438. }
  439. }
  440. /*------------------------------------------------------
  441. ** PrintMat() -- Print the contents of the matrix
  442. **------------------------------------------------------
  443. */
  444. void PrintMat(float *ary, int size, int nrow, int ncol)
  445. {
  446. int i, j;
  447. for (i=0; i<nrow; i++) {
  448. for (j=0; j<ncol; j++) {
  449. printf("%8.2e ", *(ary+size*i+j));
  450. }
  451. printf("\n");
  452. }
  453. printf("\n");
  454. }
  455. /*------------------------------------------------------
  456. ** PrintAry() -- Print the contents of the array (vector)
  457. **------------------------------------------------------
  458. */
  459. void PrintAry(float *ary, int ary_size)
  460. {
  461. int i;
  462. for (i=0; i<ary_size; i++) {
  463. printf("%.2e ", ary[i]);
  464. }
  465. printf("\n\n");
  466. }
  467. #endif