main.cpp 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213
  1. /***********************************************************************
  2. * PathFinder uses dynamic programming to find a path on a 2-D grid from
  3. * the bottom row to the top row with the smallest accumulated weights,
  4. * where each step of the path moves straight ahead or diagonally ahead.
  5. * It iterates row by row, each node picks a neighboring node in the
  6. * previous row that has the smallest accumulated weight, and adds its
  7. * own weight to the sum.
  8. *
  9. * This kernel uses the technique of ghost zone optimization
  10. ***********************************************************************/
  11. // Other header files.
  12. #include <stdio.h>
  13. #include <stdlib.h>
  14. #include <time.h>
  15. #include <assert.h>
  16. #include <iostream>
  17. #include "OpenCL.h"
  18. using namespace std;
  19. // halo width along one direction when advancing to the next iteration
  20. #define HALO 1
  21. #define STR_SIZE 256
  22. #define DEVICE 0
  23. #define M_SEED 9
  24. //#define BENCH_PRINT
  25. #define IN_RANGE(x, min, max) ((x)>=(min) && (x)<=(max))
  26. #define CLAMP_RANGE(x, min, max) x = (x<(min)) ? min : ((x>(max)) ? max : x )
  27. #define MIN(a, b) ((a)<=(b) ? (a) : (b))
  28. // Program variables.
  29. int rows, cols;
  30. int Ne = rows * cols;
  31. int* data;
  32. int** wall;
  33. int* result;
  34. int pyramid_height;
  35. void init(int argc, char** argv)
  36. {
  37. if (argc == 4)
  38. {
  39. cols = atoi(argv[1]);
  40. rows = atoi(argv[2]);
  41. pyramid_height = atoi(argv[3]);
  42. }
  43. else
  44. {
  45. printf("Usage: dynproc row_len col_len pyramid_height\n");
  46. exit(0);
  47. }
  48. data = new int[rows * cols];
  49. wall = new int*[rows];
  50. for (int n = 0; n < rows; n++)
  51. {
  52. // wall[n] is set to be the nth row of the data array.
  53. wall[n] = data + cols * n;
  54. }
  55. result = new int[cols];
  56. int seed = M_SEED;
  57. srand(seed);
  58. for (int i = 0; i < rows; i++)
  59. {
  60. for (int j = 0; j < cols; j++)
  61. {
  62. wall[i][j] = rand() % 10;
  63. }
  64. }
  65. #ifdef BENCH_PRINT
  66. for (int i = 0; i < rows; i++)
  67. {
  68. for (int j = 0; j < cols; j++)
  69. {
  70. printf("%d ", wall[i][j]);
  71. }
  72. printf("\n");
  73. }
  74. #endif
  75. }
  76. void fatal(char *s)
  77. {
  78. fprintf(stderr, "error: %s\n", s);
  79. }
  80. int main(int argc, char** argv)
  81. {
  82. init(argc, argv);
  83. // Pyramid parameters.
  84. int borderCols = (pyramid_height) * HALO;
  85. // int smallBlockCol = ?????? - (pyramid_height) * HALO * 2;
  86. // int blockCols = cols / smallBlockCol + ((cols % smallBlockCol == 0) ? 0 : 1);
  87. /* printf("pyramidHeight: %d\ngridSize: [%d]\nborder:[%d]\nblockSize: %d\nblockGrid:[%d]\ntargetBlock:[%d]\n",
  88. pyramid_height, cols, borderCols, NUMBER_THREADS, blockCols, smallBlockCol); */
  89. int size = rows * cols;
  90. // Create and initialize the OpenCL object.
  91. OpenCL cl(1); // 1 means to display output (debugging mode).
  92. cl.init(0); // 1 means to use GPU. 0 means use CPU.
  93. cl.gwSize(rows * cols);
  94. // Create and build the kernel.
  95. string kn = "dynproc_kernel"; // the kernel name, for future use.
  96. cl.createKernel(kn);
  97. // Allocate device memory.
  98. cl_mem d_gpuWall = clCreateBuffer(cl.ctxt(),
  99. CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
  100. sizeof(cl_int)*(size-cols),
  101. (data + cols),
  102. NULL);
  103. cl_mem d_gpuResult[2];
  104. d_gpuResult[0] = clCreateBuffer(cl.ctxt(),
  105. CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
  106. sizeof(cl_int)*cols,
  107. data,
  108. NULL);
  109. d_gpuResult[1] = clCreateBuffer(cl.ctxt(),
  110. CL_MEM_READ_WRITE,
  111. sizeof(cl_int)*cols,
  112. NULL,
  113. NULL);
  114. cl_int* h_outputBuffer = (cl_int*)malloc(16384*sizeof(cl_int));
  115. for (int i = 0; i < 16384; i++)
  116. {
  117. h_outputBuffer[i] = 0;
  118. }
  119. cl_mem d_outputBuffer = clCreateBuffer(cl.ctxt(),
  120. CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
  121. sizeof(cl_int)*16384,
  122. h_outputBuffer,
  123. NULL);
  124. int src = 1, final_ret = 0;
  125. for (int t = 0; t < rows - 1; t += pyramid_height)
  126. {
  127. int temp = src;
  128. src = final_ret;
  129. final_ret = temp;
  130. // Calculate this for the kernel argument...
  131. int arg0 = MIN(pyramid_height, rows-t-1);
  132. int theHalo = HALO;
  133. // Set the kernel arguments.
  134. clSetKernelArg(cl.kernel(kn), 0, sizeof(cl_int), (void*) &arg0);
  135. clSetKernelArg(cl.kernel(kn), 1, sizeof(cl_mem), (void*) &d_gpuWall);
  136. clSetKernelArg(cl.kernel(kn), 2, sizeof(cl_mem), (void*) &d_gpuResult[src]);
  137. clSetKernelArg(cl.kernel(kn), 3, sizeof(cl_mem), (void*) &d_gpuResult[final_ret]);
  138. clSetKernelArg(cl.kernel(kn), 4, sizeof(cl_int), (void*) &cols);
  139. clSetKernelArg(cl.kernel(kn), 5, sizeof(cl_int), (void*) &rows);
  140. clSetKernelArg(cl.kernel(kn), 6, sizeof(cl_int), (void*) &t);
  141. clSetKernelArg(cl.kernel(kn), 7, sizeof(cl_int), (void*) &borderCols);
  142. clSetKernelArg(cl.kernel(kn), 8, sizeof(cl_int), (void*) &theHalo);
  143. clSetKernelArg(cl.kernel(kn), 9, sizeof(cl_int) * (cl.localSize()), 0);
  144. clSetKernelArg(cl.kernel(kn), 10, sizeof(cl_int) * (cl.localSize()), 0);
  145. clSetKernelArg(cl.kernel(kn), 11, sizeof(cl_mem), (void*) &d_outputBuffer);
  146. cl.launch(kn);
  147. }
  148. // Copy results back to host.
  149. clEnqueueReadBuffer(cl.q(), // The command queue.
  150. d_gpuResult[final_ret], // The result on the device.
  151. CL_TRUE, // Blocking? (ie. Wait at this line until read has finished?)
  152. 0, // Offset. None in this case.
  153. sizeof(cl_int)*cols, // Size to copy.
  154. result, // The pointer to the memory on the host.
  155. 0, // Number of events in wait list. Not used.
  156. NULL, // Event wait list. Not used.
  157. NULL); // Event object for determining status. Not used.
  158. // Copy string buffer used for debugging from device to host.
  159. clEnqueueReadBuffer(cl.q(), // The command queue.
  160. d_outputBuffer, // Debug buffer on the device.
  161. CL_TRUE, // Blocking? (ie. Wait at this line until read has finished?)
  162. 0, // Offset. None in this case.
  163. sizeof(cl_char)*16384, // Size to copy.
  164. h_outputBuffer, // The pointer to the memory on the host.
  165. 0, // Number of events in wait list. Not used.
  166. NULL, // Event wait list. Not used.
  167. NULL); // Event object for determining status. Not used.
  168. // Tack a null terminator at the end of the string.
  169. h_outputBuffer[16383] = '\0';
  170. #ifdef BENCH_PRINT
  171. for (int i = 0; i < cols; i++)
  172. printf("%d ", data[i]);
  173. printf("\n");
  174. for (int i = 0; i < cols; i++)
  175. printf("%d ", result[i]);
  176. printf("\n");
  177. #endif
  178. // Memory cleanup here.
  179. delete[] data;
  180. delete[] wall;
  181. delete[] result;
  182. return EXIT_SUCCESS;
  183. }