main.cpp 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247
  1. /***********************************************************************
  2. * PathFinder uses dynamic programming to find a path on a 2-D grid from
  3. * the bottom row to the top row with the smallest accumulated weights,
  4. * where each step of the path moves straight ahead or diagonally ahead.
  5. * It iterates row by row, each node picks a neighboring node in the
  6. * previous row that has the smallest accumulated weight, and adds its
  7. * own weight to the sum.
  8. *
  9. * This kernel uses the technique of ghost zone optimization
  10. ***********************************************************************/
  11. // Other header files.
  12. #include <assert.h>
  13. #include <iostream>
  14. #include <stdio.h>
  15. #include <stdlib.h>
  16. #include <time.h>
  17. #include <unistd.h>
  18. #include "OpenCL.h"
  19. using namespace std;
  20. // halo width along one direction when advancing to the next iteration
  21. #define HALO 1
  22. #define STR_SIZE 256
  23. #define DEVICE 0
  24. #define M_SEED 9
  25. //#define BENCH_PRINT
  26. #define IN_RANGE(x, min, max) ((x)>=(min) && (x)<=(max))
  27. #define CLAMP_RANGE(x, min, max) x = (x<(min)) ? min : ((x>(max)) ? max : x )
  28. #define MIN(a, b) ((a)<=(b) ? (a) : (b))
  29. // Program variables.
  30. int rows, cols;
  31. int Ne = rows * cols;
  32. int* data;
  33. int** wall;
  34. int* result;
  35. int pyramid_height;
  36. void init(int argc, char** argv, int *platform_idx, int *device_idx, int *use_gpu)
  37. {
  38. // Rewritten parameters parsing for selecting platform and device and old
  39. // parameters
  40. // The lengths of the two sequences should be able to divided by 16.
  41. // And at current stage max_rows needs to equal max_cols
  42. int opt;
  43. extern char *optarg;
  44. while ((opt = getopt(argc, argv, "r:c:h:p:d:g:")) != -1 ) {
  45. switch(opt){
  46. case 'r':
  47. rows = atoi(optarg);
  48. break;
  49. case 'c':
  50. cols = atoi(optarg);
  51. break;
  52. case 'h':
  53. pyramid_height = atoi(optarg);
  54. break;
  55. case 'p':
  56. *platform_idx = atoi(optarg);
  57. break;
  58. case 'd':
  59. *device_idx = atoi(optarg);
  60. break;
  61. case 'g':
  62. *use_gpu = atoi(optarg);
  63. break;
  64. case ':':
  65. fprintf(stderr, "missing argument\n");
  66. break;
  67. default:
  68. fprintf(stderr, "Usage: %s -r <row_len> -c <col_len> -h <pyramid_height> -p <platform> -d <device> -g <use_gpu>\n",
  69. argv[0]);
  70. exit(EXIT_FAILURE);
  71. }
  72. }
  73. data = new int[rows * cols];
  74. wall = new int*[rows];
  75. for (int n = 0; n < rows; n++)
  76. {
  77. // wall[n] is set to be the nth row of the data array.
  78. wall[n] = data + cols * n;
  79. }
  80. result = new int[cols];
  81. int seed = M_SEED;
  82. srand(seed);
  83. for (int i = 0; i < rows; i++)
  84. {
  85. for (int j = 0; j < cols; j++)
  86. {
  87. wall[i][j] = rand() % 10;
  88. }
  89. }
  90. #ifdef BENCH_PRINT
  91. for (int i = 0; i < rows; i++)
  92. {
  93. for (int j = 0; j < cols; j++)
  94. {
  95. printf("%d ", wall[i][j]);
  96. }
  97. printf("\n");
  98. }
  99. #endif
  100. }
  101. void fatal(char *s)
  102. {
  103. fprintf(stderr, "error: %s\n", s);
  104. }
  105. int main(int argc, char** argv)
  106. {
  107. // Variables to store information on platform and device to use_gpu
  108. int platform_idx = 0;
  109. int device_idx = 0;
  110. int use_gpu = 0;
  111. init(argc, argv, &platform_idx, &device_idx, &use_gpu);
  112. // Pyramid parameters.
  113. int borderCols = (pyramid_height) * HALO;
  114. // int smallBlockCol = ?????? - (pyramid_height) * HALO * 2;
  115. // int blockCols = cols / smallBlockCol + ((cols % smallBlockCol == 0) ? 0 : 1);
  116. /* printf("pyramidHeight: %d\ngridSize: [%d]\nborder:[%d]\nblockSize: %d\nblockGrid:[%d]\ntargetBlock:[%d]\n",
  117. pyramid_height, cols, borderCols, NUMBER_THREADS, blockCols, smallBlockCol); */
  118. int size = rows * cols;
  119. // Create and initialize the OpenCL object.
  120. OpenCL cl(1); // 1 means to display output (debugging mode).
  121. cl.init(platform_idx, device_idx, use_gpu); // 1 means to use GPU. 0 means use CPU.
  122. cl.gwSize(rows * cols);
  123. // Create and build the kernel.
  124. string kn = "dynproc_kernel"; // the kernel name, for future use.
  125. cl.createKernel(kn, device_idx);
  126. // Allocate device memory.
  127. cl_mem d_gpuWall = clCreateBuffer(cl.ctxt(),
  128. CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
  129. sizeof(cl_int)*(size-cols),
  130. (data + cols),
  131. NULL);
  132. cl_mem d_gpuResult[2];
  133. d_gpuResult[0] = clCreateBuffer(cl.ctxt(),
  134. CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
  135. sizeof(cl_int)*cols,
  136. data,
  137. NULL);
  138. d_gpuResult[1] = clCreateBuffer(cl.ctxt(),
  139. CL_MEM_READ_WRITE,
  140. sizeof(cl_int)*cols,
  141. NULL,
  142. NULL);
  143. cl_int* h_outputBuffer = (cl_int*)malloc(16384*sizeof(cl_int));
  144. for (int i = 0; i < 16384; i++)
  145. {
  146. h_outputBuffer[i] = 0;
  147. }
  148. cl_mem d_outputBuffer = clCreateBuffer(cl.ctxt(),
  149. CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
  150. sizeof(cl_int)*16384,
  151. h_outputBuffer,
  152. NULL);
  153. int src = 1, final_ret = 0;
  154. for (int t = 0; t < rows - 1; t += pyramid_height)
  155. {
  156. int temp = src;
  157. src = final_ret;
  158. final_ret = temp;
  159. // Calculate this for the kernel argument...
  160. int arg0 = MIN(pyramid_height, rows-t-1);
  161. int theHalo = HALO;
  162. // Set the kernel arguments.
  163. clSetKernelArg(cl.kernel(kn), 0, sizeof(cl_int), (void*) &arg0);
  164. clSetKernelArg(cl.kernel(kn), 1, sizeof(cl_mem), (void*) &d_gpuWall);
  165. clSetKernelArg(cl.kernel(kn), 2, sizeof(cl_mem), (void*) &d_gpuResult[src]);
  166. clSetKernelArg(cl.kernel(kn), 3, sizeof(cl_mem), (void*) &d_gpuResult[final_ret]);
  167. clSetKernelArg(cl.kernel(kn), 4, sizeof(cl_int), (void*) &cols);
  168. clSetKernelArg(cl.kernel(kn), 5, sizeof(cl_int), (void*) &rows);
  169. clSetKernelArg(cl.kernel(kn), 6, sizeof(cl_int), (void*) &t);
  170. clSetKernelArg(cl.kernel(kn), 7, sizeof(cl_int), (void*) &borderCols);
  171. clSetKernelArg(cl.kernel(kn), 8, sizeof(cl_int), (void*) &theHalo);
  172. clSetKernelArg(cl.kernel(kn), 9, sizeof(cl_int) * (cl.localSize()), 0);
  173. clSetKernelArg(cl.kernel(kn), 10, sizeof(cl_int) * (cl.localSize()), 0);
  174. clSetKernelArg(cl.kernel(kn), 11, sizeof(cl_mem), (void*) &d_outputBuffer);
  175. cl.launch(kn);
  176. }
  177. // Copy results back to host.
  178. clEnqueueReadBuffer(cl.q(), // The command queue.
  179. d_gpuResult[final_ret], // The result on the device.
  180. CL_TRUE, // Blocking? (ie. Wait at this line until read has finished?)
  181. 0, // Offset. None in this case.
  182. sizeof(cl_int)*cols, // Size to copy.
  183. result, // The pointer to the memory on the host.
  184. 0, // Number of events in wait list. Not used.
  185. NULL, // Event wait list. Not used.
  186. NULL); // Event object for determining status. Not used.
  187. // Copy string buffer used for debugging from device to host.
  188. clEnqueueReadBuffer(cl.q(), // The command queue.
  189. d_outputBuffer, // Debug buffer on the device.
  190. CL_TRUE, // Blocking? (ie. Wait at this line until read has finished?)
  191. 0, // Offset. None in this case.
  192. sizeof(cl_char)*16384, // Size to copy.
  193. h_outputBuffer, // The pointer to the memory on the host.
  194. 0, // Number of events in wait list. Not used.
  195. NULL, // Event wait list. Not used.
  196. NULL); // Event object for determining status. Not used.
  197. // Tack a null terminator at the end of the string.
  198. h_outputBuffer[16383] = '\0';
  199. #ifdef BENCH_PRINT
  200. for (int i = 0; i < cols; i++)
  201. printf("%d ", data[i]);
  202. printf("\n");
  203. for (int i = 0; i < cols; i++)
  204. printf("%d ", result[i]);
  205. printf("\n");
  206. #endif
  207. // Memory cleanup here.
  208. delete[] data;
  209. delete[] wall;
  210. delete[] result;
  211. return EXIT_SUCCESS;
  212. }