123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247 |
- /***********************************************************************
- * PathFinder uses dynamic programming to find a path on a 2-D grid from
- * the bottom row to the top row with the smallest accumulated weights,
- * where each step of the path moves straight ahead or diagonally ahead.
- * It iterates row by row, each node picks a neighboring node in the
- * previous row that has the smallest accumulated weight, and adds its
- * own weight to the sum.
- *
- * This kernel uses the technique of ghost zone optimization
- ***********************************************************************/
- // Other header files.
- #include <assert.h>
- #include <iostream>
- #include <stdio.h>
- #include <stdlib.h>
- #include <time.h>
- #include <unistd.h>
- #include "OpenCL.h"
- using namespace std;
- // halo width along one direction when advancing to the next iteration
- #define HALO 1
- #define STR_SIZE 256
- #define DEVICE 0
- #define M_SEED 9
- //#define BENCH_PRINT
- #define IN_RANGE(x, min, max) ((x)>=(min) && (x)<=(max))
- #define CLAMP_RANGE(x, min, max) x = (x<(min)) ? min : ((x>(max)) ? max : x )
- #define MIN(a, b) ((a)<=(b) ? (a) : (b))
- // Program variables.
- int rows, cols;
- int Ne = rows * cols;
- int* data;
- int** wall;
- int* result;
- int pyramid_height;
- void init(int argc, char** argv, int *platform_idx, int *device_idx, int *use_gpu)
- {
-
- // Rewritten parameters parsing for selecting platform and device and old
- // parameters
-
- // The lengths of the two sequences should be able to divided by 16.
- // And at current stage max_rows needs to equal max_cols
- int opt;
- extern char *optarg;
- while ((opt = getopt(argc, argv, "r:c:h:p:d:g:")) != -1 ) {
- switch(opt){
- case 'r':
- rows = atoi(optarg);
- break;
- case 'c':
- cols = atoi(optarg);
- break;
- case 'h':
- pyramid_height = atoi(optarg);
- break;
- case 'p':
- *platform_idx = atoi(optarg);
- break;
- case 'd':
- *device_idx = atoi(optarg);
- break;
- case 'g':
- *use_gpu = atoi(optarg);
- break;
- case ':':
- fprintf(stderr, "missing argument\n");
- break;
- default:
- fprintf(stderr, "Usage: %s -r <row_len> -c <col_len> -h <pyramid_height> -p <platform> -d <device> -g <use_gpu>\n",
- argv[0]);
- exit(EXIT_FAILURE);
- }
- }
-
- data = new int[rows * cols];
- wall = new int*[rows];
- for (int n = 0; n < rows; n++)
- {
- // wall[n] is set to be the nth row of the data array.
- wall[n] = data + cols * n;
- }
- result = new int[cols];
- int seed = M_SEED;
- srand(seed);
- for (int i = 0; i < rows; i++)
- {
- for (int j = 0; j < cols; j++)
- {
- wall[i][j] = rand() % 10;
- }
- }
- #ifdef BENCH_PRINT
- for (int i = 0; i < rows; i++)
- {
- for (int j = 0; j < cols; j++)
- {
- printf("%d ", wall[i][j]);
- }
- printf("\n");
- }
- #endif
- }
- void fatal(char *s)
- {
- fprintf(stderr, "error: %s\n", s);
- }
- int main(int argc, char** argv)
- {
-
- // Variables to store information on platform and device to use_gpu
- int platform_idx = 0;
- int device_idx = 0;
- int use_gpu = 0;
-
- init(argc, argv, &platform_idx, &device_idx, &use_gpu);
-
- // Pyramid parameters.
- int borderCols = (pyramid_height) * HALO;
- // int smallBlockCol = ?????? - (pyramid_height) * HALO * 2;
- // int blockCols = cols / smallBlockCol + ((cols % smallBlockCol == 0) ? 0 : 1);
-
- /* printf("pyramidHeight: %d\ngridSize: [%d]\nborder:[%d]\nblockSize: %d\nblockGrid:[%d]\ntargetBlock:[%d]\n",
- pyramid_height, cols, borderCols, NUMBER_THREADS, blockCols, smallBlockCol); */
- int size = rows * cols;
- // Create and initialize the OpenCL object.
- OpenCL cl(1); // 1 means to display output (debugging mode).
- cl.init(platform_idx, device_idx, use_gpu); // 1 means to use GPU. 0 means use CPU.
- cl.gwSize(rows * cols);
- // Create and build the kernel.
- string kn = "dynproc_kernel"; // the kernel name, for future use.
- cl.createKernel(kn, device_idx);
- // Allocate device memory.
- cl_mem d_gpuWall = clCreateBuffer(cl.ctxt(),
- CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
- sizeof(cl_int)*(size-cols),
- (data + cols),
- NULL);
- cl_mem d_gpuResult[2];
- d_gpuResult[0] = clCreateBuffer(cl.ctxt(),
- CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
- sizeof(cl_int)*cols,
- data,
- NULL);
- d_gpuResult[1] = clCreateBuffer(cl.ctxt(),
- CL_MEM_READ_WRITE,
- sizeof(cl_int)*cols,
- NULL,
- NULL);
- cl_int* h_outputBuffer = (cl_int*)malloc(16384*sizeof(cl_int));
- for (int i = 0; i < 16384; i++)
- {
- h_outputBuffer[i] = 0;
- }
- cl_mem d_outputBuffer = clCreateBuffer(cl.ctxt(),
- CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
- sizeof(cl_int)*16384,
- h_outputBuffer,
- NULL);
- int src = 1, final_ret = 0;
- for (int t = 0; t < rows - 1; t += pyramid_height)
- {
- int temp = src;
- src = final_ret;
- final_ret = temp;
- // Calculate this for the kernel argument...
- int arg0 = MIN(pyramid_height, rows-t-1);
- int theHalo = HALO;
- // Set the kernel arguments.
- clSetKernelArg(cl.kernel(kn), 0, sizeof(cl_int), (void*) &arg0);
- clSetKernelArg(cl.kernel(kn), 1, sizeof(cl_mem), (void*) &d_gpuWall);
- clSetKernelArg(cl.kernel(kn), 2, sizeof(cl_mem), (void*) &d_gpuResult[src]);
- clSetKernelArg(cl.kernel(kn), 3, sizeof(cl_mem), (void*) &d_gpuResult[final_ret]);
- clSetKernelArg(cl.kernel(kn), 4, sizeof(cl_int), (void*) &cols);
- clSetKernelArg(cl.kernel(kn), 5, sizeof(cl_int), (void*) &rows);
- clSetKernelArg(cl.kernel(kn), 6, sizeof(cl_int), (void*) &t);
- clSetKernelArg(cl.kernel(kn), 7, sizeof(cl_int), (void*) &borderCols);
- clSetKernelArg(cl.kernel(kn), 8, sizeof(cl_int), (void*) &theHalo);
- clSetKernelArg(cl.kernel(kn), 9, sizeof(cl_int) * (cl.localSize()), 0);
- clSetKernelArg(cl.kernel(kn), 10, sizeof(cl_int) * (cl.localSize()), 0);
- clSetKernelArg(cl.kernel(kn), 11, sizeof(cl_mem), (void*) &d_outputBuffer);
- cl.launch(kn);
- }
- // Copy results back to host.
- clEnqueueReadBuffer(cl.q(), // The command queue.
- d_gpuResult[final_ret], // The result on the device.
- CL_TRUE, // Blocking? (ie. Wait at this line until read has finished?)
- 0, // Offset. None in this case.
- sizeof(cl_int)*cols, // Size to copy.
- result, // The pointer to the memory on the host.
- 0, // Number of events in wait list. Not used.
- NULL, // Event wait list. Not used.
- NULL); // Event object for determining status. Not used.
- // Copy string buffer used for debugging from device to host.
- clEnqueueReadBuffer(cl.q(), // The command queue.
- d_outputBuffer, // Debug buffer on the device.
- CL_TRUE, // Blocking? (ie. Wait at this line until read has finished?)
- 0, // Offset. None in this case.
- sizeof(cl_char)*16384, // Size to copy.
- h_outputBuffer, // The pointer to the memory on the host.
- 0, // Number of events in wait list. Not used.
- NULL, // Event wait list. Not used.
- NULL); // Event object for determining status. Not used.
-
- // Tack a null terminator at the end of the string.
- h_outputBuffer[16383] = '\0';
-
- #ifdef BENCH_PRINT
- for (int i = 0; i < cols; i++)
- printf("%d ", data[i]);
- printf("\n");
- for (int i = 0; i < cols; i++)
- printf("%d ", result[i]);
- printf("\n");
- #endif
- // Memory cleanup here.
- delete[] data;
- delete[] wall;
- delete[] result;
-
- return EXIT_SUCCESS;
- }
|