123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546 |
- #ifdef __cplusplus
- extern "C" {
- #endif
- //========================================================================================================================================================================================================200
- // DEFINE/INCLUDE
- //========================================================================================================================================================================================================200
- //======================================================================================================================================================150
- // LIBRARIES
- //======================================================================================================================================================150
- #include <string.h>
- #include <CL/cl.h> // (in library path provided to compiler) needed by OpenCL types and functions
- //======================================================================================================================================================150
- // MAIN FUNCTION HEADER
- //======================================================================================================================================================150
- #include "./../main.h" // (in the main program folder) needed to recognized input parameters
- //======================================================================================================================================================150
- // UTILITIES
- //======================================================================================================================================================150
- #include "./../util/opencl/opencl.h" // (in library path specified to compiler) needed by for device functions
- #include "./../util/timer/timer.h" // (in library path specified to compiler) needed by timer
- //======================================================================================================================================================150
- // KERNEL_GPU_OPENCL_WRAPPER FUNCTION HEADER
- //======================================================================================================================================================150
- #include "./kernel_gpu_opencl_wrapper.h" // (in the current directory)
- //========================================================================================================================================================================================================200
- // KERNEL_GPU_OPENCL_WRAPPER FUNCTION
- //========================================================================================================================================================================================================200
- void
- kernel_gpu_opencl_wrapper(par_str par_cpu,
- dim_str dim_cpu,
- box_str* box_cpu,
- FOUR_VECTOR* rv_cpu,
- fp* qv_cpu,
- FOUR_VECTOR* fv_cpu,
- int platform_id,
- int device_id,
- int use_gpu)
- {
- //======================================================================================================================================================150
- // CPU VARIABLES
- //======================================================================================================================================================150
- // timer
- long long time0;
- long long time1;
- long long time2;
- long long time3;
- long long time4;
- long long time5;
- long long time6;
- time0 = get_time();
- //======================================================================================================================================================150
- // GPU SETUP
- //======================================================================================================================================================150
- //====================================================================================================100
- // COMMON VARIABLES
- //====================================================================================================100
- // common variables
- cl_int error;
- //====================================================================================================100
- // GET PLATFORMS (Intel, AMD, NVIDIA, based on provided library), SELECT ONE
- //====================================================================================================100
- // Get the number of available platforms
- cl_uint num_platforms;
- error = clGetPlatformIDs( 0,
- NULL,
- &num_platforms);
- if (error != CL_SUCCESS)
- fatal_CL(error, __LINE__);
- // Get the list of available platforms
- cl_platform_id *platforms = (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms);
- error = clGetPlatformIDs( num_platforms,
- platforms,
- NULL);
- if (error != CL_SUCCESS)
- fatal_CL(error, __LINE__);
- // Select the platform in accordance to platform_id passed as parameter
- cl_platform_id platform = platforms[platform_id];
- // Get the name of the selected platform and print it (if there are multiple platforms, choose the first one)
- char pbuf[100];
- error = clGetPlatformInfo( platform,
- CL_PLATFORM_VENDOR,
- sizeof(pbuf),
- pbuf,
- NULL);
- if (error != CL_SUCCESS)
- fatal_CL(error, __LINE__);
- printf("Platform: %s\n", pbuf);
- //====================================================================================================100
- // CREATE CONTEXT FOR THE PLATFORM
- //====================================================================================================100
- // Selector for the device type in accordance to what passed as parameter
- cl_device_type device_type = use_gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU;
-
- // Create context properties for selected platform
- cl_context_properties context_properties[3] = { CL_CONTEXT_PLATFORM,
- (cl_context_properties) platform,
- 0};
- // Create context for selected platform being CPU/GPU
- cl_context context;
- context = clCreateContextFromType(context_properties,
- device_type,
- NULL,
- NULL,
- &error);
- if (error != CL_SUCCESS)
- fatal_CL(error, __LINE__);
- //====================================================================================================100
- // GET DEVICES AVAILABLE FOR THE CONTEXT, SELECT ONE
- //====================================================================================================100
- // Get the number of devices (previousely selected for the context)
- size_t devices_size;
- error = clGetContextInfo( context,
- CL_CONTEXT_DEVICES,
- 0,
- NULL,
- &devices_size);
- if (error != CL_SUCCESS)
- fatal_CL(error, __LINE__);
- // Get the list of devices (previousely selected for the context)
- cl_device_id *devices = (cl_device_id *) malloc(devices_size);
- error = clGetContextInfo( context,
- CL_CONTEXT_DEVICES,
- devices_size,
- devices,
- NULL);
- if (error != CL_SUCCESS)
- fatal_CL(error, __LINE__);
- // Select the device passed as parameter
- cl_device_id device;
- device = devices[device_id];
- // Get the name of the selected device (previousely selected for the context) and print it
- error = clGetDeviceInfo(device,
- CL_DEVICE_NAME,
- sizeof(pbuf),
- pbuf,
- NULL);
- if (error != CL_SUCCESS)
- fatal_CL(error, __LINE__);
- printf("Device: %s\n", pbuf);
- //====================================================================================================100
- // CREATE COMMAND QUEUE FOR THE DEVICE
- //====================================================================================================100
- // Create a command queue
- cl_command_queue command_queue;
- command_queue = clCreateCommandQueue( context,
- device,
- 0,
- &error);
- if (error != CL_SUCCESS)
- fatal_CL(error, __LINE__);
- //====================================================================================================100
- // CRATE PROGRAM, COMPILE IT
- //====================================================================================================100
- // Load kernel source code from file
- const char *source = load_kernel_source("./kernel/kernel_gpu_opencl.cl");
- size_t sourceSize = strlen(source);
- // Create the program
- cl_program program = clCreateProgramWithSource( context,
- 1,
- &source,
- &sourceSize,
- &error);
- if (error != CL_SUCCESS)
- fatal_CL(error, __LINE__);
- // parameterized kernel dimension
- char clOptions[110];
- // sprintf(clOptions,"-I../../src");
- sprintf(clOptions,"-I.");
- #ifdef RD_WG_SIZE
- sprintf(clOptions + strlen(clOptions), " -DRD_WG_SIZE=%d", RD_WG_SIZE);
- #endif
- #ifdef RD_WG_SIZE_0
- sprintf(clOptions + strlen(clOptions), " -DRD_WG_SIZE_0=%d", RD_WG_SIZE_0);
- #endif
- #ifdef RD_WG_SIZE_0_0
- sprintf(clOptions + strlen(clOptions), " -DRD_WG_SIZE_0_0=%d", RD_WG_SIZE_0_0);
- #endif
- // Compile the program
- error = clBuildProgram( program,
- 1,
- &device,
- clOptions,
- NULL,
- NULL);
- // Print warnings and errors from compilation
- static char log[65536];
- memset(log, 0, sizeof(log));
- clGetProgramBuildInfo( program,
- device,
- CL_PROGRAM_BUILD_LOG,
- sizeof(log)-1,
- log,
- NULL);
- if (strstr(log,"warning:") || strstr(log, "error:"))
- printf("<<<<\n%s\n>>>>\n", log);
- if (error != CL_SUCCESS)
- fatal_CL(error, __LINE__);
- // Create kernel
- cl_kernel kernel;
- kernel = clCreateKernel(program,
- "kernel_gpu_opencl",
- &error);
- if (error != CL_SUCCESS)
- fatal_CL(error, __LINE__);
- //====================================================================================================100
- // INITIAL DRIVER OVERHEAD
- //====================================================================================================100
- // cudaThreadSynchronize();
- //====================================================================================================100
- // EXECUTION PARAMETERS
- //====================================================================================================100
- size_t local_work_size[1];
- local_work_size[0] = NUMBER_THREADS;
- size_t global_work_size[1];
- global_work_size[0] = dim_cpu.number_boxes * local_work_size[0];
- printf("# of blocks = %d, # of threads/block = %d (ensure that device can handle)\n", global_work_size[0]/local_work_size[0], local_work_size[0]);
- time1 = get_time();
- //======================================================================================================================================================150
- // GPU MEMORY (MALLOC)
- //======================================================================================================================================================150
- //====================================================================================================100
- // GPU MEMORY COPY IN
- //====================================================================================================100
- //==================================================50
- // boxes
- //==================================================50
- cl_mem d_box_gpu;
- d_box_gpu = clCreateBuffer( context,
- CL_MEM_READ_WRITE,
- dim_cpu.box_mem,
- NULL,
- &error );
- if (error != CL_SUCCESS)
- fatal_CL(error, __LINE__);
- //==================================================50
- // rv
- //==================================================50
- cl_mem d_rv_gpu;
- d_rv_gpu = clCreateBuffer( context,
- CL_MEM_READ_WRITE,
- dim_cpu.space_mem,
- NULL,
- &error );
- if (error != CL_SUCCESS)
- fatal_CL(error, __LINE__);
- //==================================================50
- // qv
- //==================================================50
- cl_mem d_qv_gpu;
- d_qv_gpu = clCreateBuffer( context,
- CL_MEM_READ_WRITE,
- dim_cpu.space_mem2,
- NULL,
- &error );
- if (error != CL_SUCCESS)
- fatal_CL(error, __LINE__);
- //====================================================================================================100
- // GPU MEMORY COPY (IN & OUT)
- //====================================================================================================100
- //==================================================50
- // fv
- //==================================================50
- cl_mem d_fv_gpu;
- d_fv_gpu = clCreateBuffer( context,
- CL_MEM_READ_WRITE,
- dim_cpu.space_mem,
- NULL,
- &error );
- if (error != CL_SUCCESS)
- fatal_CL(error, __LINE__);
- time2 = get_time();
- //======================================================================================================================================================150
- // GPU MEMORY COPY IN
- //======================================================================================================================================================150
- //====================================================================================================100
- // GPU MEMORY COPY IN
- //====================================================================================================100
- //==================================================50
- // boxes
- //==================================================50
- error = clEnqueueWriteBuffer( command_queue, // command queue
- d_box_gpu, // destination
- 1, // block the source from access until this copy operation complates (1=yes, 0=no)
- 0, // offset in destination to write to
- dim_cpu.box_mem, // size to be copied
- box_cpu, // source
- 0, // # of events in the list of events to wait for
- NULL, // list of events to wait for
- NULL); // ID of this operation to be used by waiting operations
- if (error != CL_SUCCESS)
- fatal_CL(error, __LINE__);
- //==================================================50
- // rv
- //==================================================50
- error = clEnqueueWriteBuffer( command_queue,
- d_rv_gpu,
- 1,
- 0,
- dim_cpu.space_mem,
- rv_cpu,
- 0,
- 0,
- 0);
- if (error != CL_SUCCESS)
- fatal_CL(error, __LINE__);
- //==================================================50
- // qv
- //==================================================50
- error = clEnqueueWriteBuffer( command_queue,
- d_qv_gpu,
- 1,
- 0,
- dim_cpu.space_mem2,
- qv_cpu,
- 0,
- 0,
- 0);
- if (error != CL_SUCCESS)
- fatal_CL(error, __LINE__);
- //====================================================================================================100
- // GPU MEMORY COPY (IN & OUT)
- //====================================================================================================100
- //==================================================50
- // fv
- //==================================================50
- error = clEnqueueWriteBuffer( command_queue,
- d_fv_gpu,
- 1,
- 0,
- dim_cpu.space_mem,
- fv_cpu,
- 0,
- 0,
- 0);
- if (error != CL_SUCCESS)
- fatal_CL(error, __LINE__);
- time3 = get_time();
- //======================================================================================================================================================150
- // KERNEL
- //======================================================================================================================================================150
- // ???
- clSetKernelArg( kernel,
- 0,
- sizeof(par_str),
- (void *) &par_cpu);
- clSetKernelArg( kernel,
- 1,
- sizeof(dim_str),
- (void *) &dim_cpu);
- clSetKernelArg( kernel,
- 2,
- sizeof(cl_mem),
- (void *) &d_box_gpu);
- clSetKernelArg( kernel,
- 3,
- sizeof(cl_mem),
- (void *) &d_rv_gpu);
- clSetKernelArg( kernel,
- 4,
- sizeof(cl_mem),
- (void *) &d_qv_gpu);
- clSetKernelArg( kernel,
- 5,
- sizeof(cl_mem),
- (void *) &d_fv_gpu);
- // launch kernel - all boxes
- error = clEnqueueNDRangeKernel( command_queue,
- kernel,
- 1,
- NULL,
- global_work_size,
- local_work_size,
- 0,
- NULL,
- NULL);
- if (error != CL_SUCCESS)
- fatal_CL(error, __LINE__);
- // Wait for all operations to finish NOT SURE WHERE THIS SHOULD GO
- error = clFinish(command_queue);
- if (error != CL_SUCCESS)
- fatal_CL(error, __LINE__);
- time4 = get_time();
- //======================================================================================================================================================150
- // GPU MEMORY COPY OUT
- //======================================================================================================================================================150
- //====================================================================================================100
- // GPU MEMORY COPY (IN & OUT)
- //====================================================================================================100
- //==================================================50
- // fv
- //==================================================50
- error = clEnqueueReadBuffer(command_queue, // The command queue.
- d_fv_gpu, // The image on the device.
- CL_TRUE, // Blocking? (ie. Wait at this line until read has finished?)
- 0, // Offset. None in this case.
- dim_cpu.space_mem, // Size to copy.
- fv_cpu, // The pointer to the image on the host.
- 0, // Number of events in wait list. Not used.
- NULL, // Event wait list. Not used.
- NULL); // Event object for determining status. Not used.
- if (error != CL_SUCCESS)
- fatal_CL(error, __LINE__);
- // (enable for testing purposes only - prints some range of output, make sure not to initialize input in main.c with random numbers for comparison across runs)
- // int g;
- // int offset = 395;
- // for(g=0; g<10; g++){
- // printf("%f, %f, %f, %f\n", fv_cpu[offset+g].v, fv_cpu[offset+g].x, fv_cpu[offset+g].y, fv_cpu[offset+g].z);
- // }
- time5 = get_time();
- //======================================================================================================================================================150
- // GPU MEMORY DEALLOCATION
- //======================================================================================================================================================150
- // Release kernels...
- clReleaseKernel(kernel);
- // Now the program...
- clReleaseProgram(program);
- // Clean up the device memory...
- clReleaseMemObject(d_rv_gpu);
- clReleaseMemObject(d_qv_gpu);
- clReleaseMemObject(d_fv_gpu);
- clReleaseMemObject(d_box_gpu);
- // Flush the queue
- error = clFlush(command_queue);
- if (error != CL_SUCCESS)
- fatal_CL(error, __LINE__);
- // ...and finally, the queue and context.
- clReleaseCommandQueue(command_queue);
- // ???
- clReleaseContext(context);
- time6 = get_time();
- //======================================================================================================================================================150
- // DISPLAY TIMING
- //======================================================================================================================================================150
- printf("Time spent in different stages of GPU_CUDA KERNEL:\n");
- printf("%15.12f s, %15.12f % : GPU: SET DEVICE / DRIVER INIT\n", (float) (time1-time0) / 1000000, (float) (time1-time0) / (float) (time6-time0) * 100);
- printf("%15.12f s, %15.12f % : GPU MEM: ALO\n", (float) (time2-time1) / 1000000, (float) (time2-time1) / (float) (time6-time0) * 100);
- printf("%15.12f s, %15.12f % : GPU MEM: COPY IN\n", (float) (time3-time2) / 1000000, (float) (time3-time2) / (float) (time6-time0) * 100);
- printf("%15.12f s, %15.12f % : GPU: KERNEL\n", (float) (time4-time3) / 1000000, (float) (time4-time3) / (float) (time6-time0) * 100);
- printf("%15.12f s, %15.12f % : GPU MEM: COPY OUT\n", (float) (time5-time4) / 1000000, (float) (time5-time4) / (float) (time6-time0) * 100);
- printf("%15.12f s, %15.12f % : GPU MEM: FRE\n", (float) (time6-time5) / 1000000, (float) (time6-time5) / (float) (time6-time0) * 100);
- printf("Total time:\n");
- printf("%.12f s\n", (float) (time6-time0) / 1000000);
- }
- //========================================================================================================================================================================================================200
- // END KERNEL_GPU_OPENCL_WRAPPER FUNCTION
- //========================================================================================================================================================================================================200
- #ifdef __cplusplus
- }
- #endif
|