#include "find_ellipse.h" #include "track_ellipse_opencl.h" #include "OpenCL_helper_library.h" // Host and device arrays to hold device pointers to input matrices int *host_I_offsets; cl_mem device_I_offsets; // Host and device arrays to hold sizes of input matrices int *host_m_array, *host_n_array; cl_mem device_m_array, device_n_array; // Host and device arrays to hold matrices for all cells // (so we can copy to and from the device in a single transfer) float *host_I_all; cl_mem device_I_all, device_IMGVF_all; size_t total_mem_size; // The number of work items per work group const size_t local_work_size = 128; cl_bool compiled = FALSE; cl_kernel IMGVF_kernel; // Host function that launches an OpenCL kernel to compute the MGVF matrices for the specified cells void IMGVF_OpenCL(MAT **I, MAT **IMGVF, double vx, double vy, double e, int max_iterations, double cutoff, int num_cells) { cl_int error; // Initialize the data on the GPU IMGVF_OpenCL_init(I, num_cells); if (! compiled) { // Load the kernel source from the file const char *source = load_kernel_source("track_ellipse_kernel.cl"); size_t sourceSize = strlen(source); // Compile the kernel cl_program program = clCreateProgramWithSource(context, 1, &source, &sourceSize, &error); check_error(error, __FILE__, __LINE__); error = clBuildProgram(program, 1, &device, NULL, NULL, NULL); // Show compiler warnings/errors static char log[65536]; memset(log, 0, sizeof(log)); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(log)-1, log, NULL); if (strstr(log,"warning:") || strstr(log, "error:")) printf("<<<<\n%s\n>>>>\n", log); check_error(error, __FILE__, __LINE__); // Create the IMGVF kernels IMGVF_kernel = clCreateKernel(program, "IMGVF_kernel", &error); check_error(error, __FILE__, __LINE__); // Record that compiling has already completed compiled = TRUE; } // Setup execution parameters size_t num_work_groups = num_cells; size_t global_work_size = num_work_groups * local_work_size; // Convert double-precision parameters to single-precision float vx_float = (float) vx; float vy_float = (float) vy; float e_float = (float) e; float cutoff_float = (float) cutoff; // Set the kernel arguments clSetKernelArg(IMGVF_kernel, 0, sizeof(cl_mem), (void *) &device_IMGVF_all); clSetKernelArg(IMGVF_kernel, 1, sizeof(cl_mem), (void *) &device_I_all); clSetKernelArg(IMGVF_kernel, 2, sizeof(cl_mem), (void *) &device_I_offsets); clSetKernelArg(IMGVF_kernel, 3, sizeof(cl_mem), (void *) &device_m_array); clSetKernelArg(IMGVF_kernel, 4, sizeof(cl_mem), (void *) &device_n_array); clSetKernelArg(IMGVF_kernel, 5, sizeof(cl_float), (void *) &vx_float); clSetKernelArg(IMGVF_kernel, 6, sizeof(cl_float), (void *) &vy_float); clSetKernelArg(IMGVF_kernel, 7, sizeof(cl_float), (void *) &e_float); clSetKernelArg(IMGVF_kernel, 8, sizeof(cl_int), (void *) &max_iterations); clSetKernelArg(IMGVF_kernel, 9, sizeof(cl_float), (void *) &cutoff_float); // Compute the MGVF on the GPU error = clEnqueueNDRangeKernel(command_queue, IMGVF_kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL); check_error(error, __FILE__, __LINE__); // Check for kernel errors error = clFinish(command_queue); check_error(error, __FILE__, __LINE__); // Copy back the final results from the GPU IMGVF_OpenCL_cleanup(IMGVF, num_cells); } // Initializes data on the GPU for the MGVF kernel void IMGVF_OpenCL_init(MAT **IE, int num_cells) { cl_int error; // Allocate array of offsets to each cell's image size_t mem_size = sizeof(int) * num_cells; host_I_offsets = (int *) malloc(mem_size); device_I_offsets = clCreateBuffer(context, CL_MEM_READ_ONLY, mem_size, NULL, &error); check_error(error, __FILE__, __LINE__); // Allocate arrays to hold the dimensions of each cell's image host_m_array = (int *) malloc(mem_size); host_n_array = (int *) malloc(mem_size); device_m_array = clCreateBuffer(context, CL_MEM_READ_ONLY, mem_size, NULL, &error); check_error(error, __FILE__, __LINE__); device_n_array = clCreateBuffer(context, CL_MEM_READ_ONLY, mem_size, NULL, &error); check_error(error, __FILE__, __LINE__); // Figure out the size of all of the matrices combined int i, j, cell_num; size_t total_size = 0; for (cell_num = 0; cell_num < num_cells; cell_num++) { MAT *I = IE[cell_num]; size_t size = I->m * I->n; total_size += size; } total_mem_size = total_size * sizeof(float); // Allocate host memory just once for all cells host_I_all = (float *) malloc(total_mem_size); // Allocate device memory just once for all cells device_I_all = clCreateBuffer(context, CL_MEM_READ_ONLY, total_mem_size, NULL, &error); check_error(error, __FILE__, __LINE__); device_IMGVF_all = clCreateBuffer(context, CL_MEM_READ_WRITE, total_mem_size, NULL, &error); check_error(error, __FILE__, __LINE__); // Copy each initial matrix into the allocated host memory int offset = 0; for (cell_num = 0; cell_num < num_cells; cell_num++) { MAT *I = IE[cell_num]; // Determine the size of the matrix int m = I->m, n = I->n; int size = m * n; // Store memory dimensions host_m_array[cell_num] = m; host_n_array[cell_num] = n; // Store offsets to this cell's image host_I_offsets[cell_num] = offset; // Copy matrix I (which is also the initial IMGVF matrix) into the overall array for (i = 0; i < m; i++) for (j = 0; j < n; j++) host_I_all[offset + (i * n) + j] = (float) m_get_val(I, i, j); offset += size; } // Copy I matrices (which are also the initial IMGVF matrices) to device error = clEnqueueWriteBuffer(command_queue, device_I_all, CL_TRUE, 0, total_mem_size, (void *) host_I_all, 0, NULL, NULL); check_error(error, __FILE__, __LINE__); error = clEnqueueWriteBuffer(command_queue, device_IMGVF_all, CL_TRUE, 0, total_mem_size, (void *) host_I_all, 0, NULL, NULL); check_error(error, __FILE__, __LINE__); // Copy offsets array to device error = clEnqueueWriteBuffer(command_queue, device_I_offsets, CL_TRUE, 0, mem_size, (void *) host_I_offsets, 0, NULL, NULL); check_error(error, __FILE__, __LINE__); // Copy memory dimension arrays to device error = clEnqueueWriteBuffer(command_queue, device_m_array, CL_TRUE, 0, mem_size, (void *) host_m_array, 0, NULL, NULL); check_error(error, __FILE__, __LINE__); error = clEnqueueWriteBuffer(command_queue, device_n_array, CL_TRUE, 0, mem_size, (void *) host_n_array, 0, NULL, NULL); check_error(error, __FILE__, __LINE__); } // Copies the results of the MGVF kernel back to the host void IMGVF_OpenCL_cleanup(MAT **IMGVF_out_array, int num_cells) { cl_int error; // Copy the result matrices from the device to the host float *host_IMGVF_all = (cl_float *) clEnqueueMapBuffer(command_queue, device_IMGVF_all, CL_TRUE, CL_MAP_READ, 0, total_mem_size, 0, NULL, NULL, &error); check_error(error, __FILE__, __LINE__); // Copy each result matrix into its appropriate host matrix int cell_num, offset = 0; for (cell_num = 0; cell_num < num_cells; cell_num++) { MAT *IMGVF_out = IMGVF_out_array[cell_num]; // Determine the size of the matrix int m = IMGVF_out->m, n = IMGVF_out->n, i, j; // Pack the result into the matrix for (i = 0; i < m; i++) for (j = 0; j < n; j++) m_set_val(IMGVF_out, i, j, (double) host_IMGVF_all[offset + (i * n) + j]); offset += (m * n); } // Unmap results buffer error = clEnqueueUnmapMemObject(command_queue, device_IMGVF_all, (void *) host_IMGVF_all, 0, NULL, NULL); check_error(error, __FILE__, __LINE__); // Free device memory clReleaseMemObject(device_m_array); clReleaseMemObject(device_n_array); clReleaseMemObject(device_IMGVF_all); clReleaseMemObject(device_I_all); clReleaseMemObject(device_I_offsets); // Free host memory free(host_m_array); free(host_n_array); free(host_I_all); free(host_I_offsets); }