//========================================================================================================================================================================================================200 // DEFINE/INCLUDE //========================================================================================================================================================================================================200 //======================================================================================================================================================150 // LIBRARIES //======================================================================================================================================================150 #include // (in directory specified to compiler) needed by OpenCL types and functions //======================================================================================================================================================150 // MAIN FUNCTION HEADER //======================================================================================================================================================150 #include "./../main.h" // (in main directory) needed to recognized input parameters //======================================================================================================================================================150 // DEFINE //======================================================================================================================================================150 //======================================================================================================================================================150 // UTILITIES //======================================================================================================================================================150 #include "./../util/opencl/opencl.h" // (in directory) needed by device functions #include "./../util/avi/avilib.h" // (in directory) needed by avi functions #include "./../util/avi/avimod.h" // (in directory) needed by avi functions //======================================================================================================================================================150 // KERNEL_GPU_CUDA_WRAPPER FUNCTION HEADER //======================================================================================================================================================150 #include "./kernel_gpu_opencl_wrapper.h" // (in directory) //======================================================================================================================================================150 // END //======================================================================================================================================================150 //========================================================================================================================================================================================================200 // KERNEL_GPU_CUDA_WRAPPER FUNCTION //========================================================================================================================================================================================================200 void kernel_gpu_opencl_wrapper( params_common common, int* endoRow, int* endoCol, int* tEndoRowLoc, int* tEndoColLoc, int* epiRow, int* epiCol, int* tEpiRowLoc, int* tEpiColLoc, avi_t* frames) { //======================================================================================================================================================150 // CPU VARIABLES //======================================================================================================================================================150 // common variables int i; //======================================================================================================================================================150 // GPU SETUP //======================================================================================================================================================150 //====================================================================================================100 // COMMON VARIABLES //====================================================================================================100 // common variables int error; //====================================================================================================100 // GET PLATFORMS (Intel, AMD, NVIDIA, based on provided library), SELECT ONE //====================================================================================================100 // Get number of available platforms cl_uint num_platforms; error = clGetPlatformIDs( 0, NULL, &num_platforms); // # of platforms if (error != CL_SUCCESS) fatal_CL(error, __LINE__); printf("# of platforms %d\n", num_platforms); // Get list of available platforms cl_platform_id *platforms = (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms); error = clGetPlatformIDs( num_platforms, platforms, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Get names of platforms and print them cl_char pbuf[100]; int plat_count; cl_platform_id platform; for(plat_count = 0; plat_count < num_platforms; plat_count++){ platform = platforms[plat_count]; error = clGetPlatformInfo( platform, CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); printf("\tPlatform %d: %s\n", plat_count, pbuf); } // Select platform int plat = 1; platform = platforms[plat]; printf("Selecting platform %d\n", plat); //====================================================================================================100 // CREATE CONTEXT FOR THE PLATFORM //====================================================================================================100 // Create context properties for selected platform cl_context_properties context_properties[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties) platform, 0}; // Create context for selected platform being GPU cl_context context; context = clCreateContextFromType( context_properties, CL_DEVICE_TYPE_CPU, NULL, NULL, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // GET DEVICES AVAILABLE FOR THE CONTEXT, SELECT ONE //====================================================================================================100 // Get number of devices (previousely selected for the context) size_t devices_size; error = clGetContextInfo( context, CL_CONTEXT_DEVICES, 0, NULL, &devices_size); // number of bytes (devices * sizeof(cl_device_id)) if (error != CL_SUCCESS) fatal_CL(error, __LINE__); int num_devices = devices_size / sizeof(cl_device_id); printf("# of devices %d\n", num_devices); // Get the list of devices (previousely selected for the context) cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id)); error = clGetContextInfo( context, CL_CONTEXT_DEVICES, devices_size, devices, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Get names of devices and print them cl_char dbuf[100]; int devi_count; cl_device_id device; for(devi_count = 0; devi_count < num_devices; devi_count++){ device = devices[devi_count]; error = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(dbuf), dbuf, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); printf("\tDevice %d: %s\n", devi_count, dbuf); } // Select device (previousely selected for the context) (if there are multiple devices, choose the first one) int devi = 0; device = devices[devi]; printf("Selecting device %d\n", devi); //====================================================================================================100 // CREATE COMMAND QUEUE FOR THE DEVICE //====================================================================================================100 // Create a command queue cl_command_queue command_queue; command_queue = clCreateCommandQueue( context, device, 0, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // CREATE PROGRAM, COMPILE IT //====================================================================================================100 // Load kernel source code from file const char *source = load_kernel_source("./kernel/kernel_gpu_opencl.cl"); size_t sourceSize = strlen(source); // Create the program cl_program program = clCreateProgramWithSource( context, 1, &source, &sourceSize, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); char clOptions[150]; // sprintf(clOptions,"-I../../src"); sprintf(clOptions,"-I."); #ifdef RD_WG_SIZE sprintf(clOptions + strlen(clOptions), " -DRD_WG_SIZE=%d", RD_WG_SIZE); #endif #ifdef RD_WG_SIZE_0 sprintf(clOptions + strlen(clOptions), " -DRD_WG_SIZE_0=%d", RD_WG_SIZE_0); #endif #ifdef RD_WG_SIZE_0_0 sprintf(clOptions + strlen(clOptions), " -DRD_WG_SIZE_0_0=%d", RD_WG_SIZE_0_0); #endif // Compile the program error = clBuildProgram( program, 1, &device, clOptions, NULL, NULL); // Print warnings and errors from compilation static cl_char log[65536]; memset(log, 0, sizeof(log)); clGetProgramBuildInfo( program, device, CL_PROGRAM_BUILD_LOG, sizeof(log)-1, log, NULL); printf("-----OpenCL Compiler Output-----\n"); if (strstr(log,"warning:") || strstr(log, "error:")) printf("<<<<\n%s\n>>>>\n", log); printf("--------------------------------\n"); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Create kernel cl_kernel kernel; kernel = clCreateKernel(program, "kernel_gpu_opencl", &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // TRIGGERING INITIAL DRIVER OVERHEAD //====================================================================================================100 // cudaThreadSynchronize(); // the above does it //======================================================================================================================================================150 // GPU MEMORY ALLOCATION //======================================================================================================================================================150 //====================================================================================================100 // Common (COPY IN) //====================================================================================================100 cl_mem d_common; d_common = clCreateBuffer( context, CL_MEM_READ_WRITE, common.common_mem, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // Frame (COPY IN) //====================================================================================================100 // common cl_mem d_frame; d_frame = clCreateBuffer( context, CL_MEM_READ_WRITE, common.frame_mem, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // Inputs (COPY IN) //====================================================================================================100 //==================================================50 // endo points //==================================================50 // common cl_mem d_endoRow; d_endoRow = clCreateBuffer( context, CL_MEM_READ_WRITE, common.endo_mem, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); cl_mem d_endoCol; d_endoCol = clCreateBuffer( context, CL_MEM_READ_WRITE, common.endo_mem, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); cl_mem d_tEndoRowLoc; d_tEndoRowLoc = clCreateBuffer( context, CL_MEM_READ_WRITE, common.endo_mem * common.no_frames, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); cl_mem d_tEndoColLoc; d_tEndoColLoc = clCreateBuffer( context, CL_MEM_READ_WRITE, common.endo_mem * common.no_frames, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //==================================================50 // epi points //==================================================50 // common cl_mem d_epiRow; d_epiRow = clCreateBuffer( context, CL_MEM_READ_WRITE, common.epi_mem, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); cl_mem d_epiCol; d_epiCol = clCreateBuffer( context, CL_MEM_READ_WRITE, common.epi_mem, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); cl_mem d_tEpiRowLoc; d_tEpiRowLoc = clCreateBuffer( context, CL_MEM_READ_WRITE, common.epi_mem * common.no_frames, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); cl_mem d_tEpiColLoc; d_tEpiColLoc = clCreateBuffer( context, CL_MEM_READ_WRITE, common.epi_mem * common.no_frames, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // Array of Templates for All Points (COPY IN) //====================================================================================================100 //==================================================50 // template sizes //==================================================50 // common printf("tSize is %d, sSize is %d\n", common.tSize, common.sSize); common.in_rows = common.tSize + 1 + common.tSize; common.in_cols = common.in_rows; common.in_elem = common.in_rows * common.in_cols; common.in_mem = sizeof(fp) * common.in_elem; //==================================================50 // endo points templates //==================================================50 // common cl_mem d_endoT; d_endoT = clCreateBuffer( context, CL_MEM_READ_WRITE, common.in_mem * common.endoPoints, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //==================================================50 // epi points templates //==================================================50 // common cl_mem d_epiT; d_epiT = clCreateBuffer( context, CL_MEM_READ_WRITE, common.in_mem * common.epiPoints, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // AREA AROUND POINT FROM FRAME (LOCAL) //====================================================================================================100 // common common.in2_rows = common.sSize + 1 + common.sSize; common.in2_cols = common.in2_rows; common.in2_elem = common.in2_rows * common.in2_cols; common.in2_mem = sizeof(fp) * common.in2_elem; // unique cl_mem d_in2; d_in2 = clCreateBuffer( context, CL_MEM_READ_WRITE, common.in2_mem * common.allPoints, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // CONVOLUTION (LOCAL) //====================================================================================================100 // common common.conv_rows = common.in_rows + common.in2_rows - 1; // number of rows in I common.conv_cols = common.in_cols + common.in2_cols - 1; // number of columns in I common.conv_elem = common.conv_rows * common.conv_cols; // number of elements common.conv_mem = sizeof(fp) * common.conv_elem; common.ioffset = 0; common.joffset = 0; // unique cl_mem d_conv; d_conv = clCreateBuffer(context, CL_MEM_READ_WRITE, common.conv_mem * common.allPoints, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // CUMULATIVE SUM (LOCAL) //====================================================================================================100 //==================================================50 // PADDING OF ARRAY, VERTICAL CUMULATIVE SUM //==================================================50 // common common.in2_pad_add_rows = common.in_rows; common.in2_pad_add_cols = common.in_cols; common.in2_pad_cumv_rows = common.in2_rows + 2*common.in2_pad_add_rows; common.in2_pad_cumv_cols = common.in2_cols + 2*common.in2_pad_add_cols; common.in2_pad_cumv_elem = common.in2_pad_cumv_rows * common.in2_pad_cumv_cols; common.in2_pad_cumv_mem = sizeof(fp) * common.in2_pad_cumv_elem; // unique cl_mem d_in2_pad_cumv; d_in2_pad_cumv = clCreateBuffer(context, CL_MEM_READ_WRITE, common.in2_pad_cumv_mem * common.allPoints, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //==================================================50 // SELECTION //==================================================50 // common common.in2_pad_cumv_sel_rowlow = 1 + common.in_rows; // (1 to n+1) common.in2_pad_cumv_sel_rowhig = common.in2_pad_cumv_rows - 1; common.in2_pad_cumv_sel_collow = 1; common.in2_pad_cumv_sel_colhig = common.in2_pad_cumv_cols; common.in2_pad_cumv_sel_rows = common.in2_pad_cumv_sel_rowhig - common.in2_pad_cumv_sel_rowlow + 1; common.in2_pad_cumv_sel_cols = common.in2_pad_cumv_sel_colhig - common.in2_pad_cumv_sel_collow + 1; common.in2_pad_cumv_sel_elem = common.in2_pad_cumv_sel_rows * common.in2_pad_cumv_sel_cols; common.in2_pad_cumv_sel_mem = sizeof(fp) * common.in2_pad_cumv_sel_elem; // unique cl_mem d_in2_pad_cumv_sel; d_in2_pad_cumv_sel = clCreateBuffer(context, CL_MEM_READ_WRITE, common.in2_pad_cumv_sel_mem * common.allPoints, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //==================================================50 // SELECTION 2, SUBTRACTION, HORIZONTAL CUMULATIVE SUM //==================================================50 // common common.in2_pad_cumv_sel2_rowlow = 1; common.in2_pad_cumv_sel2_rowhig = common.in2_pad_cumv_rows - common.in_rows - 1; common.in2_pad_cumv_sel2_collow = 1; common.in2_pad_cumv_sel2_colhig = common.in2_pad_cumv_cols; common.in2_sub_cumh_rows = common.in2_pad_cumv_sel2_rowhig - common.in2_pad_cumv_sel2_rowlow + 1; common.in2_sub_cumh_cols = common.in2_pad_cumv_sel2_colhig - common.in2_pad_cumv_sel2_collow + 1; common.in2_sub_cumh_elem = common.in2_sub_cumh_rows * common.in2_sub_cumh_cols; common.in2_sub_cumh_mem = sizeof(fp) * common.in2_sub_cumh_elem; // unique cl_mem d_in2_sub_cumh; d_in2_sub_cumh = clCreateBuffer(context, CL_MEM_READ_WRITE, common.in2_sub_cumh_mem * common.allPoints, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //==================================================50 // SELECTION //==================================================50 // common common.in2_sub_cumh_sel_rowlow = 1; common.in2_sub_cumh_sel_rowhig = common.in2_sub_cumh_rows; common.in2_sub_cumh_sel_collow = 1 + common.in_cols; common.in2_sub_cumh_sel_colhig = common.in2_sub_cumh_cols - 1; common.in2_sub_cumh_sel_rows = common.in2_sub_cumh_sel_rowhig - common.in2_sub_cumh_sel_rowlow + 1; common.in2_sub_cumh_sel_cols = common.in2_sub_cumh_sel_colhig - common.in2_sub_cumh_sel_collow + 1; common.in2_sub_cumh_sel_elem = common.in2_sub_cumh_sel_rows * common.in2_sub_cumh_sel_cols; common.in2_sub_cumh_sel_mem = sizeof(fp) * common.in2_sub_cumh_sel_elem; // unique cl_mem d_in2_sub_cumh_sel; d_in2_sub_cumh_sel = clCreateBuffer(context, CL_MEM_READ_WRITE, common.in2_sub_cumh_sel_mem * common.allPoints, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //==================================================50 // SELECTION 2, SUBTRACTION //==================================================50 // common common.in2_sub_cumh_sel2_rowlow = 1; common.in2_sub_cumh_sel2_rowhig = common.in2_sub_cumh_rows; common.in2_sub_cumh_sel2_collow = 1; common.in2_sub_cumh_sel2_colhig = common.in2_sub_cumh_cols - common.in_cols - 1; common.in2_sub2_rows = common.in2_sub_cumh_sel2_rowhig - common.in2_sub_cumh_sel2_rowlow + 1; common.in2_sub2_cols = common.in2_sub_cumh_sel2_colhig - common.in2_sub_cumh_sel2_collow + 1; common.in2_sub2_elem = common.in2_sub2_rows * common.in2_sub2_cols; common.in2_sub2_mem = sizeof(fp) * common.in2_sub2_elem; // unique cl_mem d_in2_sub2; d_in2_sub2 = clCreateBuffer(context, CL_MEM_READ_WRITE, common.in2_sub2_mem * common.allPoints, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // CUMULATIVE SUM 2 (LOCAL) //====================================================================================================100 //==================================================50 // MULTIPLICATION //==================================================50 // common common.in2_sqr_rows = common.in2_rows; common.in2_sqr_cols = common.in2_cols; common.in2_sqr_elem = common.in2_elem; common.in2_sqr_mem = common.in2_mem; // unique cl_mem d_in2_sqr; d_in2_sqr = clCreateBuffer(context, CL_MEM_READ_WRITE, common.in2_sqr_mem * common.allPoints, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //==================================================50 // SELECTION 2, SUBTRACTION //==================================================50 // common common.in2_sqr_sub2_rows = common.in2_sub2_rows; common.in2_sqr_sub2_cols = common.in2_sub2_cols; common.in2_sqr_sub2_elem = common.in2_sub2_elem; common.in2_sqr_sub2_mem = common.in2_sub2_mem; // unique cl_mem d_in2_sqr_sub2; d_in2_sqr_sub2 = clCreateBuffer(context, CL_MEM_READ_WRITE, common.in2_sqr_sub2_mem * common.allPoints, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // FINAL (LOCAL) //====================================================================================================100 // common common.in_sqr_rows = common.in_rows; common.in_sqr_cols = common.in_cols; common.in_sqr_elem = common.in_elem; common.in_sqr_mem = common.in_mem; // unique cl_mem d_in_sqr; d_in_sqr = clCreateBuffer( context, CL_MEM_READ_WRITE, common.in_sqr_mem * common.allPoints, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // TEMPLATE MASK CREATE (LOCAL) //====================================================================================================100 // common common.tMask_rows = common.in_rows + (common.sSize+1+common.sSize) - 1; common.tMask_cols = common.tMask_rows; common.tMask_elem = common.tMask_rows * common.tMask_cols; common.tMask_mem = sizeof(fp) * common.tMask_elem; // unique cl_mem d_tMask; d_tMask = clCreateBuffer( context, CL_MEM_READ_WRITE, common.tMask_mem * common.allPoints, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // POINT MASK INITIALIZE (LOCAL) //====================================================================================================100 // common common.mask_rows = common.maxMove; common.mask_cols = common.mask_rows; common.mask_elem = common.mask_rows * common.mask_cols; common.mask_mem = sizeof(fp) * common.mask_elem; //====================================================================================================100 // MASK CONVOLUTION (LOCAL) //====================================================================================================100 // common common.mask_conv_rows = common.tMask_rows; // number of rows in I common.mask_conv_cols = common.tMask_cols; // number of columns in I common.mask_conv_elem = common.mask_conv_rows * common.mask_conv_cols; // number of elements common.mask_conv_mem = sizeof(fp) * common.mask_conv_elem; common.mask_conv_ioffset = (common.mask_rows-1)/2; if((common.mask_rows-1) % 2 > 0.5){ common.mask_conv_ioffset = common.mask_conv_ioffset + 1; } common.mask_conv_joffset = (common.mask_cols-1)/2; if((common.mask_cols-1) % 2 > 0.5){ common.mask_conv_joffset = common.mask_conv_joffset + 1; } // unique cl_mem d_mask_conv; d_mask_conv = clCreateBuffer( context, CL_MEM_READ_WRITE, common.mask_conv_mem * common.allPoints, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // END //====================================================================================================100 //======================================================================================================================================================150 // GPU MEMORY COPY //======================================================================================================================================================150 //====================================================================================================100 // Inputs //====================================================================================================100 //==================================================50 // endo points //==================================================50 error = clEnqueueWriteBuffer( command_queue, d_endoRow, 1, 0, common.endo_mem, endoRow, 0, 0, 0); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clEnqueueWriteBuffer( command_queue, d_endoCol, 1, 0, common.endo_mem, endoCol, 0, 0, 0); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //==================================================50 // epi points //==================================================50 error = clEnqueueWriteBuffer( command_queue, d_epiRow, 1, 0, common.epi_mem, epiRow, 0, 0, 0); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clEnqueueWriteBuffer( command_queue, d_epiCol, 1, 0, common.epi_mem, epiCol, 0, 0, 0); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //==================================================50 // END //==================================================50 //====================================================================================================100 // END //====================================================================================================100 //======================================================================================================================================================150 // KERNEL //======================================================================================================================================================150 //====================================================================================================100 // EXECUTION PARAMETERS //====================================================================================================100 // All kernels operations within kernel use same max size of threads. Size of block size is set to the size appropriate for max size operation (on padded matrix). Other use subsets of that. size_t local_work_size[1]; local_work_size[0] = NUMBER_THREADS; size_t global_work_size[1]; global_work_size[0] = common.allPoints * local_work_size[0]; printf("# of workgroups = %d, # of threads/workgroup = %d (ensure that device can handle)\n", (int)(global_work_size[0]/local_work_size[0]), (int)local_work_size[0]); //====================================================================================================100 // COPY ARGUMENTS //====================================================================================================100 error = clEnqueueWriteBuffer( command_queue, d_common, 1, 0, common.common_mem, &common, 0, 0, 0); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // set kernel arguments //====================================================================================================100 // structures error = clSetKernelArg( kernel, 0, sizeof(params_common), (void *) &common); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // common error = clSetKernelArg( kernel, 3, sizeof(cl_mem), (void *) &d_endoRow); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( kernel, 4, sizeof(cl_mem), (void *) &d_endoCol); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( kernel, 5, sizeof(cl_mem), (void *) &d_tEndoRowLoc); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( kernel, 6, sizeof(cl_mem), (void *) &d_tEndoColLoc); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( kernel, 7, sizeof(cl_mem), (void *) &d_epiRow); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( kernel, 8, sizeof(cl_mem), (void *) &d_epiCol); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( kernel, 9, sizeof(cl_mem), (void *) &d_tEpiRowLoc); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( kernel, 10, sizeof(cl_mem), (void *) &d_tEpiColLoc); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // common_unique error = clSetKernelArg( kernel, 11, sizeof(cl_mem), (void *) &d_endoT); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( kernel, 12, sizeof(cl_mem), (void *) &d_epiT); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( kernel, 13, sizeof(cl_mem), (void *) &d_in2); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( kernel, 14, sizeof(cl_mem), (void *) &d_conv); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( kernel, 15, sizeof(cl_mem), (void *) &d_in2_pad_cumv); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( kernel, 16, sizeof(cl_mem), (void *) &d_in2_pad_cumv_sel); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( kernel, 17, sizeof(cl_mem), (void *) &d_in2_sub_cumh); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( kernel, 18, sizeof(cl_mem), (void *) &d_in2_sub_cumh_sel); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( kernel, 19, sizeof(cl_mem), (void *) &d_in2_sub2); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( kernel, 20, sizeof(cl_mem), (void *) &d_in2_sqr); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( kernel, 21, sizeof(cl_mem), (void *) &d_in2_sqr_sub2); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( kernel, 22, sizeof(cl_mem), (void *) &d_in_sqr); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( kernel, 23, sizeof(cl_mem), (void *) &d_tMask); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( kernel, 24, sizeof(cl_mem), (void *) &d_mask_conv); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // // local // // int local_size_one; // // local_size_one = common.in_rows; // error = clSetKernelArg( kernel, // 25, // // sizeof(fp) * local_size_one, // size 51 // sizeof(fp) * common.in_elem, // NULL); // if (error != CL_SUCCESS) // fatal_CL(error, __LINE__); // error = clSetKernelArg( kernel, // 26, // // sizeof(fp) * local_size_one, // size 51 // sizeof(fp) * common.in_cols, // NULL); // if (error != CL_SUCCESS) // fatal_CL(error, __LINE__); // // int local_size_two; // // local_size_two = common.in_rows + common.in2_rows - 1; // error = clSetKernelArg( kernel, // 27, // // sizeof(fp) * local_size_two, // size 51+81-1=131 // sizeof(fp) * common.in_sqr_rows, // NULL); // if (error != CL_SUCCESS) // fatal_CL(error, __LINE__); // error = clSetKernelArg( kernel, // 28, // // sizeof(fp) * local_size_two, // size 51+81-1=131 // sizeof(fp) * common.mask_conv_rows, // NULL); // if (error != CL_SUCCESS) // fatal_CL(error, __LINE__); // // int local_size_three; // // local_size_three = common.in_rows * common.in_rows; // error = clSetKernelArg( kernel, // 29, // // sizeof(fp) * local_size_three, // size 51*51=2601 // sizeof(int) * common.mask_conv_rows, // NULL); // if (error != CL_SUCCESS) // fatal_CL(error, __LINE__); // int local_size; // local_size = (common.in_elem + common.in_cols + common.in_sqr_rows + common.mask_conv_rows) * 4 + common.mask_conv_rows * 2; // printf("size of used local memory/workgroup = %dB (ensure that device can handle)\n", local_size); cl_mem d_in_mod_temp; d_in_mod_temp = clCreateBuffer( context, CL_MEM_READ_WRITE, sizeof(fp) * common.in_elem * common.allPoints, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); cl_mem in_partial_sum; in_partial_sum = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(fp) * common.in_cols * common.allPoints, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); cl_mem in_sqr_partial_sum; in_sqr_partial_sum = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(fp) * common.in_sqr_rows * common.allPoints, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); cl_mem par_max_val; par_max_val = clCreateBuffer( context, CL_MEM_READ_WRITE, sizeof(fp) * common.mask_conv_rows * common.allPoints, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); cl_mem par_max_coo; par_max_coo = clCreateBuffer( context, CL_MEM_READ_WRITE, sizeof(int) * common.mask_conv_rows * common.allPoints, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); cl_mem in_final_sum; in_final_sum = clCreateBuffer( context, CL_MEM_READ_WRITE, sizeof(fp) * common.allPoints, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); cl_mem in_sqr_final_sum; in_sqr_final_sum = clCreateBuffer( context, CL_MEM_READ_WRITE, sizeof(fp) * common.allPoints, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); cl_mem denomT; denomT = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(fp) * common.allPoints, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( kernel, 25, sizeof(cl_mem), (void *) &d_in_mod_temp); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( kernel, 26, sizeof(cl_mem), (void *) &in_partial_sum); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( kernel, 27, sizeof(cl_mem), (void *) &in_sqr_partial_sum); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( kernel, 28, sizeof(cl_mem), (void *) &par_max_val); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( kernel, 29, sizeof(cl_mem), (void *) &par_max_coo); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( kernel, 30, sizeof(cl_mem), (void *) &in_final_sum); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( kernel, 31, sizeof(cl_mem), (void *) &in_sqr_final_sum); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( kernel, 32, sizeof(cl_mem), (void *) &denomT); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); cl_mem d_checksum; d_checksum = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(fp) * CHECK, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( kernel, 33, sizeof(cl_mem), (void *) &d_checksum); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // PRINT FRAME PROGRESS START //====================================================================================================100 printf("frame progress: "); fflush(NULL); //====================================================================================================100 // LAUNCH //====================================================================================================100 // variables fp* frame; int frame_no; for(frame_no=0; frame_no