kernel_gpu_opencl_wrapper.c 49 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467
  1. //========================================================================================================================================================================================================200
  2. // DEFINE/INCLUDE
  3. //========================================================================================================================================================================================================200
  4. //======================================================================================================================================================150
  5. // LIBRARIES
  6. //======================================================================================================================================================150
  7. #include <CL/cl.h> // (in directory specified to compiler) needed by OpenCL types and functions
  8. //======================================================================================================================================================150
  9. // MAIN FUNCTION HEADER
  10. //======================================================================================================================================================150
  11. #include "./../main.h" // (in main directory) needed to recognized input parameters
  12. //======================================================================================================================================================150
  13. // DEFINE
  14. //======================================================================================================================================================150
  15. //======================================================================================================================================================150
  16. // UTILITIES
  17. //======================================================================================================================================================150
  18. #include "./../util/opencl/opencl.h" // (in directory) needed by device functions
  19. #include "./../util/avi/avilib.h" // (in directory) needed by avi functions
  20. #include "./../util/avi/avimod.h" // (in directory) needed by avi functions
  21. //======================================================================================================================================================150
  22. // KERNEL_GPU_CUDA_WRAPPER FUNCTION HEADER
  23. //======================================================================================================================================================150
  24. #include "./kernel_gpu_opencl_wrapper.h" // (in directory)
  25. //======================================================================================================================================================150
  26. // END
  27. //======================================================================================================================================================150
  28. //========================================================================================================================================================================================================200
  29. // KERNEL_GPU_CUDA_WRAPPER FUNCTION
  30. //========================================================================================================================================================================================================200
  31. void
  32. kernel_gpu_opencl_wrapper( params_common common,
  33. int* endoRow,
  34. int* endoCol,
  35. int* tEndoRowLoc,
  36. int* tEndoColLoc,
  37. int* epiRow,
  38. int* epiCol,
  39. int* tEpiRowLoc,
  40. int* tEpiColLoc,
  41. avi_t* frames,
  42. int platform_id,
  43. int device_id,
  44. int use_gpu)
  45. {
  46. //======================================================================================================================================================150
  47. // CPU VARIABLES
  48. //======================================================================================================================================================150
  49. // common variables
  50. int i;
  51. //======================================================================================================================================================150
  52. // GPU SETUP
  53. //======================================================================================================================================================150
  54. //====================================================================================================100
  55. // COMMON VARIABLES
  56. //====================================================================================================100
  57. // common variables
  58. int error;
  59. //====================================================================================================100
  60. // GET PLATFORMS (Intel, AMD, NVIDIA, based on provided library), SELECT ONE
  61. //====================================================================================================100
  62. // Get number of available platforms
  63. cl_uint num_platforms;
  64. error = clGetPlatformIDs( 0,
  65. NULL,
  66. &num_platforms); // # of platforms
  67. if (error != CL_SUCCESS)
  68. fatal_CL(error, __LINE__);
  69. printf("# of platforms %d\n", num_platforms);
  70. // Get list of available platforms
  71. cl_platform_id *platforms = (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms);
  72. error = clGetPlatformIDs( num_platforms,
  73. platforms,
  74. NULL);
  75. if (error != CL_SUCCESS)
  76. fatal_CL(error, __LINE__);
  77. // Get names of platforms and print them
  78. cl_char pbuf[100];
  79. int plat_count;
  80. cl_platform_id platform;
  81. for(plat_count = 0; plat_count < num_platforms; plat_count++){
  82. platform = platforms[plat_count];
  83. error = clGetPlatformInfo( platform,
  84. CL_PLATFORM_VENDOR,
  85. sizeof(pbuf),
  86. pbuf,
  87. NULL);
  88. if (error != CL_SUCCESS)
  89. fatal_CL(error, __LINE__);
  90. printf("\tPlatform %d: %s\n", plat_count, pbuf);
  91. }
  92. // Select platform on the basis of the passed as function argument
  93. platform = platforms[platform_id];
  94. printf("Selecting platform %d\n", platform_id);
  95. //====================================================================================================100
  96. // CREATE CONTEXT FOR THE PLATFORM
  97. //====================================================================================================100
  98. // Create context properties for selected platform
  99. cl_context_properties context_properties[3] = { CL_CONTEXT_PLATFORM,
  100. (cl_context_properties) platform,
  101. 0};
  102. // Create the device_type selector in the basis of the argument passed (use_gpu)
  103. cl_device_type device_type = use_gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU;
  104. // Create context for selected platform being GPU or CPU
  105. cl_context context;
  106. context = clCreateContextFromType( context_properties,
  107. device_type,
  108. NULL,
  109. NULL,
  110. &error);
  111. if (error != CL_SUCCESS)
  112. fatal_CL(error, __LINE__);
  113. //====================================================================================================100
  114. // GET DEVICES AVAILABLE FOR THE CONTEXT, SELECT ONE
  115. //====================================================================================================100
  116. // Get number of devices (previousely selected for the context)
  117. size_t devices_size;
  118. error = clGetContextInfo( context,
  119. CL_CONTEXT_DEVICES,
  120. 0,
  121. NULL,
  122. &devices_size); // number of bytes (devices * sizeof(cl_device_id))
  123. if (error != CL_SUCCESS)
  124. fatal_CL(error, __LINE__);
  125. int num_devices = devices_size / sizeof(cl_device_id);
  126. printf("# of devices %d\n", num_devices);
  127. // Get the list of devices (previousely selected for the context)
  128. cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id));
  129. error = clGetContextInfo( context,
  130. CL_CONTEXT_DEVICES,
  131. devices_size,
  132. devices,
  133. NULL);
  134. if (error != CL_SUCCESS)
  135. fatal_CL(error, __LINE__);
  136. // Get names of devices and print them
  137. cl_char dbuf[100];
  138. int devi_count;
  139. cl_device_id device;
  140. for(devi_count = 0; devi_count < num_devices; devi_count++){
  141. device = devices[devi_count];
  142. error = clGetDeviceInfo(device,
  143. CL_DEVICE_NAME,
  144. sizeof(dbuf),
  145. dbuf,
  146. NULL);
  147. if (error != CL_SUCCESS)
  148. fatal_CL(error, __LINE__);
  149. printf("\tDevice %d: %s\n", devi_count, dbuf);
  150. }
  151. // Select device on the basis of the argument passed as function argument
  152. device = devices[device_id];
  153. printf("Selecting device %d\n", device_id);
  154. //====================================================================================================100
  155. // CREATE COMMAND QUEUE FOR THE DEVICE
  156. //====================================================================================================100
  157. // Create a command queue
  158. cl_command_queue command_queue;
  159. command_queue = clCreateCommandQueue( context,
  160. device,
  161. 0,
  162. &error);
  163. if (error != CL_SUCCESS)
  164. fatal_CL(error, __LINE__);
  165. //====================================================================================================100
  166. // CREATE PROGRAM, COMPILE IT
  167. //====================================================================================================100
  168. // Load kernel source code from file
  169. const char *source = load_kernel_source("./kernel/kernel_gpu_opencl.cl");
  170. size_t sourceSize = strlen(source);
  171. // Create the program
  172. cl_program program = clCreateProgramWithSource( context,
  173. 1,
  174. &source,
  175. &sourceSize,
  176. &error);
  177. if (error != CL_SUCCESS)
  178. fatal_CL(error, __LINE__);
  179. char clOptions[150];
  180. // sprintf(clOptions,"-I../../src");
  181. sprintf(clOptions,"-I.");
  182. #ifdef RD_WG_SIZE
  183. sprintf(clOptions + strlen(clOptions), " -DRD_WG_SIZE=%d", RD_WG_SIZE);
  184. #endif
  185. #ifdef RD_WG_SIZE_0
  186. sprintf(clOptions + strlen(clOptions), " -DRD_WG_SIZE_0=%d", RD_WG_SIZE_0);
  187. #endif
  188. #ifdef RD_WG_SIZE_0_0
  189. sprintf(clOptions + strlen(clOptions), " -DRD_WG_SIZE_0_0=%d", RD_WG_SIZE_0_0);
  190. #endif
  191. // Compile the program
  192. error = clBuildProgram( program,
  193. 1,
  194. &device,
  195. clOptions,
  196. NULL,
  197. NULL);
  198. // Print warnings and errors from compilation
  199. static cl_char log[65536];
  200. memset(log, 0, sizeof(log));
  201. clGetProgramBuildInfo( program,
  202. device,
  203. CL_PROGRAM_BUILD_LOG,
  204. sizeof(log)-1,
  205. log,
  206. NULL);
  207. printf("-----OpenCL Compiler Output-----\n");
  208. if (strstr(log,"warning:") || strstr(log, "error:"))
  209. printf("<<<<\n%s\n>>>>\n", log);
  210. printf("--------------------------------\n");
  211. if (error != CL_SUCCESS)
  212. fatal_CL(error, __LINE__);
  213. // Create kernel
  214. cl_kernel kernel;
  215. kernel = clCreateKernel(program,
  216. "kernel_gpu_opencl",
  217. &error);
  218. if (error != CL_SUCCESS)
  219. fatal_CL(error, __LINE__);
  220. //====================================================================================================100
  221. // TRIGGERING INITIAL DRIVER OVERHEAD
  222. //====================================================================================================100
  223. // cudaThreadSynchronize(); // the above does it
  224. //======================================================================================================================================================150
  225. // GPU MEMORY ALLOCATION
  226. //======================================================================================================================================================150
  227. //====================================================================================================100
  228. // Common (COPY IN)
  229. //====================================================================================================100
  230. cl_mem d_common;
  231. d_common = clCreateBuffer( context,
  232. CL_MEM_READ_WRITE,
  233. common.common_mem,
  234. NULL,
  235. &error );
  236. if (error != CL_SUCCESS)
  237. fatal_CL(error, __LINE__);
  238. //====================================================================================================100
  239. // Frame (COPY IN)
  240. //====================================================================================================100
  241. // common
  242. cl_mem d_frame;
  243. d_frame = clCreateBuffer( context,
  244. CL_MEM_READ_WRITE,
  245. common.frame_mem,
  246. NULL,
  247. &error );
  248. if (error != CL_SUCCESS)
  249. fatal_CL(error, __LINE__);
  250. //====================================================================================================100
  251. // Inputs (COPY IN)
  252. //====================================================================================================100
  253. //==================================================50
  254. // endo points
  255. //==================================================50
  256. // common
  257. cl_mem d_endoRow;
  258. d_endoRow = clCreateBuffer( context,
  259. CL_MEM_READ_WRITE,
  260. common.endo_mem,
  261. NULL,
  262. &error );
  263. if (error != CL_SUCCESS)
  264. fatal_CL(error, __LINE__);
  265. cl_mem d_endoCol;
  266. d_endoCol = clCreateBuffer( context,
  267. CL_MEM_READ_WRITE,
  268. common.endo_mem,
  269. NULL,
  270. &error );
  271. if (error != CL_SUCCESS)
  272. fatal_CL(error, __LINE__);
  273. cl_mem d_tEndoRowLoc;
  274. d_tEndoRowLoc = clCreateBuffer( context,
  275. CL_MEM_READ_WRITE,
  276. common.endo_mem * common.no_frames,
  277. NULL,
  278. &error );
  279. if (error != CL_SUCCESS)
  280. fatal_CL(error, __LINE__);
  281. cl_mem d_tEndoColLoc;
  282. d_tEndoColLoc = clCreateBuffer( context,
  283. CL_MEM_READ_WRITE,
  284. common.endo_mem * common.no_frames,
  285. NULL,
  286. &error );
  287. if (error != CL_SUCCESS)
  288. fatal_CL(error, __LINE__);
  289. //==================================================50
  290. // epi points
  291. //==================================================50
  292. // common
  293. cl_mem d_epiRow;
  294. d_epiRow = clCreateBuffer( context,
  295. CL_MEM_READ_WRITE,
  296. common.epi_mem,
  297. NULL,
  298. &error );
  299. if (error != CL_SUCCESS)
  300. fatal_CL(error, __LINE__);
  301. cl_mem d_epiCol;
  302. d_epiCol = clCreateBuffer( context,
  303. CL_MEM_READ_WRITE,
  304. common.epi_mem,
  305. NULL,
  306. &error );
  307. if (error != CL_SUCCESS)
  308. fatal_CL(error, __LINE__);
  309. cl_mem d_tEpiRowLoc;
  310. d_tEpiRowLoc = clCreateBuffer( context,
  311. CL_MEM_READ_WRITE,
  312. common.epi_mem * common.no_frames,
  313. NULL,
  314. &error );
  315. if (error != CL_SUCCESS)
  316. fatal_CL(error, __LINE__);
  317. cl_mem d_tEpiColLoc;
  318. d_tEpiColLoc = clCreateBuffer( context,
  319. CL_MEM_READ_WRITE,
  320. common.epi_mem * common.no_frames,
  321. NULL,
  322. &error );
  323. if (error != CL_SUCCESS)
  324. fatal_CL(error, __LINE__);
  325. //====================================================================================================100
  326. // Array of Templates for All Points (COPY IN)
  327. //====================================================================================================100
  328. //==================================================50
  329. // template sizes
  330. //==================================================50
  331. // common
  332. printf("tSize is %d, sSize is %d\n", common.tSize, common.sSize);
  333. common.in_rows = common.tSize + 1 + common.tSize;
  334. common.in_cols = common.in_rows;
  335. common.in_elem = common.in_rows * common.in_cols;
  336. common.in_mem = sizeof(fp) * common.in_elem;
  337. //==================================================50
  338. // endo points templates
  339. //==================================================50
  340. // common
  341. cl_mem d_endoT;
  342. d_endoT = clCreateBuffer( context,
  343. CL_MEM_READ_WRITE,
  344. common.in_mem * common.endoPoints,
  345. NULL,
  346. &error );
  347. if (error != CL_SUCCESS)
  348. fatal_CL(error, __LINE__);
  349. //==================================================50
  350. // epi points templates
  351. //==================================================50
  352. // common
  353. cl_mem d_epiT;
  354. d_epiT = clCreateBuffer( context,
  355. CL_MEM_READ_WRITE,
  356. common.in_mem * common.epiPoints,
  357. NULL,
  358. &error );
  359. if (error != CL_SUCCESS)
  360. fatal_CL(error, __LINE__);
  361. //====================================================================================================100
  362. // AREA AROUND POINT FROM FRAME (LOCAL)
  363. //====================================================================================================100
  364. // common
  365. common.in2_rows = common.sSize + 1 + common.sSize;
  366. common.in2_cols = common.in2_rows;
  367. common.in2_elem = common.in2_rows * common.in2_cols;
  368. common.in2_mem = sizeof(fp) * common.in2_elem;
  369. // unique
  370. cl_mem d_in2;
  371. d_in2 = clCreateBuffer( context,
  372. CL_MEM_READ_WRITE,
  373. common.in2_mem * common.allPoints,
  374. NULL,
  375. &error );
  376. if (error != CL_SUCCESS)
  377. fatal_CL(error, __LINE__);
  378. //====================================================================================================100
  379. // CONVOLUTION (LOCAL)
  380. //====================================================================================================100
  381. // common
  382. common.conv_rows = common.in_rows + common.in2_rows - 1; // number of rows in I
  383. common.conv_cols = common.in_cols + common.in2_cols - 1; // number of columns in I
  384. common.conv_elem = common.conv_rows * common.conv_cols; // number of elements
  385. common.conv_mem = sizeof(fp) * common.conv_elem;
  386. common.ioffset = 0;
  387. common.joffset = 0;
  388. // unique
  389. cl_mem d_conv;
  390. d_conv = clCreateBuffer(context,
  391. CL_MEM_READ_WRITE,
  392. common.conv_mem * common.allPoints,
  393. NULL,
  394. &error );
  395. if (error != CL_SUCCESS)
  396. fatal_CL(error, __LINE__);
  397. //====================================================================================================100
  398. // CUMULATIVE SUM (LOCAL)
  399. //====================================================================================================100
  400. //==================================================50
  401. // PADDING OF ARRAY, VERTICAL CUMULATIVE SUM
  402. //==================================================50
  403. // common
  404. common.in2_pad_add_rows = common.in_rows;
  405. common.in2_pad_add_cols = common.in_cols;
  406. common.in2_pad_cumv_rows = common.in2_rows + 2*common.in2_pad_add_rows;
  407. common.in2_pad_cumv_cols = common.in2_cols + 2*common.in2_pad_add_cols;
  408. common.in2_pad_cumv_elem = common.in2_pad_cumv_rows * common.in2_pad_cumv_cols;
  409. common.in2_pad_cumv_mem = sizeof(fp) * common.in2_pad_cumv_elem;
  410. // unique
  411. cl_mem d_in2_pad_cumv;
  412. d_in2_pad_cumv = clCreateBuffer(context,
  413. CL_MEM_READ_WRITE,
  414. common.in2_pad_cumv_mem * common.allPoints,
  415. NULL,
  416. &error );
  417. if (error != CL_SUCCESS)
  418. fatal_CL(error, __LINE__);
  419. //==================================================50
  420. // SELECTION
  421. //==================================================50
  422. // common
  423. common.in2_pad_cumv_sel_rowlow = 1 + common.in_rows; // (1 to n+1)
  424. common.in2_pad_cumv_sel_rowhig = common.in2_pad_cumv_rows - 1;
  425. common.in2_pad_cumv_sel_collow = 1;
  426. common.in2_pad_cumv_sel_colhig = common.in2_pad_cumv_cols;
  427. common.in2_pad_cumv_sel_rows = common.in2_pad_cumv_sel_rowhig - common.in2_pad_cumv_sel_rowlow + 1;
  428. common.in2_pad_cumv_sel_cols = common.in2_pad_cumv_sel_colhig - common.in2_pad_cumv_sel_collow + 1;
  429. common.in2_pad_cumv_sel_elem = common.in2_pad_cumv_sel_rows * common.in2_pad_cumv_sel_cols;
  430. common.in2_pad_cumv_sel_mem = sizeof(fp) * common.in2_pad_cumv_sel_elem;
  431. // unique
  432. cl_mem d_in2_pad_cumv_sel;
  433. d_in2_pad_cumv_sel = clCreateBuffer(context,
  434. CL_MEM_READ_WRITE,
  435. common.in2_pad_cumv_sel_mem * common.allPoints,
  436. NULL,
  437. &error );
  438. if (error != CL_SUCCESS)
  439. fatal_CL(error, __LINE__);
  440. //==================================================50
  441. // SELECTION 2, SUBTRACTION, HORIZONTAL CUMULATIVE SUM
  442. //==================================================50
  443. // common
  444. common.in2_pad_cumv_sel2_rowlow = 1;
  445. common.in2_pad_cumv_sel2_rowhig = common.in2_pad_cumv_rows - common.in_rows - 1;
  446. common.in2_pad_cumv_sel2_collow = 1;
  447. common.in2_pad_cumv_sel2_colhig = common.in2_pad_cumv_cols;
  448. common.in2_sub_cumh_rows = common.in2_pad_cumv_sel2_rowhig - common.in2_pad_cumv_sel2_rowlow + 1;
  449. common.in2_sub_cumh_cols = common.in2_pad_cumv_sel2_colhig - common.in2_pad_cumv_sel2_collow + 1;
  450. common.in2_sub_cumh_elem = common.in2_sub_cumh_rows * common.in2_sub_cumh_cols;
  451. common.in2_sub_cumh_mem = sizeof(fp) * common.in2_sub_cumh_elem;
  452. // unique
  453. cl_mem d_in2_sub_cumh;
  454. d_in2_sub_cumh = clCreateBuffer(context,
  455. CL_MEM_READ_WRITE,
  456. common.in2_sub_cumh_mem * common.allPoints,
  457. NULL,
  458. &error );
  459. if (error != CL_SUCCESS)
  460. fatal_CL(error, __LINE__);
  461. //==================================================50
  462. // SELECTION
  463. //==================================================50
  464. // common
  465. common.in2_sub_cumh_sel_rowlow = 1;
  466. common.in2_sub_cumh_sel_rowhig = common.in2_sub_cumh_rows;
  467. common.in2_sub_cumh_sel_collow = 1 + common.in_cols;
  468. common.in2_sub_cumh_sel_colhig = common.in2_sub_cumh_cols - 1;
  469. common.in2_sub_cumh_sel_rows = common.in2_sub_cumh_sel_rowhig - common.in2_sub_cumh_sel_rowlow + 1;
  470. common.in2_sub_cumh_sel_cols = common.in2_sub_cumh_sel_colhig - common.in2_sub_cumh_sel_collow + 1;
  471. common.in2_sub_cumh_sel_elem = common.in2_sub_cumh_sel_rows * common.in2_sub_cumh_sel_cols;
  472. common.in2_sub_cumh_sel_mem = sizeof(fp) * common.in2_sub_cumh_sel_elem;
  473. // unique
  474. cl_mem d_in2_sub_cumh_sel;
  475. d_in2_sub_cumh_sel = clCreateBuffer(context,
  476. CL_MEM_READ_WRITE,
  477. common.in2_sub_cumh_sel_mem * common.allPoints,
  478. NULL,
  479. &error );
  480. if (error != CL_SUCCESS)
  481. fatal_CL(error, __LINE__);
  482. //==================================================50
  483. // SELECTION 2, SUBTRACTION
  484. //==================================================50
  485. // common
  486. common.in2_sub_cumh_sel2_rowlow = 1;
  487. common.in2_sub_cumh_sel2_rowhig = common.in2_sub_cumh_rows;
  488. common.in2_sub_cumh_sel2_collow = 1;
  489. common.in2_sub_cumh_sel2_colhig = common.in2_sub_cumh_cols - common.in_cols - 1;
  490. common.in2_sub2_rows = common.in2_sub_cumh_sel2_rowhig - common.in2_sub_cumh_sel2_rowlow + 1;
  491. common.in2_sub2_cols = common.in2_sub_cumh_sel2_colhig - common.in2_sub_cumh_sel2_collow + 1;
  492. common.in2_sub2_elem = common.in2_sub2_rows * common.in2_sub2_cols;
  493. common.in2_sub2_mem = sizeof(fp) * common.in2_sub2_elem;
  494. // unique
  495. cl_mem d_in2_sub2;
  496. d_in2_sub2 = clCreateBuffer(context,
  497. CL_MEM_READ_WRITE,
  498. common.in2_sub2_mem * common.allPoints,
  499. NULL,
  500. &error );
  501. if (error != CL_SUCCESS)
  502. fatal_CL(error, __LINE__);
  503. //====================================================================================================100
  504. // CUMULATIVE SUM 2 (LOCAL)
  505. //====================================================================================================100
  506. //==================================================50
  507. // MULTIPLICATION
  508. //==================================================50
  509. // common
  510. common.in2_sqr_rows = common.in2_rows;
  511. common.in2_sqr_cols = common.in2_cols;
  512. common.in2_sqr_elem = common.in2_elem;
  513. common.in2_sqr_mem = common.in2_mem;
  514. // unique
  515. cl_mem d_in2_sqr;
  516. d_in2_sqr = clCreateBuffer(context,
  517. CL_MEM_READ_WRITE,
  518. common.in2_sqr_mem * common.allPoints,
  519. NULL,
  520. &error );
  521. if (error != CL_SUCCESS)
  522. fatal_CL(error, __LINE__);
  523. //==================================================50
  524. // SELECTION 2, SUBTRACTION
  525. //==================================================50
  526. // common
  527. common.in2_sqr_sub2_rows = common.in2_sub2_rows;
  528. common.in2_sqr_sub2_cols = common.in2_sub2_cols;
  529. common.in2_sqr_sub2_elem = common.in2_sub2_elem;
  530. common.in2_sqr_sub2_mem = common.in2_sub2_mem;
  531. // unique
  532. cl_mem d_in2_sqr_sub2;
  533. d_in2_sqr_sub2 = clCreateBuffer(context,
  534. CL_MEM_READ_WRITE,
  535. common.in2_sqr_sub2_mem * common.allPoints,
  536. NULL,
  537. &error );
  538. if (error != CL_SUCCESS)
  539. fatal_CL(error, __LINE__);
  540. //====================================================================================================100
  541. // FINAL (LOCAL)
  542. //====================================================================================================100
  543. // common
  544. common.in_sqr_rows = common.in_rows;
  545. common.in_sqr_cols = common.in_cols;
  546. common.in_sqr_elem = common.in_elem;
  547. common.in_sqr_mem = common.in_mem;
  548. // unique
  549. cl_mem d_in_sqr;
  550. d_in_sqr = clCreateBuffer( context,
  551. CL_MEM_READ_WRITE,
  552. common.in_sqr_mem * common.allPoints,
  553. NULL,
  554. &error );
  555. if (error != CL_SUCCESS)
  556. fatal_CL(error, __LINE__);
  557. //====================================================================================================100
  558. // TEMPLATE MASK CREATE (LOCAL)
  559. //====================================================================================================100
  560. // common
  561. common.tMask_rows = common.in_rows + (common.sSize+1+common.sSize) - 1;
  562. common.tMask_cols = common.tMask_rows;
  563. common.tMask_elem = common.tMask_rows * common.tMask_cols;
  564. common.tMask_mem = sizeof(fp) * common.tMask_elem;
  565. // unique
  566. cl_mem d_tMask;
  567. d_tMask = clCreateBuffer( context,
  568. CL_MEM_READ_WRITE,
  569. common.tMask_mem * common.allPoints,
  570. NULL,
  571. &error );
  572. if (error != CL_SUCCESS)
  573. fatal_CL(error, __LINE__);
  574. //====================================================================================================100
  575. // POINT MASK INITIALIZE (LOCAL)
  576. //====================================================================================================100
  577. // common
  578. common.mask_rows = common.maxMove;
  579. common.mask_cols = common.mask_rows;
  580. common.mask_elem = common.mask_rows * common.mask_cols;
  581. common.mask_mem = sizeof(fp) * common.mask_elem;
  582. //====================================================================================================100
  583. // MASK CONVOLUTION (LOCAL)
  584. //====================================================================================================100
  585. // common
  586. common.mask_conv_rows = common.tMask_rows; // number of rows in I
  587. common.mask_conv_cols = common.tMask_cols; // number of columns in I
  588. common.mask_conv_elem = common.mask_conv_rows * common.mask_conv_cols; // number of elements
  589. common.mask_conv_mem = sizeof(fp) * common.mask_conv_elem;
  590. common.mask_conv_ioffset = (common.mask_rows-1)/2;
  591. if((common.mask_rows-1) % 2 > 0.5){
  592. common.mask_conv_ioffset = common.mask_conv_ioffset + 1;
  593. }
  594. common.mask_conv_joffset = (common.mask_cols-1)/2;
  595. if((common.mask_cols-1) % 2 > 0.5){
  596. common.mask_conv_joffset = common.mask_conv_joffset + 1;
  597. }
  598. // unique
  599. cl_mem d_mask_conv;
  600. d_mask_conv = clCreateBuffer( context,
  601. CL_MEM_READ_WRITE,
  602. common.mask_conv_mem * common.allPoints,
  603. NULL,
  604. &error );
  605. if (error != CL_SUCCESS)
  606. fatal_CL(error, __LINE__);
  607. //====================================================================================================100
  608. // END
  609. //====================================================================================================100
  610. //======================================================================================================================================================150
  611. // GPU MEMORY COPY
  612. //======================================================================================================================================================150
  613. //====================================================================================================100
  614. // Inputs
  615. //====================================================================================================100
  616. //==================================================50
  617. // endo points
  618. //==================================================50
  619. error = clEnqueueWriteBuffer( command_queue,
  620. d_endoRow,
  621. 1,
  622. 0,
  623. common.endo_mem,
  624. endoRow,
  625. 0,
  626. 0,
  627. 0);
  628. if (error != CL_SUCCESS)
  629. fatal_CL(error, __LINE__);
  630. error = clEnqueueWriteBuffer( command_queue,
  631. d_endoCol,
  632. 1,
  633. 0,
  634. common.endo_mem,
  635. endoCol,
  636. 0,
  637. 0,
  638. 0);
  639. if (error != CL_SUCCESS)
  640. fatal_CL(error, __LINE__);
  641. //==================================================50
  642. // epi points
  643. //==================================================50
  644. error = clEnqueueWriteBuffer( command_queue,
  645. d_epiRow,
  646. 1,
  647. 0,
  648. common.epi_mem,
  649. epiRow,
  650. 0,
  651. 0,
  652. 0);
  653. if (error != CL_SUCCESS)
  654. fatal_CL(error, __LINE__);
  655. error = clEnqueueWriteBuffer( command_queue,
  656. d_epiCol,
  657. 1,
  658. 0,
  659. common.epi_mem,
  660. epiCol,
  661. 0,
  662. 0,
  663. 0);
  664. if (error != CL_SUCCESS)
  665. fatal_CL(error, __LINE__);
  666. //==================================================50
  667. // END
  668. //==================================================50
  669. //====================================================================================================100
  670. // END
  671. //====================================================================================================100
  672. //======================================================================================================================================================150
  673. // KERNEL
  674. //======================================================================================================================================================150
  675. //====================================================================================================100
  676. // EXECUTION PARAMETERS
  677. //====================================================================================================100
  678. // All kernels operations within kernel use same max size of threads. Size of block size is set to the size appropriate for max size operation (on padded matrix). Other use subsets of that.
  679. size_t local_work_size[1];
  680. local_work_size[0] = NUMBER_THREADS;
  681. size_t global_work_size[1];
  682. global_work_size[0] = common.allPoints * local_work_size[0];
  683. printf("# of workgroups = %d, # of threads/workgroup = %d (ensure that device can handle)\n", (int)(global_work_size[0]/local_work_size[0]), (int)local_work_size[0]);
  684. //====================================================================================================100
  685. // COPY ARGUMENTS
  686. //====================================================================================================100
  687. error = clEnqueueWriteBuffer( command_queue,
  688. d_common,
  689. 1,
  690. 0,
  691. common.common_mem,
  692. &common,
  693. 0,
  694. 0,
  695. 0);
  696. if (error != CL_SUCCESS)
  697. fatal_CL(error, __LINE__);
  698. //====================================================================================================100
  699. // set kernel arguments
  700. //====================================================================================================100
  701. // structures
  702. error = clSetKernelArg( kernel,
  703. 0,
  704. sizeof(params_common),
  705. (void *) &common);
  706. if (error != CL_SUCCESS)
  707. fatal_CL(error, __LINE__);
  708. // common
  709. error = clSetKernelArg( kernel,
  710. 3,
  711. sizeof(cl_mem),
  712. (void *) &d_endoRow);
  713. if (error != CL_SUCCESS)
  714. fatal_CL(error, __LINE__);
  715. error = clSetKernelArg( kernel,
  716. 4,
  717. sizeof(cl_mem),
  718. (void *) &d_endoCol);
  719. if (error != CL_SUCCESS)
  720. fatal_CL(error, __LINE__);
  721. error = clSetKernelArg( kernel,
  722. 5,
  723. sizeof(cl_mem),
  724. (void *) &d_tEndoRowLoc);
  725. if (error != CL_SUCCESS)
  726. fatal_CL(error, __LINE__);
  727. error = clSetKernelArg( kernel,
  728. 6,
  729. sizeof(cl_mem),
  730. (void *) &d_tEndoColLoc);
  731. if (error != CL_SUCCESS)
  732. fatal_CL(error, __LINE__);
  733. error = clSetKernelArg( kernel,
  734. 7,
  735. sizeof(cl_mem),
  736. (void *) &d_epiRow);
  737. if (error != CL_SUCCESS)
  738. fatal_CL(error, __LINE__);
  739. error = clSetKernelArg( kernel,
  740. 8,
  741. sizeof(cl_mem),
  742. (void *) &d_epiCol);
  743. if (error != CL_SUCCESS)
  744. fatal_CL(error, __LINE__);
  745. error = clSetKernelArg( kernel,
  746. 9,
  747. sizeof(cl_mem),
  748. (void *) &d_tEpiRowLoc);
  749. if (error != CL_SUCCESS)
  750. fatal_CL(error, __LINE__);
  751. error = clSetKernelArg( kernel,
  752. 10,
  753. sizeof(cl_mem),
  754. (void *) &d_tEpiColLoc);
  755. if (error != CL_SUCCESS)
  756. fatal_CL(error, __LINE__);
  757. // common_unique
  758. error = clSetKernelArg( kernel,
  759. 11,
  760. sizeof(cl_mem),
  761. (void *) &d_endoT);
  762. if (error != CL_SUCCESS)
  763. fatal_CL(error, __LINE__);
  764. error = clSetKernelArg( kernel,
  765. 12,
  766. sizeof(cl_mem),
  767. (void *) &d_epiT);
  768. if (error != CL_SUCCESS)
  769. fatal_CL(error, __LINE__);
  770. error = clSetKernelArg( kernel,
  771. 13,
  772. sizeof(cl_mem),
  773. (void *) &d_in2);
  774. if (error != CL_SUCCESS)
  775. fatal_CL(error, __LINE__);
  776. error = clSetKernelArg( kernel,
  777. 14,
  778. sizeof(cl_mem),
  779. (void *) &d_conv);
  780. if (error != CL_SUCCESS)
  781. fatal_CL(error, __LINE__);
  782. error = clSetKernelArg( kernel,
  783. 15,
  784. sizeof(cl_mem),
  785. (void *) &d_in2_pad_cumv);
  786. if (error != CL_SUCCESS)
  787. fatal_CL(error, __LINE__);
  788. error = clSetKernelArg( kernel,
  789. 16,
  790. sizeof(cl_mem),
  791. (void *) &d_in2_pad_cumv_sel);
  792. if (error != CL_SUCCESS)
  793. fatal_CL(error, __LINE__);
  794. error = clSetKernelArg( kernel,
  795. 17,
  796. sizeof(cl_mem),
  797. (void *) &d_in2_sub_cumh);
  798. if (error != CL_SUCCESS)
  799. fatal_CL(error, __LINE__);
  800. error = clSetKernelArg( kernel,
  801. 18,
  802. sizeof(cl_mem),
  803. (void *) &d_in2_sub_cumh_sel);
  804. if (error != CL_SUCCESS)
  805. fatal_CL(error, __LINE__);
  806. error = clSetKernelArg( kernel,
  807. 19,
  808. sizeof(cl_mem),
  809. (void *) &d_in2_sub2);
  810. if (error != CL_SUCCESS)
  811. fatal_CL(error, __LINE__);
  812. error = clSetKernelArg( kernel,
  813. 20,
  814. sizeof(cl_mem),
  815. (void *) &d_in2_sqr);
  816. if (error != CL_SUCCESS)
  817. fatal_CL(error, __LINE__);
  818. error = clSetKernelArg( kernel,
  819. 21,
  820. sizeof(cl_mem),
  821. (void *) &d_in2_sqr_sub2);
  822. if (error != CL_SUCCESS)
  823. fatal_CL(error, __LINE__);
  824. error = clSetKernelArg( kernel,
  825. 22,
  826. sizeof(cl_mem),
  827. (void *) &d_in_sqr);
  828. if (error != CL_SUCCESS)
  829. fatal_CL(error, __LINE__);
  830. error = clSetKernelArg( kernel,
  831. 23,
  832. sizeof(cl_mem),
  833. (void *) &d_tMask);
  834. if (error != CL_SUCCESS)
  835. fatal_CL(error, __LINE__);
  836. error = clSetKernelArg( kernel,
  837. 24,
  838. sizeof(cl_mem),
  839. (void *) &d_mask_conv);
  840. if (error != CL_SUCCESS)
  841. fatal_CL(error, __LINE__);
  842. // // local
  843. // // int local_size_one;
  844. // // local_size_one = common.in_rows;
  845. // error = clSetKernelArg( kernel,
  846. // 25,
  847. // // sizeof(fp) * local_size_one, // size 51
  848. // sizeof(fp) * common.in_elem,
  849. // NULL);
  850. // if (error != CL_SUCCESS)
  851. // fatal_CL(error, __LINE__);
  852. // error = clSetKernelArg( kernel,
  853. // 26,
  854. // // sizeof(fp) * local_size_one, // size 51
  855. // sizeof(fp) * common.in_cols,
  856. // NULL);
  857. // if (error != CL_SUCCESS)
  858. // fatal_CL(error, __LINE__);
  859. // // int local_size_two;
  860. // // local_size_two = common.in_rows + common.in2_rows - 1;
  861. // error = clSetKernelArg( kernel,
  862. // 27,
  863. // // sizeof(fp) * local_size_two, // size 51+81-1=131
  864. // sizeof(fp) * common.in_sqr_rows,
  865. // NULL);
  866. // if (error != CL_SUCCESS)
  867. // fatal_CL(error, __LINE__);
  868. // error = clSetKernelArg( kernel,
  869. // 28,
  870. // // sizeof(fp) * local_size_two, // size 51+81-1=131
  871. // sizeof(fp) * common.mask_conv_rows,
  872. // NULL);
  873. // if (error != CL_SUCCESS)
  874. // fatal_CL(error, __LINE__);
  875. // // int local_size_three;
  876. // // local_size_three = common.in_rows * common.in_rows;
  877. // error = clSetKernelArg( kernel,
  878. // 29,
  879. // // sizeof(fp) * local_size_three, // size 51*51=2601
  880. // sizeof(int) * common.mask_conv_rows,
  881. // NULL);
  882. // if (error != CL_SUCCESS)
  883. // fatal_CL(error, __LINE__);
  884. // int local_size;
  885. // local_size = (common.in_elem + common.in_cols + common.in_sqr_rows + common.mask_conv_rows) * 4 + common.mask_conv_rows * 2;
  886. // printf("size of used local memory/workgroup = %dB (ensure that device can handle)\n", local_size);
  887. cl_mem d_in_mod_temp;
  888. d_in_mod_temp = clCreateBuffer( context,
  889. CL_MEM_READ_WRITE,
  890. sizeof(fp) * common.in_elem * common.allPoints,
  891. NULL,
  892. &error );
  893. if (error != CL_SUCCESS)
  894. fatal_CL(error, __LINE__);
  895. cl_mem in_partial_sum;
  896. in_partial_sum = clCreateBuffer(context,
  897. CL_MEM_READ_WRITE,
  898. sizeof(fp) * common.in_cols * common.allPoints,
  899. NULL,
  900. &error );
  901. if (error != CL_SUCCESS)
  902. fatal_CL(error, __LINE__);
  903. cl_mem in_sqr_partial_sum;
  904. in_sqr_partial_sum = clCreateBuffer(context,
  905. CL_MEM_READ_WRITE,
  906. sizeof(fp) * common.in_sqr_rows * common.allPoints,
  907. NULL,
  908. &error );
  909. if (error != CL_SUCCESS)
  910. fatal_CL(error, __LINE__);
  911. cl_mem par_max_val;
  912. par_max_val = clCreateBuffer( context,
  913. CL_MEM_READ_WRITE,
  914. sizeof(fp) * common.mask_conv_rows * common.allPoints,
  915. NULL,
  916. &error );
  917. if (error != CL_SUCCESS)
  918. fatal_CL(error, __LINE__);
  919. cl_mem par_max_coo;
  920. par_max_coo = clCreateBuffer( context,
  921. CL_MEM_READ_WRITE,
  922. sizeof(int) * common.mask_conv_rows * common.allPoints,
  923. NULL,
  924. &error );
  925. if (error != CL_SUCCESS)
  926. fatal_CL(error, __LINE__);
  927. cl_mem in_final_sum;
  928. in_final_sum = clCreateBuffer( context,
  929. CL_MEM_READ_WRITE,
  930. sizeof(fp) * common.allPoints,
  931. NULL,
  932. &error );
  933. if (error != CL_SUCCESS)
  934. fatal_CL(error, __LINE__);
  935. cl_mem in_sqr_final_sum;
  936. in_sqr_final_sum = clCreateBuffer( context,
  937. CL_MEM_READ_WRITE,
  938. sizeof(fp) * common.allPoints,
  939. NULL,
  940. &error );
  941. if (error != CL_SUCCESS)
  942. fatal_CL(error, __LINE__);
  943. cl_mem denomT;
  944. denomT = clCreateBuffer(context,
  945. CL_MEM_READ_WRITE,
  946. sizeof(fp) * common.allPoints,
  947. NULL,
  948. &error );
  949. if (error != CL_SUCCESS)
  950. fatal_CL(error, __LINE__);
  951. error = clSetKernelArg( kernel,
  952. 25,
  953. sizeof(cl_mem),
  954. (void *) &d_in_mod_temp);
  955. if (error != CL_SUCCESS)
  956. fatal_CL(error, __LINE__);
  957. error = clSetKernelArg( kernel,
  958. 26,
  959. sizeof(cl_mem),
  960. (void *) &in_partial_sum);
  961. if (error != CL_SUCCESS)
  962. fatal_CL(error, __LINE__);
  963. error = clSetKernelArg( kernel,
  964. 27,
  965. sizeof(cl_mem),
  966. (void *) &in_sqr_partial_sum);
  967. if (error != CL_SUCCESS)
  968. fatal_CL(error, __LINE__);
  969. error = clSetKernelArg( kernel,
  970. 28,
  971. sizeof(cl_mem),
  972. (void *) &par_max_val);
  973. if (error != CL_SUCCESS)
  974. fatal_CL(error, __LINE__);
  975. error = clSetKernelArg( kernel,
  976. 29,
  977. sizeof(cl_mem),
  978. (void *) &par_max_coo);
  979. if (error != CL_SUCCESS)
  980. fatal_CL(error, __LINE__);
  981. error = clSetKernelArg( kernel,
  982. 30,
  983. sizeof(cl_mem),
  984. (void *) &in_final_sum);
  985. if (error != CL_SUCCESS)
  986. fatal_CL(error, __LINE__);
  987. error = clSetKernelArg( kernel,
  988. 31,
  989. sizeof(cl_mem),
  990. (void *) &in_sqr_final_sum);
  991. if (error != CL_SUCCESS)
  992. fatal_CL(error, __LINE__);
  993. error = clSetKernelArg( kernel,
  994. 32,
  995. sizeof(cl_mem),
  996. (void *) &denomT);
  997. if (error != CL_SUCCESS)
  998. fatal_CL(error, __LINE__);
  999. cl_mem d_checksum;
  1000. d_checksum = clCreateBuffer(context,
  1001. CL_MEM_READ_WRITE,
  1002. sizeof(fp) * CHECK,
  1003. NULL,
  1004. &error );
  1005. if (error != CL_SUCCESS)
  1006. fatal_CL(error, __LINE__);
  1007. error = clSetKernelArg( kernel,
  1008. 33,
  1009. sizeof(cl_mem),
  1010. (void *) &d_checksum);
  1011. if (error != CL_SUCCESS)
  1012. fatal_CL(error, __LINE__);
  1013. //====================================================================================================100
  1014. // PRINT FRAME PROGRESS START
  1015. //====================================================================================================100
  1016. printf("frame progress: ");
  1017. fflush(NULL);
  1018. //====================================================================================================100
  1019. // LAUNCH
  1020. //====================================================================================================100
  1021. // variables
  1022. fp* frame;
  1023. int frame_no;
  1024. for(frame_no=0; frame_no<common.frames_processed; frame_no++){
  1025. //==================================================50
  1026. // get and write current frame to GPU buffer
  1027. //==================================================50
  1028. // Extract a cropped version of the first frame from the video file
  1029. frame = get_frame( frames, // pointer to video file
  1030. frame_no, // number of frame that needs to be returned
  1031. 0, // cropped?
  1032. 0, // scaled?
  1033. 1); // converted
  1034. // copy frame to GPU memory
  1035. error = clEnqueueWriteBuffer( command_queue,
  1036. d_frame,
  1037. 1,
  1038. 0,
  1039. common.frame_mem,
  1040. frame,
  1041. 0,
  1042. 0,
  1043. 0);
  1044. if (error != CL_SUCCESS)
  1045. fatal_CL(error, __LINE__);
  1046. //==================================================50
  1047. // kernel arguments that change inside this loop
  1048. //==================================================50
  1049. // common_change
  1050. error = clSetKernelArg( kernel,
  1051. 1,
  1052. sizeof(cl_mem),
  1053. (void *) &d_frame);
  1054. if (error != CL_SUCCESS)
  1055. fatal_CL(error, __LINE__);
  1056. error = clSetKernelArg( kernel,
  1057. 2,
  1058. sizeof(int),
  1059. (void *) &frame_no);
  1060. if (error != CL_SUCCESS)
  1061. fatal_CL(error, __LINE__);
  1062. //==================================================50
  1063. // launch kernel
  1064. //==================================================50
  1065. error = clEnqueueNDRangeKernel( command_queue,
  1066. kernel,
  1067. 1,
  1068. NULL,
  1069. global_work_size,
  1070. local_work_size,
  1071. 0,
  1072. NULL,
  1073. NULL);
  1074. if (error != CL_SUCCESS)
  1075. fatal_CL(error, __LINE__);
  1076. //==================================================50
  1077. // finish iteration
  1078. //==================================================50
  1079. // Synchronization, wait for all operations in the command queue so far to finish
  1080. error = clFinish(command_queue);
  1081. if (error != CL_SUCCESS)
  1082. fatal_CL(error, __LINE__);
  1083. // free frame after each loop iteration, since AVI library allocates memory for every frame fetched
  1084. free(frame);
  1085. //==================================================50
  1086. // print frame progress
  1087. //==================================================50
  1088. // print frame progress
  1089. printf("%d ", frame_no);
  1090. fflush(NULL);
  1091. //==================================================50
  1092. // DISPLAY CHECKSUM (TESTING)
  1093. //==================================================50
  1094. #ifdef TEST_CHECKSUM
  1095. fp* checksum;
  1096. checksum = (fp*)malloc(sizeof(fp) * CHECK);
  1097. error = clEnqueueReadBuffer(command_queue,
  1098. d_checksum,
  1099. CL_TRUE,
  1100. 0,
  1101. sizeof(fp)*CHECK,
  1102. checksum,
  1103. 0,
  1104. NULL,
  1105. NULL);
  1106. if (error != CL_SUCCESS)
  1107. fatal_CL(error, __LINE__);
  1108. printf("CHECKSUM:\n");
  1109. for(i=0; i<CHECK; i++){
  1110. printf("%f ", checksum[i]);
  1111. }
  1112. printf("\n\n");
  1113. #endif
  1114. //==================================================50
  1115. // End
  1116. //==================================================50
  1117. }
  1118. //====================================================================================================100
  1119. // PRINT FRAME PROGRESS END
  1120. //====================================================================================================100
  1121. printf("\n");
  1122. fflush(NULL);
  1123. //======================================================================================================================================================150
  1124. // OUTPUT
  1125. //======================================================================================================================================================150
  1126. //====================================================================================================100
  1127. // endo points
  1128. //====================================================================================================100
  1129. error = clEnqueueReadBuffer(command_queue,
  1130. d_tEndoRowLoc,
  1131. CL_TRUE,
  1132. 0,
  1133. common.endo_mem * common.no_frames,
  1134. tEndoRowLoc,
  1135. 0,
  1136. NULL,
  1137. NULL);
  1138. if (error != CL_SUCCESS)
  1139. fatal_CL(error, __LINE__);
  1140. // for testing of the output
  1141. #ifdef TEST_OUTPUT
  1142. int j;
  1143. for(i=0; i<common.frames_processed; i++){
  1144. printf("%d: ", i);
  1145. for(j=0; j<common.endoPoints; j++){
  1146. printf("%d ", tEndoRowLoc[j*common.no_frames+i]);
  1147. }
  1148. printf("\n\n");
  1149. }
  1150. #endif
  1151. error = clEnqueueReadBuffer(command_queue,
  1152. d_tEndoColLoc,
  1153. CL_TRUE,
  1154. 0,
  1155. common.endo_mem * common.no_frames,
  1156. tEndoColLoc,
  1157. 0,
  1158. NULL,
  1159. NULL);
  1160. if (error != CL_SUCCESS)
  1161. fatal_CL(error, __LINE__);
  1162. //====================================================================================================100
  1163. // epi points
  1164. //====================================================================================================100
  1165. error = clEnqueueReadBuffer(command_queue,
  1166. d_tEpiRowLoc,
  1167. CL_TRUE,
  1168. 0,
  1169. common.epi_mem * common.no_frames,
  1170. tEpiRowLoc,
  1171. 0,
  1172. NULL,
  1173. NULL);
  1174. if (error != CL_SUCCESS)
  1175. fatal_CL(error, __LINE__);;
  1176. error = clEnqueueReadBuffer(command_queue,
  1177. d_tEpiColLoc,
  1178. CL_TRUE,
  1179. 0,
  1180. common.epi_mem * common.no_frames,
  1181. tEpiColLoc,
  1182. 0,
  1183. NULL,
  1184. NULL);
  1185. if (error != CL_SUCCESS)
  1186. fatal_CL(error, __LINE__);
  1187. //======================================================================================================================================================150
  1188. // DEALLOCATION
  1189. //======================================================================================================================================================150
  1190. // OpenCL structures
  1191. error = clReleaseKernel(kernel);
  1192. if (error != CL_SUCCESS)
  1193. fatal_CL(error, __LINE__);
  1194. error = clReleaseProgram(program);
  1195. if (error != CL_SUCCESS)
  1196. fatal_CL(error, __LINE__);
  1197. // common_change
  1198. error = clReleaseMemObject(d_frame);
  1199. if (error != CL_SUCCESS)
  1200. fatal_CL(error, __LINE__);
  1201. // common
  1202. error = clReleaseMemObject(d_endoRow);
  1203. if (error != CL_SUCCESS)
  1204. fatal_CL(error, __LINE__);
  1205. error = clReleaseMemObject(d_endoCol);
  1206. if (error != CL_SUCCESS)
  1207. fatal_CL(error, __LINE__);
  1208. error = clReleaseMemObject(d_tEndoRowLoc);
  1209. if (error != CL_SUCCESS)
  1210. fatal_CL(error, __LINE__);
  1211. error = clReleaseMemObject(d_tEndoColLoc);
  1212. if (error != CL_SUCCESS)
  1213. fatal_CL(error, __LINE__);
  1214. error = clReleaseMemObject(d_epiRow);
  1215. if (error != CL_SUCCESS)
  1216. fatal_CL(error, __LINE__);
  1217. error = clReleaseMemObject(d_epiCol);
  1218. if (error != CL_SUCCESS)
  1219. fatal_CL(error, __LINE__);
  1220. error = clReleaseMemObject(d_tEpiRowLoc);
  1221. if (error != CL_SUCCESS)
  1222. fatal_CL(error, __LINE__);
  1223. error = clReleaseMemObject(d_tEpiColLoc);
  1224. if (error != CL_SUCCESS)
  1225. fatal_CL(error, __LINE__);
  1226. // common_unique
  1227. error = clReleaseMemObject(d_endoT);
  1228. if (error != CL_SUCCESS)
  1229. fatal_CL(error, __LINE__);
  1230. error = clReleaseMemObject(d_epiT);
  1231. if (error != CL_SUCCESS)
  1232. fatal_CL(error, __LINE__);
  1233. error = clReleaseMemObject(d_in2);
  1234. if (error != CL_SUCCESS)
  1235. fatal_CL(error, __LINE__);
  1236. error = clReleaseMemObject(d_conv);
  1237. if (error != CL_SUCCESS)
  1238. fatal_CL(error, __LINE__);
  1239. error = clReleaseMemObject(d_in2_pad_cumv);
  1240. if (error != CL_SUCCESS)
  1241. fatal_CL(error, __LINE__);
  1242. error = clReleaseMemObject(d_in2_pad_cumv_sel);
  1243. if (error != CL_SUCCESS)
  1244. fatal_CL(error, __LINE__);
  1245. error = clReleaseMemObject(d_in2_sub_cumh);
  1246. if (error != CL_SUCCESS)
  1247. fatal_CL(error, __LINE__);
  1248. error = clReleaseMemObject(d_in2_sub_cumh_sel);
  1249. if (error != CL_SUCCESS)
  1250. fatal_CL(error, __LINE__);
  1251. error = clReleaseMemObject(d_in2_sub2);
  1252. if (error != CL_SUCCESS)
  1253. fatal_CL(error, __LINE__);
  1254. error = clReleaseMemObject(d_in2_sqr);
  1255. if (error != CL_SUCCESS)
  1256. fatal_CL(error, __LINE__);
  1257. error = clReleaseMemObject(d_in2_sqr_sub2);
  1258. if (error != CL_SUCCESS)
  1259. fatal_CL(error, __LINE__);
  1260. error = clReleaseMemObject(d_in_sqr);
  1261. if (error != CL_SUCCESS)
  1262. fatal_CL(error, __LINE__);
  1263. error = clReleaseMemObject(d_tMask);
  1264. if (error != CL_SUCCESS)
  1265. fatal_CL(error, __LINE__);
  1266. error = clReleaseMemObject(d_mask_conv);
  1267. if (error != CL_SUCCESS)
  1268. fatal_CL(error, __LINE__);
  1269. // OpenCL structures
  1270. error = clFlush(command_queue);
  1271. if (error != CL_SUCCESS)
  1272. fatal_CL(error, __LINE__);
  1273. error = clReleaseCommandQueue(command_queue);
  1274. if (error != CL_SUCCESS)
  1275. fatal_CL(error, __LINE__);
  1276. error = clReleaseContext(context);
  1277. if (error != CL_SUCCESS)
  1278. fatal_CL(error, __LINE__);
  1279. //======================================================================================================================================================150
  1280. // End
  1281. //======================================================================================================================================================150
  1282. }
  1283. //========================================================================================================================================================================================================200
  1284. // END
  1285. //========================================================================================================================================================================================================200