com_dwt.cl 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707
  1. #define THREADS 256
  2. #define BOUNDARY_X 2
  3. int divRndUp(int n,
  4. int d)
  5. {
  6. return (n / d) + ((n % d) ? 1 : 0);
  7. }
  8. /* Store 3 RGB float components */
  9. /*void storeComponents(__global float *d_r, __global float *d_g, __global float *d_b, __global const float r, __global const float g, __global const float b, int pos)
  10. {
  11. d_r[pos] = (r/255.0f) - 0.5f;
  12. d_g[pos] = (g/255.0f) - 0.5f;
  13. d_b[pos] = (b/255.0f) - 0.5f;
  14. }
  15. */
  16. // Store 3 RGB intege components
  17. void storeComponents(__global int *d_r,
  18. __global int *d_g,
  19. __global int *d_b,
  20. int r,
  21. int g,
  22. int b,
  23. int pos)
  24. {
  25. d_r[pos] = r - 128;
  26. d_g[pos] = g - 128;
  27. d_b[pos] = b - 128;
  28. }
  29. /* Store float component */
  30. /*__kernel void storeComponent(__global float *d_c, __global const float c, int pos)
  31. {
  32. d_c[pos] = (c/255.0f) - 0.5f;
  33. }
  34. */
  35. // Store integer component
  36. void storeComponent(__global int *d_c,
  37. const int c,
  38. int pos)
  39. {
  40. d_c[pos] = c - 128;
  41. }
  42. // Copy img src data into three separated component buffers
  43. __kernel void c_CopySrcToComponents (__global int *d_r,
  44. __global int *d_g,
  45. __global int *d_b,
  46. __global unsigned char * cl_d_src,
  47. int pixels)
  48. {
  49. int x = get_local_id(0);
  50. int gX= get_local_size(0) * get_group_id(0);
  51. __local unsigned char sData[THREADS*3];
  52. // Copy data to shared mem by 4bytes
  53. // other checks are not necessary, since
  54. // cl_d_src buffer is aligned to sharedDataSize
  55. sData[3 * x + 0] = cl_d_src [gX * 3 + 3 * x + 0];
  56. sData[3 * x + 1] = cl_d_src [gX * 3 + 3 * x + 1];
  57. sData[3 * x + 2] = cl_d_src [gX * 3 + 3 * x + 2];
  58. barrier(CLK_LOCAL_MEM_FENCE);
  59. int r, g, b;
  60. int offset = x*3;
  61. r = (int)(sData[offset]);
  62. g = (int)(sData[offset+1]);
  63. b = (int)(sData[offset+2]);
  64. int globalOutputPosition = gX + x;
  65. if (globalOutputPosition < pixels)
  66. {
  67. storeComponents(d_r, d_g, d_b, r, g, b, globalOutputPosition);
  68. }
  69. }
  70. // Copy img src data into three separated component buffers
  71. __kernel void c_CopySrcToComponent (__global int *d_c,
  72. __global unsigned char * cl_d_src,
  73. int pixels)
  74. {
  75. int x = get_local_id(0);
  76. int gX = get_local_size(0) * get_group_id(0);
  77. __local unsigned char sData[THREADS];
  78. sData[ x ] = cl_d_src [gX + x];
  79. barrier(CLK_LOCAL_MEM_FENCE);
  80. int c;
  81. c = (int) (sData[x]);
  82. int globalOutputPosition = gX + x;
  83. if (globalOutputPosition < pixels)
  84. {
  85. storeComponent(d_c, c, globalOutputPosition);
  86. }
  87. }
  88. static void mirror( int *d,
  89. const int sizeD)
  90. {
  91. if ((*d )>= sizeD)
  92. {
  93. (*d) = 2 * sizeD -2 - (*d);
  94. } else if((*d) < 0)
  95. {
  96. (*d) = -(*d) ;
  97. }
  98. }
  99. struct VerticalDWTPixelIO
  100. {
  101. bool CHECKED;
  102. int end, stride;
  103. };
  104. int initialize_PixelIO(struct VerticalDWTPixelIO *pIO,
  105. bool CHECK,
  106. const int sizeX,
  107. const int sizeY,
  108. int firstX,
  109. int firstY)
  110. {
  111. pIO->CHECKED = CHECK;
  112. pIO->end = pIO->CHECKED ? (sizeY * sizeX + firstX) : 0 ;
  113. pIO->stride = sizeX;
  114. return firstX + sizeX * firstY;
  115. }
  116. struct VerticalDWTPixelLoader
  117. {
  118. bool CHECKED;
  119. int last;
  120. };
  121. void init_PixelLoader(struct VerticalDWTPixelLoader *loader,
  122. const int sizeX,
  123. const int sizeY,
  124. int firstX,
  125. const int firstY,
  126. struct VerticalDWTPixelIO *pIO,
  127. bool CHECK )
  128. {
  129. mirror (&firstX, sizeX);
  130. loader->last = initialize_PixelIO (pIO, CHECK, sizeX, sizeY, firstX, firstY) - sizeX;
  131. }
  132. void clear_PixelLoader(struct VerticalDWTPixelLoader *pLoader,
  133. struct VerticalDWTPixelIO *pIO)
  134. {
  135. pLoader->last = 0;
  136. pIO->end = 0 ;
  137. pIO->stride = 0 ;
  138. }
  139. int loadFrom(struct VerticalDWTPixelLoader *pLoader,
  140. __global const int * const input,
  141. struct VerticalDWTPixelIO *pIO,
  142. int CHECK)
  143. {
  144. pLoader->last += pIO->stride;
  145. if(CHECK && (pLoader->last == pIO->end))
  146. {
  147. pLoader->last -= 2 * pIO->stride;
  148. pIO->stride = 0 - pIO->stride;
  149. }
  150. return input[pLoader->last];
  151. }
  152. struct VerticalDWTBandIO
  153. {
  154. bool CHECKED;
  155. int end;
  156. int strideHighToLow;
  157. int strideLowToHigh;
  158. };
  159. int initialize_BandIO(struct VerticalDWTBandIO *bandIO,
  160. const int imageSizeX,
  161. const int imageSizeY,
  162. int firstX,
  163. int firstY)
  164. {
  165. int columnOffset = firstX / 2;
  166. int verticalStride;
  167. if(firstX & 1)
  168. {
  169. verticalStride = imageSizeX / 2;
  170. columnOffset += divRndUp(imageSizeX, 2) * divRndUp(imageSizeY, 2);
  171. bandIO->strideLowToHigh = (imageSizeX * imageSizeY) / 2;
  172. }
  173. else
  174. {
  175. verticalStride = imageSizeX / 2 + (imageSizeX & 1);
  176. bandIO->strideLowToHigh = divRndUp(imageSizeY, 2) * imageSizeX;
  177. }
  178. bandIO->strideHighToLow = verticalStride - bandIO->strideLowToHigh;
  179. if (bandIO->CHECKED)
  180. {
  181. bandIO->end = columnOffset + (imageSizeY / 2) * verticalStride + (imageSizeY & 1) * bandIO->strideLowToHigh ;
  182. }
  183. else
  184. {
  185. bandIO->end = 0;
  186. }
  187. return columnOffset + (firstY / 2) * verticalStride // right row
  188. + (firstY & 1) * bandIO->strideLowToHigh;
  189. }
  190. struct VerticalDWTBandLoader
  191. {
  192. bool CHECKED;
  193. int last;
  194. };
  195. struct VerticalDWTBandWriter
  196. {
  197. bool CHECKED;
  198. int next;
  199. };
  200. int saveAndUpdate(struct VerticalDWTBandWriter *writer,
  201. bool CHECK,
  202. struct VerticalDWTBandIO *bandIO,
  203. __global int * const output,
  204. __local int *item,
  205. int *stride)
  206. {
  207. writer->CHECKED = CHECK;
  208. if((!writer->CHECKED) || (writer->next != bandIO->end) )
  209. {
  210. output[writer->next] = *item;
  211. writer->next += *stride;
  212. }
  213. return writer->next;
  214. }
  215. void clear_BandWriter(struct VerticalDWTBandWriter *writer,
  216. struct VerticalDWTBandIO *bandIO)
  217. {
  218. bandIO->end = 0;
  219. bandIO->strideHighToLow = 0;
  220. bandIO->strideLowToHigh = 0;
  221. writer->next = 0 ;
  222. }
  223. void init_BandWriter(struct VerticalDWTBandWriter *writer,
  224. struct VerticalDWTBandIO *bandIO,
  225. const int imageSizeX,
  226. const int imageSizeY,
  227. const int firstX,
  228. const int firstY)
  229. {
  230. if (firstX < imageSizeX)
  231. {
  232. writer->next = initialize_BandIO (bandIO, imageSizeX, imageSizeY, firstX, firstY);
  233. }
  234. else
  235. {
  236. clear_BandWriter (writer , bandIO) ;
  237. }
  238. }
  239. int writeLowInto(struct VerticalDWTBandWriter *writer,
  240. struct VerticalDWTBandIO *bandIO,
  241. __global int * const output,
  242. __local int *primary)
  243. {
  244. return saveAndUpdate(writer, writer->CHECKED, bandIO, output, primary, &(bandIO->strideLowToHigh));
  245. }
  246. int writeHighInto(struct VerticalDWTBandWriter *writer, struct VerticalDWTBandIO *bandIO, __global int * const output, __local int *other)
  247. {
  248. return saveAndUpdate(writer, writer->CHECKED, bandIO, output, other, &(bandIO->strideHighToLow));
  249. }
  250. //TransformBuffer is contained in cuda_gwt/transform_buffer.h
  251. struct TransformBuffer
  252. {
  253. int SIZE_X, SIZE_Y;
  254. int VERTICAL_STRIDE;
  255. int SHM_BANKS, BUFFER_SIZE, PADDING, ODD_OFFSET;
  256. /// buffer for both even and odd columns
  257. int data[2182]; //data[2 * BUFFER_SIZE + PADDING]
  258. };
  259. void horizontalStep (__local struct TransformBuffer *buffer,
  260. const int count,
  261. const int prevOffset,
  262. const int midOffset,
  263. const int nextOffset,
  264. int flag)
  265. {
  266. const int STEPS = count / buffer->SIZE_X;
  267. const int finalCount = count % buffer->SIZE_X;
  268. const int finalOffset = count - finalCount;
  269. for(int i = 0; i< STEPS; i++)
  270. {
  271. const int previous = buffer->data[prevOffset + i * buffer->SIZE_X + get_local_id(0)] ;
  272. const int next = buffer->data[nextOffset + i * buffer->SIZE_X + get_local_id(0)];
  273. __local int * center = & (buffer->data[midOffset + i * buffer->SIZE_X + get_local_id(0)]);
  274. if (flag == 0)
  275. {
  276. *center -= (previous + next) /2; //Forward53Predict()
  277. } else if (flag == 1)
  278. {
  279. *center += (previous + next + 2) /4; //Forward53Update()
  280. }
  281. }
  282. if(get_local_id(0) < finalCount) {
  283. const int previous = buffer->data[prevOffset + finalOffset + get_local_id(0)];
  284. const int next = buffer->data[nextOffset + finalOffset + get_local_id(0)];
  285. __local int * center = & (buffer->data[midOffset + finalOffset + get_local_id(0)]);
  286. if (flag == 0)
  287. {
  288. *center -= (previous + next) /2; //Forward53Predict()
  289. } else if (flag == 1)
  290. {
  291. *center += (previous + next + 2) /4; //Forward53Update()
  292. }
  293. }
  294. }
  295. void forEachHorizontalOdd(__local struct TransformBuffer *buffer,
  296. const int firstLine,
  297. const int numLines,
  298. int flag)
  299. {
  300. const int count = numLines * buffer->VERTICAL_STRIDE - 1 ;
  301. const int prevOffset = firstLine * buffer->VERTICAL_STRIDE ;
  302. const int centerOffset = prevOffset + buffer->ODD_OFFSET ;
  303. const int nextOffset = prevOffset + 1;
  304. horizontalStep (buffer, count, prevOffset, centerOffset, nextOffset, flag);
  305. }
  306. void forEachHorizontalEven(__local struct TransformBuffer *buffer,
  307. const int firstLine,
  308. const int numLines,
  309. int flag)
  310. {
  311. const int count = numLines * buffer->VERTICAL_STRIDE - 1 ;
  312. const int centerOffset = firstLine * buffer->VERTICAL_STRIDE + 1;
  313. const int prevOffset = firstLine * buffer->VERTICAL_STRIDE + buffer->ODD_OFFSET;
  314. const int nextOffset = prevOffset + 1;
  315. horizontalStep (buffer, count, prevOffset, centerOffset, nextOffset, flag);
  316. }
  317. void forEachVerticalOdd (__local struct TransformBuffer *buffer,
  318. const int columnOffset,
  319. int flag)
  320. {
  321. int steps = (buffer->SIZE_Y - 1) / 2;
  322. for (int i = 0; i < steps; i++)
  323. {
  324. int row = i * 2 + 1;
  325. int prev = buffer->data[columnOffset+ (row - 1) * buffer->VERTICAL_STRIDE];
  326. int next = buffer->data[columnOffset+ (row + 1) * buffer->VERTICAL_STRIDE];
  327. if (flag == 0)
  328. {
  329. buffer->data[columnOffset + row * buffer->VERTICAL_STRIDE] -= (prev + next) /2;
  330. }
  331. else if (flag == 1)
  332. {
  333. //buffer->data[columnOffset + row * buffer->VERTICAL_STRIDE] += (prev + next + 2) /4;
  334. }
  335. }
  336. }
  337. void forEachVerticalEven (__local struct TransformBuffer *buffer,
  338. const int columnOffset,
  339. int flag)
  340. {
  341. int i ;
  342. if(buffer->SIZE_Y > 3)
  343. {
  344. int steps = (int)( buffer->SIZE_Y / 2) -1 ;
  345. for(i = 0; i < steps; i++)
  346. {
  347. int row = 2 + i * 2;
  348. int prev = buffer->data[columnOffset+ (row - 1) * buffer->VERTICAL_STRIDE];
  349. int next = buffer->data[columnOffset + (row + 1) * buffer->VERTICAL_STRIDE];
  350. if (flag == 0)
  351. {
  352. //buffer->data[columnOffset + row * buffer->VERTICAL_STRIDE] -= (prev + next) /2;
  353. }
  354. else if (flag == 1)
  355. {
  356. buffer->data[columnOffset + row * buffer->VERTICAL_STRIDE] += (prev + next + 2)/4; //real one
  357. }
  358. }
  359. }
  360. }
  361. struct FDWT53Column
  362. {
  363. bool CHECKED_LOADER;
  364. // loader for the column
  365. struct VerticalDWTPixelLoader loader;
  366. /// offset of the column in shared buffer
  367. int offset;
  368. // backup of first 3 loaded pixels (not transformed)
  369. int pixel0, pixel1, pixel2;
  370. };
  371. void clear_FDWT53Column(struct FDWT53Column *st_FDWT53Column,
  372. struct VerticalDWTPixelIO *pIO)
  373. {
  374. st_FDWT53Column->offset = 0;
  375. st_FDWT53Column->pixel0 = 0;
  376. st_FDWT53Column->pixel1 = 0;
  377. st_FDWT53Column->pixel2 = 0;
  378. clear_PixelLoader(&(st_FDWT53Column->loader), pIO);
  379. }
  380. struct FDWT53 {
  381. int WIN_SIZE_X, WIN_SIZE_Y;
  382. struct FDWT53Column column;
  383. /// Type of shared memory buffer for 5/3 FDWT transforms.
  384. /// Actual shared buffer used for forward 5/3 DWT.
  385. struct TransformBuffer buffer;
  386. /// Difference between indices of two vertical neighbors in buffer.
  387. int STRIDE;
  388. };
  389. //in from transform_buffer.h
  390. int getColumnOffset(int columnIndex,
  391. __local struct TransformBuffer * buffer)
  392. {
  393. columnIndex += BOUNDARY_X;
  394. return columnIndex / 2 // select right column
  395. + (columnIndex & 1) * buffer->ODD_OFFSET; // select odd or even buffer
  396. }
  397. void initColumn(__local struct FDWT53 * fdwt53,
  398. struct FDWT53Column *column,
  399. bool CHECKED,
  400. __global const int * const input,
  401. const int sizeX,
  402. const int sizeY,
  403. const int colIndex,
  404. const int firstY,
  405. struct VerticalDWTPixelIO *pIO)
  406. {
  407. column->CHECKED_LOADER = CHECKED;
  408. column->offset = getColumnOffset(colIndex, &fdwt53->buffer);
  409. const int firstX = get_group_id(0) * fdwt53->WIN_SIZE_X + colIndex;
  410. if(get_group_id(1) == 0)
  411. {
  412. // topmost block - apply mirroring rules when loading first 3 rows
  413. init_PixelLoader(&(column->loader), sizeX, sizeY, firstX, firstY, pIO, CHECKED);
  414. column->pixel2 = loadFrom(&(column->loader),input, pIO, CHECKED); // loaded pixel #0
  415. column->pixel1 = loadFrom(&(column->loader),input, pIO, CHECKED); // loaded pixel #1
  416. column->pixel0 = loadFrom(&(column->loader),input, pIO, CHECKED); // loaded pixel #2
  417. init_PixelLoader(&(column->loader), sizeX, sizeY, firstX, firstY + 1, pIO, CHECKED);
  418. }
  419. else
  420. {
  421. init_PixelLoader(&(column->loader), sizeX, sizeY, firstX, firstY - 2, pIO, CHECKED);
  422. column->pixel0 = loadFrom(&(column->loader),input, pIO, CHECKED); // loaded pixel #0
  423. column->pixel1 = loadFrom(&(column->loader),input, pIO, CHECKED); // loaded pixel #1
  424. column->pixel2 = loadFrom(&(column->loader),input, pIO, CHECKED); // loaded pixel #2
  425. }
  426. }
  427. void loadAndVerticallyTransform (__local struct FDWT53 *fdwt53,
  428. struct FDWT53Column *column,
  429. bool CHECKED,
  430. __global const int * const input,
  431. struct VerticalDWTPixelIO *pIO)
  432. {
  433. fdwt53->buffer.data[column->offset + 0 * fdwt53->STRIDE] = column->pixel0;
  434. fdwt53->buffer.data[column->offset + 1 * fdwt53->STRIDE] = column->pixel1;
  435. fdwt53->buffer.data[column->offset + 2 * fdwt53->STRIDE] = column->pixel2;
  436. for (int i = 3; i < (3 + fdwt53->WIN_SIZE_Y); i++)
  437. {
  438. fdwt53->buffer.data[column->offset + i * fdwt53->STRIDE] = loadFrom(&(column->loader),input, pIO, CHECKED);
  439. }
  440. column->pixel0 = fdwt53->buffer.data [column->offset + ( fdwt53->WIN_SIZE_Y + 0 ) * fdwt53->STRIDE] ;
  441. column->pixel1 = fdwt53->buffer.data [column->offset + ( fdwt53->WIN_SIZE_Y + 1 ) * fdwt53->STRIDE] ;
  442. column->pixel2 = fdwt53->buffer.data [column->offset + ( fdwt53->WIN_SIZE_Y + 2 ) * fdwt53->STRIDE] ;
  443. int flag = 0 ;
  444. forEachVerticalOdd (&fdwt53->buffer, column->offset, flag);
  445. flag = 1 ;
  446. forEachVerticalEven(&fdwt53->buffer, column->offset, flag);
  447. }
  448. void transform(__local struct FDWT53 *fdwt53,
  449. bool CHECK_LOADS,
  450. bool CHECK_WRITES,
  451. __global const int * const in,
  452. __global int * out,
  453. const int sizeX,
  454. const int sizeY,
  455. const int winSteps)
  456. {
  457. // info about one main and one boundary columns processed by this thread
  458. struct FDWT53Column column; column.CHECKED_LOADER = CHECK_LOADS;
  459. struct VerticalDWTPixelIO pIO;
  460. struct FDWT53Column boundaryColumn; boundaryColumn.CHECKED_LOADER = CHECK_LOADS;
  461. struct VerticalDWTPixelIO pIO_b;
  462. // Initialize all column info: initialize loaders, compute offset of
  463. // column in shared buffer and initialize loader of column.
  464. const int firstY = get_group_id(1) * fdwt53->WIN_SIZE_Y * winSteps;
  465. initColumn(fdwt53, &column, CHECK_LOADS, in, sizeX, sizeY, get_local_id(0), firstY, &pIO);
  466. // first 3 threads initialize boundary columns, others do not use them
  467. clear_FDWT53Column(&boundaryColumn, &pIO_b);
  468. if (get_local_id(0) < 3) {
  469. // index of boundary column (relative x-axis coordinate of the column)
  470. const int colId = get_local_id(0) + ((get_local_id(0)== 0) ? fdwt53->WIN_SIZE_X : -3);
  471. // initialize the column
  472. initColumn (fdwt53, &boundaryColumn, CHECK_LOADS, in, sizeX, sizeY, colId, firstY, &pIO_b);
  473. }
  474. // index of column which will be written into output by this thread
  475. const int outColumnIndex = (get_local_id(0) * 2) - (fdwt53->WIN_SIZE_X - 1) * (get_local_id(0) / ( fdwt53->WIN_SIZE_X / 2));
  476. // offset of column which will be written by this thread into output
  477. const int outColumnOffset = getColumnOffset(outColumnIndex, &(fdwt53->buffer));
  478. // initialize output writer for this thread
  479. const int outputFirstX = get_group_id(0) * fdwt53->WIN_SIZE_X +outColumnIndex;
  480. struct VerticalDWTBandWriter writer; writer.CHECKED = CHECK_WRITES;
  481. struct VerticalDWTBandIO bandIO; bandIO.CHECKED = CHECK_WRITES;
  482. init_BandWriter(&writer, &bandIO, sizeX, sizeY, outputFirstX, firstY);
  483. // Sliding window iterations:
  484. // Each iteration assumes that first 3 pixels of each column are loaded.
  485. for(int w = 0; w < winSteps; w++)
  486. {
  487. loadAndVerticallyTransform(fdwt53, &column, CHECK_LOADS, in, &pIO);
  488. if (get_local_id(0) < 3)
  489. {
  490. loadAndVerticallyTransform(fdwt53, &boundaryColumn, CHECK_LOADS, in, &pIO_b);
  491. }
  492. barrier(CLK_LOCAL_MEM_FENCE);
  493. int flag = 0; //flag = 0 execute Forward53Predict, flag = 1 execute Forward53Update
  494. forEachHorizontalOdd(&(fdwt53->buffer), 2, fdwt53->WIN_SIZE_Y, flag);
  495. barrier(CLK_LOCAL_MEM_FENCE);
  496. flag = 1;
  497. forEachHorizontalEven(&(fdwt53->buffer), 2, fdwt53->WIN_SIZE_Y, flag);
  498. barrier(CLK_LOCAL_MEM_FENCE);
  499. for(int r = 2; r < (2+fdwt53->WIN_SIZE_Y); r+= 2)
  500. {
  501. writeLowInto(&writer, &bandIO, out, &(fdwt53->buffer.data[outColumnOffset + r * fdwt53->buffer.VERTICAL_STRIDE]));
  502. writeHighInto(&writer, &bandIO, out, &(fdwt53->buffer.data[outColumnOffset + (r+1) * fdwt53->buffer.VERTICAL_STRIDE]));
  503. }
  504. barrier(CLK_LOCAL_MEM_FENCE);
  505. }
  506. }
  507. // Forward 5/3 DWT predict operation.
  508. void Forward53Predict (const int p,
  509. __global int * c,
  510. const int n)
  511. {
  512. *c -= (p + n) /2;
  513. }
  514. // Forward 5/3 DWT update operation.
  515. void Forward53Update (const int p,
  516. __global int * c,
  517. const int n)
  518. {
  519. *c += (p + n + 2) /4;
  520. }
  521. __kernel void cl_fdwt53Kernel(__global const int * const in,
  522. __global int * out,
  523. const int sx,
  524. const int sy,
  525. const int steps,
  526. int WIN_SIZE_X,
  527. int WIN_SIZE_Y)
  528. {
  529. __local struct FDWT53 fdwt53;
  530. fdwt53.WIN_SIZE_X = WIN_SIZE_X;
  531. fdwt53.WIN_SIZE_Y = WIN_SIZE_Y;
  532. //initialize
  533. //Lingjie Zhang modified on 11/02/2015
  534. //for(int i = 0; i < sizeof(fdwt53.buffer)/sizeof(int); i++){
  535. for(int i = 0; i < sizeof(fdwt53.buffer.data)/sizeof(int); i++){
  536. fdwt53.buffer.data[i] = 0;
  537. }
  538. //end of Lingjie Zhang modification
  539. fdwt53.buffer.SIZE_X = fdwt53.WIN_SIZE_X;
  540. fdwt53.buffer.SIZE_Y = fdwt53.WIN_SIZE_Y + 3;
  541. fdwt53.buffer.VERTICAL_STRIDE = BOUNDARY_X + (fdwt53.buffer.SIZE_X / 2);//BOUNDARY = 2
  542. fdwt53.buffer.SHM_BANKS = 32; // SHM_BANKS = ((__CUDA_ARCH__ >= 200) ? 32 : 16)
  543. fdwt53.buffer.BUFFER_SIZE = fdwt53.buffer.VERTICAL_STRIDE * fdwt53.buffer.SIZE_Y;
  544. fdwt53.buffer.PADDING = fdwt53.buffer.SHM_BANKS - ((fdwt53.buffer.BUFFER_SIZE + fdwt53.buffer.SHM_BANKS / 2) % fdwt53.buffer.SHM_BANKS) ;
  545. fdwt53.buffer.ODD_OFFSET = fdwt53.buffer.BUFFER_SIZE + fdwt53.buffer.PADDING ;
  546. fdwt53.STRIDE = fdwt53.buffer.VERTICAL_STRIDE ;
  547. const int maxX = (get_group_id(0) + 1) * WIN_SIZE_X + 1;
  548. const int maxY = (get_group_id(1) + 1) * WIN_SIZE_Y * steps + 1;
  549. const bool atRightBoudary = maxX >= sx;
  550. const bool atBottomBoudary = maxY >= sy;
  551. // Select specialized version of code according to distance of this
  552. // threadblock's pixels from image boundary.
  553. if(atBottomBoudary)
  554. {
  555. // near bottom boundary => check both writing and reading
  556. transform(&fdwt53, true, true, in, out, sx, sy, steps);
  557. }
  558. else if(atRightBoudary)
  559. {
  560. // near right boundary only => check writing only
  561. transform(&fdwt53, false, true, in, out, sx, sy, steps);
  562. }
  563. else
  564. {
  565. // no nearby boundary => check nothing
  566. transform(&fdwt53, false, false, in, out, sx, sy, steps);
  567. }
  568. }