lud_kernel.cl 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162
  1. // #define BLOCK_SIZE 16
  2. __kernel void
  3. lud_diagonal(__global float *m,
  4. __local float *shadow,
  5. int matrix_dim,
  6. int offset)
  7. {
  8. int i,j;
  9. int tx = get_local_id(0);
  10. int array_offset = offset*matrix_dim+offset;
  11. for(i=0; i < BLOCK_SIZE; i++){
  12. shadow[i * BLOCK_SIZE + tx]=m[array_offset + tx];
  13. array_offset += matrix_dim;
  14. }
  15. barrier(CLK_LOCAL_MEM_FENCE);
  16. for(i=0; i < BLOCK_SIZE-1; i++) {
  17. if (tx>i){
  18. for(j=0; j < i; j++)
  19. shadow[tx * BLOCK_SIZE + i] -= shadow[tx * BLOCK_SIZE + j] * shadow[j * BLOCK_SIZE + i];
  20. shadow[tx * BLOCK_SIZE + i] /= shadow[i * BLOCK_SIZE + i];
  21. }
  22. barrier(CLK_LOCAL_MEM_FENCE);
  23. if (tx>i){
  24. for(j=0; j < i+1; j++)
  25. shadow[(i+1) * BLOCK_SIZE + tx] -= shadow[(i+1) * BLOCK_SIZE + j]*shadow[j * BLOCK_SIZE + tx];
  26. }
  27. barrier(CLK_LOCAL_MEM_FENCE);
  28. }
  29. array_offset = (offset+1)*matrix_dim+offset;
  30. for(i=1; i < BLOCK_SIZE; i++){
  31. m[array_offset+tx]=shadow[i * BLOCK_SIZE + tx];
  32. array_offset += matrix_dim;
  33. }
  34. }
  35. __kernel void
  36. lud_perimeter(__global float *m,
  37. __local float *dia,
  38. __local float *peri_row,
  39. __local float *peri_col,
  40. int matrix_dim,
  41. int offset)
  42. {
  43. int i,j, array_offset;
  44. int idx;
  45. int bx = get_group_id(0);
  46. int tx = get_local_id(0);
  47. if (tx < BLOCK_SIZE) {
  48. idx = tx;
  49. array_offset = offset*matrix_dim+offset;
  50. for (i=0; i < BLOCK_SIZE/2; i++){
  51. dia[i * BLOCK_SIZE + idx]=m[array_offset+idx];
  52. array_offset += matrix_dim;
  53. }
  54. array_offset = offset*matrix_dim+offset;
  55. for (i=0; i < BLOCK_SIZE; i++) {
  56. peri_row[i * BLOCK_SIZE+ idx]=m[array_offset+(bx+1)*BLOCK_SIZE+idx];
  57. array_offset += matrix_dim;
  58. }
  59. } else {
  60. idx = tx-BLOCK_SIZE;
  61. array_offset = (offset+BLOCK_SIZE/2)*matrix_dim+offset;
  62. for (i=BLOCK_SIZE/2; i < BLOCK_SIZE; i++){
  63. dia[i * BLOCK_SIZE + idx]=m[array_offset+idx];
  64. array_offset += matrix_dim;
  65. }
  66. array_offset = (offset+(bx+1)*BLOCK_SIZE)*matrix_dim+offset;
  67. for (i=0; i < BLOCK_SIZE; i++) {
  68. peri_col[i * BLOCK_SIZE + idx] = m[array_offset+idx];
  69. array_offset += matrix_dim;
  70. }
  71. }
  72. barrier(CLK_LOCAL_MEM_FENCE);
  73. if (tx < BLOCK_SIZE) { //peri-row
  74. idx=tx;
  75. for(i=1; i < BLOCK_SIZE; i++){
  76. for (j=0; j < i; j++)
  77. peri_row[i * BLOCK_SIZE + idx]-=dia[i * BLOCK_SIZE+ j]*peri_row[j * BLOCK_SIZE + idx];
  78. }
  79. } else { //peri-col
  80. idx=tx - BLOCK_SIZE;
  81. for(i=0; i < BLOCK_SIZE; i++){
  82. for(j=0; j < i; j++)
  83. peri_col[idx * BLOCK_SIZE + i]-=peri_col[idx * BLOCK_SIZE+ j]*dia[j * BLOCK_SIZE + i];
  84. peri_col[idx * BLOCK_SIZE + i] /= dia[i * BLOCK_SIZE+ i];
  85. }
  86. }
  87. barrier(CLK_LOCAL_MEM_FENCE);
  88. if (tx < BLOCK_SIZE) { //peri-row
  89. idx=tx;
  90. array_offset = (offset+1)*matrix_dim+offset;
  91. for(i=1; i < BLOCK_SIZE; i++){
  92. m[array_offset+(bx+1)*BLOCK_SIZE+idx] = peri_row[i*BLOCK_SIZE+idx];
  93. array_offset += matrix_dim;
  94. }
  95. } else { //peri-col
  96. idx=tx - BLOCK_SIZE;
  97. array_offset = (offset+(bx+1)*BLOCK_SIZE)*matrix_dim+offset;
  98. for(i=0; i < BLOCK_SIZE; i++){
  99. m[array_offset+idx] = peri_col[i*BLOCK_SIZE+idx];
  100. array_offset += matrix_dim;
  101. }
  102. }
  103. }
  104. __kernel void
  105. lud_internal(__global float *m,
  106. __local float *peri_row,
  107. __local float *peri_col,
  108. int matrix_dim,
  109. int offset)
  110. {
  111. int bx = get_group_id(0);
  112. int by = get_group_id(1);
  113. int tx = get_local_id(0);
  114. int ty = get_local_id(1);
  115. int i;
  116. float sum;
  117. int global_row_id = offset + (by+1)*BLOCK_SIZE;
  118. int global_col_id = offset + (bx+1)*BLOCK_SIZE;
  119. peri_row[ty * BLOCK_SIZE + tx] = m[(offset+ty)*matrix_dim+global_col_id+tx];
  120. peri_col[ty * BLOCK_SIZE + tx] = m[(global_row_id+ty)*matrix_dim+offset+tx];
  121. barrier(CLK_LOCAL_MEM_FENCE);
  122. sum = 0;
  123. for (i=0; i < BLOCK_SIZE; i++)
  124. sum += peri_col[ty * BLOCK_SIZE + i] * peri_row[i * BLOCK_SIZE + tx];
  125. m[(global_row_id+ty)*matrix_dim+global_col_id+tx] -= sum;
  126. }