streamcluster_cl.h 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275
  1. /***********************************************
  2. streamcluster_cl.h
  3. : parallelized code of streamcluster
  4. - original code from PARSEC Benchmark Suite
  5. - parallelization with OpenCL API has been applied by
  6. Jianbin Fang - j.fang@tudelft.nl
  7. Delft University of Technology
  8. Faculty of Electrical Engineering, Mathematics and Computer Science
  9. Department of Software Technology
  10. Parallel and Distributed Systems Group
  11. on 15/03/2010
  12. ***********************************************/
  13. #define THREADS_PER_BLOCK 256
  14. #define MAXBLOCKS 65536
  15. //#define PROFILE_TMP
  16. #define __CL_ENABLE_EXCEPTIONS
  17. #include "CLHelper.h"
  18. typedef struct {
  19. float weight;
  20. long long assign; /* number of point where this one is assigned */
  21. float cost; /* cost of that assignment, weight*distance */
  22. } Point_Struct;
  23. /* host memory analogous to device memory */
  24. float *work_mem_h;
  25. float *coord_h;
  26. float *gl_lower;
  27. Point_Struct *p_h;
  28. /* device memory */
  29. cl_mem work_mem_d;
  30. cl_mem coord_d;
  31. cl_mem center_table_d;
  32. cl_mem switch_membership_d;
  33. cl_mem p_d;
  34. static int c; // counters
  35. void quit(char *message){
  36. printf("%s\n", message);
  37. exit(1);
  38. }
  39. //free memory
  40. void freeDevMem(){
  41. try{
  42. _clFree(work_mem_d);
  43. _clFree(center_table_d);
  44. _clFree(switch_membership_d);
  45. _clFree(p_d);
  46. _clFree(coord_d);
  47. /*if(work_mem_h!=NULL)
  48. free(work_mem_h);*/
  49. _clFreeHost(1, work_mem_h);
  50. if(coord_h!=NULL)
  51. free(coord_h);
  52. if(gl_lower!=NULL)
  53. free(gl_lower);
  54. if(p_h!=NULL)
  55. free(p_h);
  56. }
  57. catch(string msg){
  58. quit(&(msg[0]));
  59. }
  60. }
  61. //allocate device memory together
  62. void allocDevMem(int num, int dim, int kmax){
  63. try{
  64. work_mem_d = _clMalloc(kmax * num * sizeof(float));
  65. center_table_d = _clMalloc(num * sizeof(int));
  66. switch_membership_d = _clMalloc(num * sizeof(char));
  67. p_d = _clMalloc(num * sizeof(Point));
  68. coord_d = _clMalloc(num * dim * sizeof(float));
  69. }
  70. catch(string msg){
  71. quit(&(msg[0]));
  72. }
  73. }
  74. float pgain(long long x, Points *points, float z, long int *numcenters, int kmax, bool *is_center, int *center_table, char *switch_membership,
  75. double *serial, double *cpu_gpu_memcpy, double *memcpy_back, double *gpu_malloc, double *kernel){
  76. float gl_cost = 0;
  77. try{
  78. #ifdef PROFILE_TMP
  79. double t1 = gettime();
  80. #endif
  81. int K = *numcenters ; // number of centers
  82. int num = points->num; // number of points
  83. int dim = points->dim; // number of dimension
  84. kmax++;
  85. /***** build center index table 1*****/
  86. int count = 0;
  87. for( int i=0; i<num; i++){
  88. if( is_center[i] )
  89. center_table[i] = count++;
  90. }
  91. #ifdef PROFILE_TMP
  92. double t2 = gettime();
  93. *serial += t2 - t1;
  94. #endif
  95. /***** initial memory allocation and preparation for transfer : execute once -1 *****/
  96. if( c == 0 ) {
  97. #ifdef PROFILE_TMP
  98. double t3 = gettime();
  99. #endif
  100. coord_h = (float*) malloc( num * dim * sizeof(float)); // coordinates (host)
  101. gl_lower = (float*) malloc( kmax * sizeof(float) );
  102. work_mem_h = (float*)_clMallocHost(kmax*num*sizeof(float));
  103. p_h = (Point_Struct*)malloc(num*sizeof(Point_Struct)); //by cambine: not compatibal with original Point
  104. // prepare mapping for point coordinates
  105. //--cambine: what's the use of point coordinates? for computing distance.
  106. for(int i=0; i<dim; i++){
  107. for(int j=0; j<num; j++)
  108. coord_h[ (num*i)+j ] = points->p[j].coord[i];
  109. }
  110. #ifdef PROFILE_TMP
  111. double t4 = gettime();
  112. *serial += t4 - t3;
  113. #endif
  114. allocDevMem(num, dim, kmax);
  115. #ifdef PROFILE_TMP
  116. double t5 = gettime();
  117. *gpu_malloc += t5 - t4;
  118. #endif
  119. // copy coordinate to device memory
  120. _clMemcpyH2D(coord_d, coord_h, num*dim*sizeof(float));
  121. #ifdef PROFILE_TMP
  122. double t6 = gettime();
  123. *cpu_gpu_memcpy += t6 - t5;
  124. #endif
  125. }
  126. #ifdef PROFILE_TMP
  127. double t100 = gettime();
  128. #endif
  129. for(int i=0; i<num; i++){
  130. p_h[i].weight = ((points->p)[i]).weight;
  131. p_h[i].assign = ((points->p)[i]).assign;
  132. p_h[i].cost = ((points->p)[i]).cost;
  133. }
  134. #ifdef PROFILE_TMP
  135. double t101 = gettime();
  136. *serial += t101 - t100;
  137. #endif
  138. #ifdef PROFILE_TMP
  139. double t7 = gettime();
  140. #endif
  141. /***** memory transfer from host to device *****/
  142. /* copy to device memory */
  143. _clMemcpyH2D(center_table_d, center_table, num*sizeof(int));
  144. _clMemcpyH2D(p_d, p_h, num * sizeof(Point_Struct));
  145. #ifdef PROFILE_TMP
  146. double t8 = gettime();
  147. *cpu_gpu_memcpy += t8 - t7;
  148. #endif
  149. /***** kernel execution *****/
  150. /* Determine the number of thread blocks in the x- and y-dimension */
  151. size_t smSize = dim * sizeof(float);
  152. #ifdef PROFILE_TMP
  153. double t9 = gettime();
  154. #endif
  155. _clMemset(switch_membership_d, 0, num*sizeof(char));
  156. _clMemset(work_mem_d, 0, (K+1)*num*sizeof(float));
  157. //--cambine: set kernel argument
  158. int kernel_id = 0;
  159. int arg_idx = 0;
  160. kernel_id = 1;
  161. arg_idx = 0;
  162. _clSetArgs(kernel_id, arg_idx++, p_d);
  163. _clSetArgs(kernel_id, arg_idx++, coord_d);
  164. _clSetArgs(kernel_id, arg_idx++, work_mem_d);
  165. _clSetArgs(kernel_id, arg_idx++, center_table_d);
  166. _clSetArgs(kernel_id, arg_idx++, switch_membership_d);
  167. _clSetArgs(kernel_id, arg_idx++, 0, smSize);
  168. _clSetArgs(kernel_id, arg_idx++, &num, sizeof(int));
  169. _clSetArgs(kernel_id, arg_idx++, &dim, sizeof(int));
  170. _clSetArgs(kernel_id, arg_idx++, &x, sizeof(long));
  171. _clSetArgs(kernel_id, arg_idx++, &K, sizeof(int));
  172. _clInvokeKernel(kernel_id, num, THREADS_PER_BLOCK); //--cambine: start kernel here
  173. _clFinish();
  174. #ifdef PROFILE_TMP
  175. double t10 = gettime();
  176. *kernel += t10 - t9;
  177. #endif
  178. /***** copy back to host for CPU side work *****/
  179. _clMemcpyD2H(switch_membership, switch_membership_d, num * sizeof(char));
  180. _clMemcpyD2H(work_mem_h, work_mem_d, (K+1) *num*sizeof(float));
  181. #ifdef PROFILE_TMP
  182. double t11 = gettime();
  183. *memcpy_back += t11 - t10;
  184. #endif
  185. /****** cpu side work *****/
  186. int numclose = 0;
  187. gl_cost = z;
  188. /* compute the number of centers to close if we are to open i */
  189. for(int i=0; i < num; i++){ //--cambine:??
  190. if( is_center[i] ) {
  191. float low = z;
  192. //printf("i=%d ", i);
  193. for( int j = 0; j < num; j++ )
  194. low += work_mem_h[ j*(K+1) + center_table[i] ];
  195. //printf("low=%f\n", low);
  196. gl_lower[center_table[i]] = low;
  197. if ( low > 0 ) {
  198. numclose++;
  199. work_mem_h[i*(K+1)+K] -= low;
  200. }
  201. }
  202. gl_cost += work_mem_h[i*(K+1)+K];
  203. }
  204. /* if opening a center at x saves cost (i.e. cost is negative) do so
  205. otherwise, do nothing */
  206. if ( gl_cost < 0 ) {
  207. for(int i=0; i<num; i++){
  208. bool close_center = gl_lower[center_table[points->p[i].assign]] > 0 ;
  209. if ( (switch_membership[i]=='1') || close_center ) {
  210. points->p[i].cost = points->p[i].weight * dist(points->p[i], points->p[x], points->dim);
  211. points->p[i].assign = x;
  212. }
  213. }
  214. for(int i=0; i<num; i++){
  215. if( is_center[i] && gl_lower[center_table[i]] > 0 )
  216. is_center[i] = false;
  217. }
  218. is_center[x] = true;
  219. *numcenters = *numcenters +1 - numclose;
  220. }
  221. else
  222. gl_cost = 0; // the value we'
  223. #ifdef PROFILE_TMP
  224. double t12 = gettime();
  225. *serial += t12 - t11;
  226. #endif
  227. c++;
  228. }
  229. catch(string msg){
  230. printf("--cambine:%s\n", msg.c_str());
  231. freeDevMem();
  232. _clRelease();
  233. exit(-1);
  234. }
  235. catch(...){
  236. printf("--cambine: unknow reasons in pgain\n");
  237. }
  238. /*FILE *fp = fopen("data_opencl.txt", "a");
  239. fprintf(fp,"%d, %f\n", c, gl_cost);
  240. fclose(fp);*/
  241. //printf("%d, %f\n", c, gl_cost);
  242. //exit(-1);
  243. return -gl_cost;
  244. }