/*********************************************** streamcluster_cl.h : parallelized code of streamcluster - original code from PARSEC Benchmark Suite - parallelization with OpenCL API has been applied by Jianbin Fang - j.fang@tudelft.nl Delft University of Technology Faculty of Electrical Engineering, Mathematics and Computer Science Department of Software Technology Parallel and Distributed Systems Group on 15/03/2010 ***********************************************/ #define THREADS_PER_BLOCK 256 #define MAXBLOCKS 65536 //#define PROFILE_TMP #define __CL_ENABLE_EXCEPTIONS #include "CLHelper.h" typedef struct { float weight; long long assign; /* number of point where this one is assigned */ float cost; /* cost of that assignment, weight*distance */ } Point_Struct; /* host memory analogous to device memory */ float *work_mem_h; float *coord_h; float *gl_lower; Point_Struct *p_h; /* device memory */ cl_mem work_mem_d; cl_mem coord_d; cl_mem center_table_d; cl_mem switch_membership_d; cl_mem p_d; static int c; // counters void quit(char *message){ printf("%s\n", message); exit(1); } //free memory void freeDevMem(){ try{ _clFree(work_mem_d); _clFree(center_table_d); _clFree(switch_membership_d); _clFree(p_d); _clFree(coord_d); /*if(work_mem_h!=NULL) free(work_mem_h);*/ _clFreeHost(1, work_mem_h); if(coord_h!=NULL) free(coord_h); if(gl_lower!=NULL) free(gl_lower); if(p_h!=NULL) free(p_h); } catch(string msg){ quit(&(msg[0])); } } //allocate device memory together void allocDevMem(int num, int dim, int kmax){ try{ work_mem_d = _clMalloc(kmax * num * sizeof(float)); center_table_d = _clMalloc(num * sizeof(int)); switch_membership_d = _clMalloc(num * sizeof(char)); p_d = _clMalloc(num * sizeof(Point)); coord_d = _clMalloc(num * dim * sizeof(float)); } catch(string msg){ quit(&(msg[0])); } } float pgain(long long x, Points *points, float z, long int *numcenters, int kmax, bool *is_center, int *center_table, char *switch_membership, double *serial, double *cpu_gpu_memcpy, double *memcpy_back, double *gpu_malloc, double *kernel){ float gl_cost = 0; try{ #ifdef PROFILE_TMP double t1 = gettime(); #endif int K = *numcenters ; // number of centers int num = points->num; // number of points int dim = points->dim; // number of dimension kmax++; /***** build center index table 1*****/ int count = 0; for( int i=0; ip[j].coord[i]; } #ifdef PROFILE_TMP double t4 = gettime(); *serial += t4 - t3; #endif allocDevMem(num, dim, kmax); #ifdef PROFILE_TMP double t5 = gettime(); *gpu_malloc += t5 - t4; #endif // copy coordinate to device memory _clMemcpyH2D(coord_d, coord_h, num*dim*sizeof(float)); #ifdef PROFILE_TMP double t6 = gettime(); *cpu_gpu_memcpy += t6 - t5; #endif } #ifdef PROFILE_TMP double t100 = gettime(); #endif for(int i=0; ip)[i]).weight; p_h[i].assign = ((points->p)[i]).assign; p_h[i].cost = ((points->p)[i]).cost; } #ifdef PROFILE_TMP double t101 = gettime(); *serial += t101 - t100; #endif #ifdef PROFILE_TMP double t7 = gettime(); #endif /***** memory transfer from host to device *****/ /* copy to device memory */ _clMemcpyH2D(center_table_d, center_table, num*sizeof(int)); _clMemcpyH2D(p_d, p_h, num * sizeof(Point_Struct)); #ifdef PROFILE_TMP double t8 = gettime(); *cpu_gpu_memcpy += t8 - t7; #endif /***** kernel execution *****/ /* Determine the number of thread blocks in the x- and y-dimension */ size_t smSize = dim * sizeof(float); #ifdef PROFILE_TMP double t9 = gettime(); #endif _clMemset(switch_membership_d, 0, num*sizeof(char)); _clMemset(work_mem_d, 0, (K+1)*num*sizeof(float)); //--cambine: set kernel argument int kernel_id = 0; int arg_idx = 0; kernel_id = 1; arg_idx = 0; _clSetArgs(kernel_id, arg_idx++, p_d); _clSetArgs(kernel_id, arg_idx++, coord_d); _clSetArgs(kernel_id, arg_idx++, work_mem_d); _clSetArgs(kernel_id, arg_idx++, center_table_d); _clSetArgs(kernel_id, arg_idx++, switch_membership_d); _clSetArgs(kernel_id, arg_idx++, 0, smSize); _clSetArgs(kernel_id, arg_idx++, &num, sizeof(int)); _clSetArgs(kernel_id, arg_idx++, &dim, sizeof(int)); _clSetArgs(kernel_id, arg_idx++, &x, sizeof(long)); _clSetArgs(kernel_id, arg_idx++, &K, sizeof(int)); _clInvokeKernel(kernel_id, num, THREADS_PER_BLOCK); //--cambine: start kernel here _clFinish(); #ifdef PROFILE_TMP double t10 = gettime(); *kernel += t10 - t9; #endif /***** copy back to host for CPU side work *****/ _clMemcpyD2H(switch_membership, switch_membership_d, num * sizeof(char)); _clMemcpyD2H(work_mem_h, work_mem_d, (K+1) *num*sizeof(float)); #ifdef PROFILE_TMP double t11 = gettime(); *memcpy_back += t11 - t10; #endif /****** cpu side work *****/ int numclose = 0; gl_cost = z; /* compute the number of centers to close if we are to open i */ for(int i=0; i < num; i++){ //--cambine:?? if( is_center[i] ) { float low = z; //printf("i=%d ", i); for( int j = 0; j < num; j++ ) low += work_mem_h[ j*(K+1) + center_table[i] ]; //printf("low=%f\n", low); gl_lower[center_table[i]] = low; if ( low > 0 ) { numclose++; work_mem_h[i*(K+1)+K] -= low; } } gl_cost += work_mem_h[i*(K+1)+K]; } /* if opening a center at x saves cost (i.e. cost is negative) do so otherwise, do nothing */ if ( gl_cost < 0 ) { for(int i=0; ip[i].assign]] > 0 ; if ( (switch_membership[i]=='1') || close_center ) { points->p[i].cost = points->p[i].weight * dist(points->p[i], points->p[x], points->dim); points->p[i].assign = x; } } for(int i=0; i 0 ) is_center[i] = false; } is_center[x] = true; *numcenters = *numcenters +1 - numclose; } else gl_cost = 0; // the value we' #ifdef PROFILE_TMP double t12 = gettime(); *serial += t12 - t11; #endif c++; } catch(string msg){ printf("--cambine:%s\n", msg.c_str()); freeDevMem(); _clRelease(); exit(-1); } catch(...){ printf("--cambine: unknow reasons in pgain\n"); } /*FILE *fp = fopen("data_opencl.txt", "a"); fprintf(fp,"%d, %f\n", c, gl_cost); fclose(fp);*/ //printf("%d, %f\n", c, gl_cost); //exit(-1); return -gl_cost; }