kmeans.cpp 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279
  1. #include <stdio.h>
  2. #include <string.h>
  3. #include <stdlib.h>
  4. #include <math.h>
  5. #include <iostream>
  6. #include <string>
  7. #include "kmeans.h"
  8. #ifdef WIN
  9. #include <windows.h>
  10. #else
  11. #include <pthread.h>
  12. #include <sys/time.h>
  13. double gettime() {
  14. struct timeval t;
  15. gettimeofday(&t,NULL);
  16. return t.tv_sec+t.tv_usec*1e-6;
  17. }
  18. #endif
  19. #ifdef NV
  20. #include <oclUtils.h>
  21. #else
  22. #include <CL/cl.h>
  23. #endif
  24. #ifndef FLT_MAX
  25. #define FLT_MAX 3.40282347e+38
  26. #endif
  27. #ifdef RD_WG_SIZE_0_0
  28. #define BLOCK_SIZE RD_WG_SIZE_0_0
  29. #elif defined(RD_WG_SIZE_0)
  30. #define BLOCK_SIZE RD_WG_SIZE_0
  31. #elif defined(RD_WG_SIZE)
  32. #define BLOCK_SIZE RD_WG_SIZE
  33. #else
  34. #define BLOCK_SIZE 256
  35. #endif
  36. #ifdef RD_WG_SIZE_1_0
  37. #define BLOCK_SIZE2 RD_WG_SIZE_1_0
  38. #elif defined(RD_WG_SIZE_1)
  39. #define BLOCK_SIZE2 RD_WG_SIZE_1
  40. #elif defined(RD_WG_SIZE)
  41. #define BLOCK_SIZE2 RD_WG_SIZE
  42. #else
  43. #define BLOCK_SIZE2 256
  44. #endif
  45. // local variables
  46. static cl_context context;
  47. static cl_command_queue cmd_queue;
  48. static cl_device_type device_type;
  49. static cl_device_id * device_list;
  50. static cl_int num_devices;
  51. static int initialize(int use_gpu)
  52. {
  53. cl_int result;
  54. size_t size;
  55. // modify a bit the initialization of the platforms in order to handle more than one platform (since on the odroid we have the GPU and the CPU platform).
  56. cl_uint platformCount;
  57. cl_platform_id *platforms_ids;
  58. // create OpenCL context
  59. clGetPlatformIDs(0, NULL, &platformCount);
  60. platforms_ids = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount);
  61. clGetPlatformIDs(platformCount, platforms_ids, NULL);
  62. if (clGetPlatformIDs(platformCount, platforms_ids, NULL) != CL_SUCCESS) { printf("ERROR: clGetPlatformIDs(1,*,0) failed\n"); return -1; }
  63. cl_context_properties ctxprop[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platforms_ids[1], 0};
  64. device_type = use_gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU;
  65. context = clCreateContextFromType( ctxprop, device_type, NULL, NULL, NULL );
  66. if( !context ) { printf("ERROR: clCreateContextFromType(%s) failed\n", use_gpu ? "GPU" : "CPU"); return -1; }
  67. // get the list of GPUs
  68. result = clGetContextInfo( context, CL_CONTEXT_DEVICES, 0, NULL, &size );
  69. num_devices = (int) (size / sizeof(cl_device_id));
  70. if( result != CL_SUCCESS || num_devices < 1 ) { printf("ERROR: clGetContextInfo() failed\n"); return -1; }
  71. device_list = new cl_device_id[num_devices];
  72. if( !device_list ) { printf("ERROR: new cl_device_id[] failed\n"); return -1; }
  73. result = clGetContextInfo( context, CL_CONTEXT_DEVICES, size, device_list, NULL );
  74. if( result != CL_SUCCESS ) { printf("ERROR: clGetContextInfo() failed\n"); return -1; }
  75. // create command queue for the first device
  76. cmd_queue = clCreateCommandQueue( context, device_list[0], 0, NULL );
  77. if( !cmd_queue ) { printf("ERROR: clCreateCommandQueue() failed\n"); return -1; }
  78. return 0;
  79. }
  80. static int shutdown()
  81. {
  82. // release resources
  83. if( cmd_queue ) clReleaseCommandQueue( cmd_queue );
  84. if( context ) clReleaseContext( context );
  85. if( device_list ) delete device_list;
  86. // reset all variables
  87. cmd_queue = 0;
  88. context = 0;
  89. device_list = 0;
  90. num_devices = 0;
  91. device_type = 0;
  92. return 0;
  93. }
  94. cl_mem d_feature;
  95. cl_mem d_feature_swap;
  96. cl_mem d_cluster;
  97. cl_mem d_membership;
  98. cl_kernel kernel;
  99. cl_kernel kernel_s;
  100. cl_kernel kernel2;
  101. int *membership_OCL;
  102. int *membership_d;
  103. float *feature_d;
  104. float *clusters_d;
  105. float *center_d;
  106. int allocate(int n_points, int n_features, int n_clusters, float **feature)
  107. {
  108. int sourcesize = 1024*1024;
  109. char * source = (char *)calloc(sourcesize, sizeof(char));
  110. if(!source) { printf("ERROR: calloc(%d) failed\n", sourcesize); return -1; }
  111. // read the kernel core source
  112. char * tempchar = "./kmeans.cl";
  113. FILE * fp = fopen(tempchar, "rb");
  114. if(!fp) { printf("ERROR: unable to open '%s'\n", tempchar); return -1; }
  115. fread(source + strlen(source), sourcesize, 1, fp);
  116. fclose(fp);
  117. // OpenCL initialization
  118. int use_gpu = 0;
  119. if(initialize(use_gpu)) return -1;
  120. // compile kernel
  121. cl_int err = 0;
  122. const char * slist[2] = { source, 0 };
  123. cl_program prog = clCreateProgramWithSource(context, 1, slist, NULL, &err);
  124. if(err != CL_SUCCESS) { printf("ERROR: clCreateProgramWithSource() => %d\n", err); return -1; }
  125. err = clBuildProgram(prog, 0, NULL, NULL, NULL, NULL);
  126. { // show warnings/errors
  127. // static char log[65536]; memset(log, 0, sizeof(log));
  128. // cl_device_id device_id = 0;
  129. // err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(device_id), &device_id, NULL);
  130. // clGetProgramBuildInfo(prog, device_id, CL_PROGRAM_BUILD_LOG, sizeof(log)-1, log, NULL);
  131. // if(err || strstr(log,"warning:") || strstr(log, "error:")) printf("<<<<\n%s\n>>>>\n", log);
  132. }
  133. if(err != CL_SUCCESS) { printf("ERROR: clBuildProgram() => %d\n", err); return -1; }
  134. char * kernel_kmeans_c = "kmeans_kernel_c";
  135. char * kernel_swap = "kmeans_swap";
  136. kernel_s = clCreateKernel(prog, kernel_kmeans_c, &err);
  137. if(err != CL_SUCCESS) { printf("ERROR: clCreateKernel() 0 => %d\n", err); return -1; }
  138. kernel2 = clCreateKernel(prog, kernel_swap, &err);
  139. if(err != CL_SUCCESS) { printf("ERROR: clCreateKernel() 0 => %d\n", err); return -1; }
  140. clReleaseProgram(prog);
  141. d_feature = clCreateBuffer(context, CL_MEM_READ_WRITE, n_points * n_features * sizeof(float), NULL, &err );
  142. if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer d_feature (size:%d) => %d\n", n_points * n_features, err); return -1;}
  143. d_feature_swap = clCreateBuffer(context, CL_MEM_READ_WRITE, n_points * n_features * sizeof(float), NULL, &err );
  144. if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer d_feature_swap (size:%d) => %d\n", n_points * n_features, err); return -1;}
  145. d_cluster = clCreateBuffer(context, CL_MEM_READ_WRITE, n_clusters * n_features * sizeof(float), NULL, &err );
  146. if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer d_cluster (size:%d) => %d\n", n_clusters * n_features, err); return -1;}
  147. d_membership = clCreateBuffer(context, CL_MEM_READ_WRITE, n_points * sizeof(int), NULL, &err );
  148. if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer d_membership (size:%d) => %d\n", n_points, err); return -1;}
  149. //write buffers
  150. err = clEnqueueWriteBuffer(cmd_queue, d_feature, 1, 0, n_points * n_features * sizeof(float), feature[0], 0, 0, 0);
  151. if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer d_feature (size:%d) => %d\n", n_points * n_features, err); return -1; }
  152. clSetKernelArg(kernel2, 0, sizeof(void *), (void*) &d_feature);
  153. clSetKernelArg(kernel2, 1, sizeof(void *), (void*) &d_feature_swap);
  154. clSetKernelArg(kernel2, 2, sizeof(cl_int), (void*) &n_points);
  155. clSetKernelArg(kernel2, 3, sizeof(cl_int), (void*) &n_features);
  156. size_t global_work[3] = { n_points, 1, 1 };
  157. /// Ke Wang adjustable local group size 2013/08/07 10:37:33
  158. size_t local_work_size= BLOCK_SIZE; // work group size is defined by RD_WG_SIZE_0 or RD_WG_SIZE_0_0 2014/06/10 17:00:51
  159. if(global_work[0]%local_work_size !=0)
  160. global_work[0]=(global_work[0]/local_work_size+1)*local_work_size;
  161. err = clEnqueueNDRangeKernel(cmd_queue, kernel2, 1, NULL, global_work, &local_work_size, 0, 0, 0);
  162. if(err != CL_SUCCESS) { printf("ERROR: clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; }
  163. membership_OCL = (int*) malloc(n_points * sizeof(int));
  164. }
  165. void deallocateMemory()
  166. {
  167. clReleaseMemObject(d_feature);
  168. clReleaseMemObject(d_feature_swap);
  169. clReleaseMemObject(d_cluster);
  170. clReleaseMemObject(d_membership);
  171. free(membership_OCL);
  172. }
  173. int main( int argc, char** argv)
  174. {
  175. printf("WG size of kernel_swap = %d, WG size of kernel_kmeans = %d \n", BLOCK_SIZE, BLOCK_SIZE2);
  176. setup(argc, argv);
  177. shutdown();
  178. }
  179. int kmeansOCL(float **feature, /* in: [npoints][nfeatures] */
  180. int n_features,
  181. int n_points,
  182. int n_clusters,
  183. int *membership,
  184. float **clusters,
  185. int *new_centers_len,
  186. float **new_centers)
  187. {
  188. int delta = 0;
  189. int i, j, k;
  190. cl_int err = 0;
  191. size_t global_work[3] = { n_points, 1, 1 };
  192. /// Ke Wang adjustable local group size 2013/08/07 10:37:33
  193. size_t local_work_size=BLOCK_SIZE2; // work group size is defined by RD_WG_SIZE_1 or RD_WG_SIZE_1_0 2014/06/10 17:00:41
  194. if(global_work[0]%local_work_size !=0)
  195. global_work[0]=(global_work[0]/local_work_size+1)*local_work_size;
  196. err = clEnqueueWriteBuffer(cmd_queue, d_cluster, 1, 0, n_clusters * n_features * sizeof(float), clusters[0], 0, 0, 0);
  197. if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer d_cluster (size:%d) => %d\n", n_points, err); return -1; }
  198. int size = 0; int offset = 0;
  199. clSetKernelArg(kernel_s, 0, sizeof(void *), (void*) &d_feature_swap);
  200. clSetKernelArg(kernel_s, 1, sizeof(void *), (void*) &d_cluster);
  201. clSetKernelArg(kernel_s, 2, sizeof(void *), (void*) &d_membership);
  202. clSetKernelArg(kernel_s, 3, sizeof(cl_int), (void*) &n_points);
  203. clSetKernelArg(kernel_s, 4, sizeof(cl_int), (void*) &n_clusters);
  204. clSetKernelArg(kernel_s, 5, sizeof(cl_int), (void*) &n_features);
  205. clSetKernelArg(kernel_s, 6, sizeof(cl_int), (void*) &offset);
  206. clSetKernelArg(kernel_s, 7, sizeof(cl_int), (void*) &size);
  207. err = clEnqueueNDRangeKernel(cmd_queue, kernel_s, 1, NULL, global_work, &local_work_size, 0, 0, 0);
  208. if(err != CL_SUCCESS) { printf("ERROR: clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; }
  209. clFinish(cmd_queue);
  210. err = clEnqueueReadBuffer(cmd_queue, d_membership, 1, 0, n_points * sizeof(int), membership_OCL, 0, 0, 0);
  211. if(err != CL_SUCCESS) { printf("ERROR: Memcopy Out\n"); return -1; }
  212. delta = 0;
  213. for (i = 0; i < n_points; i++)
  214. {
  215. int cluster_id = membership_OCL[i];
  216. new_centers_len[cluster_id]++;
  217. if (membership_OCL[i] != membership[i])
  218. {
  219. delta++;
  220. membership[i] = membership_OCL[i];
  221. }
  222. for (j = 0; j < n_features; j++)
  223. {
  224. new_centers[cluster_id][j] += feature[i][j];
  225. }
  226. }
  227. return delta;
  228. }