read_input.c 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322
  1. /*****************************************************************************/
  2. /*IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. */
  3. /*By downloading, copying, installing or using the software you agree */
  4. /*to this license. If you do not agree to this license, do not download, */
  5. /*install, copy or use the software. */
  6. /* */
  7. /* */
  8. /*Copyright (c) 2005 Northwestern University */
  9. /*All rights reserved. */
  10. /*Redistribution of the software in source and binary forms, */
  11. /*with or without modification, is permitted provided that the */
  12. /*following conditions are met: */
  13. /* */
  14. /*1 Redistributions of source code must retain the above copyright */
  15. /* notice, this list of conditions and the following disclaimer. */
  16. /* */
  17. /*2 Redistributions in binary form must reproduce the above copyright */
  18. /* notice, this list of conditions and the following disclaimer in the */
  19. /* documentation and/or other materials provided with the distribution.*/
  20. /* */
  21. /*3 Neither the name of Northwestern University nor the names of its */
  22. /* contributors may be used to endorse or promote products derived */
  23. /* from this software without specific prior written permission. */
  24. /* */
  25. /*THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS */
  26. /*IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED */
  27. /*TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY, NON-INFRINGEMENT AND */
  28. /*FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL */
  29. /*NORTHWESTERN UNIVERSITY OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, */
  30. /*INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  31. /*(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR */
  32. /*SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) */
  33. /*HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, */
  34. /*STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN */
  35. /*ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  36. /*POSSIBILITY OF SUCH DAMAGE. */
  37. /******************************************************************************/
  38. /*************************************************************************/
  39. /** File: example.c **/
  40. /** Description: Takes as input a file: **/
  41. /** ascii file: containing 1 data point per line **/
  42. /** binary file: first int is the number of objects **/
  43. /** 2nd int is the no. of features of each **/
  44. /** object **/
  45. /** This example performs a fuzzy c-means clustering **/
  46. /** on the data. Fuzzy clustering is performed using **/
  47. /** min to max clusters and the clustering that gets **/
  48. /** the best score according to a compactness and **/
  49. /** separation criterion are returned. **/
  50. /** Author: Wei-keng Liao **/
  51. /** ECE Department Northwestern University **/
  52. /** email: wkliao@ece.northwestern.edu **/
  53. /** **/
  54. /** Edited by: Jay Pisharath **/
  55. /** Northwestern University. **/
  56. /** **/
  57. /** ================================================================ **/
  58. /** **/
  59. /** Edited by: Shuai Che, David Tarjan, Sang-Ha Lee **/
  60. /** University of Virginia **/
  61. /** **/
  62. /** Description: No longer supports fuzzy c-means clustering; **/
  63. /** only regular k-means clustering. **/
  64. /** No longer performs "validity" function to analyze **/
  65. /** compactness and separation crietria; instead **/
  66. /** calculate root mean squared error. **/
  67. /** **/
  68. /*************************************************************************/
  69. #define _CRT_SECURE_NO_DEPRECATE 1
  70. #include <stdio.h>
  71. #include <stdlib.h>
  72. #include <string.h>
  73. #include <limits.h>
  74. #include <math.h>
  75. #include <fcntl.h>
  76. #include <omp.h>
  77. #include "kmeans.h"
  78. #include <unistd.h>
  79. extern double wtime(void);
  80. /*---< usage() >------------------------------------------------------------*/
  81. void usage(char *argv0) {
  82. char *help =
  83. "\nUsage: %s [switches] -i filename\n\n"
  84. " -i filename :file containing data to be clustered\n"
  85. " -m max_nclusters :maximum number of clusters allowed [default=5]\n"
  86. " -n min_nclusters :minimum number of clusters allowed [default=5]\n"
  87. " -t threshold :threshold value [default=0.001]\n"
  88. " -l nloops :iteration for each number of clusters [default=1]\n"
  89. " -b :input file is in binary format\n"
  90. " -r :calculate RMSE [default=off]\n"
  91. " -o :output cluster center coordinates [default=off]\n"
  92. " -p platform_id :OCL platform to use [default=0]\n"
  93. " -d device_id :OCL device to use [default=0]\n"
  94. " -g use_gpu :1 for GPU 0 for CPU [default=0]\n";
  95. fprintf(stderr, help, argv0);
  96. exit(-1);
  97. }
  98. /*---< main() >-------------------------------------------------------------*/
  99. int setup(int argc, char **argv) {
  100. int opt;
  101. extern char *optarg;
  102. char *filename = 0;
  103. float *buf;
  104. char line[1024];
  105. int isBinaryFile = 0;
  106. float threshold = 0.001; /* default value */
  107. int max_nclusters=5; /* default value */
  108. int min_nclusters=5; /* default value */
  109. int best_nclusters = 0;
  110. int nfeatures = 0;
  111. int npoints = 0;
  112. float len;
  113. float **features;
  114. float **cluster_centres=NULL;
  115. int i, j, index;
  116. int nloops = 1; /* default value */
  117. int isRMSE = 0;
  118. float rmse;
  119. int isOutput = 0;
  120. //float cluster_timing, io_timing;
  121. // Variables to store information on platform and device to use_gpu
  122. int platform_id = 0;
  123. int device_id = 0;
  124. int use_gpu = 0;
  125. /* obtain command line arguments and change appropriate options */
  126. while ( (opt=getopt(argc,argv,"i:t:m:n:l:brop:d:g:"))!= EOF) {
  127. switch (opt) {
  128. case 'i': filename=optarg;
  129. break;
  130. case 'b': isBinaryFile = 1;
  131. break;
  132. case 't': threshold=atof(optarg);
  133. break;
  134. case 'm': max_nclusters = atoi(optarg);
  135. break;
  136. case 'n': min_nclusters = atoi(optarg);
  137. break;
  138. case 'r': isRMSE = 1;
  139. break;
  140. case 'o': isOutput = 1;
  141. break;
  142. case 'l': nloops = atoi(optarg);
  143. break;
  144. case 'p': platform_id = atoi(optarg);
  145. break;
  146. case 'd': device_id = atoi(optarg);
  147. break;
  148. case 'g': use_gpu = atoi(optarg);
  149. break;
  150. case '?': usage(argv[0]);
  151. break;
  152. default: usage(argv[0]);
  153. break;
  154. }
  155. }
  156. if (filename == 0) usage(argv[0]);
  157. /* ============== I/O begin ==============*/
  158. /* get nfeatures and npoints */
  159. //io_timing = omp_get_wtime();
  160. if (isBinaryFile) { //Binary file input
  161. int infile;
  162. if ((infile = open(filename, O_RDONLY, "0600")) == -1) {
  163. fprintf(stderr, "Error: no such file (%s)\n", filename);
  164. exit(1);
  165. }
  166. read(infile, &npoints, sizeof(int));
  167. read(infile, &nfeatures, sizeof(int));
  168. /* allocate space for features[][] and read attributes of all objects */
  169. buf = (float*) malloc(npoints*nfeatures*sizeof(float));
  170. features = (float**)malloc(npoints* sizeof(float*));
  171. features[0] = (float*) malloc(npoints*nfeatures*sizeof(float));
  172. for (i=1; i<npoints; i++)
  173. features[i] = features[i-1] + nfeatures;
  174. read(infile, buf, npoints*nfeatures*sizeof(float));
  175. close(infile);
  176. }
  177. else {
  178. FILE *infile;
  179. if ((infile = fopen(filename, "r")) == NULL) {
  180. fprintf(stderr, "Error: no such file (%s)\n", filename);
  181. exit(1);
  182. }
  183. while (fgets(line, 1024, infile) != NULL)
  184. if (strtok(line, " \t\n") != 0)
  185. npoints++;
  186. rewind(infile);
  187. while (fgets(line, 1024, infile) != NULL) {
  188. if (strtok(line, " \t\n") != 0) {
  189. /* ignore the id (first attribute): nfeatures = 1; */
  190. while (strtok(NULL, " ,\t\n") != NULL) nfeatures++;
  191. break;
  192. }
  193. }
  194. /* allocate space for features[] and read attributes of all objects */
  195. buf = (float*) malloc(npoints*nfeatures*sizeof(float));
  196. features = (float**)malloc(npoints* sizeof(float*));
  197. features[0] = (float*) malloc(npoints*nfeatures*sizeof(float));
  198. for (i=1; i<npoints; i++)
  199. features[i] = features[i-1] + nfeatures;
  200. rewind(infile);
  201. i = 0;
  202. while (fgets(line, 1024, infile) != NULL) {
  203. if (strtok(line, " \t\n") == NULL) continue;
  204. for (j=0; j<nfeatures; j++) {
  205. buf[i] = atof(strtok(NULL, " ,\t\n"));
  206. i++;
  207. }
  208. }
  209. fclose(infile);
  210. }
  211. //io_timing = omp_get_wtime() - io_timing;
  212. printf("\nI/O completed\n");
  213. printf("\nNumber of objects: %d\n", npoints);
  214. printf("Number of features: %d\n", nfeatures);
  215. /* ============== I/O end ==============*/
  216. // error check for clusters
  217. if (npoints < min_nclusters)
  218. {
  219. printf("Error: min_nclusters(%d) > npoints(%d) -- cannot proceed\n", min_nclusters, npoints);
  220. exit(0);
  221. }
  222. srand(7); /* seed for future random number generator */
  223. memcpy(features[0], buf, npoints*nfeatures*sizeof(float)); /* now features holds 2-dimensional array of features */
  224. free(buf);
  225. /* ======================= core of the clustering ===================*/
  226. //cluster_timing = omp_get_wtime(); /* Total clustering time */
  227. cluster_centres = NULL;
  228. index = cluster(npoints, /* number of data points */
  229. nfeatures, /* number of features for each point */
  230. features, /* array: [npoints][nfeatures] */
  231. min_nclusters, /* range of min to max number of clusters */
  232. max_nclusters,
  233. threshold, /* loop termination factor */
  234. &best_nclusters, /* return: number between min and max */
  235. &cluster_centres, /* return: [best_nclusters][nfeatures] */
  236. &rmse, /* Root Mean Squared Error */
  237. isRMSE, /* calculate RMSE */
  238. nloops,
  239. platform_id,
  240. device_id,
  241. use_gpu); /* number of iteration for each number of clusters */
  242. //cluster_timing = omp_get_wtime() - cluster_timing;
  243. /* =============== Command Line Output =============== */
  244. /* cluster center coordinates
  245. :displayed only for when k=1*/
  246. if((min_nclusters == max_nclusters) && (isOutput == 1)) {
  247. printf("\n================= Centroid Coordinates =================\n");
  248. for(i = 0; i < max_nclusters; i++){
  249. printf("%d:", i);
  250. for(j = 0; j < nfeatures; j++){
  251. printf(" %.2f", cluster_centres[i][j]);
  252. }
  253. printf("\n\n");
  254. }
  255. }
  256. len = (float) ((max_nclusters - min_nclusters + 1)*nloops);
  257. printf("Number of Iteration: %d\n", nloops);
  258. //printf("Time for I/O: %.5fsec\n", io_timing);
  259. //printf("Time for Entire Clustering: %.5fsec\n", cluster_timing);
  260. if(min_nclusters != max_nclusters){
  261. if(nloops != 1){ //range of k, multiple iteration
  262. //printf("Average Clustering Time: %fsec\n",
  263. // cluster_timing / len);
  264. printf("Best number of clusters is %d\n", best_nclusters);
  265. }
  266. else{ //range of k, single iteration
  267. //printf("Average Clustering Time: %fsec\n",
  268. // cluster_timing / len);
  269. printf("Best number of clusters is %d\n", best_nclusters);
  270. }
  271. }
  272. else{
  273. if(nloops != 1){ // single k, multiple iteration
  274. //printf("Average Clustering Time: %.5fsec\n",
  275. // cluster_timing / nloops);
  276. if(isRMSE) // if calculated RMSE
  277. printf("Number of trials to approach the best RMSE of %.3f is %d\n", rmse, index + 1);
  278. }
  279. else{ // single k, single iteration
  280. if(isRMSE) // if calculated RMSE
  281. printf("Root Mean Squared Error: %.3f\n", rmse);
  282. }
  283. }
  284. /* free up memory */
  285. free(features[0]);
  286. free(features);
  287. return(0);
  288. }