read_input.c 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306
  1. /*****************************************************************************/
  2. /*IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. */
  3. /*By downloading, copying, installing or using the software you agree */
  4. /*to this license. If you do not agree to this license, do not download, */
  5. /*install, copy or use the software. */
  6. /* */
  7. /* */
  8. /*Copyright (c) 2005 Northwestern University */
  9. /*All rights reserved. */
  10. /*Redistribution of the software in source and binary forms, */
  11. /*with or without modification, is permitted provided that the */
  12. /*following conditions are met: */
  13. /* */
  14. /*1 Redistributions of source code must retain the above copyright */
  15. /* notice, this list of conditions and the following disclaimer. */
  16. /* */
  17. /*2 Redistributions in binary form must reproduce the above copyright */
  18. /* notice, this list of conditions and the following disclaimer in the */
  19. /* documentation and/or other materials provided with the distribution.*/
  20. /* */
  21. /*3 Neither the name of Northwestern University nor the names of its */
  22. /* contributors may be used to endorse or promote products derived */
  23. /* from this software without specific prior written permission. */
  24. /* */
  25. /*THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS */
  26. /*IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED */
  27. /*TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY, NON-INFRINGEMENT AND */
  28. /*FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL */
  29. /*NORTHWESTERN UNIVERSITY OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, */
  30. /*INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  31. /*(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR */
  32. /*SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) */
  33. /*HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, */
  34. /*STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN */
  35. /*ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  36. /*POSSIBILITY OF SUCH DAMAGE. */
  37. /******************************************************************************/
  38. /*************************************************************************/
  39. /** File: example.c **/
  40. /** Description: Takes as input a file: **/
  41. /** ascii file: containing 1 data point per line **/
  42. /** binary file: first int is the number of objects **/
  43. /** 2nd int is the no. of features of each **/
  44. /** object **/
  45. /** This example performs a fuzzy c-means clustering **/
  46. /** on the data. Fuzzy clustering is performed using **/
  47. /** min to max clusters and the clustering that gets **/
  48. /** the best score according to a compactness and **/
  49. /** separation criterion are returned. **/
  50. /** Author: Wei-keng Liao **/
  51. /** ECE Department Northwestern University **/
  52. /** email: wkliao@ece.northwestern.edu **/
  53. /** **/
  54. /** Edited by: Jay Pisharath **/
  55. /** Northwestern University. **/
  56. /** **/
  57. /** ================================================================ **/
  58. /** **/
  59. /** Edited by: Shuai Che, David Tarjan, Sang-Ha Lee **/
  60. /** University of Virginia **/
  61. /** **/
  62. /** Description: No longer supports fuzzy c-means clustering; **/
  63. /** only regular k-means clustering. **/
  64. /** No longer performs "validity" function to analyze **/
  65. /** compactness and separation crietria; instead **/
  66. /** calculate root mean squared error. **/
  67. /** **/
  68. /*************************************************************************/
  69. #define _CRT_SECURE_NO_DEPRECATE 1
  70. #include <stdio.h>
  71. #include <stdlib.h>
  72. #include <string.h>
  73. #include <limits.h>
  74. #include <math.h>
  75. #include <fcntl.h>
  76. #include <omp.h>
  77. #include "kmeans.h"
  78. #include <unistd.h>
  79. extern double wtime(void);
  80. /*---< usage() >------------------------------------------------------------*/
  81. void usage(char *argv0) {
  82. char *help =
  83. "\nUsage: %s [switches] -i filename\n\n"
  84. " -i filename :file containing data to be clustered\n"
  85. " -m max_nclusters :maximum number of clusters allowed [default=5]\n"
  86. " -n min_nclusters :minimum number of clusters allowed [default=5]\n"
  87. " -t threshold :threshold value [default=0.001]\n"
  88. " -l nloops :iteration for each number of clusters [default=1]\n"
  89. " -b :input file is in binary format\n"
  90. " -r :calculate RMSE [default=off]\n"
  91. " -o :output cluster center coordinates [default=off]\n";
  92. fprintf(stderr, help, argv0);
  93. exit(-1);
  94. }
  95. /*---< main() >-------------------------------------------------------------*/
  96. int setup(int argc, char **argv) {
  97. int opt;
  98. extern char *optarg;
  99. char *filename = 0;
  100. float *buf;
  101. char line[1024];
  102. int isBinaryFile = 0;
  103. float threshold = 0.001; /* default value */
  104. int max_nclusters=5; /* default value */
  105. int min_nclusters=5; /* default value */
  106. int best_nclusters = 0;
  107. int nfeatures = 0;
  108. int npoints = 0;
  109. float len;
  110. float **features;
  111. float **cluster_centres=NULL;
  112. int i, j, index;
  113. int nloops = 1; /* default value */
  114. int isRMSE = 0;
  115. float rmse;
  116. int isOutput = 0;
  117. //float cluster_timing, io_timing;
  118. /* obtain command line arguments and change appropriate options */
  119. while ( (opt=getopt(argc,argv,"i:t:m:n:l:bro"))!= EOF) {
  120. switch (opt) {
  121. case 'i': filename=optarg;
  122. break;
  123. case 'b': isBinaryFile = 1;
  124. break;
  125. case 't': threshold=atof(optarg);
  126. break;
  127. case 'm': max_nclusters = atoi(optarg);
  128. break;
  129. case 'n': min_nclusters = atoi(optarg);
  130. break;
  131. case 'r': isRMSE = 1;
  132. break;
  133. case 'o': isOutput = 1;
  134. break;
  135. case 'l': nloops = atoi(optarg);
  136. break;
  137. case '?': usage(argv[0]);
  138. break;
  139. default: usage(argv[0]);
  140. break;
  141. }
  142. }
  143. if (filename == 0) usage(argv[0]);
  144. /* ============== I/O begin ==============*/
  145. /* get nfeatures and npoints */
  146. //io_timing = omp_get_wtime();
  147. if (isBinaryFile) { //Binary file input
  148. int infile;
  149. if ((infile = open(filename, O_RDONLY, "0600")) == -1) {
  150. fprintf(stderr, "Error: no such file (%s)\n", filename);
  151. exit(1);
  152. }
  153. read(infile, &npoints, sizeof(int));
  154. read(infile, &nfeatures, sizeof(int));
  155. /* allocate space for features[][] and read attributes of all objects */
  156. buf = (float*) malloc(npoints*nfeatures*sizeof(float));
  157. features = (float**)malloc(npoints* sizeof(float*));
  158. features[0] = (float*) malloc(npoints*nfeatures*sizeof(float));
  159. for (i=1; i<npoints; i++)
  160. features[i] = features[i-1] + nfeatures;
  161. read(infile, buf, npoints*nfeatures*sizeof(float));
  162. close(infile);
  163. }
  164. else {
  165. FILE *infile;
  166. if ((infile = fopen(filename, "r")) == NULL) {
  167. fprintf(stderr, "Error: no such file (%s)\n", filename);
  168. exit(1);
  169. }
  170. while (fgets(line, 1024, infile) != NULL)
  171. if (strtok(line, " \t\n") != 0)
  172. npoints++;
  173. rewind(infile);
  174. while (fgets(line, 1024, infile) != NULL) {
  175. if (strtok(line, " \t\n") != 0) {
  176. /* ignore the id (first attribute): nfeatures = 1; */
  177. while (strtok(NULL, " ,\t\n") != NULL) nfeatures++;
  178. break;
  179. }
  180. }
  181. /* allocate space for features[] and read attributes of all objects */
  182. buf = (float*) malloc(npoints*nfeatures*sizeof(float));
  183. features = (float**)malloc(npoints* sizeof(float*));
  184. features[0] = (float*) malloc(npoints*nfeatures*sizeof(float));
  185. for (i=1; i<npoints; i++)
  186. features[i] = features[i-1] + nfeatures;
  187. rewind(infile);
  188. i = 0;
  189. while (fgets(line, 1024, infile) != NULL) {
  190. if (strtok(line, " \t\n") == NULL) continue;
  191. for (j=0; j<nfeatures; j++) {
  192. buf[i] = atof(strtok(NULL, " ,\t\n"));
  193. i++;
  194. }
  195. }
  196. fclose(infile);
  197. }
  198. //io_timing = omp_get_wtime() - io_timing;
  199. printf("\nI/O completed\n");
  200. printf("\nNumber of objects: %d\n", npoints);
  201. printf("Number of features: %d\n", nfeatures);
  202. /* ============== I/O end ==============*/
  203. // error check for clusters
  204. if (npoints < min_nclusters)
  205. {
  206. printf("Error: min_nclusters(%d) > npoints(%d) -- cannot proceed\n", min_nclusters, npoints);
  207. exit(0);
  208. }
  209. srand(7); /* seed for future random number generator */
  210. memcpy(features[0], buf, npoints*nfeatures*sizeof(float)); /* now features holds 2-dimensional array of features */
  211. free(buf);
  212. /* ======================= core of the clustering ===================*/
  213. //cluster_timing = omp_get_wtime(); /* Total clustering time */
  214. cluster_centres = NULL;
  215. index = cluster(npoints, /* number of data points */
  216. nfeatures, /* number of features for each point */
  217. features, /* array: [npoints][nfeatures] */
  218. min_nclusters, /* range of min to max number of clusters */
  219. max_nclusters,
  220. threshold, /* loop termination factor */
  221. &best_nclusters, /* return: number between min and max */
  222. &cluster_centres, /* return: [best_nclusters][nfeatures] */
  223. &rmse, /* Root Mean Squared Error */
  224. isRMSE, /* calculate RMSE */
  225. nloops); /* number of iteration for each number of clusters */
  226. //cluster_timing = omp_get_wtime() - cluster_timing;
  227. /* =============== Command Line Output =============== */
  228. /* cluster center coordinates
  229. :displayed only for when k=1*/
  230. if((min_nclusters == max_nclusters) && (isOutput == 1)) {
  231. printf("\n================= Centroid Coordinates =================\n");
  232. for(i = 0; i < max_nclusters; i++){
  233. printf("%d:", i);
  234. for(j = 0; j < nfeatures; j++){
  235. printf(" %.2f", cluster_centres[i][j]);
  236. }
  237. printf("\n\n");
  238. }
  239. }
  240. len = (float) ((max_nclusters - min_nclusters + 1)*nloops);
  241. printf("Number of Iteration: %d\n", nloops);
  242. //printf("Time for I/O: %.5fsec\n", io_timing);
  243. //printf("Time for Entire Clustering: %.5fsec\n", cluster_timing);
  244. if(min_nclusters != max_nclusters){
  245. if(nloops != 1){ //range of k, multiple iteration
  246. //printf("Average Clustering Time: %fsec\n",
  247. // cluster_timing / len);
  248. printf("Best number of clusters is %d\n", best_nclusters);
  249. }
  250. else{ //range of k, single iteration
  251. //printf("Average Clustering Time: %fsec\n",
  252. // cluster_timing / len);
  253. printf("Best number of clusters is %d\n", best_nclusters);
  254. }
  255. }
  256. else{
  257. if(nloops != 1){ // single k, multiple iteration
  258. //printf("Average Clustering Time: %.5fsec\n",
  259. // cluster_timing / nloops);
  260. if(isRMSE) // if calculated RMSE
  261. printf("Number of trials to approach the best RMSE of %.3f is %d\n", rmse, index + 1);
  262. }
  263. else{ // single k, single iteration
  264. if(isRMSE) // if calculated RMSE
  265. printf("Root Mean Squared Error: %.3f\n", rmse);
  266. }
  267. }
  268. /* free up memory */
  269. free(features[0]);
  270. free(features);
  271. return(0);
  272. }