SHOGUN  v2.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
KMeans.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 1999-2008 Gunnar Raetsch
8  * Written (W) 2007-2009 Soeren Sonnenburg
9  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
10  */
11 
14 #include <shogun/labels/Labels.h>
17 #include <shogun/base/Parallel.h>
18 
19 #ifdef HAVE_PTHREAD
20 #include <pthread.h>
21 #endif
22 
23 #define MUSRECALC
24 
25 #define PAR_THRESH 10
26 
27 using namespace shogun;
28 
31 {
32  init();
33 }
34 
35 CKMeans::CKMeans(int32_t k_, CDistance* d)
37 {
38  init();
39  k=k_;
40  set_distance(d);
41 }
42 
44 {
45 }
46 
48 {
50 
51  if (data)
52  distance->init(data, data);
53 
55 
58  ASSERT(lhs);
59  int32_t num=lhs->get_num_vectors();
60  SG_UNREF(lhs);
61 
62  Weights=SGVector<float64_t>(num);
63  for (int32_t i=0; i<num; i++)
64  Weights.vector[i]=1.0;
65 
66  clustknb(false, NULL);
67 
68  return true;
69 }
70 
71 bool CKMeans::load(FILE* srcfile)
72 {
75  return false;
76 }
77 
78 bool CKMeans::save(FILE* dstfile)
79 {
82  return false;
83 }
84 
85 
86 void CKMeans::set_k(int32_t p_k)
87 {
88  ASSERT(p_k>0);
89  this->k=p_k;
90 }
91 
92 int32_t CKMeans::get_k()
93 {
94  return k;
95 }
96 
97 void CKMeans::set_max_iter(int32_t iter)
98 {
99  ASSERT(iter>0);
100  max_iter=iter;
101 }
102 
104 {
105  return max_iter;
106 }
107 
109 {
110  return R;
111 }
112 
114 {
115  if (!R.vector)
116  return SGMatrix<float64_t>();
117 
120  SGMatrix<float64_t> centers=lhs->get_feature_matrix();
121  SG_UNREF(lhs);
122  return centers;
123 }
124 
126 {
127  return dimensions;
128 }
129 
130 #ifndef DOXYGEN_SHOULD_SKIP_THIS
131 struct thread_data
132 {
133  float64_t* x;
135  float64_t* z;
136  int32_t n1, n2, m;
137  int32_t js, je; /* defines the matrix stripe */
138  int32_t offs;
139 };
140 #endif // DOXYGEN_SHOULD_SKIP_THIS
141 
142 namespace shogun
143 {
144 void *sqdist_thread_func(void * P)
145 {
146  struct thread_data *TD=(struct thread_data*) P;
147  float64_t* x=TD->x;
148  CDenseFeatures<float64_t>* y=TD->y;
149  float64_t* z=TD->z;
150  int32_t n1=TD->n1,
151  m=TD->m,
152  js=TD->js,
153  je=TD->je,
154  offs=TD->offs,
155  j,i,k;
156 
157  for (j=js; j<je; j++)
158  {
159  int32_t vlen=0;
160  bool vfree=false;
161  float64_t* vec=y->get_feature_vector(j+offs, vlen, vfree);
162 
163  for (i=0; i<n1; i++)
164  {
165  float64_t sum=0;
166  for (k=0; k<m; k++)
167  sum = sum + CMath::sq(x[i*m + k] - vec[k]);
168  z[j*n1 + i] = sum;
169  }
170 
171  y->free_feature_vector(vec, j, vfree);
172  }
173  return NULL;
174 }
175 }
176 
177 void CKMeans::clustknb(bool use_old_mus, float64_t *mus_start)
178 {
181  ASSERT(lhs && lhs->get_num_features()>0 && lhs->get_num_vectors()>0);
182 
183  int32_t XSize=lhs->get_num_vectors();
184  dimensions=lhs->get_num_features();
185  int32_t i, changed=1;
186  const int32_t XDimk=dimensions*k;
187  int32_t iter=0;
188 
190 
192 
193  int32_t *ClList=SG_CALLOC(int32_t, XSize);
194  float64_t *weights_set=SG_CALLOC(float64_t, k);
195  float64_t *dists=SG_CALLOC(float64_t, k*XSize);
196 
199  CFeatures* rhs_cache = distance->replace_rhs(rhs_mus);
200 
201  int32_t vlen=0;
202  bool vfree=false;
203  float64_t* vec=NULL;
204 
205  /* ClList=zeros(XSize,1) ; */
206  memset(ClList, 0, sizeof(int32_t)*XSize);
207  /* weights_set=zeros(k,1) ; */
208  memset(weights_set, 0, sizeof(float64_t)*k);
209 
210  /* cluster_centers=zeros(dimensions, k) ; */
211  memset(mus.matrix, 0, sizeof(float64_t)*XDimk);
212 
213  if (!use_old_mus)
214  {
215  for (i=0; i<XSize; i++)
216  {
217  const int32_t Cl=CMath::random(0, k-1);
218  int32_t j;
219  float64_t weight=Weights.vector[i];
220 
221  weights_set[Cl]+=weight;
222  ClList[i]=Cl;
223 
224  vec=lhs->get_feature_vector(i, vlen, vfree);
225 
226  for (j=0; j<dimensions; j++)
227  mus.matrix[Cl*dimensions+j] += weight*vec[j];
228 
229  lhs->free_feature_vector(vec, i, vfree);
230  }
231  for (i=0; i<k; i++)
232  {
233  int32_t j;
234 
235  if (weights_set[i]!=0.0)
236  for (j=0; j<dimensions; j++)
237  mus.matrix[i*dimensions+j] /= weights_set[i];
238  }
239  }
240  else
241  {
242  ASSERT(mus_start);
243 
245  rhs_mus->copy_feature_matrix(SGMatrix<float64_t>(mus_start,dimensions,k));
246  float64_t* p_dists=dists;
247 
248  for(int32_t idx=0;idx<XSize;idx++,p_dists+=k)
249  distances_rhs(p_dists,0,k,idx);
250  p_dists=NULL;
251 
252  for (i=0; i<XSize; i++)
253  {
254  float64_t mini=dists[i*k];
255  int32_t Cl = 0, j;
256 
257  for (j=1; j<k; j++)
258  {
259  if (dists[i*k+j]<mini)
260  {
261  Cl=j;
262  mini=dists[i*k+j];
263  }
264  }
265  ClList[i]=Cl;
266  }
267 
268  /* Compute the sum of all points belonging to a cluster
269  * and count the points */
270  for (i=0; i<XSize; i++)
271  {
272  const int32_t Cl = ClList[i];
273  float64_t weight=Weights.vector[i];
274  weights_set[Cl]+=weight;
275 #ifndef MUSRECALC
276  vec=lhs->get_feature_vector(i, vlen, vfree);
277 
278  for (j=0; j<dimensions; j++)
279  mus.matrix[Cl*dimensions+j] += weight*vec[j];
280 
281  lhs->free_feature_vector(vec, i, vfree);
282 #endif
283  }
284 #ifndef MUSRECALC
285  /* normalization to get the mean */
286  for (i=0; i<k; i++)
287  {
288  if (weights_set[i]!=0.0)
289  {
290  int32_t j;
291  for (j=0; j<dimensions; j++)
292  mus.matrix[i*dimensions+j] /= weights_set[i];
293  }
294  }
295 #endif
296  }
297 
298 
299 
300  while (changed && (iter<max_iter))
301  {
302  iter++;
303  if (iter==max_iter-1)
304  SG_WARNING("kmeans clustering changed throughout %d iterations stopping...\n", max_iter-1);
305 
306  if (iter%1000 == 0)
307  SG_INFO("Iteration[%d/%d]: Assignment of %i patterns changed.\n", iter, max_iter, changed);
308  changed=0;
309 
310 #ifdef MUSRECALC
311  /* mus=zeros(dimensions, k) ; */
312  memset(mus.matrix, 0, sizeof(float64_t)*XDimk);
313 
314  for (i=0; i<XSize; i++)
315  {
316  int32_t j;
317  int32_t Cl=ClList[i];
318  float64_t weight=Weights.vector[i];
319 
320  vec=lhs->get_feature_vector(i, vlen, vfree);
321 
322  for (j=0; j<dimensions; j++)
323  mus.matrix[Cl*dimensions+j] += weight*vec[j];
324 
325  lhs->free_feature_vector(vec, i, vfree);
326  }
327  for (i=0; i<k; i++)
328  {
329  int32_t j;
330 
331  if (weights_set[i]!=0.0)
332  for (j=0; j<dimensions; j++)
333  mus.matrix[i*dimensions+j] /= weights_set[i];
334  }
335 #endif
336 
337  rhs_mus->copy_feature_matrix(mus);
338 
339  for (i=0; i<XSize; i++)
340  {
341  /* ks=ceil(rand(1,XSize)*XSize) ; */
342  const int32_t Pat= CMath::random(0, XSize-1);
343  const int32_t ClList_Pat=ClList[Pat];
344  int32_t imini, j;
345  float64_t mini, weight;
346 
347  weight=Weights.vector[Pat];
348 
349  /* compute the distance of this point to all centers */
350  for(int32_t idx_k=0;idx_k<k;idx_k++)
351  dists[idx_k]=distance->distance(Pat,idx_k);
352 
353  /* [mini,imini]=min(dists(:,i)) ; */
354  imini=0 ; mini=dists[0];
355  for (j=1; j<k; j++)
356  if (dists[j]<mini)
357  {
358  mini=dists[j];
359  imini=j;
360  }
361 
362  if (imini!=ClList_Pat)
363  {
364  changed= changed + 1;
365 
366  /* weights_set(imini) = weights_set(imini) + weight ; */
367  weights_set[imini]+= weight;
368  /* weights_set(j) = weights_set(j) - weight ; */
369  weights_set[ClList_Pat]-= weight;
370 
371  vec=lhs->get_feature_vector(Pat, vlen, vfree);
372 
373  for (j=0; j<dimensions; j++)
374  {
375  mus.matrix[imini*dimensions+j]-=(vec[j]
376  -mus.matrix[imini*dimensions+j])
377  *(weight/weights_set[imini]);
378  }
379 
380  lhs->free_feature_vector(vec, Pat, vfree);
381 
382  /* mu_new = mu_old - (x - mu_old)/(n-1) */
383  /* if weights_set(j)~=0 */
384  if (weights_set[ClList_Pat]!=0.0)
385  {
386  vec=lhs->get_feature_vector(Pat, vlen, vfree);
387 
388  for (j=0; j<dimensions; j++)
389  {
390  mus.matrix[ClList_Pat*dimensions+j]-=
391  (vec[j]
392  -mus.matrix[ClList_Pat
393  *dimensions+j])
394  *(weight/weights_set[ClList_Pat]);
395  }
396  lhs->free_feature_vector(vec, Pat, vfree);
397  }
398  else
399  /* mus(:,j)=zeros(dimensions,1) ; */
400  for (j=0; j<dimensions; j++)
401  mus.matrix[ClList_Pat*dimensions+j]=0;
402 
403  /* ClList(i)= imini ; */
404  ClList[Pat] = imini;
405  }
406  }
407  }
408 
409  /* compute the ,,variances'' of the clusters */
410  for (i=0; i<k; i++)
411  {
412  float64_t rmin1=0;
413  float64_t rmin2=0;
414 
415  bool first_round=true;
416 
417  for (int32_t j=0; j<k; j++)
418  {
419  if (j!=i)
420  {
421  int32_t l;
422  float64_t dist = 0;
423 
424  for (l=0; l<dimensions; l++)
425  {
426  dist+=CMath::sq(
427  mus.matrix[i*dimensions+l]
428  -mus.matrix[j*dimensions+l]);
429  }
430 
431  if (first_round)
432  {
433  rmin1=dist;
434  rmin2=dist;
435  first_round=false;
436  }
437  else
438  {
439  if ((dist<rmin2) && (dist>=rmin1))
440  rmin2=dist;
441 
442  if (dist<rmin1)
443  {
444  rmin2=rmin1;
445  rmin1=dist;
446  }
447  }
448  }
449  }
450 
451  R.vector[i]=(0.7*CMath::sqrt(rmin1)+0.3*CMath::sqrt(rmin2));
452  }
453 
454  distance->replace_rhs(rhs_cache);
455  delete rhs_mus;
456  SG_FREE(ClList);
457  SG_FREE(weights_set);
458  SG_FREE(dists);
459  SG_UNREF(lhs);
460 }
461 
463 {
464  /* set lhs of underlying distance to cluster centers */
466  mus);
467 
468  /* store cluster centers in lhs of distance variable */
469  CFeatures* rhs=distance->get_rhs();
470  distance->init(cluster_centers, rhs);
471  SG_UNREF(rhs);
472 }
473 
474 void CKMeans::init()
475 {
476  max_iter=10000;
477  k=3;
478  dimensions=0;
479 
480  SG_ADD(&max_iter, "max_iter", "Maximum number of iterations", MS_AVAILABLE);
481  SG_ADD(&k, "k", "k, the number of clusters", MS_AVAILABLE);
482  SG_ADD(&dimensions, "dimensions", "Dimensions of data", MS_NOT_AVAILABLE);
483  SG_ADD(&R, "R", "Cluster radiuses", MS_NOT_AVAILABLE);
484 }
485 

SHOGUN Machine Learning Toolbox - Documentation