SHOGUN  3.2.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
KMeans.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 1999-2008 Gunnar Raetsch
8  * Written (W) 2007-2009 Soeren Sonnenburg
9  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
10  */
11 
16 #include <shogun/labels/Labels.h>
19 #include <shogun/base/Parallel.h>
20 
21 #ifdef HAVE_PTHREAD
22 #include <pthread.h>
23 #endif
24 
25 #define MUSRECALC
26 
27 using namespace shogun;
28 
31 {
32  init();
33 }
34 
37 {
38  init();
39  k=k_;
40  set_distance(d);
41  train_method=f;
42 }
43 
44 CKMeans::CKMeans(int32_t k_, CDistance* d, bool use_kmpp, EKMeansMethod f)
46 {
47  init();
48  k=k_;
49  set_distance(d);
50  use_kmeanspp=use_kmpp;
51  train_method=f;
52 }
53 
56 {
57  init();
58  k = k_i;
59  set_distance(d_i);
60  set_initial_centers(centers_i);
61  train_method=f;
62 }
63 
65 {
66 }
67 
69 {
70  dimensions = ((CDenseFeatures<float64_t>*) distance->get_lhs())->get_num_features();
71  REQUIRE(centers.num_cols == k,
72  "Expected %d initial cluster centers, got %d", k, centers.num_cols);
73  REQUIRE(centers.num_rows == dimensions,
74  "Expected %d dimensionional cluster centers, got %d", dimensions, centers.num_rows);
75  mus_initial = centers;
76 }
77 
78 void CKMeans::set_random_centers(SGVector<float64_t> weights_set, SGVector<int32_t> ClList, int32_t XSize)
79 {
82 
83  for (int32_t i=0; i<XSize; i++)
84  {
85  const int32_t Cl=CMath::random(0, k-1);
86  weights_set[Cl]+=1.0;
87  ClList[i]=Cl;
88 
89  int32_t vlen=0;
90  bool vfree=false;
91  float64_t* vec=lhs->get_feature_vector(i, vlen, vfree);
92 
93  for (int32_t j=0; j<dimensions; j++)
94  mus.matrix[Cl*dimensions+j] += vec[j];
95 
96  lhs->free_feature_vector(vec, i, vfree);
97  }
98 
99  SG_UNREF(lhs);
100 
101  for (int32_t i=0; i<k; i++)
102  {
103  if (weights_set[i]!=0.0)
104  {
105  for (int32_t j=0; j<dimensions; j++)
106  mus.matrix[i*dimensions+j] /= weights_set[i];
107  }
108  }
109 }
110 
112  SGVector<int32_t> ClList, int32_t XSize)
113 {
114  ASSERT(mus_initial.matrix);
115 
118  CFeatures* rhs_cache=distance->replace_rhs(rhs_mus);
119  rhs_mus->copy_feature_matrix(mus_initial);
120 
122  dists.zero();
123 
124  for(int32_t idx=0;idx<XSize;idx++)
125  {
126  for(int32_t m=0;m<k;m++)
127  dists[k*idx+m] = distance->distance(idx,m);
128  }
129 
130  for (int32_t i=0; i<XSize; i++)
131  {
132  float64_t mini=dists[i*k];
133  int32_t Cl = 0, j;
134 
135  for (j=1; j<k; j++)
136  {
137  if (dists[i*k+j]<mini)
138  {
139  Cl=j;
140  mini=dists[i*k+j];
141  }
142  }
143  ClList[i]=Cl;
144  }
145 
146  /* Compute the sum of all points belonging to a cluster
147  * and count the points */
150  for (int32_t i=0; i<XSize; i++)
151  {
152  const int32_t Cl = ClList[i];
153  weights_set[Cl]+=1.0;
154 
155  int32_t vlen=0;
156  bool vfree=false;
157  float64_t* vec=lhs->get_feature_vector(i, vlen, vfree);
158 
159  for (int32_t j=0; j<dimensions; j++)
160  mus.matrix[Cl*dimensions+j] += vec[j];
161 
162  lhs->free_feature_vector(vec, i, vfree);
163  }
164  SG_UNREF(lhs);
165  distance->replace_rhs(rhs_cache);
166  delete rhs_mus;
167 
168  /* normalization to get the mean */
169  for (int32_t i=0; i<k; i++)
170  {
171  if (weights_set[i]!=0.0)
172  {
173  for (int32_t j=0; j<dimensions; j++)
174  mus.matrix[i*dimensions+j] /= weights_set[i];
175  }
176  }
177 
178 }
179 
180 void CKMeans::compute_cluster_variances()
181 {
182  /* compute the ,,variances'' of the clusters */
183  for (int32_t i=0; i<k; i++)
184  {
185  float64_t rmin1=0;
186  float64_t rmin2=0;
187 
188  bool first_round=true;
189 
190  for (int32_t j=0; j<k; j++)
191  {
192  if (j!=i)
193  {
194  int32_t l;
195  float64_t dist = 0;
196 
197  for (l=0; l<dimensions; l++)
198  {
199  dist+=CMath::sq(
200  mus.matrix[i*dimensions+l]
201  -mus.matrix[j*dimensions+l]);
202  }
203 
204  if (first_round)
205  {
206  rmin1=dist;
207  rmin2=dist;
208  first_round=false;
209  }
210  else
211  {
212  if ((dist<rmin2) && (dist>=rmin1))
213  rmin2=dist;
214 
215  if (dist<rmin1)
216  {
217  rmin2=rmin1;
218  rmin1=dist;
219  }
220  }
221  }
222  }
223 
224  R.vector[i]=(0.7*CMath::sqrt(rmin1)+0.3*CMath::sqrt(rmin2));
225  }
226 }
227 
228 bool CKMeans::train_machine(CFeatures* data)
229 {
231 
232  if (data)
233  distance->init(data, data);
234 
236  CDenseFeatures<float64_t>::obtain_from_generic(distance->get_lhs());
237 
238  ASSERT(lhs);
239  int32_t XSize=lhs->get_num_vectors();
240  dimensions=lhs->get_num_features();
241  const int32_t XDimk=dimensions*k;
242 
243  ASSERT(XSize>0 && dimensions>0);
244 
246  if (use_kmeanspp)
247  mus_initial=kmeanspp();
248 
249  R=SGVector<float64_t>(k);
250 
251  mus=SGMatrix<float64_t>(dimensions, k);
252  /* cluster_centers=zeros(dimensions, k) ; */
253  memset(mus.matrix, 0, sizeof(float64_t)*XDimk);
254 
255  SGVector<int32_t> ClList=SGVector<int32_t>(XSize);
256  ClList.zero();
257  SGVector<float64_t> weights_set=SGVector<float64_t>(k);
258  weights_set.zero();
259 
260  if (mus_initial.matrix)
261  set_initial_centers(weights_set, ClList, XSize);
262  else
263  set_random_centers(weights_set, ClList, XSize);
264 
265  if (train_method==KMM_MINI_BATCH)
266  {
267  CKMeansMiniBatchImpl::minibatch_KMeans(k, distance, batch_size, minib_iter, mus);
268  }
269  else
270  {
271  CKMeansLloydImpl::Lloyd_KMeans(k, distance, max_iter, mus, ClList, weights_set, fixed_centers);
272  }
273 
274  compute_cluster_variances();
275  SG_UNREF(lhs);
276  return true;
277 }
278 
279 bool CKMeans::load(FILE* srcfile)
280 {
283  return false;
284 }
285 
286 bool CKMeans::save(FILE* dstfile)
287 {
290  return false;
291 }
292 
294 {
295  use_kmeanspp=kmpp;
296 }
297 
299 {
300  return use_kmeanspp;
301 }
302 
303 void CKMeans::set_k(int32_t p_k)
304 {
305  REQUIRE(p_k>0, "number of clusters should be > 0");
306  this->k=p_k;
307 }
308 
309 int32_t CKMeans::get_k()
310 {
311  return k;
312 }
313 
314 void CKMeans::set_max_iter(int32_t iter)
315 {
316  REQUIRE(iter>0, "number of clusters should be > 0");
317  max_iter=iter;
318 }
319 
321 {
322  return max_iter;
323 }
324 
326 {
327  train_method=f;
328 }
329 
331 {
332  return train_method;
333 }
334 
336 {
337  REQUIRE(b>0, "Parameter bach size should be > 0");
338  batch_size=b;
339 }
340 
342 {
343  return batch_size;
344 }
345 
347 {
348  REQUIRE(i>0, "Parameter number of iterations should be > 0");
349  minib_iter=i;
350 }
351 
353 {
354  return minib_iter;
355 }
356 
357 void CKMeans::set_mbKMeans_params(int32_t b, int32_t t)
358 {
359  REQUIRE(b>0, "Parameter bach size should be > 0");
360  REQUIRE(t>0, "Parameter number of iterations should be > 0");
361  batch_size=b;
362  minib_iter=t;
363 }
364 
366 {
367  return R;
368 }
369 
371 {
372  if (!R.vector)
373  return SGMatrix<float64_t>();
374 
377  SGMatrix<float64_t> centers=lhs->get_feature_matrix();
378  SG_UNREF(lhs);
379  return centers;
380 }
381 
383 {
384  return dimensions;
385 }
386 
388 {
389  fixed_centers=fixed;
390 }
391 
393 {
394  return fixed_centers;
395 }
396 
397 void CKMeans::store_model_features()
398 {
399  /* set lhs of underlying distance to cluster centers */
401  mus);
402 
403  /* store cluster centers in lhs of distance variable */
404  CFeatures* rhs=distance->get_rhs();
405  distance->init(cluster_centers, rhs);
406  SG_UNREF(rhs);
407 }
408 
409 SGMatrix<float64_t> CKMeans::kmeanspp()
410 {
411  int32_t num=distance->get_num_vec_lhs();
414 
415  /* 1st center */
416  int32_t mu_1=CMath::random((int32_t) 0,num-1);
417  mu_index[0]=mu_1;
418 
419  /* choose a center - do k-1 times */
420  int32_t count=0;
421  while (++count<k)
422  {
423  float64_t sum=0.0;
424  /* for each data point find distance to nearest already chosen center */
425  for (int32_t point_idx=0;point_idx<num;point_idx++)
426  {
427  dists[point_idx]=distance->distance(mu_index[0],point_idx);
428  int32_t cent_id=1;
429 
430  while (cent_id<count)
431  {
432  float64_t dist_temp=distance->distance(mu_index[cent_id],point_idx);
433  if (dists[point_idx]>dist_temp)
434  dists[point_idx]=dist_temp;
435  cent_id++;
436  }
437 
438  dists[point_idx]*=dists[point_idx];
439  sum+=dists[point_idx];
440  }
441 
442  /*random choosing - points weighted by square of distance from nearset center*/
443  int32_t mu_next=0;
444  float64_t chosen=CMath::random(0.0,sum);
445  while ((chosen-=dists[mu_next])>0)
446  mu_next++;
447 
448  mu_index[count]=mu_next;
449  }
450 
452  int32_t dim=lhs->get_num_features();
454  for (int32_t c_m=0;c_m<k;c_m++)
455  {
456  SGVector<float64_t> feature=lhs->get_feature_vector(c_m);
457  for (int32_t r_m=0;r_m<dim;r_m++)
458  mat(r_m,c_m)=feature[r_m];
459  }
460  SG_UNREF(lhs);
461  return mat;
462 }
463 
464 void CKMeans::init()
465 {
466  max_iter=10000;
467  k=3;
468  dimensions=0;
469  fixed_centers=false;
470  use_kmeanspp=false;
471  train_method=KMM_LLOYD;
472  batch_size=-1;
473  minib_iter=-1;
474  SG_ADD(&max_iter, "max_iter", "Maximum number of iterations", MS_AVAILABLE);
475  SG_ADD(&k, "k", "k, the number of clusters", MS_AVAILABLE);
476  SG_ADD(&dimensions, "dimensions", "Dimensions of data", MS_NOT_AVAILABLE);
477  SG_ADD(&R, "R", "Cluster radiuses", MS_NOT_AVAILABLE);
478 }
479 

SHOGUN Machine Learning Toolbox - Documentation