SHOGUN  4.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
KMeans.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 1999-2008 Gunnar Raetsch
8  * Written (W) 2007-2009 Soeren Sonnenburg
9  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
10  */
11 
17 #include <shogun/labels/Labels.h>
20 #include <shogun/base/Parallel.h>
22 
23 #ifdef HAVE_PTHREAD
24 #include <pthread.h>
25 #endif
26 
27 #ifdef HAVE_LINALG_LIB
29 #endif
30 
31 using namespace shogun;
32 using namespace Eigen;
33 
36 {
37  init();
38 }
39 
42 {
43  init();
44  k=k_;
45  set_distance(d);
46  train_method=f;
47 }
48 
49 CKMeans::CKMeans(int32_t k_, CDistance* d, bool use_kmpp, EKMeansMethod f)
51 {
52  init();
53  k=k_;
54  set_distance(d);
55  use_kmeanspp=use_kmpp;
56  train_method=f;
57 }
58 
61 {
62  init();
63  k = k_i;
64  set_distance(d_i);
65  set_initial_centers(centers_i);
66  train_method=f;
67 }
68 
70 {
71 }
72 
74 {
76  dimensions=lhs->get_num_features();
77  REQUIRE(centers.num_cols == k,
78  "Expected %d initial cluster centers, got %d.\n", k, centers.num_cols);
79  REQUIRE(centers.num_rows == dimensions,
80  "Expected %d dimensionional cluster centers, got %d.\n", dimensions, centers.num_rows);
81  mus_initial = centers;
82  SG_UNREF(lhs);
83 }
84 
85 void CKMeans::set_random_centers()
86 {
87  mus.zero();
90  int32_t lhs_size=lhs->get_num_vectors();
91 
92  SGVector<int32_t> temp=SGVector<int32_t>(lhs_size);
93  SGVector<int32_t>::range_fill_vector(temp, lhs_size, 0);
94  CMath::permute(temp);
95 
96  for (int32_t i=0; i<k; i++)
97  {
98  const int32_t cluster_center_i=temp[i];
99  SGVector<float64_t> vec=lhs->get_feature_vector(cluster_center_i);
100 
101  for (int32_t j=0; j<dimensions; j++)
102  mus(j,i)=vec[j];
103 
104  lhs->free_feature_vector(vec, cluster_center_i);
105  }
106 
107  SG_UNREF(lhs);
108 
109 }
110 
111 void CKMeans::compute_cluster_variances()
112 {
113  /* compute the ,,variances'' of the clusters */
114  for (int32_t i=0; i<k; i++)
115  {
116  float64_t rmin1=0;
117  float64_t rmin2=0;
118 
119  bool first_round=true;
120 
121  for (int32_t j=0; j<k; j++)
122  {
123  if (j!=i)
124  {
125  int32_t l;
126  float64_t dist = 0;
127 
128  for (l=0; l<dimensions; l++)
129  {
130  dist+=CMath::sq(
131  mus.matrix[i*dimensions+l]
132  -mus.matrix[j*dimensions+l]);
133  }
134 
135  if (first_round)
136  {
137  rmin1=dist;
138  rmin2=dist;
139  first_round=false;
140  }
141  else
142  {
143  if ((dist<rmin2) && (dist>=rmin1))
144  rmin2=dist;
145 
146  if (dist<rmin1)
147  {
148  rmin2=rmin1;
149  rmin1=dist;
150  }
151  }
152  }
153  }
154 
155  R.vector[i]=(0.7*CMath::sqrt(rmin1)+0.3*CMath::sqrt(rmin2));
156  }
157 }
158 
159 bool CKMeans::train_machine(CFeatures* data)
160 {
161  REQUIRE(distance, "Distance is not provided.\n")
162  REQUIRE(distance->get_feature_type()==F_DREAL,
163  "Distance's features type (%d) should be of type REAL (%d).\n")
164 
165  if (data)
166  distance->init(data, data);
167 
169  CDenseFeatures<float64_t>::obtain_from_generic(distance->get_lhs());
170 
171  REQUIRE(lhs, "Lhs features of distance not provided.\n");
172  int32_t lhs_size=lhs->get_num_vectors();
173  dimensions=lhs->get_num_features();
174  const int32_t centers_size=dimensions*k;
175 
176  REQUIRE(lhs_size>0, "Lhs features should not be empty.\n");
177  REQUIRE(dimensions>0, "Lhs features dimensions (%d) should be >0.\n", dimensions);
178 
179  /* if kmeans++ to be used */
180  if (use_kmeanspp)
181  {
182 #ifdef HAVE_LINALG_LIB
183  mus_initial=kmeanspp();
184 #endif
185  }
186 
187  R=SGVector<float64_t>(k);
188 
189  mus=SGMatrix<float64_t>(dimensions, k);
190  /* cluster_centers=zeros(dimensions, k) ; */
191  memset(mus.matrix, 0, sizeof(float64_t)*centers_size);
192 
193 
194  if (mus_initial.matrix)
195  mus = mus_initial;
196  else
197  set_random_centers();
198 
199  if (train_method==KMM_MINI_BATCH)
200  {
201  CKMeansMiniBatchImpl::minibatch_KMeans(k, distance, batch_size, minib_iter, mus);
202  }
203  else
204  {
205  CKMeansLloydImpl::Lloyd_KMeans(k, distance, max_iter, mus, fixed_centers);
206  }
207 
208  compute_cluster_variances();
209  SG_UNREF(lhs);
210  return true;
211 }
212 
213 bool CKMeans::load(FILE* srcfile)
214 {
217  return false;
218 }
219 
220 bool CKMeans::save(FILE* dstfile)
221 {
224  return false;
225 }
226 
228 {
229  use_kmeanspp=kmpp;
230 }
231 
233 {
234  return use_kmeanspp;
235 }
236 
237 void CKMeans::set_k(int32_t p_k)
238 {
239  REQUIRE(p_k>0, "Number of clusters (%d) should be > 0\n", k);
240  this->k=p_k;
241 }
242 
243 int32_t CKMeans::get_k()
244 {
245  return k;
246 }
247 
248 void CKMeans::set_max_iter(int32_t iter)
249 {
250  REQUIRE(iter>0, "Maximum number of iterations (%d) should be > 0.\n", iter);
251  max_iter=iter;
252 }
253 
255 {
256  return max_iter;
257 }
258 
260 {
261  train_method=f;
262 }
263 
265 {
266  return train_method;
267 }
268 
270 {
271  REQUIRE(b>0, "Batch size (%d) should be > 0.\n", b);
272  batch_size=b;
273 }
274 
276 {
277  return batch_size;
278 }
279 
281 {
282  REQUIRE(i>0, "Number of iterations (%d) should be > 0.\n", i);
283  minib_iter=i;
284 }
285 
287 {
288  return minib_iter;
289 }
290 
291 void CKMeans::set_mini_batch_parameters(int32_t b, int32_t t)
292 {
293  REQUIRE(b>0, "Bach size (%d) should be > 0.\n", b);
294  REQUIRE(t>0, "Number of iterations (%d) should be > 0.\n", t);
295  batch_size=b;
296  minib_iter=t;
297 }
298 
300 {
301  return R;
302 }
303 
305 {
306  if (!R.vector)
307  return SGMatrix<float64_t>();
308 
311  SGMatrix<float64_t> centers=lhs->get_feature_matrix();
312  SG_UNREF(lhs);
313  return centers;
314 }
315 
317 {
318  return dimensions;
319 }
320 
322 {
323  fixed_centers=fixed;
324 }
325 
327 {
328  return fixed_centers;
329 }
330 
331 void CKMeans::store_model_features()
332 {
333  /* set lhs of underlying distance to cluster centers */
335  mus);
336 
337  /* store cluster centers in lhs of distance variable */
338  CFeatures* rhs=distance->get_rhs();
339  distance->init(cluster_centers, rhs);
340  SG_UNREF(rhs);
341 }
342 
343 SGMatrix<float64_t> CKMeans::kmeanspp()
344 {
345  int32_t lhs_size;
347  lhs_size=lhs->get_num_vectors();
348 
349  SGMatrix<float64_t> centers=SGMatrix<float64_t>(dimensions, k);
350  centers.zero();
351  SGVector<float64_t> min_dist=SGVector<float64_t>(lhs_size);
352  min_dist.zero();
353 
354  /* First center is chosen at random */
355  int32_t mu=CMath::random((int32_t) 0, lhs_size-1);
356  SGVector<float64_t> mu_first=lhs->get_feature_vector(mu);
357  for(int32_t j=0; j<dimensions; j++)
358  centers(j, 0)=mu_first[j];
359 
362 #pragma omp parallel for shared(min_dist)
363  for(int32_t i=0; i<lhs_size; i++)
364  min_dist[i]=CMath::sq(distance->distance(i, mu));
365 #ifdef HAVE_LINALG
366  float64_t sum=linalg::vector_sum(min_dist);
367 #else //HAVE_LINALG
368  Map<VectorXd> eigen_min_dist(min_dist.vector, min_dist.vlen);
369  float64_t sum=eigen_min_dist.sum();
370 #endif //HAVE_LINALG
371  int32_t n_rands=2 + int32_t(CMath::log(k));
372 
373  /* Choose centers with weighted probability */
374  for(int32_t i=1; i<k; i++)
375  {
376  int32_t best_center=0;
377  float64_t best_sum=-1.0;
378  SGVector<float64_t> best_min_dist=SGVector<float64_t>(lhs_size);
379 
380  /* local tries for best center */
381  for(int32_t trial=0; trial<n_rands; trial++)
382  {
383  float64_t temp_sum=0.0;
384  float64_t temp_dist=0.0;
385  SGVector<float64_t> temp_min_dist=SGVector<float64_t>(lhs_size);
386  int32_t new_center=0;
387  float64_t prob=CMath::random(0.0, 1.0);
388  prob=prob*sum;
389 
390  for(int32_t j=0; j<lhs_size; j++)
391  {
392  temp_sum+=min_dist[j];
393  if (prob <= temp_sum)
394  {
395  new_center=j;
396  break;
397  }
398  }
399 
400 #pragma omp parallel for firstprivate(lhs_size) \
401  shared(temp_min_dist)
402  for(int32_t j=0; j<lhs_size; j++)
403  {
404  temp_dist=CMath::sq(distance->distance(j, new_center));
405  temp_min_dist[j]=CMath::min(temp_dist, min_dist[j]);
406  }
407 
408 #ifdef HAVE_LINALG
409  temp_sum=linalg::vector_sum(temp_min_dist);
410 #else //HAVE_LINALG
411  Map<VectorXd> eigen_temp_sum(temp_min_dist.vector, temp_min_dist.vlen);
412  temp_sum=eigen_temp_sum.sum();
413 #endif //HAVE_LINALG
414  if ((temp_sum<best_sum) || (best_sum<0))
415  {
416  best_sum=temp_sum;
417  best_min_dist=temp_min_dist;
418  best_center=new_center;
419  }
420  }
421 
422  SGVector<float64_t> vec=lhs->get_feature_vector(best_center);
423  for(int32_t j=0; j<dimensions; j++)
424  centers(j, i)=vec[j];
425  sum=best_sum;
426  min_dist=best_min_dist;
427  }
428 
430  SG_UNREF(lhs);
431  return centers;
432 }
433 
434 void CKMeans::init()
435 {
436  max_iter=10000;
437  k=3;
438  dimensions=0;
439  fixed_centers=false;
440  use_kmeanspp=false;
441  train_method=KMM_LLOYD;
442  batch_size=-1;
443  minib_iter=-1;
444  SG_ADD(&max_iter, "max_iter", "Maximum number of iterations", MS_AVAILABLE);
445  SG_ADD(&k, "k", "k, the number of clusters", MS_AVAILABLE);
446  SG_ADD(&dimensions, "dimensions", "Dimensions of data", MS_NOT_AVAILABLE);
447  SG_ADD(&R, "R", "Cluster radiuses", MS_NOT_AVAILABLE);
448 }
449 
static void permute(SGVector< T > v, CRandom *rand=NULL)
Definition: Math.h:1144
#define SG_RESET_LOCALE
Definition: SGIO.h:86
virtual bool save(FILE *dstfile)
Definition: KMeans.cpp:220
ST * get_feature_vector(int32_t num, int32_t &len, bool &dofree)
Class Distance, a base class for all the distances used in the Shogun toolbox.
Definition: Distance.h:87
int32_t get_num_features() const
virtual void reset_precompute()
Definition: Distance.h:150
CFeatures * get_lhs()
Definition: Distance.h:224
static T sq(T x)
Definition: Math.h:450
EKMeansMethod
Definition: KMeans.h:28
void set_use_kmeanspp(bool kmpp)
Definition: KMeans.cpp:227
Definition: SGMatrix.h:20
#define REQUIRE(x,...)
Definition: SGIO.h:206
CFeatures * get_rhs()
Definition: Distance.h:230
void set_k(int32_t p_k)
Definition: KMeans.cpp:237
index_t num_cols
Definition: SGMatrix.h:376
int32_t get_dimensions()
Definition: KMeans.cpp:316
#define SG_SET_LOCALE_C
Definition: SGIO.h:85
A generic DistanceMachine interface.
index_t num_rows
Definition: SGMatrix.h:374
static uint64_t random()
Definition: Math.h:1019
virtual ~CKMeans()
Definition: KMeans.cpp:69
index_t vlen
Definition: SGVector.h:492
static void Lloyd_KMeans(int32_t k, CDistance *distance, int32_t max_iter, SGMatrix< float64_t > mus, bool fixed_centers)
bool get_use_kmeanspp() const
Definition: KMeans.cpp:232
SGVector< float64_t > get_radiuses()
Definition: KMeans.cpp:299
virtual int32_t get_num_vectors() const
float64_t get_max_iter()
Definition: KMeans.cpp:254
double float64_t
Definition: common.h:50
virtual bool load(FILE *srcfile)
Definition: KMeans.cpp:213
static void range_fill_vector(T *vec, int32_t len, T start=0)
Definition: SGVector.cpp:228
bool get_fixed_centers()
Definition: KMeans.cpp:326
int32_t get_mini_batch_size() const
Definition: KMeans.cpp:275
void set_max_iter(int32_t iter)
Definition: KMeans.cpp:248
void set_fixed_centers(bool fixed)
Definition: KMeans.cpp:321
void set_mini_batch_num_iterations(int32_t t)
Definition: KMeans.cpp:280
virtual float64_t distance(int32_t idx_a, int32_t idx_b)
Definition: Distance.cpp:189
#define SG_UNREF(x)
Definition: SGObject.h:52
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
virtual void set_initial_centers(SGMatrix< float64_t > centers)
Definition: KMeans.cpp:73
void free_feature_vector(ST *feat_vec, int32_t num, bool dofree)
The class Features is the base class of all feature objects.
Definition: Features.h:68
static T min(T a, T b)
Definition: Math.h:157
void set_distance(CDistance *d)
void set_mini_batch_parameters(int32_t b, int32_t t)
Definition: KMeans.cpp:291
void set_train_method(EKMeansMethod f)
Definition: KMeans.cpp:259
virtual void precompute_lhs()
Definition: Distance.h:143
int32_t get_mini_batch_num_iterations() const
Definition: KMeans.cpp:286
static float64_t log(float64_t v)
Definition: Math.h:922
static CDenseFeatures * obtain_from_generic(CFeatures *const base_features)
Vector::Scalar vector_sum(Vector a)
Definition: Redux.h:81
virtual void precompute_rhs()
Definition: Distance.h:135
EKMeansMethod get_train_method() const
Definition: KMeans.cpp:264
#define SG_ADD(...)
Definition: SGObject.h:81
void set_mini_batch_size(int32_t b)
Definition: KMeans.cpp:269
static float32_t sqrt(float32_t x)
Definition: Math.h:459
static void minibatch_KMeans(int32_t k, CDistance *distance, int32_t batch_size, int32_t minib_iter, SGMatrix< float64_t > mus)
int32_t get_k()
Definition: KMeans.cpp:243
virtual bool init(CFeatures *lhs, CFeatures *rhs)
Definition: Distance.cpp:78
SGMatrix< float64_t > get_cluster_centers()
Definition: KMeans.cpp:304

SHOGUN Machine Learning Toolbox - Documentation