SHOGUN  4.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
KMeansBase.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 1999-2008 Gunnar Raetsch
8  * Written (W) 2007-2009 Soeren Sonnenburg
9  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
10  */
11 
15 #include <shogun/labels/Labels.h>
18 #include <shogun/base/Parallel.h>
20 
21 #ifdef HAVE_LINALG_LIB
23 #endif
24 
25 using namespace shogun;
26 using namespace Eigen;
27 
30 {
31  init();
32 }
33 
34 CKMeansBase::CKMeansBase(int32_t k_, CDistance* d, bool use_kmpp)
36 {
37  init();
38  k=k_;
39  set_distance(d);
40  use_kmeanspp=use_kmpp;
41 }
42 
45 {
46  init();
47  k = k_i;
48  set_distance(d_i);
49  set_initial_centers(centers_i);
50 }
51 
53 {
54 }
55 
57 {
60  REQUIRE(centers.num_cols == k,
61  "Expected %d initial cluster centers, got %d", k, centers.num_cols);
62  REQUIRE(centers.num_rows == dimensions,
63  "Expected %d dimensionional cluster centers, got %d", dimensions, centers.num_rows);
64  mus_initial = centers;
65  SG_UNREF(lhs);
66 }
67 
69 {
70  mus.zero();
73  int32_t lhs_size=lhs->get_num_vectors();
74 
75  SGVector<int32_t> temp=SGVector<int32_t>(lhs_size);
76  SGVector<int32_t>::range_fill_vector(temp, lhs_size, 0);
77  CMath::permute(temp);
78 
79  for (int32_t i=0; i<k; i++)
80  {
81  const int32_t cluster_center_i=temp[i];
82  SGVector<float64_t> vec=lhs->get_feature_vector(cluster_center_i);
83 
84  for (int32_t j=0; j<dimensions; j++)
85  mus(j,i)=vec[j];
86 
87  lhs->free_feature_vector(vec, cluster_center_i);
88  }
89 
90  SG_UNREF(lhs);
91 
92 }
93 
95 {
96  /* compute the ,,variances'' of the clusters */
97  for (int32_t i=0; i<k; i++)
98  {
99  float64_t rmin1=0;
100  float64_t rmin2=0;
101 
102  bool first_round=true;
103 
104  for (int32_t j=0; j<k; j++)
105  {
106  if (j!=i)
107  {
108  int32_t l;
109  float64_t dist = 0;
110 
111  for (l=0; l<dimensions; l++)
112  {
113  dist+=CMath::sq(
114  mus.matrix[i*dimensions+l]
115  -mus.matrix[j*dimensions+l]);
116  }
117 
118  if (first_round)
119  {
120  rmin1=dist;
121  rmin2=dist;
122  first_round=false;
123  }
124  else
125  {
126  if ((dist<rmin2) && (dist>=rmin1))
127  rmin2=dist;
128 
129  if (dist<rmin1)
130  {
131  rmin2=rmin1;
132  rmin1=dist;
133  }
134  }
135  }
136  }
137 
138  R.vector[i]=(0.7*CMath::sqrt(rmin1)+0.3*CMath::sqrt(rmin2));
139  }
140 }
141 
143 {
144  REQUIRE(distance, "Distance is not provided")
145  REQUIRE(distance->get_feature_type()==F_DREAL, "Distance's features type (%d) should be of type REAL (%d)")
146 
147  if (data)
148  distance->init(data, data);
149 
152 
153  REQUIRE(lhs, "Lhs features of distance not provided");
154  int32_t lhs_size=lhs->get_num_vectors();
156  const int32_t centers_size=dimensions*k;
157 
158  REQUIRE(lhs_size>0, "Lhs features should not be empty");
159  REQUIRE(dimensions>0, "Lhs features should have more than zero dimensions");
160 
161  /* if kmeans++ to be used */
162  if (use_kmeanspp)
163  {
164 #ifdef HAVE_LINALG_LIB
166 #else
167  SG_WARNING("LINALG library not available for KMeans++, resorting to random initialisation");
168 #endif
169  }
170 
172 
174  /* cluster_centers=zeros(dimensions, k) ; */
175  memset(mus.matrix, 0, sizeof(float64_t)*centers_size);
176 
177 
178  if (mus_initial.matrix)
179  mus = mus_initial;
180  else
182 
183  SG_UNREF(lhs);
184 }
185 
186 bool CKMeansBase::load(FILE* srcfile)
187 {
190  return false;
191 }
192 
193 bool CKMeansBase::save(FILE* dstfile)
194 {
197  return false;
198 }
199 
201 {
202  use_kmeanspp=kmpp;
203 }
204 
206 {
207  return use_kmeanspp;
208 }
209 
210 void CKMeansBase::set_k(int32_t p_k)
211 {
212  REQUIRE(p_k>0, "number of clusters should be > 0");
213  this->k=p_k;
214 }
215 
217 {
218  return k;
219 }
220 
221 void CKMeansBase::set_max_iter(int32_t iter)
222 {
223  REQUIRE(iter>0, "number of clusters should be > 0");
224  max_iter=iter;
225 }
226 
228 {
229  return max_iter;
230 }
231 
233 {
234  return R;
235 }
236 
238 {
239  if (!R.vector)
240  return SGMatrix<float64_t>();
241 
244  SGMatrix<float64_t> centers=lhs->get_feature_matrix();
245  SG_UNREF(lhs);
246  return centers;
247 }
248 
250 {
251  return dimensions;
252 }
253 
255 {
256  fixed_centers=fixed;
257 }
258 
260 {
261  return fixed_centers;
262 }
263 
265 {
266  /* set lhs of underlying distance to cluster centers */
268  mus);
269 
270  /* store cluster centers in lhs of distance variable */
271  CFeatures* rhs=distance->get_rhs();
272  distance->init(cluster_centers, rhs);
273  SG_UNREF(rhs);
274 }
275 
277 {
278  int32_t lhs_size;
280  lhs_size=lhs->get_num_vectors();
281 
283  centers.zero();
284  SGVector<float64_t> min_dist=SGVector<float64_t>(lhs_size);
285  min_dist.zero();
286 
287  /* First center is chosen at random */
288  int32_t mu=CMath::random((int32_t) 0, lhs_size-1);
289  SGVector<float64_t> mu_first=lhs->get_feature_vector(mu);
290  for(int32_t j=0; j<dimensions; j++)
291  centers(j, 0)=mu_first[j];
292 
295 #pragma omp parallel for shared(min_dist)
296  for(int32_t i=0; i<lhs_size; i++)
297  min_dist[i]=CMath::sq(distance->distance(i, mu));
298 #ifdef HAVE_LINALG
299  float64_t sum=linalg::vector_sum(min_dist);
300 #else //HAVE_LINALG
301  Map<VectorXd> eigen_min_dist(min_dist.vector, min_dist.vlen);
302  float64_t sum=eigen_min_dist.sum();
303 #endif //HAVE_LINALG
304  int32_t n_rands=2 + int32_t(CMath::log(k));
305 
306  /* Choose centers with weighted probability */
307  for(int32_t i=1; i<k; i++)
308  {
309  int32_t best_center=0;
310  float64_t best_sum=-1.0;
311  SGVector<float64_t> best_min_dist=SGVector<float64_t>(lhs_size);
312 
313  /* local tries for best center */
314  for(int32_t trial=0; trial<n_rands; trial++)
315  {
316  float64_t temp_sum=0.0;
317  float64_t temp_dist=0.0;
318  SGVector<float64_t> temp_min_dist=SGVector<float64_t>(lhs_size);
319  int32_t new_center=0;
320  float64_t prob=CMath::random(0.0, 1.0);
321  prob=prob*sum;
322 
323  for(int32_t j=0; j<lhs_size; j++)
324  {
325  temp_sum+=min_dist[j];
326  if (prob <= temp_sum)
327  {
328  new_center=j;
329  break;
330  }
331  }
332 
333 #pragma omp parallel for firstprivate(lhs_size) \
334  shared(temp_min_dist)
335  for(int32_t j=0; j<lhs_size; j++)
336  {
337  temp_dist=CMath::sq(distance->distance(j, new_center));
338  temp_min_dist[j]=CMath::min(temp_dist, min_dist[j]);
339  }
340 
341 #ifdef HAVE_LINALG
342  temp_sum=linalg::vector_sum(temp_min_dist);
343 #else //HAVE_LINALG
344  Map<VectorXd> eigen_temp_sum(temp_min_dist.vector, temp_min_dist.vlen);
345  temp_sum=eigen_temp_sum.sum();
346 #endif //HAVE_LINALG
347  if ((temp_sum<best_sum) || (best_sum<0))
348  {
349  best_sum=temp_sum;
350  best_min_dist=temp_min_dist;
351  best_center=new_center;
352  }
353  }
354 
355  SGVector<float64_t> vec=lhs->get_feature_vector(best_center);
356  for(int32_t j=0; j<dimensions; j++)
357  centers(j, i)=vec[j];
358  sum=best_sum;
359  min_dist=best_min_dist;
360  }
361 
363  SG_UNREF(lhs);
364  return centers;
365 }
366 
368 {
369  max_iter=10000;
370  k=3;
371  dimensions=0;
372  fixed_centers=false;
373  use_kmeanspp=false;
374  SG_ADD(&max_iter, "max_iter", "Maximum number of iterations", MS_AVAILABLE);
375  SG_ADD(&k, "k", "k, the number of clusters", MS_AVAILABLE);
376  SG_ADD(&dimensions, "dimensions", "Dimensions of data", MS_NOT_AVAILABLE);
377  SG_ADD(&R, "R", "Cluster radiuses", MS_NOT_AVAILABLE);
378 }
379 
SGVector< float64_t > get_radiuses()
Definition: KMeansBase.cpp:232
static void permute(SGVector< T > v, CRandom *rand=NULL)
Definition: Math.h:1144
#define SG_RESET_LOCALE
Definition: SGIO.h:86
virtual void store_model_features()
Definition: KMeansBase.cpp:264
ST * get_feature_vector(int32_t num, int32_t &len, bool &dofree)
Class Distance, a base class for all the distances used in the Shogun toolbox.
Definition: Distance.h:87
int32_t get_num_features() const
virtual void reset_precompute()
Definition: Distance.h:150
CFeatures * get_lhs()
Definition: Distance.h:224
virtual bool save(FILE *dstfile)
Definition: KMeansBase.cpp:193
static T sq(T x)
Definition: Math.h:450
Definition: SGMatrix.h:20
#define REQUIRE(x,...)
Definition: SGIO.h:206
CFeatures * get_rhs()
Definition: Distance.h:230
index_t num_cols
Definition: SGMatrix.h:376
void set_max_iter(int32_t iter)
Definition: KMeansBase.cpp:221
SGMatrix< float64_t > mus
Definition: KMeansBase.h:199
#define SG_SET_LOCALE_C
Definition: SGIO.h:85
A generic DistanceMachine interface.
index_t num_rows
Definition: SGMatrix.h:374
static uint64_t random()
Definition: Math.h:1019
void compute_cluster_variances()
Definition: KMeansBase.cpp:94
void initialize_training(CFeatures *data=NULL)
Definition: KMeansBase.cpp:142
index_t vlen
Definition: SGVector.h:494
float64_t get_max_iter()
Definition: KMeansBase.cpp:227
virtual int32_t get_num_vectors() const
double float64_t
Definition: common.h:50
int32_t get_dimensions()
Definition: KMeansBase.cpp:249
static void range_fill_vector(T *vec, int32_t len, T start=0)
Definition: SGVector.cpp:228
void set_k(int32_t p_k)
Definition: KMeansBase.cpp:210
virtual float64_t distance(int32_t idx_a, int32_t idx_b)
Definition: Distance.cpp:206
#define SG_UNREF(x)
Definition: SGObject.h:55
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
void free_feature_vector(ST *feat_vec, int32_t num, bool dofree)
bool get_use_kmeanspp() const
Definition: KMeansBase.cpp:205
virtual bool load(FILE *srcfile)
Definition: KMeansBase.cpp:186
The class Features is the base class of all feature objects.
Definition: Features.h:68
static T min(T a, T b)
Definition: Math.h:157
void set_distance(CDistance *d)
SGMatrix< float64_t > kmeanspp()
Definition: KMeansBase.cpp:276
virtual EFeatureType get_feature_type()=0
virtual void precompute_lhs()
Definition: Distance.h:143
SGVector< float64_t > R
Definition: KMeansBase.h:190
static float64_t log(float64_t v)
Definition: Math.h:922
static CDenseFeatures * obtain_from_generic(CFeatures *const base_features)
SGMatrix< float64_t > get_cluster_centers()
Definition: KMeansBase.cpp:237
Vector::Scalar vector_sum(Vector a)
Definition: Redux.h:81
virtual void precompute_rhs()
Definition: Distance.h:135
#define SG_WARNING(...)
Definition: SGIO.h:128
#define SG_ADD(...)
Definition: SGObject.h:84
static float32_t sqrt(float32_t x)
Definition: Math.h:459
virtual ~CKMeansBase()
Definition: KMeansBase.cpp:52
virtual bool init(CFeatures *lhs, CFeatures *rhs)
Definition: Distance.cpp:78
void set_use_kmeanspp(bool kmpp)
Definition: KMeansBase.cpp:200
virtual void set_initial_centers(SGMatrix< float64_t > centers)
Definition: KMeansBase.cpp:56
void set_fixed_centers(bool fixed)
Definition: KMeansBase.cpp:254
SGMatrix< float64_t > mus_initial
Definition: KMeansBase.h:193

SHOGUN Machine Learning Toolbox - Documentation