SHOGUN  4.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
KMeans.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2014 Parijat Mazumdar
8  * Written (W) 2016 Saurabh Mahindre
9  */
10 
16 #include <shogun/io/SGIO.h>
18 
19 using namespace Eigen;
20 using namespace shogun;
21 
22 
23 namespace shogun
24 {
25 
26 CKMeans::CKMeans():CKMeansBase()
27 {
28 }
29 
30 CKMeans::CKMeans(int32_t k_i, CDistance* d_i, bool use_kmpp_i):CKMeansBase(k_i, d_i, use_kmpp_i)
31 {
32 }
33 
34 CKMeans::CKMeans(int32_t k_i, CDistance* d_i, SGMatrix<float64_t> centers_i):CKMeansBase(k_i, d_i, centers_i)
35 {
36 }
37 
39 {
40 }
41 
42 void CKMeans::Lloyd_KMeans(SGMatrix<float64_t> centers, int32_t num_centers)
43 {
46 
47  int32_t lhs_size=lhs->get_num_vectors();
48  int32_t dim=lhs->get_num_features();
49 
51  CFeatures* rhs_cache=distance->replace_rhs(rhs_mus);
52 
53  SGVector<int32_t> cluster_assignments=SGVector<int32_t>(lhs_size);
54  cluster_assignments.zero();
55 
56  /* Weights : Number of points in each cluster */
57  SGVector<int64_t> weights_set(num_centers);
58  weights_set.zero();
59  /* Initially set all weights for zeroth cluster, Changes in assignement step */
60  weights_set[0]=lhs_size;
61 
63 
64  int32_t changed=1;
65  int32_t iter;
66 
67  for(iter=0; iter<max_iter; iter++)
68  {
69  if (iter==max_iter-1)
70  SG_SWARNING("KMeans clustering has reached maximum number of ( %d ) iterations without having converged. \
71  Terminating. \n", iter)
72 
73  changed=0;
74  rhs_mus->copy_feature_matrix(centers);
75 
76  distance->precompute_rhs();
77 
78 #pragma omp parallel for firstprivate(lhs_size, dim, num_centers) \
79  shared(centers, cluster_assignments, weights_set) \
80  reduction(+:changed) if (!fixed_centers)
81  /* Assigment step : Assign each point to nearest cluster */
82  for (int32_t i=0; i<lhs_size; i++)
83  {
84  const int32_t cluster_assignments_i=cluster_assignments[i];
85  int32_t min_cluster, j;
86  float64_t min_dist, dist;
87 
88  min_cluster=0;
89  min_dist=distance->distance(i,0);
90  for (j=1; j<num_centers; j++)
91  {
92  dist=distance->distance(i,j);
93  if (dist<min_dist)
94  {
95  min_dist=dist;
96  min_cluster=j;
97  }
98  }
99 
100  if (min_cluster!=cluster_assignments_i)
101  {
102  changed++;
103 #pragma omp atomic
104  ++weights_set[min_cluster];
105 #pragma omp atomic
106  --weights_set[cluster_assignments_i];
107 
108  if(fixed_centers)
109  {
111  float64_t temp_min = 1.0 / weights_set[min_cluster];
112 
113  /* mu_new = mu_old + (x - mu_old)/(w) */
114  for (j=0; j<dim; j++)
115  {
116  centers(j, min_cluster)+=
117  (vec[j]-centers(j, min_cluster))*temp_min;
118  }
119 
120  lhs->free_feature_vector(vec, i);
121 
122  /* mu_new = mu_old - (x - mu_old)/(w-1) */
123  /* if weights_set(j)~=0 */
124  if (weights_set[cluster_assignments_i]!=0)
125  {
126  float64_t temp_i = 1.0 / weights_set[cluster_assignments_i];
128 
129  for (j=0; j<dim; j++)
130  {
131  centers(j, cluster_assignments_i)-=
132  (vec1[j]-centers(j, cluster_assignments_i))*temp_i;
133  }
134  lhs->free_feature_vector(vec1, i);
135  }
136  else
137  {
138  /* mus(:,j)=zeros(dim,1) ; */
139  for (j=0; j<dim; j++)
140  centers(j, cluster_assignments_i)=0;
141  }
142 
143  }
144 
145  cluster_assignments[i] = min_cluster;
146  }
147  }
148  if(changed==0)
149  break;
150 
151  /* Update Step : Calculate new means */
152  if (!fixed_centers)
153  {
154  /* mus=zeros(dim, num_centers) ; */
155  centers.zero();
156  Map<MatrixXd> map_centers(centers.matrix, centers.num_rows, centers.num_cols);
157 
158  for (int32_t i=0; i<lhs_size; i++)
159  {
160  int32_t cluster_i=cluster_assignments[i];
161 
163  Map<VectorXd> map_vec(vec.vector, vec.size());
164 
165  map_centers.col(cluster_i) += map_vec;
166 
167  lhs->free_feature_vector(vec, i);
168  }
169 
170  for (int32_t i=0; i<num_centers; i++)
171  {
172  if (weights_set[i]!=0)
173  map_centers.col(i)*=1.0/weights_set[i];
174  }
175  }
176  if (iter%(max_iter/10) == 0)
177  SG_SINFO("Iteration[%d/%d]: Assignment of %i patterns changed.\n", iter, max_iter, changed)
178  }
180  distance->replace_rhs(rhs_cache);
181  delete rhs_mus;
182  SG_UNREF(lhs);
183 }
184 
185 bool CKMeans::train_machine(CFeatures* data)
186 {
187  initialize_training(data);
188  Lloyd_KMeans(mus, k);
190  return true;
191 }
192 
193 }
194 
ST * get_feature_vector(int32_t num, int32_t &len, bool &dofree)
Class Distance, a base class for all the distances used in the Shogun toolbox.
Definition: Distance.h:87
int32_t get_num_features() const
virtual void reset_precompute()
Definition: Distance.h:150
CFeatures * get_lhs()
Definition: Distance.h:224
#define SG_SWARNING(...)
Definition: SGIO.h:178
Definition: SGMatrix.h:20
SGMatrix< float64_t > mus
Definition: KMeansBase.h:199
virtual ~CKMeans()
Definition: KMeans.cpp:38
int32_t size() const
Definition: SGVector.h:113
void compute_cluster_variances()
Definition: KMeansBase.cpp:94
void initialize_training(CFeatures *data=NULL)
Definition: KMeansBase.cpp:142
virtual int32_t get_num_vectors() const
double float64_t
Definition: common.h:50
virtual CFeatures * replace_rhs(CFeatures *rhs)
Definition: Distance.cpp:170
virtual float64_t distance(int32_t idx_a, int32_t idx_b)
Definition: Distance.cpp:206
#define SG_UNREF(x)
Definition: SGObject.h:55
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
void free_feature_vector(ST *feat_vec, int32_t num, bool dofree)
The class Features is the base class of all feature objects.
Definition: Features.h:68
#define SG_SINFO(...)
Definition: SGIO.h:173
virtual void precompute_lhs()
Definition: Distance.h:143
static CDenseFeatures * obtain_from_generic(CFeatures *const base_features)

SHOGUN Machine Learning Toolbox - Documentation