SHOGUN  4.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
KMeansLloydImpl.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2014 Parijat Mazumdar
8  */
9 
15 #include <shogun/io/SGIO.h>
16 
17 using namespace shogun;
18 
19 namespace shogun
20 {
21 void CKMeansLloydImpl::Lloyd_KMeans(int32_t k, CDistance* distance, int32_t max_iter, SGMatrix<float64_t> mus,
22  bool fixed_centers)
23 {
26 
27  int32_t lhs_size=lhs->get_num_vectors();
28  int32_t dimensions=lhs->get_num_features();
29 
31  CFeatures* rhs_cache=distance->replace_rhs(rhs_mus);
32 
33  SGVector<int32_t> cluster_assignments=SGVector<int32_t>(lhs_size);
34  cluster_assignments.zero();
35 
36  /* Weights : Number of points in each cluster */
38  weights_set.zero();
39  /* Initially set all weights for zeroth cluster, Changes in assignement step */
40  weights_set[0]=lhs_size;
41 
42  distance->precompute_lhs();
43 
44  int32_t changed=1;
45  int32_t iter;
46 
47  for(iter=0; iter<max_iter; iter++)
48  {
49  if (iter==max_iter-1)
50  SG_SWARNING("KMeans clustering has reached maximum number of ( %d ) iterations without having converged. \
51  Terminating. \n", iter)
52 
53  changed=0;
54  rhs_mus->copy_feature_matrix(mus);
55 
56  distance->precompute_rhs();
57 
58 #pragma omp parallel for firstprivate(lhs_size, dimensions, k) \
59  shared(mus, cluster_assignments, weights_set) \
60  reduction(+:changed) if (!fixed_centers)
61  /* Assigment step : Assign each point to nearest cluster */
62  for (int32_t i=0; i<lhs_size; i++)
63  {
64  const int32_t cluster_assignments_i=cluster_assignments[i];
65  int32_t min_cluster, j;
66  float64_t min_dist, dist;
67 
68  min_cluster=0;
69  min_dist=distance->distance(i,0);
70  for (j=1; j<k; j++)
71  {
72  dist=distance->distance(i,j);
73  if (dist<min_dist)
74  {
75  min_dist=dist;
76  min_cluster=j;
77  }
78  }
79 
80  if (min_cluster!=cluster_assignments_i)
81  {
82  changed++;
83 #pragma omp atomic
84  weights_set[min_cluster]+= 1.0;
85 #pragma omp atomic
86  weights_set[cluster_assignments_i]-= 1.0;
87 
88  if(fixed_centers)
89  {
91 
92  /* mu_new = mu_old + (x - mu_old)/(w) */
93  for (j=0; j<dimensions; j++)
94  {
95  mus(j, min_cluster)+=
96  (vec[j]-mus(j, min_cluster)) / weights_set[min_cluster];
97  }
98 
99  lhs->free_feature_vector(vec, i);
100 
101  /* mu_new = mu_old - (x - mu_old)/(w-1) */
102  /* if weights_set(j)~=0 */
103  if (weights_set[cluster_assignments_i]!=0.0)
104  {
106 
107  for (j=0; j<dimensions; j++)
108  {
109  mus(j, cluster_assignments_i)-=
110  (vec1[j]-mus(j, cluster_assignments_i)) / weights_set[cluster_assignments_i];
111  }
112  lhs->free_feature_vector(vec1, i);
113  }
114  else
115  {
116  /* mus(:,j)=zeros(dimensions,1) ; */
117  for (j=0; j<dimensions; j++)
118  mus(j, cluster_assignments_i)=0;
119  }
120 
121  }
122 
123  cluster_assignments[i] = min_cluster;
124  }
125  }
126  if(changed==0)
127  break;
128 
129  /* Update Step : Calculate new means */
130  if (!fixed_centers)
131  {
132  /* mus=zeros(dimensions, k) ; */
133  mus.zero();
134  for (int32_t i=0; i<lhs_size; i++)
135  {
136  int32_t cluster_i=cluster_assignments[i];
137 
139 
140  for (int32_t j=0; j<dimensions; j++)
141  mus(j, cluster_i) += vec[j];
142  lhs->free_feature_vector(vec, i);
143  }
144 
145  for (int32_t i=0; i<k; i++)
146  {
147  if (weights_set[i]!=0.0)
148  {
149  for (int32_t j=0; j<dimensions; j++)
150  mus(j, i) /= weights_set[i];
151  }
152  }
153  }
154  if (iter%(max_iter/10) == 0)
155  SG_SINFO("Iteration[%d/%d]: Assignment of %i patterns changed.\n", iter, max_iter, changed)
156  }
157  distance->reset_precompute();
158  distance->replace_rhs(rhs_cache);
159  delete rhs_mus;
160  SG_UNREF(lhs);
161 }
162 }
float distance(CJLCoverTreePoint p1, CJLCoverTreePoint p2, float64_t upper_bound)
virtual void copy_feature_matrix(SGMatrix< ST > src)
ST * get_feature_vector(int32_t num, int32_t &len, bool &dofree)
Class Distance, a base class for all the distances used in the Shogun toolbox.
Definition: Distance.h:87
int32_t get_num_features() const
virtual void reset_precompute()
Definition: Distance.h:150
CFeatures * get_lhs()
Definition: Distance.h:224
#define SG_SWARNING(...)
Definition: SGIO.h:178
static void Lloyd_KMeans(int32_t k, CDistance *distance, int32_t max_iter, SGMatrix< float64_t > mus, bool fixed_centers)
virtual int32_t get_num_vectors() const
double float64_t
Definition: common.h:50
CFeatures * replace_rhs(CFeatures *rhs)
Definition: Distance.cpp:145
virtual float64_t distance(int32_t idx_a, int32_t idx_b)
Definition: Distance.cpp:189
#define SG_UNREF(x)
Definition: SGObject.h:52
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
void free_feature_vector(ST *feat_vec, int32_t num, bool dofree)
The class Features is the base class of all feature objects.
Definition: Features.h:68
#define SG_SINFO(...)
Definition: SGIO.h:173
virtual void precompute_lhs()
Definition: Distance.h:143
static CDenseFeatures * obtain_from_generic(CFeatures *const base_features)
virtual void precompute_rhs()
Definition: Distance.h:135

SHOGUN Machine Learning Toolbox - Documentation