00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include "clustering/KMeans.h"
00013 #include "distance/Distance.h"
00014 #include "features/Labels.h"
00015 #include "features/SimpleFeatures.h"
00016 #include "lib/Mathematics.h"
00017 #include "base/Parallel.h"
00018
00019 #ifndef WIN32
00020 #include <pthread.h>
00021 #endif
00022
00023 #define MUSRECALC
00024
00025 #define PAR_THRESH 10
00026
00027 using namespace shogun;
00028
00029 CKMeans::CKMeans()
00030 : CDistanceMachine(), max_iter(10000), k(3), dimensions(0), R(NULL),
00031 mus(NULL), Weights(NULL)
00032 {
00033 }
00034
00035 CKMeans::CKMeans(int32_t k_, CDistance* d)
00036 : CDistanceMachine(), max_iter(10000), k(k_), dimensions(0), R(NULL),
00037 mus(NULL), Weights(NULL)
00038 {
00039 set_distance(d);
00040 }
00041
00042 CKMeans::~CKMeans()
00043 {
00044 delete[] R;
00045 delete[] mus;
00046 }
00047
00048 bool CKMeans::train(CFeatures* data)
00049 {
00050 ASSERT(distance);
00051
00052 if (data)
00053 distance->init(data, data);
00054
00055 ASSERT(distance->get_feature_type()==F_DREAL);
00056
00057 CSimpleFeatures<float64_t>* lhs=(CSimpleFeatures<float64_t>*) distance->get_lhs();
00058 ASSERT(lhs);
00059 int32_t num=lhs->get_num_vectors();
00060 SG_UNREF(lhs);
00061
00062 Weights=new float64_t[num];
00063 for (int32_t i=0; i<num; i++)
00064 Weights[i]=1.0;
00065
00066 clustknb(false, NULL);
00067 delete[] Weights;
00068
00069 return true;
00070 }
00071
00072 bool CKMeans::load(FILE* srcfile)
00073 {
00074 SG_SET_LOCALE_C;
00075 SG_RESET_LOCALE;
00076 return false;
00077 }
00078
00079 bool CKMeans::save(FILE* dstfile)
00080 {
00081 SG_SET_LOCALE_C;
00082 SG_RESET_LOCALE;
00083 return false;
00084 }
00085
00086 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00087 struct thread_data
00088 {
00089 float64_t* x;
00090 CSimpleFeatures<float64_t>* y;
00091 float64_t* z;
00092 int32_t n1, n2, m;
00093 int32_t js, je;
00094 int32_t offs;
00095 };
00096 #endif // DOXYGEN_SHOULD_SKIP_THIS
00097
00098 namespace shogun
00099 {
00100 void *sqdist_thread_func(void * P)
00101 {
00102 struct thread_data *TD=(struct thread_data*) P;
00103 float64_t* x=TD->x;
00104 CSimpleFeatures<float64_t>* y=TD->y;
00105 float64_t* z=TD->z;
00106 int32_t n1=TD->n1,
00107 m=TD->m,
00108 js=TD->js,
00109 je=TD->je,
00110 offs=TD->offs,
00111 j,i,k;
00112
00113 for (j=js; j<je; j++)
00114 {
00115 int32_t vlen=0;
00116 bool vfree=false;
00117 float64_t* vec=y->get_feature_vector(j+offs, vlen, vfree);
00118
00119 for (i=0; i<n1; i++)
00120 {
00121 float64_t sum=0;
00122 for (k=0; k<m; k++)
00123 sum = sum + CMath::sq(x[i*m + k] - vec[k]);
00124 z[j*n1 + i] = sum;
00125 }
00126
00127 y->free_feature_vector(vec, j, vfree);
00128 }
00129 return NULL;
00130 }
00131 }
00132
00133 void CKMeans::clustknb(bool use_old_mus, float64_t *mus_start)
00134 {
00135 ASSERT(distance && distance->get_feature_type()==F_DREAL);
00136 CSimpleFeatures<float64_t>* lhs = (CSimpleFeatures<float64_t>*) distance->get_lhs();
00137 ASSERT(lhs && lhs->get_num_features()>0 && lhs->get_num_vectors()>0);
00138
00139 int32_t XSize=lhs->get_num_vectors();
00140 dimensions=lhs->get_num_features();
00141 int32_t i, changed=1;
00142 const int32_t XDimk=dimensions*k;
00143 int32_t iter=0;
00144
00145 delete[] R;
00146 R=new float64_t[k];
00147
00148 delete[] mus;
00149 mus=new float64_t[XDimk];
00150
00151 int32_t *ClList = (int32_t*) calloc(XSize, sizeof(int32_t));
00152 float64_t *weights_set = (float64_t*) calloc(k, sizeof(float64_t));
00153 float64_t *dists = (float64_t*) calloc(k*XSize, sizeof(float64_t));
00154
00156 CSimpleFeatures<float64_t>* rhs_mus = new CSimpleFeatures<float64_t>(0);
00157 CFeatures* rhs_cache = distance->replace_rhs(rhs_mus);
00158
00159 int32_t vlen=0;
00160 bool vfree=false;
00161 float64_t* vec=NULL;
00162
00163
00164 for (i=0; i<XSize; i++) ClList[i]=0;
00165
00166 for (i=0; i<k; i++) weights_set[i]=0;
00167
00168
00169 for (i=0; i<XDimk; i++) mus[i]=0;
00170
00171 if (!use_old_mus)
00172 {
00173
00174
00175
00176
00177
00178
00179
00180
00181
00182
00183
00184
00185
00186
00187
00188
00189
00190
00191 for (i=0; i<XSize; i++)
00192 {
00193 const int32_t Cl=CMath::random(0, k-1);
00194 int32_t j;
00195 float64_t weight=Weights[i];
00196
00197 weights_set[Cl]+=weight;
00198 ClList[i]=Cl;
00199
00200 vec=lhs->get_feature_vector(i, vlen, vfree);
00201
00202 for (j=0; j<dimensions; j++)
00203 mus[Cl*dimensions+j] += weight*vec[j];
00204
00205 lhs->free_feature_vector(vec, i, vfree);
00206 }
00207 for (i=0; i<k; i++)
00208 {
00209 int32_t j;
00210
00211 if (weights_set[i]!=0.0)
00212 for (j=0; j<dimensions; j++)
00213 mus[i*dimensions+j] /= weights_set[i];
00214 }
00215 }
00216 else
00217 {
00218 ASSERT(mus_start);
00219
00221 rhs_mus->copy_feature_matrix(mus_start,dimensions,k);
00222 float64_t* p_dists=dists;
00223
00224 for(int32_t idx=0;idx<XSize;idx++,p_dists+=k)
00225 distances_rhs(p_dists,0,k,idx);
00226 p_dists=NULL;
00227
00228 for (i=0; i<XSize; i++)
00229 {
00230 float64_t mini=dists[i*k];
00231 int32_t Cl = 0, j;
00232
00233 for (j=1; j<k; j++)
00234 {
00235 if (dists[i*k+j]<mini)
00236 {
00237 Cl=j;
00238 mini=dists[i*k+j];
00239 }
00240 }
00241 ClList[i]=Cl;
00242 }
00243
00244
00245
00246 for (i=0; i<XSize; i++)
00247 {
00248 const int32_t Cl = ClList[i];
00249 float64_t weight=Weights[i];
00250 weights_set[Cl]+=weight;
00251 #ifndef MUSRECALC
00252 vec=lhs->get_feature_vector(i, vlen, vfree);
00253
00254 for (j=0; j<dimensions; j++)
00255 mus[Cl*dimensions+j] += weight*vec[j];
00256
00257 lhs->free_feature_vector(vec, i, vfree);
00258 #endif
00259 }
00260 #ifndef MUSRECALC
00261
00262 for (i=0; i<k; i++)
00263 {
00264 if (weights_set[i]!=0.0)
00265 {
00266 int32_t j;
00267 for (j=0; j<dimensions; j++)
00268 mus[i*dimensions+j] /= weights_set[i];
00269 }
00270 }
00271 #endif
00272 }
00273
00274
00275
00276 while (changed && (iter<max_iter))
00277 {
00278 iter++;
00279 if (iter==max_iter-1)
00280 SG_WARNING("kmeans clustering changed throughout %d iterations stopping...\n", max_iter-1);
00281
00282 if (iter%1000 == 0)
00283 SG_INFO("Iteration[%d/%d]: Assignment of %i patterns changed.\n", iter, max_iter, changed);
00284 changed=0;
00285
00286 #ifdef MUSRECALC
00287
00288 for (i=0; i<XDimk; i++) mus[i]=0;
00289
00290 for (i=0; i<XSize; i++)
00291 {
00292 int32_t j;
00293 int32_t Cl=ClList[i];
00294 float64_t weight=Weights[i];
00295
00296 vec=lhs->get_feature_vector(i, vlen, vfree);
00297
00298 for (j=0; j<dimensions; j++)
00299 mus[Cl*dimensions+j] += weight*vec[j];
00300
00301 lhs->free_feature_vector(vec, i, vfree);
00302 }
00303 for (i=0; i<k; i++)
00304 {
00305 int32_t j;
00306
00307 if (weights_set[i]!=0.0)
00308 for (j=0; j<dimensions; j++)
00309 mus[i*dimensions+j] /= weights_set[i];
00310 }
00311 #endif
00312
00313 rhs_mus->copy_feature_matrix(mus,dimensions,k);
00314
00315 for (i=0; i<XSize; i++)
00316 {
00317
00318 const int32_t Pat= CMath::random(0, XSize-1);
00319 const int32_t ClList_Pat=ClList[Pat];
00320 int32_t imini, j;
00321 float64_t mini, weight;
00322
00323 weight=Weights[Pat];
00324
00325
00326 for(int32_t idx_k=0;idx_k<k;idx_k++)
00327 dists[idx_k]=distance->distance(Pat,idx_k);
00328
00329
00330 imini=0 ; mini=dists[0];
00331 for (j=1; j<k; j++)
00332 if (dists[j]<mini)
00333 {
00334 mini=dists[j];
00335 imini=j;
00336 }
00337
00338 if (imini!=ClList_Pat)
00339 {
00340 changed= changed + 1;
00341
00342
00343 weights_set[imini]+= weight;
00344
00345 weights_set[ClList_Pat]-= weight;
00346
00347
00348
00349 vec=lhs->get_feature_vector(Pat, vlen, vfree);
00350
00351 for (j=0; j<dimensions; j++)
00352 mus[imini*dimensions+j]-=(vec[j]-mus[imini*dimensions+j])*(weight/weights_set[imini]);
00353
00354 lhs->free_feature_vector(vec, Pat, vfree);
00355
00356
00357
00358 if (weights_set[ClList_Pat]!=0.0)
00359 {
00360
00361 vec=lhs->get_feature_vector(Pat, vlen, vfree);
00362
00363 for (j=0; j<dimensions; j++)
00364 mus[ClList_Pat*dimensions+j]-=(vec[j]-mus[ClList_Pat*dimensions+j])*(weight/weights_set[ClList_Pat]);
00365 lhs->free_feature_vector(vec, Pat, vfree);
00366 }
00367 else
00368
00369 for (j=0; j<dimensions; j++)
00370 mus[ClList_Pat*dimensions+j]=0;
00371
00372
00373 ClList[Pat] = imini;
00374 }
00375 }
00376 }
00377
00378
00379 for (i=0; i<k; i++)
00380 {
00381 float64_t rmin1=0;
00382 float64_t rmin2=0;
00383
00384 bool first_round=true;
00385
00386 for (int32_t j=0; j<k; j++)
00387 {
00388 if (j!=i)
00389 {
00390 int32_t l;
00391 float64_t dist = 0;
00392
00393 for (l=0; l<dimensions; l++)
00394 dist+=CMath::sq(mus[i*dimensions+l]-mus[j*dimensions+l]);
00395
00396 if (first_round)
00397 {
00398 rmin1=dist;
00399 rmin2=dist;
00400 first_round=false;
00401 }
00402 else
00403 {
00404 if ((dist<rmin2) && (dist>=rmin1))
00405 rmin2=dist;
00406
00407 if (dist<rmin1)
00408 {
00409 rmin2=rmin1;
00410 rmin1=dist;
00411 }
00412 }
00413 }
00414 }
00415
00416 R[i]=(0.7*sqrt(rmin1)+0.3*sqrt(rmin2));
00417 }
00418 distance->replace_rhs(rhs_cache);
00419 delete rhs_mus;
00420
00421 free(ClList);
00422 free(weights_set);
00423 free(dists);
00424 SG_UNREF(lhs);
00425 }