KMeans.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2008 Gunnar Raetsch
00008  * Written (W) 2007-2009 Soeren Sonnenburg
00009  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00010  */
00011 
00012 #include "clustering/KMeans.h"
00013 #include "distance/Distance.h"
00014 #include "features/Labels.h"
00015 #include "features/SimpleFeatures.h"
00016 #include "lib/Mathematics.h"
00017 #include "base/Parallel.h"
00018 
00019 #ifndef WIN32
00020 #include <pthread.h>
00021 #endif
00022 
00023 #define MUSRECALC
00024 
00025 #define PAR_THRESH  10
00026 
00027 using namespace shogun;
00028 
00029 CKMeans::CKMeans()
00030 : CDistanceMachine(), max_iter(10000), k(3), dimensions(0), R(NULL),
00031     mus(NULL), Weights(NULL)
00032 {
00033 }
00034 
00035 CKMeans::CKMeans(int32_t k_, CDistance* d)
00036 : CDistanceMachine(), max_iter(10000), k(k_), dimensions(0), R(NULL),
00037     mus(NULL), Weights(NULL)
00038 {
00039     set_distance(d);
00040 }
00041 
00042 CKMeans::~CKMeans()
00043 {
00044     delete[] R;
00045     delete[] mus;
00046 }
00047 
00048 bool CKMeans::train(CFeatures* data)
00049 {
00050     ASSERT(distance);
00051 
00052     if (data)
00053         distance->init(data, data);
00054 
00055     ASSERT(distance->get_feature_type()==F_DREAL);
00056 
00057     CSimpleFeatures<float64_t>* lhs=(CSimpleFeatures<float64_t>*) distance->get_lhs();
00058     ASSERT(lhs);
00059     int32_t num=lhs->get_num_vectors();
00060     SG_UNREF(lhs);
00061 
00062     Weights=new float64_t[num];
00063     for (int32_t i=0; i<num; i++)
00064         Weights[i]=1.0;
00065 
00066     clustknb(false, NULL);
00067     delete[] Weights;
00068 
00069     return true;
00070 }
00071 
00072 bool CKMeans::load(FILE* srcfile)
00073 {
00074     SG_SET_LOCALE_C;
00075     SG_RESET_LOCALE;
00076     return false;
00077 }
00078 
00079 bool CKMeans::save(FILE* dstfile)
00080 {
00081     SG_SET_LOCALE_C;
00082     SG_RESET_LOCALE;
00083     return false;
00084 }
00085 
00086 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00087 struct thread_data
00088 {
00089     float64_t* x;
00090     CSimpleFeatures<float64_t>* y;
00091     float64_t* z;
00092     int32_t n1, n2, m;
00093     int32_t js, je; /* defines the matrix stripe */
00094     int32_t offs;
00095 };
00096 #endif // DOXYGEN_SHOULD_SKIP_THIS
00097 
00098 namespace shogun
00099 {
00100 void *sqdist_thread_func(void * P)
00101 {
00102     struct thread_data *TD=(struct thread_data*) P;
00103     float64_t* x=TD->x;
00104     CSimpleFeatures<float64_t>* y=TD->y;
00105     float64_t* z=TD->z;
00106     int32_t n1=TD->n1,
00107         m=TD->m,
00108         js=TD->js,
00109         je=TD->je,
00110         offs=TD->offs,
00111         j,i,k;
00112 
00113     for (j=js; j<je; j++)
00114     {
00115         int32_t vlen=0;
00116         bool vfree=false;
00117         float64_t* vec=y->get_feature_vector(j+offs, vlen, vfree);
00118 
00119         for (i=0; i<n1; i++)
00120         {
00121             float64_t sum=0;
00122             for (k=0; k<m; k++) 
00123                 sum = sum + CMath::sq(x[i*m + k] - vec[k]);
00124             z[j*n1 + i] = sum;
00125         }
00126 
00127         y->free_feature_vector(vec, j, vfree);
00128     }
00129     return NULL;
00130 } 
00131 }
00132 
00133 void CKMeans::clustknb(bool use_old_mus, float64_t *mus_start)
00134 {
00135     ASSERT(distance && distance->get_feature_type()==F_DREAL);
00136     CSimpleFeatures<float64_t>* lhs = (CSimpleFeatures<float64_t>*) distance->get_lhs();
00137     ASSERT(lhs && lhs->get_num_features()>0 && lhs->get_num_vectors()>0);
00138 
00139     int32_t XSize=lhs->get_num_vectors();
00140     dimensions=lhs->get_num_features();
00141     int32_t i, changed=1;
00142     const int32_t XDimk=dimensions*k;
00143     int32_t iter=0;
00144 
00145     delete[] R;
00146     R=new float64_t[k];
00147 
00148     delete[] mus;
00149     mus=new float64_t[XDimk];
00150 
00151     int32_t *ClList = (int32_t*) calloc(XSize, sizeof(int32_t));
00152     float64_t *weights_set = (float64_t*) calloc(k, sizeof(float64_t));
00153     float64_t *dists = (float64_t*) calloc(k*XSize, sizeof(float64_t));
00154 
00156     CSimpleFeatures<float64_t>* rhs_mus = new CSimpleFeatures<float64_t>(0);
00157     CFeatures* rhs_cache = distance->replace_rhs(rhs_mus);
00158  
00159     int32_t vlen=0;
00160     bool vfree=false;
00161     float64_t* vec=NULL;
00162 
00163     /* ClList=zeros(XSize,1) ; */
00164     for (i=0; i<XSize; i++) ClList[i]=0;
00165     /* weights_set=zeros(k,1) ; */
00166     for (i=0; i<k; i++) weights_set[i]=0;
00167 
00168     /* mus=zeros(dimensions, k) ; */
00169     for (i=0; i<XDimk; i++) mus[i]=0;
00170 
00171     if (!use_old_mus)
00172     {
00173         /* random clustering (select random subsets) */
00174         /*  ks=ceil(rand(1,XSize)*k);
00175          *  for i=1:k,
00176          *  actks= (ks==i);
00177          *  c=sum(actks);
00178          *  weights_set(i)=c;
00179          *
00180          *  ClList(actks)=i*ones(1, c);
00181          *
00182          *  if ~mus_recalc,
00183          *      if c>1
00184          *          mus(:,i) = sum(XData(:,actks)')'/c;
00185          *      elseif c>0
00186          *          mus(:,i) = XData(:,actks);
00187          *      end;
00188          *  end;
00189          *   end ; */
00190 
00191         for (i=0; i<XSize; i++) 
00192         {
00193             const int32_t Cl=CMath::random(0, k-1);
00194             int32_t j;
00195             float64_t weight=Weights[i];
00196 
00197             weights_set[Cl]+=weight;
00198             ClList[i]=Cl;
00199 
00200             vec=lhs->get_feature_vector(i, vlen, vfree);
00201 
00202             for (j=0; j<dimensions; j++)
00203                 mus[Cl*dimensions+j] += weight*vec[j];
00204 
00205             lhs->free_feature_vector(vec, i, vfree);
00206         }
00207         for (i=0; i<k; i++)
00208         {
00209             int32_t j;
00210 
00211             if (weights_set[i]!=0.0)
00212                 for (j=0; j<dimensions; j++)
00213                     mus[i*dimensions+j] /= weights_set[i];
00214         }
00215     }
00216     else 
00217     {
00218         ASSERT(mus_start);
00219 
00221         rhs_mus->copy_feature_matrix(mus_start,dimensions,k);
00222         float64_t* p_dists=dists;
00223 
00224         for(int32_t idx=0;idx<XSize;idx++,p_dists+=k)
00225             distances_rhs(p_dists,0,k,idx);
00226         p_dists=NULL;            
00227 
00228         for (i=0; i<XSize; i++)
00229         {
00230             float64_t mini=dists[i*k];
00231             int32_t Cl = 0, j;
00232 
00233             for (j=1; j<k; j++)
00234             {
00235                 if (dists[i*k+j]<mini)
00236                 {
00237                     Cl=j;
00238                     mini=dists[i*k+j];
00239                 }
00240             }
00241             ClList[i]=Cl;
00242         }
00243 
00244         /* Compute the sum of all points belonging to a cluster 
00245          * and count the points */
00246         for (i=0; i<XSize; i++) 
00247         {
00248             const int32_t Cl = ClList[i];
00249             float64_t weight=Weights[i];
00250             weights_set[Cl]+=weight;
00251 #ifndef MUSRECALC
00252             vec=lhs->get_feature_vector(i, vlen, vfree);
00253 
00254             for (j=0; j<dimensions; j++)
00255                 mus[Cl*dimensions+j] += weight*vec[j];
00256 
00257             lhs->free_feature_vector(vec, i, vfree);
00258 #endif
00259         }
00260 #ifndef MUSRECALC
00261         /* normalization to get the mean */ 
00262         for (i=0; i<k; i++)
00263         {
00264             if (weights_set[i]!=0.0)
00265             {
00266                 int32_t j;
00267                 for (j=0; j<dimensions; j++)
00268                     mus[i*dimensions+j] /= weights_set[i];
00269             }
00270         }
00271 #endif
00272     }
00273 
00274 
00275 
00276     while (changed && (iter<max_iter))
00277     {
00278         iter++;
00279         if (iter==max_iter-1)
00280             SG_WARNING("kmeans clustering changed throughout %d iterations stopping...\n", max_iter-1);
00281 
00282         if (iter%1000 == 0)
00283             SG_INFO("Iteration[%d/%d]: Assignment of %i patterns changed.\n", iter, max_iter, changed);
00284         changed=0;
00285 
00286 #ifdef MUSRECALC
00287         /* mus=zeros(dimensions, k) ; */
00288         for (i=0; i<XDimk; i++) mus[i]=0;
00289 
00290         for (i=0; i<XSize; i++) 
00291         {
00292             int32_t j;
00293             int32_t Cl=ClList[i];
00294             float64_t weight=Weights[i];
00295 
00296             vec=lhs->get_feature_vector(i, vlen, vfree);
00297 
00298             for (j=0; j<dimensions; j++)
00299                 mus[Cl*dimensions+j] += weight*vec[j];
00300 
00301             lhs->free_feature_vector(vec, i, vfree);
00302         }
00303         for (i=0; i<k; i++)
00304         {
00305             int32_t j;
00306 
00307             if (weights_set[i]!=0.0)
00308                 for (j=0; j<dimensions; j++)
00309                     mus[i*dimensions+j] /= weights_set[i];
00310         }
00311 #endif
00312 
00313         rhs_mus->copy_feature_matrix(mus,dimensions,k);
00314 
00315         for (i=0; i<XSize; i++)
00316         {
00317             /* ks=ceil(rand(1,XSize)*XSize) ; */
00318             const int32_t Pat= CMath::random(0, XSize-1);
00319             const int32_t ClList_Pat=ClList[Pat];
00320             int32_t imini, j;
00321             float64_t mini, weight;
00322 
00323             weight=Weights[Pat];
00324 
00325             /* compute the distance of this point to all centers */
00326             for(int32_t idx_k=0;idx_k<k;idx_k++)
00327                 dists[idx_k]=distance->distance(Pat,idx_k);
00328            
00329             /* [mini,imini]=min(dists(:,i)) ; */
00330             imini=0 ; mini=dists[0];
00331             for (j=1; j<k; j++)
00332                 if (dists[j]<mini)
00333                 {
00334                     mini=dists[j];
00335                     imini=j;
00336                 }
00337 
00338             if (imini!=ClList_Pat)
00339             {
00340                 changed= changed + 1;
00341 
00342                 /* weights_set(imini) = weights_set(imini) + weight ; */
00343                 weights_set[imini]+= weight;
00344                 /* weights_set(j)     = weights_set(j)     - weight ; */
00345                 weights_set[ClList_Pat]-= weight;
00346 
00347                 /* mu_new=mu_old + (x - mu_old)/(n+1) */
00348                 /* mus(:,imini)=mus(:,imini) + (XData(:,i) - mus(:,imini)) * (weight / weights_set(imini)) ; */
00349                 vec=lhs->get_feature_vector(Pat, vlen, vfree);
00350 
00351                 for (j=0; j<dimensions; j++)
00352                     mus[imini*dimensions+j]-=(vec[j]-mus[imini*dimensions+j])*(weight/weights_set[imini]);
00353 
00354                 lhs->free_feature_vector(vec, Pat, vfree);
00355 
00356                 /* mu_new = mu_old - (x - mu_old)/(n-1) */
00357                 /* if weights_set(j)~=0 */
00358                 if (weights_set[ClList_Pat]!=0.0)
00359                 {
00360                     /* mus(:,j)=mus(:,j) - (XData(:,i) - mus(:,j)) * (weight/weights_set(j)) ; */
00361                     vec=lhs->get_feature_vector(Pat, vlen, vfree);
00362 
00363                     for (j=0; j<dimensions; j++)
00364                         mus[ClList_Pat*dimensions+j]-=(vec[j]-mus[ClList_Pat*dimensions+j])*(weight/weights_set[ClList_Pat]);
00365                     lhs->free_feature_vector(vec, Pat, vfree);
00366                 }
00367                 else
00368                     /*  mus(:,j)=zeros(dimensions,1) ; */
00369                     for (j=0; j<dimensions; j++)
00370                         mus[ClList_Pat*dimensions+j]=0;
00371 
00372                 /* ClList(i)= imini ; */
00373                 ClList[Pat] = imini;
00374             }
00375         }
00376     }
00377 
00378     /* compute the ,,variances'' of the clusters */
00379     for (i=0; i<k; i++)
00380     {
00381         float64_t rmin1=0;
00382         float64_t rmin2=0;
00383 
00384         bool first_round=true;
00385 
00386         for (int32_t j=0; j<k; j++)
00387         {
00388             if (j!=i)
00389             {
00390                 int32_t l;
00391                 float64_t dist = 0;
00392 
00393                 for (l=0; l<dimensions; l++)
00394                     dist+=CMath::sq(mus[i*dimensions+l]-mus[j*dimensions+l]);
00395 
00396                 if (first_round)
00397                 {
00398                     rmin1=dist;
00399                     rmin2=dist;
00400                     first_round=false;
00401                 }
00402                 else
00403                 {
00404                     if ((dist<rmin2) && (dist>=rmin1))
00405                         rmin2=dist;
00406 
00407                     if (dist<rmin1) 
00408                     {
00409                         rmin2=rmin1;
00410                         rmin1=dist;
00411                     }
00412                 }
00413             }
00414         }
00415 
00416         R[i]=(0.7*sqrt(rmin1)+0.3*sqrt(rmin2));
00417     }
00418         distance->replace_rhs(rhs_cache);
00419         delete rhs_mus;        
00420 
00421     free(ClList);
00422     free(weights_set);
00423     free(dists);
00424     SG_UNREF(lhs);
00425 } 
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation