KMeans.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2008 Gunnar Raetsch
00008  * Written (W) 2007-2009 Soeren Sonnenburg
00009  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00010  */
00011 
00012 #include <shogun/clustering/KMeans.h>
00013 #include <shogun/distance/Distance.h>
00014 #include <shogun/features/Labels.h>
00015 #include <shogun/features/SimpleFeatures.h>
00016 #include <shogun/mathematics/Math.h>
00017 #include <shogun/base/Parallel.h>
00018 
00019 #ifdef HAVE_PTHREAD
00020 #include <pthread.h>
00021 #endif
00022 
00023 #define MUSRECALC
00024 
00025 #define PAR_THRESH  10
00026 
00027 using namespace shogun;
00028 
00029 CKMeans::CKMeans()
00030 : CDistanceMachine()
00031 {
00032     init();
00033 }
00034 
00035 CKMeans::CKMeans(int32_t k_, CDistance* d)
00036 : CDistanceMachine()
00037 {
00038     init();
00039     k=k_;
00040     set_distance(d);
00041 }
00042 
00043 CKMeans::~CKMeans()
00044 {
00045     R.destroy_vector();
00046 }
00047 
00048 bool CKMeans::train_machine(CFeatures* data)
00049 {
00050     ASSERT(distance);
00051 
00052     if (data)
00053         distance->init(data, data);
00054 
00055     ASSERT(distance->get_feature_type()==F_DREAL);
00056 
00057     CSimpleFeatures<float64_t>* lhs=
00058             (CSimpleFeatures<float64_t>*)distance->get_lhs();
00059     ASSERT(lhs);
00060     int32_t num=lhs->get_num_vectors();
00061     SG_UNREF(lhs);
00062 
00063     Weights=SGVector<float64_t>(num);
00064     for (int32_t i=0; i<num; i++)
00065         Weights.vector[i]=1.0;
00066 
00067     clustknb(false, NULL);
00068     Weights.destroy_vector();
00069 
00070     return true;
00071 }
00072 
00073 bool CKMeans::load(FILE* srcfile)
00074 {
00075     SG_SET_LOCALE_C;
00076     SG_RESET_LOCALE;
00077     return false;
00078 }
00079 
00080 bool CKMeans::save(FILE* dstfile)
00081 {
00082     SG_SET_LOCALE_C;
00083     SG_RESET_LOCALE;
00084     return false;
00085 }
00086 
00087 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00088 struct thread_data
00089 {
00090     float64_t* x;
00091     CSimpleFeatures<float64_t>* y;
00092     float64_t* z;
00093     int32_t n1, n2, m;
00094     int32_t js, je; /* defines the matrix stripe */
00095     int32_t offs;
00096 };
00097 #endif // DOXYGEN_SHOULD_SKIP_THIS
00098 
00099 namespace shogun
00100 {
00101 void *sqdist_thread_func(void * P)
00102 {
00103     struct thread_data *TD=(struct thread_data*) P;
00104     float64_t* x=TD->x;
00105     CSimpleFeatures<float64_t>* y=TD->y;
00106     float64_t* z=TD->z;
00107     int32_t n1=TD->n1,
00108         m=TD->m,
00109         js=TD->js,
00110         je=TD->je,
00111         offs=TD->offs,
00112         j,i,k;
00113 
00114     for (j=js; j<je; j++)
00115     {
00116         int32_t vlen=0;
00117         bool vfree=false;
00118         float64_t* vec=y->get_feature_vector(j+offs, vlen, vfree);
00119 
00120         for (i=0; i<n1; i++)
00121         {
00122             float64_t sum=0;
00123             for (k=0; k<m; k++) 
00124                 sum = sum + CMath::sq(x[i*m + k] - vec[k]);
00125             z[j*n1 + i] = sum;
00126         }
00127 
00128         y->free_feature_vector(vec, j, vfree);
00129     }
00130     return NULL;
00131 } 
00132 }
00133 
00134 void CKMeans::clustknb(bool use_old_mus, float64_t *mus_start)
00135 {
00136     ASSERT(distance && distance->get_feature_type()==F_DREAL);
00137     CSimpleFeatures<float64_t>* lhs = (CSimpleFeatures<float64_t>*) distance->get_lhs();
00138     ASSERT(lhs && lhs->get_num_features()>0 && lhs->get_num_vectors()>0);
00139 
00140     int32_t XSize=lhs->get_num_vectors();
00141     dimensions=lhs->get_num_features();
00142     int32_t i, changed=1;
00143     const int32_t XDimk=dimensions*k;
00144     int32_t iter=0;
00145 
00146     R.destroy_vector();
00147     R=SGVector<float64_t>(k);
00148 
00149     mus=SGMatrix<float64_t>(dimensions, k);
00150 
00151     int32_t *ClList=SG_CALLOC(int32_t, XSize);
00152     float64_t *weights_set=SG_CALLOC(float64_t, k);
00153     float64_t *dists=SG_CALLOC(float64_t, k*XSize);
00154 
00156     CSimpleFeatures<float64_t>* rhs_mus = new CSimpleFeatures<float64_t>(0);
00157     CFeatures* rhs_cache = distance->replace_rhs(rhs_mus);
00158  
00159     int32_t vlen=0;
00160     bool vfree=false;
00161     float64_t* vec=NULL;
00162 
00163     /* ClList=zeros(XSize,1) ; */
00164     memset(ClList, 0, sizeof(int32_t)*XSize);
00165     /* weights_set=zeros(k,1) ; */
00166     memset(weights_set, 0, sizeof(float64_t)*k);
00167 
00168     /* cluster_centers=zeros(dimensions, k) ; */
00169     memset(mus.matrix, 0, sizeof(float64_t)*XDimk);
00170 
00171     if (!use_old_mus)
00172     {
00173         for (i=0; i<XSize; i++) 
00174         {
00175             const int32_t Cl=CMath::random(0, k-1);
00176             int32_t j;
00177             float64_t weight=Weights.vector[i];
00178 
00179             weights_set[Cl]+=weight;
00180             ClList[i]=Cl;
00181 
00182             vec=lhs->get_feature_vector(i, vlen, vfree);
00183 
00184             for (j=0; j<dimensions; j++)
00185                 mus.matrix[Cl*dimensions+j] += weight*vec[j];
00186 
00187             lhs->free_feature_vector(vec, i, vfree);
00188         }
00189         for (i=0; i<k; i++)
00190         {
00191             int32_t j;
00192 
00193             if (weights_set[i]!=0.0)
00194                 for (j=0; j<dimensions; j++)
00195                     mus.matrix[i*dimensions+j] /= weights_set[i];
00196         }
00197     }
00198     else 
00199     {
00200         ASSERT(mus_start);
00201 
00203         rhs_mus->copy_feature_matrix(SGMatrix<float64_t>(mus_start,dimensions,k));
00204         float64_t* p_dists=dists;
00205 
00206         for(int32_t idx=0;idx<XSize;idx++,p_dists+=k)
00207             distances_rhs(p_dists,0,k,idx);
00208         p_dists=NULL;            
00209 
00210         for (i=0; i<XSize; i++)
00211         {
00212             float64_t mini=dists[i*k];
00213             int32_t Cl = 0, j;
00214 
00215             for (j=1; j<k; j++)
00216             {
00217                 if (dists[i*k+j]<mini)
00218                 {
00219                     Cl=j;
00220                     mini=dists[i*k+j];
00221                 }
00222             }
00223             ClList[i]=Cl;
00224         }
00225 
00226         /* Compute the sum of all points belonging to a cluster 
00227          * and count the points */
00228         for (i=0; i<XSize; i++) 
00229         {
00230             const int32_t Cl = ClList[i];
00231             float64_t weight=Weights.vector[i];
00232             weights_set[Cl]+=weight;
00233 #ifndef MUSRECALC
00234             vec=lhs->get_feature_vector(i, vlen, vfree);
00235 
00236             for (j=0; j<dimensions; j++)
00237                 mus.matrix[Cl*dimensions+j] += weight*vec[j];
00238 
00239             lhs->free_feature_vector(vec, i, vfree);
00240 #endif
00241         }
00242 #ifndef MUSRECALC
00243         /* normalization to get the mean */ 
00244         for (i=0; i<k; i++)
00245         {
00246             if (weights_set[i]!=0.0)
00247             {
00248                 int32_t j;
00249                 for (j=0; j<dimensions; j++)
00250                     mus.matrix[i*dimensions+j] /= weights_set[i];
00251             }
00252         }
00253 #endif
00254     }
00255 
00256 
00257 
00258     while (changed && (iter<max_iter))
00259     {
00260         iter++;
00261         if (iter==max_iter-1)
00262             SG_WARNING("kmeans clustering changed throughout %d iterations stopping...\n", max_iter-1);
00263 
00264         if (iter%1000 == 0)
00265             SG_INFO("Iteration[%d/%d]: Assignment of %i patterns changed.\n", iter, max_iter, changed);
00266         changed=0;
00267 
00268 #ifdef MUSRECALC
00269         /* mus=zeros(dimensions, k) ; */
00270         memset(mus.matrix, 0, sizeof(float64_t)*XDimk);
00271 
00272         for (i=0; i<XSize; i++) 
00273         {
00274             int32_t j;
00275             int32_t Cl=ClList[i];
00276             float64_t weight=Weights.vector[i];
00277 
00278             vec=lhs->get_feature_vector(i, vlen, vfree);
00279 
00280             for (j=0; j<dimensions; j++)
00281                 mus.matrix[Cl*dimensions+j] += weight*vec[j];
00282 
00283             lhs->free_feature_vector(vec, i, vfree);
00284         }
00285         for (i=0; i<k; i++)
00286         {
00287             int32_t j;
00288 
00289             if (weights_set[i]!=0.0)
00290                 for (j=0; j<dimensions; j++)
00291                     mus.matrix[i*dimensions+j] /= weights_set[i];
00292         }
00293 #endif
00294 
00295         rhs_mus->copy_feature_matrix(mus);
00296 
00297         for (i=0; i<XSize; i++)
00298         {
00299             /* ks=ceil(rand(1,XSize)*XSize) ; */
00300             const int32_t Pat= CMath::random(0, XSize-1);
00301             const int32_t ClList_Pat=ClList[Pat];
00302             int32_t imini, j;
00303             float64_t mini, weight;
00304 
00305             weight=Weights.vector[Pat];
00306 
00307             /* compute the distance of this point to all centers */
00308             for(int32_t idx_k=0;idx_k<k;idx_k++)
00309                 dists[idx_k]=distance->distance(Pat,idx_k);
00310            
00311             /* [mini,imini]=min(dists(:,i)) ; */
00312             imini=0 ; mini=dists[0];
00313             for (j=1; j<k; j++)
00314                 if (dists[j]<mini)
00315                 {
00316                     mini=dists[j];
00317                     imini=j;
00318                 }
00319 
00320             if (imini!=ClList_Pat)
00321             {
00322                 changed= changed + 1;
00323 
00324                 /* weights_set(imini) = weights_set(imini) + weight ; */
00325                 weights_set[imini]+= weight;
00326                 /* weights_set(j)     = weights_set(j)     - weight ; */
00327                 weights_set[ClList_Pat]-= weight;
00328 
00329                 vec=lhs->get_feature_vector(Pat, vlen, vfree);
00330 
00331                 for (j=0; j<dimensions; j++)
00332                 {
00333                     mus.matrix[imini*dimensions+j]-=(vec[j]
00334                             -mus.matrix[imini*dimensions+j])
00335                             *(weight/weights_set[imini]);
00336                 }
00337 
00338                 lhs->free_feature_vector(vec, Pat, vfree);
00339 
00340                 /* mu_new = mu_old - (x - mu_old)/(n-1) */
00341                 /* if weights_set(j)~=0 */
00342                 if (weights_set[ClList_Pat]!=0.0)
00343                 {
00344                     vec=lhs->get_feature_vector(Pat, vlen, vfree);
00345 
00346                     for (j=0; j<dimensions; j++)
00347                     {
00348                         mus.matrix[ClList_Pat*dimensions+j]-=
00349                                 (vec[j]
00350                                         -mus.matrix[ClList_Pat
00351                                                 *dimensions+j])
00352                                         *(weight/weights_set[ClList_Pat]);
00353                     }
00354                     lhs->free_feature_vector(vec, Pat, vfree);
00355                 }
00356                 else
00357                     /*  mus(:,j)=zeros(dimensions,1) ; */
00358                     for (j=0; j<dimensions; j++)
00359                         mus.matrix[ClList_Pat*dimensions+j]=0;
00360 
00361                 /* ClList(i)= imini ; */
00362                 ClList[Pat] = imini;
00363             }
00364         }
00365     }
00366 
00367     /* compute the ,,variances'' of the clusters */
00368     for (i=0; i<k; i++)
00369     {
00370         float64_t rmin1=0;
00371         float64_t rmin2=0;
00372 
00373         bool first_round=true;
00374 
00375         for (int32_t j=0; j<k; j++)
00376         {
00377             if (j!=i)
00378             {
00379                 int32_t l;
00380                 float64_t dist = 0;
00381 
00382                 for (l=0; l<dimensions; l++)
00383                 {
00384                     dist+=CMath::sq(
00385                             mus.matrix[i*dimensions+l]
00386                                     -mus.matrix[j*dimensions+l]);
00387                 }
00388 
00389                 if (first_round)
00390                 {
00391                     rmin1=dist;
00392                     rmin2=dist;
00393                     first_round=false;
00394                 }
00395                 else
00396                 {
00397                     if ((dist<rmin2) && (dist>=rmin1))
00398                         rmin2=dist;
00399 
00400                     if (dist<rmin1) 
00401                     {
00402                         rmin2=rmin1;
00403                         rmin1=dist;
00404                     }
00405                 }
00406             }
00407         }
00408 
00409         R.vector[i]=(0.7*CMath::sqrt(rmin1)+0.3*CMath::sqrt(rmin2));
00410     }
00411         distance->replace_rhs(rhs_cache);
00412         delete rhs_mus;        
00413     SG_FREE(ClList);
00414     SG_FREE(weights_set);
00415     SG_FREE(dists);
00416     SG_UNREF(lhs);
00417 }
00418 
00419 void CKMeans::store_model_features()
00420 {
00421     /* set lhs of underlying distance to cluster centers */
00422     CSimpleFeatures<float64_t>* cluster_centers=new CSimpleFeatures<float64_t>(
00423             mus);
00424 
00425     /* reset mus variable to avoid interference with above features */
00426     mus.do_free=false;
00427     mus.free_matrix();
00428 
00429     /* store cluster centers in lhs of distance variable */
00430     CFeatures* rhs=distance->get_rhs();
00431     distance->init(cluster_centers, rhs);
00432     SG_UNREF(rhs);
00433 }
00434 
00435 void CKMeans::init()
00436 {
00437     max_iter=10000;
00438     k=3;
00439     dimensions=0;
00440 
00441     m_parameters->add(&max_iter, "max_iter", "Maximum number of iterations");
00442     m_parameters->add(&k, "k", "Parameter k");
00443     m_parameters->add(&dimensions, "dimensions", "Dimensions of data");
00444     m_parameters->add(&R, "R", "Cluster radiuses");
00445 }
00446 
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation