KMeans.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2008 Gunnar Raetsch
00008  * Written (W) 2007-2009 Soeren Sonnenburg
00009  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00010  */
00011 
00012 #include <shogun/clustering/KMeans.h>
00013 #include <shogun/distance/Distance.h>
00014 #include <shogun/features/Labels.h>
00015 #include <shogun/features/SimpleFeatures.h>
00016 #include <shogun/mathematics/Math.h>
00017 #include <shogun/base/Parallel.h>
00018 
00019 #ifdef HAVE_PTHREAD
00020 #include <pthread.h>
00021 #endif
00022 
00023 #define MUSRECALC
00024 
00025 #define PAR_THRESH  10
00026 
00027 using namespace shogun;
00028 
00029 CKMeans::CKMeans()
00030 : CDistanceMachine()
00031 {
00032     init();
00033 }
00034 
00035 CKMeans::CKMeans(int32_t k_, CDistance* d)
00036 : CDistanceMachine()
00037 {
00038     init();
00039     k=k_;
00040     set_distance(d);
00041 }
00042 
00043 CKMeans::~CKMeans()
00044 {
00045     R.destroy_vector();
00046 }
00047 
00048 bool CKMeans::train_machine(CFeatures* data)
00049 {
00050     ASSERT(distance);
00051 
00052     if (data)
00053         distance->init(data, data);
00054 
00055     ASSERT(distance->get_feature_type()==F_DREAL);
00056 
00057     CSimpleFeatures<float64_t>* lhs=
00058             (CSimpleFeatures<float64_t>*)distance->get_lhs();
00059     ASSERT(lhs);
00060     int32_t num=lhs->get_num_vectors();
00061     SG_UNREF(lhs);
00062 
00063     Weights=SGVector<float64_t>(num);
00064     for (int32_t i=0; i<num; i++)
00065         Weights.vector[i]=1.0;
00066 
00067     clustknb(false, NULL);
00068     Weights.destroy_vector();
00069 
00070     return true;
00071 }
00072 
00073 bool CKMeans::load(FILE* srcfile)
00074 {
00075     SG_SET_LOCALE_C;
00076     SG_RESET_LOCALE;
00077     return false;
00078 }
00079 
00080 bool CKMeans::save(FILE* dstfile)
00081 {
00082     SG_SET_LOCALE_C;
00083     SG_RESET_LOCALE;
00084     return false;
00085 }
00086 
00087 
00088 void CKMeans::set_k(int32_t p_k)
00089 {
00090     ASSERT(p_k>0);
00091     this->k=p_k;
00092 }
00093 
00094 int32_t CKMeans::get_k()
00095 {
00096     return k;
00097 }
00098 
00099 void CKMeans::set_max_iter(int32_t iter)
00100 {
00101     ASSERT(iter>0);
00102     max_iter=iter;
00103 }
00104 
00105 float64_t CKMeans::get_max_iter()
00106 {
00107     return max_iter;
00108 }
00109 
00110 SGVector<float64_t> CKMeans::get_radiuses()
00111 {
00112     return R;
00113 }
00114 
00115 SGMatrix<float64_t> CKMeans::get_cluster_centers()
00116 {
00117     if (!R.vector)
00118         return SGMatrix<float64_t>();
00119 
00120     CSimpleFeatures<float64_t>* lhs=
00121         (CSimpleFeatures<float64_t>*)distance->get_lhs();
00122     SGMatrix<float64_t> centers=lhs->get_feature_matrix();
00123     SG_UNREF(lhs);
00124     return centers;
00125 }
00126 
00127 int32_t CKMeans::get_dimensions()
00128 {
00129     return dimensions;
00130 }
00131 
00132 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00133 struct thread_data
00134 {
00135     float64_t* x;
00136     CSimpleFeatures<float64_t>* y;
00137     float64_t* z;
00138     int32_t n1, n2, m;
00139     int32_t js, je; /* defines the matrix stripe */
00140     int32_t offs;
00141 };
00142 #endif // DOXYGEN_SHOULD_SKIP_THIS
00143 
00144 namespace shogun
00145 {
00146 void *sqdist_thread_func(void * P)
00147 {
00148     struct thread_data *TD=(struct thread_data*) P;
00149     float64_t* x=TD->x;
00150     CSimpleFeatures<float64_t>* y=TD->y;
00151     float64_t* z=TD->z;
00152     int32_t n1=TD->n1,
00153         m=TD->m,
00154         js=TD->js,
00155         je=TD->je,
00156         offs=TD->offs,
00157         j,i,k;
00158 
00159     for (j=js; j<je; j++)
00160     {
00161         int32_t vlen=0;
00162         bool vfree=false;
00163         float64_t* vec=y->get_feature_vector(j+offs, vlen, vfree);
00164 
00165         for (i=0; i<n1; i++)
00166         {
00167             float64_t sum=0;
00168             for (k=0; k<m; k++) 
00169                 sum = sum + CMath::sq(x[i*m + k] - vec[k]);
00170             z[j*n1 + i] = sum;
00171         }
00172 
00173         y->free_feature_vector(vec, j, vfree);
00174     }
00175     return NULL;
00176 } 
00177 }
00178 
00179 void CKMeans::clustknb(bool use_old_mus, float64_t *mus_start)
00180 {
00181     ASSERT(distance && distance->get_feature_type()==F_DREAL);
00182     CSimpleFeatures<float64_t>* lhs = (CSimpleFeatures<float64_t>*) distance->get_lhs();
00183     ASSERT(lhs && lhs->get_num_features()>0 && lhs->get_num_vectors()>0);
00184 
00185     int32_t XSize=lhs->get_num_vectors();
00186     dimensions=lhs->get_num_features();
00187     int32_t i, changed=1;
00188     const int32_t XDimk=dimensions*k;
00189     int32_t iter=0;
00190 
00191     R.destroy_vector();
00192     R=SGVector<float64_t>(k);
00193 
00194     mus=SGMatrix<float64_t>(dimensions, k);
00195 
00196     int32_t *ClList=SG_CALLOC(int32_t, XSize);
00197     float64_t *weights_set=SG_CALLOC(float64_t, k);
00198     float64_t *dists=SG_CALLOC(float64_t, k*XSize);
00199 
00201     CSimpleFeatures<float64_t>* rhs_mus = new CSimpleFeatures<float64_t>(0);
00202     CFeatures* rhs_cache = distance->replace_rhs(rhs_mus);
00203  
00204     int32_t vlen=0;
00205     bool vfree=false;
00206     float64_t* vec=NULL;
00207 
00208     /* ClList=zeros(XSize,1) ; */
00209     memset(ClList, 0, sizeof(int32_t)*XSize);
00210     /* weights_set=zeros(k,1) ; */
00211     memset(weights_set, 0, sizeof(float64_t)*k);
00212 
00213     /* cluster_centers=zeros(dimensions, k) ; */
00214     memset(mus.matrix, 0, sizeof(float64_t)*XDimk);
00215 
00216     if (!use_old_mus)
00217     {
00218         for (i=0; i<XSize; i++) 
00219         {
00220             const int32_t Cl=CMath::random(0, k-1);
00221             int32_t j;
00222             float64_t weight=Weights.vector[i];
00223 
00224             weights_set[Cl]+=weight;
00225             ClList[i]=Cl;
00226 
00227             vec=lhs->get_feature_vector(i, vlen, vfree);
00228 
00229             for (j=0; j<dimensions; j++)
00230                 mus.matrix[Cl*dimensions+j] += weight*vec[j];
00231 
00232             lhs->free_feature_vector(vec, i, vfree);
00233         }
00234         for (i=0; i<k; i++)
00235         {
00236             int32_t j;
00237 
00238             if (weights_set[i]!=0.0)
00239                 for (j=0; j<dimensions; j++)
00240                     mus.matrix[i*dimensions+j] /= weights_set[i];
00241         }
00242     }
00243     else 
00244     {
00245         ASSERT(mus_start);
00246 
00248         rhs_mus->copy_feature_matrix(SGMatrix<float64_t>(mus_start,dimensions,k));
00249         float64_t* p_dists=dists;
00250 
00251         for(int32_t idx=0;idx<XSize;idx++,p_dists+=k)
00252             distances_rhs(p_dists,0,k,idx);
00253         p_dists=NULL;            
00254 
00255         for (i=0; i<XSize; i++)
00256         {
00257             float64_t mini=dists[i*k];
00258             int32_t Cl = 0, j;
00259 
00260             for (j=1; j<k; j++)
00261             {
00262                 if (dists[i*k+j]<mini)
00263                 {
00264                     Cl=j;
00265                     mini=dists[i*k+j];
00266                 }
00267             }
00268             ClList[i]=Cl;
00269         }
00270 
00271         /* Compute the sum of all points belonging to a cluster 
00272          * and count the points */
00273         for (i=0; i<XSize; i++) 
00274         {
00275             const int32_t Cl = ClList[i];
00276             float64_t weight=Weights.vector[i];
00277             weights_set[Cl]+=weight;
00278 #ifndef MUSRECALC
00279             vec=lhs->get_feature_vector(i, vlen, vfree);
00280 
00281             for (j=0; j<dimensions; j++)
00282                 mus.matrix[Cl*dimensions+j] += weight*vec[j];
00283 
00284             lhs->free_feature_vector(vec, i, vfree);
00285 #endif
00286         }
00287 #ifndef MUSRECALC
00288         /* normalization to get the mean */ 
00289         for (i=0; i<k; i++)
00290         {
00291             if (weights_set[i]!=0.0)
00292             {
00293                 int32_t j;
00294                 for (j=0; j<dimensions; j++)
00295                     mus.matrix[i*dimensions+j] /= weights_set[i];
00296             }
00297         }
00298 #endif
00299     }
00300 
00301 
00302 
00303     while (changed && (iter<max_iter))
00304     {
00305         iter++;
00306         if (iter==max_iter-1)
00307             SG_WARNING("kmeans clustering changed throughout %d iterations stopping...\n", max_iter-1);
00308 
00309         if (iter%1000 == 0)
00310             SG_INFO("Iteration[%d/%d]: Assignment of %i patterns changed.\n", iter, max_iter, changed);
00311         changed=0;
00312 
00313 #ifdef MUSRECALC
00314         /* mus=zeros(dimensions, k) ; */
00315         memset(mus.matrix, 0, sizeof(float64_t)*XDimk);
00316 
00317         for (i=0; i<XSize; i++) 
00318         {
00319             int32_t j;
00320             int32_t Cl=ClList[i];
00321             float64_t weight=Weights.vector[i];
00322 
00323             vec=lhs->get_feature_vector(i, vlen, vfree);
00324 
00325             for (j=0; j<dimensions; j++)
00326                 mus.matrix[Cl*dimensions+j] += weight*vec[j];
00327 
00328             lhs->free_feature_vector(vec, i, vfree);
00329         }
00330         for (i=0; i<k; i++)
00331         {
00332             int32_t j;
00333 
00334             if (weights_set[i]!=0.0)
00335                 for (j=0; j<dimensions; j++)
00336                     mus.matrix[i*dimensions+j] /= weights_set[i];
00337         }
00338 #endif
00339 
00340         rhs_mus->copy_feature_matrix(mus);
00341 
00342         for (i=0; i<XSize; i++)
00343         {
00344             /* ks=ceil(rand(1,XSize)*XSize) ; */
00345             const int32_t Pat= CMath::random(0, XSize-1);
00346             const int32_t ClList_Pat=ClList[Pat];
00347             int32_t imini, j;
00348             float64_t mini, weight;
00349 
00350             weight=Weights.vector[Pat];
00351 
00352             /* compute the distance of this point to all centers */
00353             for(int32_t idx_k=0;idx_k<k;idx_k++)
00354                 dists[idx_k]=distance->distance(Pat,idx_k);
00355            
00356             /* [mini,imini]=min(dists(:,i)) ; */
00357             imini=0 ; mini=dists[0];
00358             for (j=1; j<k; j++)
00359                 if (dists[j]<mini)
00360                 {
00361                     mini=dists[j];
00362                     imini=j;
00363                 }
00364 
00365             if (imini!=ClList_Pat)
00366             {
00367                 changed= changed + 1;
00368 
00369                 /* weights_set(imini) = weights_set(imini) + weight ; */
00370                 weights_set[imini]+= weight;
00371                 /* weights_set(j)     = weights_set(j)     - weight ; */
00372                 weights_set[ClList_Pat]-= weight;
00373 
00374                 vec=lhs->get_feature_vector(Pat, vlen, vfree);
00375 
00376                 for (j=0; j<dimensions; j++)
00377                 {
00378                     mus.matrix[imini*dimensions+j]-=(vec[j]
00379                             -mus.matrix[imini*dimensions+j])
00380                             *(weight/weights_set[imini]);
00381                 }
00382 
00383                 lhs->free_feature_vector(vec, Pat, vfree);
00384 
00385                 /* mu_new = mu_old - (x - mu_old)/(n-1) */
00386                 /* if weights_set(j)~=0 */
00387                 if (weights_set[ClList_Pat]!=0.0)
00388                 {
00389                     vec=lhs->get_feature_vector(Pat, vlen, vfree);
00390 
00391                     for (j=0; j<dimensions; j++)
00392                     {
00393                         mus.matrix[ClList_Pat*dimensions+j]-=
00394                                 (vec[j]
00395                                         -mus.matrix[ClList_Pat
00396                                                 *dimensions+j])
00397                                         *(weight/weights_set[ClList_Pat]);
00398                     }
00399                     lhs->free_feature_vector(vec, Pat, vfree);
00400                 }
00401                 else
00402                     /*  mus(:,j)=zeros(dimensions,1) ; */
00403                     for (j=0; j<dimensions; j++)
00404                         mus.matrix[ClList_Pat*dimensions+j]=0;
00405 
00406                 /* ClList(i)= imini ; */
00407                 ClList[Pat] = imini;
00408             }
00409         }
00410     }
00411 
00412     /* compute the ,,variances'' of the clusters */
00413     for (i=0; i<k; i++)
00414     {
00415         float64_t rmin1=0;
00416         float64_t rmin2=0;
00417 
00418         bool first_round=true;
00419 
00420         for (int32_t j=0; j<k; j++)
00421         {
00422             if (j!=i)
00423             {
00424                 int32_t l;
00425                 float64_t dist = 0;
00426 
00427                 for (l=0; l<dimensions; l++)
00428                 {
00429                     dist+=CMath::sq(
00430                             mus.matrix[i*dimensions+l]
00431                                     -mus.matrix[j*dimensions+l]);
00432                 }
00433 
00434                 if (first_round)
00435                 {
00436                     rmin1=dist;
00437                     rmin2=dist;
00438                     first_round=false;
00439                 }
00440                 else
00441                 {
00442                     if ((dist<rmin2) && (dist>=rmin1))
00443                         rmin2=dist;
00444 
00445                     if (dist<rmin1) 
00446                     {
00447                         rmin2=rmin1;
00448                         rmin1=dist;
00449                     }
00450                 }
00451             }
00452         }
00453 
00454         R.vector[i]=(0.7*CMath::sqrt(rmin1)+0.3*CMath::sqrt(rmin2));
00455     }
00456         distance->replace_rhs(rhs_cache);
00457         delete rhs_mus;        
00458     SG_FREE(ClList);
00459     SG_FREE(weights_set);
00460     SG_FREE(dists);
00461     SG_UNREF(lhs);
00462 }
00463 
00464 void CKMeans::store_model_features()
00465 {
00466     /* set lhs of underlying distance to cluster centers */
00467     CSimpleFeatures<float64_t>* cluster_centers=new CSimpleFeatures<float64_t>(
00468             mus);
00469 
00470     /* reset mus variable to avoid interference with above features */
00471     mus.do_free=false;
00472     mus.free_matrix();
00473 
00474     /* store cluster centers in lhs of distance variable */
00475     CFeatures* rhs=distance->get_rhs();
00476     distance->init(cluster_centers, rhs);
00477     SG_UNREF(rhs);
00478 }
00479 
00480 void CKMeans::init()
00481 {
00482     max_iter=10000;
00483     k=3;
00484     dimensions=0;
00485 
00486     m_parameters->add(&max_iter, "max_iter", "Maximum number of iterations");
00487     m_parameters->add(&k, "k", "Parameter k");
00488     m_parameters->add(&dimensions, "dimensions", "Dimensions of data");
00489     m_parameters->add(&R, "R", "Cluster radiuses");
00490 }
00491 
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation