KMeans.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2008 Gunnar Raetsch
00008  * Written (W) 2007-2009 Soeren Sonnenburg
00009  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00010  */
00011 
00012 #include <shogun/clustering/KMeans.h>
00013 #include <shogun/distance/Distance.h>
00014 #include <shogun/labels/Labels.h>
00015 #include <shogun/features/DenseFeatures.h>
00016 #include <shogun/mathematics/Math.h>
00017 #include <shogun/base/Parallel.h>
00018 
00019 #ifdef HAVE_PTHREAD
00020 #include <pthread.h>
00021 #endif
00022 
00023 #define MUSRECALC
00024 
00025 #define PAR_THRESH  10
00026 
00027 using namespace shogun;
00028 
00029 CKMeans::CKMeans()
00030 : CDistanceMachine()
00031 {
00032     init();
00033 }
00034 
00035 CKMeans::CKMeans(int32_t k_, CDistance* d)
00036 : CDistanceMachine()
00037 {
00038     init();
00039     k=k_;
00040     set_distance(d);
00041 }
00042 
00043 CKMeans::~CKMeans()
00044 {
00045 }
00046 
00047 bool CKMeans::train_machine(CFeatures* data)
00048 {
00049     ASSERT(distance);
00050 
00051     if (data)
00052         distance->init(data, data);
00053 
00054     ASSERT(distance->get_feature_type()==F_DREAL);
00055 
00056     CDenseFeatures<float64_t>* lhs=
00057             (CDenseFeatures<float64_t>*)distance->get_lhs();
00058     ASSERT(lhs);
00059     int32_t num=lhs->get_num_vectors();
00060     SG_UNREF(lhs);
00061 
00062     Weights=SGVector<float64_t>(num);
00063     for (int32_t i=0; i<num; i++)
00064         Weights.vector[i]=1.0;
00065 
00066     clustknb(false, NULL);
00067 
00068     return true;
00069 }
00070 
00071 bool CKMeans::load(FILE* srcfile)
00072 {
00073     SG_SET_LOCALE_C;
00074     SG_RESET_LOCALE;
00075     return false;
00076 }
00077 
00078 bool CKMeans::save(FILE* dstfile)
00079 {
00080     SG_SET_LOCALE_C;
00081     SG_RESET_LOCALE;
00082     return false;
00083 }
00084 
00085 
00086 void CKMeans::set_k(int32_t p_k)
00087 {
00088     ASSERT(p_k>0);
00089     this->k=p_k;
00090 }
00091 
00092 int32_t CKMeans::get_k()
00093 {
00094     return k;
00095 }
00096 
00097 void CKMeans::set_max_iter(int32_t iter)
00098 {
00099     ASSERT(iter>0);
00100     max_iter=iter;
00101 }
00102 
00103 float64_t CKMeans::get_max_iter()
00104 {
00105     return max_iter;
00106 }
00107 
00108 SGVector<float64_t> CKMeans::get_radiuses()
00109 {
00110     return R;
00111 }
00112 
00113 SGMatrix<float64_t> CKMeans::get_cluster_centers()
00114 {
00115     if (!R.vector)
00116         return SGMatrix<float64_t>();
00117 
00118     CDenseFeatures<float64_t>* lhs=
00119         (CDenseFeatures<float64_t>*)distance->get_lhs();
00120     SGMatrix<float64_t> centers=lhs->get_feature_matrix();
00121     SG_UNREF(lhs);
00122     return centers;
00123 }
00124 
00125 int32_t CKMeans::get_dimensions()
00126 {
00127     return dimensions;
00128 }
00129 
00130 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00131 struct thread_data
00132 {
00133     float64_t* x;
00134     CDenseFeatures<float64_t>* y;
00135     float64_t* z;
00136     int32_t n1, n2, m;
00137     int32_t js, je; /* defines the matrix stripe */
00138     int32_t offs;
00139 };
00140 #endif // DOXYGEN_SHOULD_SKIP_THIS
00141 
00142 namespace shogun
00143 {
00144 void *sqdist_thread_func(void * P)
00145 {
00146     struct thread_data *TD=(struct thread_data*) P;
00147     float64_t* x=TD->x;
00148     CDenseFeatures<float64_t>* y=TD->y;
00149     float64_t* z=TD->z;
00150     int32_t n1=TD->n1,
00151         m=TD->m,
00152         js=TD->js,
00153         je=TD->je,
00154         offs=TD->offs,
00155         j,i,k;
00156 
00157     for (j=js; j<je; j++)
00158     {
00159         int32_t vlen=0;
00160         bool vfree=false;
00161         float64_t* vec=y->get_feature_vector(j+offs, vlen, vfree);
00162 
00163         for (i=0; i<n1; i++)
00164         {
00165             float64_t sum=0;
00166             for (k=0; k<m; k++)
00167                 sum = sum + CMath::sq(x[i*m + k] - vec[k]);
00168             z[j*n1 + i] = sum;
00169         }
00170 
00171         y->free_feature_vector(vec, j, vfree);
00172     }
00173     return NULL;
00174 }
00175 }
00176 
00177 void CKMeans::clustknb(bool use_old_mus, float64_t *mus_start)
00178 {
00179     ASSERT(distance && distance->get_feature_type()==F_DREAL);
00180     CDenseFeatures<float64_t>* lhs = (CDenseFeatures<float64_t>*) distance->get_lhs();
00181     ASSERT(lhs && lhs->get_num_features()>0 && lhs->get_num_vectors()>0);
00182 
00183     int32_t XSize=lhs->get_num_vectors();
00184     dimensions=lhs->get_num_features();
00185     int32_t i, changed=1;
00186     const int32_t XDimk=dimensions*k;
00187     int32_t iter=0;
00188 
00189     R=SGVector<float64_t>(k);
00190 
00191     mus=SGMatrix<float64_t>(dimensions, k);
00192 
00193     int32_t *ClList=SG_CALLOC(int32_t, XSize);
00194     float64_t *weights_set=SG_CALLOC(float64_t, k);
00195     float64_t *dists=SG_CALLOC(float64_t, k*XSize);
00196 
00198     CDenseFeatures<float64_t>* rhs_mus = new CDenseFeatures<float64_t>(0);
00199     CFeatures* rhs_cache = distance->replace_rhs(rhs_mus);
00200 
00201     int32_t vlen=0;
00202     bool vfree=false;
00203     float64_t* vec=NULL;
00204 
00205     /* ClList=zeros(XSize,1) ; */
00206     memset(ClList, 0, sizeof(int32_t)*XSize);
00207     /* weights_set=zeros(k,1) ; */
00208     memset(weights_set, 0, sizeof(float64_t)*k);
00209 
00210     /* cluster_centers=zeros(dimensions, k) ; */
00211     memset(mus.matrix, 0, sizeof(float64_t)*XDimk);
00212 
00213     if (!use_old_mus)
00214     {
00215         for (i=0; i<XSize; i++)
00216         {
00217             const int32_t Cl=CMath::random(0, k-1);
00218             int32_t j;
00219             float64_t weight=Weights.vector[i];
00220 
00221             weights_set[Cl]+=weight;
00222             ClList[i]=Cl;
00223 
00224             vec=lhs->get_feature_vector(i, vlen, vfree);
00225 
00226             for (j=0; j<dimensions; j++)
00227                 mus.matrix[Cl*dimensions+j] += weight*vec[j];
00228 
00229             lhs->free_feature_vector(vec, i, vfree);
00230         }
00231         for (i=0; i<k; i++)
00232         {
00233             int32_t j;
00234 
00235             if (weights_set[i]!=0.0)
00236                 for (j=0; j<dimensions; j++)
00237                     mus.matrix[i*dimensions+j] /= weights_set[i];
00238         }
00239     }
00240     else
00241     {
00242         ASSERT(mus_start);
00243 
00245         rhs_mus->copy_feature_matrix(SGMatrix<float64_t>(mus_start,dimensions,k));
00246         float64_t* p_dists=dists;
00247 
00248         for(int32_t idx=0;idx<XSize;idx++,p_dists+=k)
00249             distances_rhs(p_dists,0,k,idx);
00250         p_dists=NULL;
00251 
00252         for (i=0; i<XSize; i++)
00253         {
00254             float64_t mini=dists[i*k];
00255             int32_t Cl = 0, j;
00256 
00257             for (j=1; j<k; j++)
00258             {
00259                 if (dists[i*k+j]<mini)
00260                 {
00261                     Cl=j;
00262                     mini=dists[i*k+j];
00263                 }
00264             }
00265             ClList[i]=Cl;
00266         }
00267 
00268         /* Compute the sum of all points belonging to a cluster
00269          * and count the points */
00270         for (i=0; i<XSize; i++)
00271         {
00272             const int32_t Cl = ClList[i];
00273             float64_t weight=Weights.vector[i];
00274             weights_set[Cl]+=weight;
00275 #ifndef MUSRECALC
00276             vec=lhs->get_feature_vector(i, vlen, vfree);
00277 
00278             for (j=0; j<dimensions; j++)
00279                 mus.matrix[Cl*dimensions+j] += weight*vec[j];
00280 
00281             lhs->free_feature_vector(vec, i, vfree);
00282 #endif
00283         }
00284 #ifndef MUSRECALC
00285         /* normalization to get the mean */
00286         for (i=0; i<k; i++)
00287         {
00288             if (weights_set[i]!=0.0)
00289             {
00290                 int32_t j;
00291                 for (j=0; j<dimensions; j++)
00292                     mus.matrix[i*dimensions+j] /= weights_set[i];
00293             }
00294         }
00295 #endif
00296     }
00297 
00298 
00299 
00300     while (changed && (iter<max_iter))
00301     {
00302         iter++;
00303         if (iter==max_iter-1)
00304             SG_WARNING("kmeans clustering changed throughout %d iterations stopping...\n", max_iter-1);
00305 
00306         if (iter%1000 == 0)
00307             SG_INFO("Iteration[%d/%d]: Assignment of %i patterns changed.\n", iter, max_iter, changed);
00308         changed=0;
00309 
00310 #ifdef MUSRECALC
00311         /* mus=zeros(dimensions, k) ; */
00312         memset(mus.matrix, 0, sizeof(float64_t)*XDimk);
00313 
00314         for (i=0; i<XSize; i++)
00315         {
00316             int32_t j;
00317             int32_t Cl=ClList[i];
00318             float64_t weight=Weights.vector[i];
00319 
00320             vec=lhs->get_feature_vector(i, vlen, vfree);
00321 
00322             for (j=0; j<dimensions; j++)
00323                 mus.matrix[Cl*dimensions+j] += weight*vec[j];
00324 
00325             lhs->free_feature_vector(vec, i, vfree);
00326         }
00327         for (i=0; i<k; i++)
00328         {
00329             int32_t j;
00330 
00331             if (weights_set[i]!=0.0)
00332                 for (j=0; j<dimensions; j++)
00333                     mus.matrix[i*dimensions+j] /= weights_set[i];
00334         }
00335 #endif
00336 
00337         rhs_mus->copy_feature_matrix(mus);
00338 
00339         for (i=0; i<XSize; i++)
00340         {
00341             /* ks=ceil(rand(1,XSize)*XSize) ; */
00342             const int32_t Pat= CMath::random(0, XSize-1);
00343             const int32_t ClList_Pat=ClList[Pat];
00344             int32_t imini, j;
00345             float64_t mini, weight;
00346 
00347             weight=Weights.vector[Pat];
00348 
00349             /* compute the distance of this point to all centers */
00350             for(int32_t idx_k=0;idx_k<k;idx_k++)
00351                 dists[idx_k]=distance->distance(Pat,idx_k);
00352 
00353             /* [mini,imini]=min(dists(:,i)) ; */
00354             imini=0 ; mini=dists[0];
00355             for (j=1; j<k; j++)
00356                 if (dists[j]<mini)
00357                 {
00358                     mini=dists[j];
00359                     imini=j;
00360                 }
00361 
00362             if (imini!=ClList_Pat)
00363             {
00364                 changed= changed + 1;
00365 
00366                 /* weights_set(imini) = weights_set(imini) + weight ; */
00367                 weights_set[imini]+= weight;
00368                 /* weights_set(j)     = weights_set(j)     - weight ; */
00369                 weights_set[ClList_Pat]-= weight;
00370 
00371                 vec=lhs->get_feature_vector(Pat, vlen, vfree);
00372 
00373                 for (j=0; j<dimensions; j++)
00374                 {
00375                     mus.matrix[imini*dimensions+j]-=(vec[j]
00376                             -mus.matrix[imini*dimensions+j])
00377                             *(weight/weights_set[imini]);
00378                 }
00379 
00380                 lhs->free_feature_vector(vec, Pat, vfree);
00381 
00382                 /* mu_new = mu_old - (x - mu_old)/(n-1) */
00383                 /* if weights_set(j)~=0 */
00384                 if (weights_set[ClList_Pat]!=0.0)
00385                 {
00386                     vec=lhs->get_feature_vector(Pat, vlen, vfree);
00387 
00388                     for (j=0; j<dimensions; j++)
00389                     {
00390                         mus.matrix[ClList_Pat*dimensions+j]-=
00391                                 (vec[j]
00392                                         -mus.matrix[ClList_Pat
00393                                                 *dimensions+j])
00394                                         *(weight/weights_set[ClList_Pat]);
00395                     }
00396                     lhs->free_feature_vector(vec, Pat, vfree);
00397                 }
00398                 else
00399                     /*  mus(:,j)=zeros(dimensions,1) ; */
00400                     for (j=0; j<dimensions; j++)
00401                         mus.matrix[ClList_Pat*dimensions+j]=0;
00402 
00403                 /* ClList(i)= imini ; */
00404                 ClList[Pat] = imini;
00405             }
00406         }
00407     }
00408 
00409     /* compute the ,,variances'' of the clusters */
00410     for (i=0; i<k; i++)
00411     {
00412         float64_t rmin1=0;
00413         float64_t rmin2=0;
00414 
00415         bool first_round=true;
00416 
00417         for (int32_t j=0; j<k; j++)
00418         {
00419             if (j!=i)
00420             {
00421                 int32_t l;
00422                 float64_t dist = 0;
00423 
00424                 for (l=0; l<dimensions; l++)
00425                 {
00426                     dist+=CMath::sq(
00427                             mus.matrix[i*dimensions+l]
00428                                     -mus.matrix[j*dimensions+l]);
00429                 }
00430 
00431                 if (first_round)
00432                 {
00433                     rmin1=dist;
00434                     rmin2=dist;
00435                     first_round=false;
00436                 }
00437                 else
00438                 {
00439                     if ((dist<rmin2) && (dist>=rmin1))
00440                         rmin2=dist;
00441 
00442                     if (dist<rmin1)
00443                     {
00444                         rmin2=rmin1;
00445                         rmin1=dist;
00446                     }
00447                 }
00448             }
00449         }
00450 
00451         R.vector[i]=(0.7*CMath::sqrt(rmin1)+0.3*CMath::sqrt(rmin2));
00452     }
00453 
00454     distance->replace_rhs(rhs_cache);
00455     delete rhs_mus;
00456     SG_FREE(ClList);
00457     SG_FREE(weights_set);
00458     SG_FREE(dists);
00459     SG_UNREF(lhs);
00460 }
00461 
00462 void CKMeans::store_model_features()
00463 {
00464     /* set lhs of underlying distance to cluster centers */
00465     CDenseFeatures<float64_t>* cluster_centers=new CDenseFeatures<float64_t>(
00466             mus);
00467 
00468     /* store cluster centers in lhs of distance variable */
00469     CFeatures* rhs=distance->get_rhs();
00470     distance->init(cluster_centers, rhs);
00471     SG_UNREF(rhs);
00472 }
00473 
00474 void CKMeans::init()
00475 {
00476     max_iter=10000;
00477     k=3;
00478     dimensions=0;
00479 
00480     SG_ADD(&max_iter, "max_iter", "Maximum number of iterations", MS_AVAILABLE);
00481     SG_ADD(&k, "k", "k, the number of clusters", MS_AVAILABLE);
00482     SG_ADD(&dimensions, "dimensions", "Dimensions of data", MS_NOT_AVAILABLE);
00483     SG_ADD(&R, "R", "Cluster radiuses", MS_NOT_AVAILABLE);
00484 }
00485 
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation