00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include <shogun/clustering/KMeans.h>
00013 #include <shogun/distance/Distance.h>
00014 #include <shogun/labels/Labels.h>
00015 #include <shogun/features/DenseFeatures.h>
00016 #include <shogun/mathematics/Math.h>
00017 #include <shogun/base/Parallel.h>
00018
00019 #ifdef HAVE_PTHREAD
00020 #include <pthread.h>
00021 #endif
00022
00023 #define MUSRECALC
00024
00025 #define PAR_THRESH 10
00026
00027 using namespace shogun;
00028
00029 CKMeans::CKMeans()
00030 : CDistanceMachine()
00031 {
00032 init();
00033 }
00034
00035 CKMeans::CKMeans(int32_t k_, CDistance* d)
00036 : CDistanceMachine()
00037 {
00038 init();
00039 k=k_;
00040 set_distance(d);
00041 }
00042
00043 CKMeans::~CKMeans()
00044 {
00045 }
00046
00047 bool CKMeans::train_machine(CFeatures* data)
00048 {
00049 ASSERT(distance);
00050
00051 if (data)
00052 distance->init(data, data);
00053
00054 ASSERT(distance->get_feature_type()==F_DREAL);
00055
00056 CDenseFeatures<float64_t>* lhs=
00057 (CDenseFeatures<float64_t>*)distance->get_lhs();
00058 ASSERT(lhs);
00059 int32_t num=lhs->get_num_vectors();
00060 SG_UNREF(lhs);
00061
00062 Weights=SGVector<float64_t>(num);
00063 for (int32_t i=0; i<num; i++)
00064 Weights.vector[i]=1.0;
00065
00066 clustknb(false, NULL);
00067
00068 return true;
00069 }
00070
00071 bool CKMeans::load(FILE* srcfile)
00072 {
00073 SG_SET_LOCALE_C;
00074 SG_RESET_LOCALE;
00075 return false;
00076 }
00077
00078 bool CKMeans::save(FILE* dstfile)
00079 {
00080 SG_SET_LOCALE_C;
00081 SG_RESET_LOCALE;
00082 return false;
00083 }
00084
00085
00086 void CKMeans::set_k(int32_t p_k)
00087 {
00088 ASSERT(p_k>0);
00089 this->k=p_k;
00090 }
00091
00092 int32_t CKMeans::get_k()
00093 {
00094 return k;
00095 }
00096
00097 void CKMeans::set_max_iter(int32_t iter)
00098 {
00099 ASSERT(iter>0);
00100 max_iter=iter;
00101 }
00102
00103 float64_t CKMeans::get_max_iter()
00104 {
00105 return max_iter;
00106 }
00107
00108 SGVector<float64_t> CKMeans::get_radiuses()
00109 {
00110 return R;
00111 }
00112
00113 SGMatrix<float64_t> CKMeans::get_cluster_centers()
00114 {
00115 if (!R.vector)
00116 return SGMatrix<float64_t>();
00117
00118 CDenseFeatures<float64_t>* lhs=
00119 (CDenseFeatures<float64_t>*)distance->get_lhs();
00120 SGMatrix<float64_t> centers=lhs->get_feature_matrix();
00121 SG_UNREF(lhs);
00122 return centers;
00123 }
00124
00125 int32_t CKMeans::get_dimensions()
00126 {
00127 return dimensions;
00128 }
00129
00130 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00131 struct thread_data
00132 {
00133 float64_t* x;
00134 CDenseFeatures<float64_t>* y;
00135 float64_t* z;
00136 int32_t n1, n2, m;
00137 int32_t js, je;
00138 int32_t offs;
00139 };
00140 #endif // DOXYGEN_SHOULD_SKIP_THIS
00141
00142 namespace shogun
00143 {
00144 void *sqdist_thread_func(void * P)
00145 {
00146 struct thread_data *TD=(struct thread_data*) P;
00147 float64_t* x=TD->x;
00148 CDenseFeatures<float64_t>* y=TD->y;
00149 float64_t* z=TD->z;
00150 int32_t n1=TD->n1,
00151 m=TD->m,
00152 js=TD->js,
00153 je=TD->je,
00154 offs=TD->offs,
00155 j,i,k;
00156
00157 for (j=js; j<je; j++)
00158 {
00159 int32_t vlen=0;
00160 bool vfree=false;
00161 float64_t* vec=y->get_feature_vector(j+offs, vlen, vfree);
00162
00163 for (i=0; i<n1; i++)
00164 {
00165 float64_t sum=0;
00166 for (k=0; k<m; k++)
00167 sum = sum + CMath::sq(x[i*m + k] - vec[k]);
00168 z[j*n1 + i] = sum;
00169 }
00170
00171 y->free_feature_vector(vec, j, vfree);
00172 }
00173 return NULL;
00174 }
00175 }
00176
00177 void CKMeans::clustknb(bool use_old_mus, float64_t *mus_start)
00178 {
00179 ASSERT(distance && distance->get_feature_type()==F_DREAL);
00180 CDenseFeatures<float64_t>* lhs = (CDenseFeatures<float64_t>*) distance->get_lhs();
00181 ASSERT(lhs && lhs->get_num_features()>0 && lhs->get_num_vectors()>0);
00182
00183 int32_t XSize=lhs->get_num_vectors();
00184 dimensions=lhs->get_num_features();
00185 int32_t i, changed=1;
00186 const int32_t XDimk=dimensions*k;
00187 int32_t iter=0;
00188
00189 R=SGVector<float64_t>(k);
00190
00191 mus=SGMatrix<float64_t>(dimensions, k);
00192
00193 int32_t *ClList=SG_CALLOC(int32_t, XSize);
00194 float64_t *weights_set=SG_CALLOC(float64_t, k);
00195 float64_t *dists=SG_CALLOC(float64_t, k*XSize);
00196
00198 CDenseFeatures<float64_t>* rhs_mus = new CDenseFeatures<float64_t>(0);
00199 CFeatures* rhs_cache = distance->replace_rhs(rhs_mus);
00200
00201 int32_t vlen=0;
00202 bool vfree=false;
00203 float64_t* vec=NULL;
00204
00205
00206 memset(ClList, 0, sizeof(int32_t)*XSize);
00207
00208 memset(weights_set, 0, sizeof(float64_t)*k);
00209
00210
00211 memset(mus.matrix, 0, sizeof(float64_t)*XDimk);
00212
00213 if (!use_old_mus)
00214 {
00215 for (i=0; i<XSize; i++)
00216 {
00217 const int32_t Cl=CMath::random(0, k-1);
00218 int32_t j;
00219 float64_t weight=Weights.vector[i];
00220
00221 weights_set[Cl]+=weight;
00222 ClList[i]=Cl;
00223
00224 vec=lhs->get_feature_vector(i, vlen, vfree);
00225
00226 for (j=0; j<dimensions; j++)
00227 mus.matrix[Cl*dimensions+j] += weight*vec[j];
00228
00229 lhs->free_feature_vector(vec, i, vfree);
00230 }
00231 for (i=0; i<k; i++)
00232 {
00233 int32_t j;
00234
00235 if (weights_set[i]!=0.0)
00236 for (j=0; j<dimensions; j++)
00237 mus.matrix[i*dimensions+j] /= weights_set[i];
00238 }
00239 }
00240 else
00241 {
00242 ASSERT(mus_start);
00243
00245 rhs_mus->copy_feature_matrix(SGMatrix<float64_t>(mus_start,dimensions,k));
00246 float64_t* p_dists=dists;
00247
00248 for(int32_t idx=0;idx<XSize;idx++,p_dists+=k)
00249 distances_rhs(p_dists,0,k,idx);
00250 p_dists=NULL;
00251
00252 for (i=0; i<XSize; i++)
00253 {
00254 float64_t mini=dists[i*k];
00255 int32_t Cl = 0, j;
00256
00257 for (j=1; j<k; j++)
00258 {
00259 if (dists[i*k+j]<mini)
00260 {
00261 Cl=j;
00262 mini=dists[i*k+j];
00263 }
00264 }
00265 ClList[i]=Cl;
00266 }
00267
00268
00269
00270 for (i=0; i<XSize; i++)
00271 {
00272 const int32_t Cl = ClList[i];
00273 float64_t weight=Weights.vector[i];
00274 weights_set[Cl]+=weight;
00275 #ifndef MUSRECALC
00276 vec=lhs->get_feature_vector(i, vlen, vfree);
00277
00278 for (j=0; j<dimensions; j++)
00279 mus.matrix[Cl*dimensions+j] += weight*vec[j];
00280
00281 lhs->free_feature_vector(vec, i, vfree);
00282 #endif
00283 }
00284 #ifndef MUSRECALC
00285
00286 for (i=0; i<k; i++)
00287 {
00288 if (weights_set[i]!=0.0)
00289 {
00290 int32_t j;
00291 for (j=0; j<dimensions; j++)
00292 mus.matrix[i*dimensions+j] /= weights_set[i];
00293 }
00294 }
00295 #endif
00296 }
00297
00298
00299
00300 while (changed && (iter<max_iter))
00301 {
00302 iter++;
00303 if (iter==max_iter-1)
00304 SG_WARNING("kmeans clustering changed throughout %d iterations stopping...\n", max_iter-1);
00305
00306 if (iter%1000 == 0)
00307 SG_INFO("Iteration[%d/%d]: Assignment of %i patterns changed.\n", iter, max_iter, changed);
00308 changed=0;
00309
00310 #ifdef MUSRECALC
00311
00312 memset(mus.matrix, 0, sizeof(float64_t)*XDimk);
00313
00314 for (i=0; i<XSize; i++)
00315 {
00316 int32_t j;
00317 int32_t Cl=ClList[i];
00318 float64_t weight=Weights.vector[i];
00319
00320 vec=lhs->get_feature_vector(i, vlen, vfree);
00321
00322 for (j=0; j<dimensions; j++)
00323 mus.matrix[Cl*dimensions+j] += weight*vec[j];
00324
00325 lhs->free_feature_vector(vec, i, vfree);
00326 }
00327 for (i=0; i<k; i++)
00328 {
00329 int32_t j;
00330
00331 if (weights_set[i]!=0.0)
00332 for (j=0; j<dimensions; j++)
00333 mus.matrix[i*dimensions+j] /= weights_set[i];
00334 }
00335 #endif
00336
00337 rhs_mus->copy_feature_matrix(mus);
00338
00339 for (i=0; i<XSize; i++)
00340 {
00341
00342 const int32_t Pat= CMath::random(0, XSize-1);
00343 const int32_t ClList_Pat=ClList[Pat];
00344 int32_t imini, j;
00345 float64_t mini, weight;
00346
00347 weight=Weights.vector[Pat];
00348
00349
00350 for(int32_t idx_k=0;idx_k<k;idx_k++)
00351 dists[idx_k]=distance->distance(Pat,idx_k);
00352
00353
00354 imini=0 ; mini=dists[0];
00355 for (j=1; j<k; j++)
00356 if (dists[j]<mini)
00357 {
00358 mini=dists[j];
00359 imini=j;
00360 }
00361
00362 if (imini!=ClList_Pat)
00363 {
00364 changed= changed + 1;
00365
00366
00367 weights_set[imini]+= weight;
00368
00369 weights_set[ClList_Pat]-= weight;
00370
00371 vec=lhs->get_feature_vector(Pat, vlen, vfree);
00372
00373 for (j=0; j<dimensions; j++)
00374 {
00375 mus.matrix[imini*dimensions+j]-=(vec[j]
00376 -mus.matrix[imini*dimensions+j])
00377 *(weight/weights_set[imini]);
00378 }
00379
00380 lhs->free_feature_vector(vec, Pat, vfree);
00381
00382
00383
00384 if (weights_set[ClList_Pat]!=0.0)
00385 {
00386 vec=lhs->get_feature_vector(Pat, vlen, vfree);
00387
00388 for (j=0; j<dimensions; j++)
00389 {
00390 mus.matrix[ClList_Pat*dimensions+j]-=
00391 (vec[j]
00392 -mus.matrix[ClList_Pat
00393 *dimensions+j])
00394 *(weight/weights_set[ClList_Pat]);
00395 }
00396 lhs->free_feature_vector(vec, Pat, vfree);
00397 }
00398 else
00399
00400 for (j=0; j<dimensions; j++)
00401 mus.matrix[ClList_Pat*dimensions+j]=0;
00402
00403
00404 ClList[Pat] = imini;
00405 }
00406 }
00407 }
00408
00409
00410 for (i=0; i<k; i++)
00411 {
00412 float64_t rmin1=0;
00413 float64_t rmin2=0;
00414
00415 bool first_round=true;
00416
00417 for (int32_t j=0; j<k; j++)
00418 {
00419 if (j!=i)
00420 {
00421 int32_t l;
00422 float64_t dist = 0;
00423
00424 for (l=0; l<dimensions; l++)
00425 {
00426 dist+=CMath::sq(
00427 mus.matrix[i*dimensions+l]
00428 -mus.matrix[j*dimensions+l]);
00429 }
00430
00431 if (first_round)
00432 {
00433 rmin1=dist;
00434 rmin2=dist;
00435 first_round=false;
00436 }
00437 else
00438 {
00439 if ((dist<rmin2) && (dist>=rmin1))
00440 rmin2=dist;
00441
00442 if (dist<rmin1)
00443 {
00444 rmin2=rmin1;
00445 rmin1=dist;
00446 }
00447 }
00448 }
00449 }
00450
00451 R.vector[i]=(0.7*CMath::sqrt(rmin1)+0.3*CMath::sqrt(rmin2));
00452 }
00453
00454 distance->replace_rhs(rhs_cache);
00455 delete rhs_mus;
00456 SG_FREE(ClList);
00457 SG_FREE(weights_set);
00458 SG_FREE(dists);
00459 SG_UNREF(lhs);
00460 }
00461
00462 void CKMeans::store_model_features()
00463 {
00464
00465 CDenseFeatures<float64_t>* cluster_centers=new CDenseFeatures<float64_t>(
00466 mus);
00467
00468
00469 CFeatures* rhs=distance->get_rhs();
00470 distance->init(cluster_centers, rhs);
00471 SG_UNREF(rhs);
00472 }
00473
00474 void CKMeans::init()
00475 {
00476 max_iter=10000;
00477 k=3;
00478 dimensions=0;
00479
00480 SG_ADD(&max_iter, "max_iter", "Maximum number of iterations", MS_AVAILABLE);
00481 SG_ADD(&k, "k", "k, the number of clusters", MS_AVAILABLE);
00482 SG_ADD(&dimensions, "dimensions", "Dimensions of data", MS_NOT_AVAILABLE);
00483 SG_ADD(&R, "R", "Cluster radiuses", MS_NOT_AVAILABLE);
00484 }
00485