00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include <shogun/clustering/KMeans.h>
00013 #include <shogun/distance/Distance.h>
00014 #include <shogun/features/Labels.h>
00015 #include <shogun/features/SimpleFeatures.h>
00016 #include <shogun/mathematics/Math.h>
00017 #include <shogun/base/Parallel.h>
00018
00019 #ifdef HAVE_PTHREAD
00020 #include <pthread.h>
00021 #endif
00022
00023 #define MUSRECALC
00024
00025 #define PAR_THRESH 10
00026
00027 using namespace shogun;
00028
00029 CKMeans::CKMeans()
00030 : CDistanceMachine()
00031 {
00032 init();
00033 }
00034
00035 CKMeans::CKMeans(int32_t k_, CDistance* d)
00036 : CDistanceMachine()
00037 {
00038 init();
00039 k=k_;
00040 set_distance(d);
00041 }
00042
00043 CKMeans::~CKMeans()
00044 {
00045 R.destroy_vector();
00046 }
00047
00048 bool CKMeans::train_machine(CFeatures* data)
00049 {
00050 ASSERT(distance);
00051
00052 if (data)
00053 distance->init(data, data);
00054
00055 ASSERT(distance->get_feature_type()==F_DREAL);
00056
00057 CSimpleFeatures<float64_t>* lhs=
00058 (CSimpleFeatures<float64_t>*)distance->get_lhs();
00059 ASSERT(lhs);
00060 int32_t num=lhs->get_num_vectors();
00061 SG_UNREF(lhs);
00062
00063 Weights=SGVector<float64_t>(num);
00064 for (int32_t i=0; i<num; i++)
00065 Weights.vector[i]=1.0;
00066
00067 clustknb(false, NULL);
00068 Weights.destroy_vector();
00069
00070 return true;
00071 }
00072
00073 bool CKMeans::load(FILE* srcfile)
00074 {
00075 SG_SET_LOCALE_C;
00076 SG_RESET_LOCALE;
00077 return false;
00078 }
00079
00080 bool CKMeans::save(FILE* dstfile)
00081 {
00082 SG_SET_LOCALE_C;
00083 SG_RESET_LOCALE;
00084 return false;
00085 }
00086
00087
00088 void CKMeans::set_k(int32_t p_k)
00089 {
00090 ASSERT(p_k>0);
00091 this->k=p_k;
00092 }
00093
00094 int32_t CKMeans::get_k()
00095 {
00096 return k;
00097 }
00098
00099 void CKMeans::set_max_iter(int32_t iter)
00100 {
00101 ASSERT(iter>0);
00102 max_iter=iter;
00103 }
00104
00105 float64_t CKMeans::get_max_iter()
00106 {
00107 return max_iter;
00108 }
00109
00110 SGVector<float64_t> CKMeans::get_radiuses()
00111 {
00112 return R;
00113 }
00114
00115 SGMatrix<float64_t> CKMeans::get_cluster_centers()
00116 {
00117 if (!R.vector)
00118 return SGMatrix<float64_t>();
00119
00120 CSimpleFeatures<float64_t>* lhs=
00121 (CSimpleFeatures<float64_t>*)distance->get_lhs();
00122 SGMatrix<float64_t> centers=lhs->get_feature_matrix();
00123 SG_UNREF(lhs);
00124 return centers;
00125 }
00126
00127 int32_t CKMeans::get_dimensions()
00128 {
00129 return dimensions;
00130 }
00131
00132 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00133 struct thread_data
00134 {
00135 float64_t* x;
00136 CSimpleFeatures<float64_t>* y;
00137 float64_t* z;
00138 int32_t n1, n2, m;
00139 int32_t js, je;
00140 int32_t offs;
00141 };
00142 #endif // DOXYGEN_SHOULD_SKIP_THIS
00143
00144 namespace shogun
00145 {
00146 void *sqdist_thread_func(void * P)
00147 {
00148 struct thread_data *TD=(struct thread_data*) P;
00149 float64_t* x=TD->x;
00150 CSimpleFeatures<float64_t>* y=TD->y;
00151 float64_t* z=TD->z;
00152 int32_t n1=TD->n1,
00153 m=TD->m,
00154 js=TD->js,
00155 je=TD->je,
00156 offs=TD->offs,
00157 j,i,k;
00158
00159 for (j=js; j<je; j++)
00160 {
00161 int32_t vlen=0;
00162 bool vfree=false;
00163 float64_t* vec=y->get_feature_vector(j+offs, vlen, vfree);
00164
00165 for (i=0; i<n1; i++)
00166 {
00167 float64_t sum=0;
00168 for (k=0; k<m; k++)
00169 sum = sum + CMath::sq(x[i*m + k] - vec[k]);
00170 z[j*n1 + i] = sum;
00171 }
00172
00173 y->free_feature_vector(vec, j, vfree);
00174 }
00175 return NULL;
00176 }
00177 }
00178
00179 void CKMeans::clustknb(bool use_old_mus, float64_t *mus_start)
00180 {
00181 ASSERT(distance && distance->get_feature_type()==F_DREAL);
00182 CSimpleFeatures<float64_t>* lhs = (CSimpleFeatures<float64_t>*) distance->get_lhs();
00183 ASSERT(lhs && lhs->get_num_features()>0 && lhs->get_num_vectors()>0);
00184
00185 int32_t XSize=lhs->get_num_vectors();
00186 dimensions=lhs->get_num_features();
00187 int32_t i, changed=1;
00188 const int32_t XDimk=dimensions*k;
00189 int32_t iter=0;
00190
00191 R.destroy_vector();
00192 R=SGVector<float64_t>(k);
00193
00194 mus=SGMatrix<float64_t>(dimensions, k);
00195
00196 int32_t *ClList=SG_CALLOC(int32_t, XSize);
00197 float64_t *weights_set=SG_CALLOC(float64_t, k);
00198 float64_t *dists=SG_CALLOC(float64_t, k*XSize);
00199
00201 CSimpleFeatures<float64_t>* rhs_mus = new CSimpleFeatures<float64_t>(0);
00202 CFeatures* rhs_cache = distance->replace_rhs(rhs_mus);
00203
00204 int32_t vlen=0;
00205 bool vfree=false;
00206 float64_t* vec=NULL;
00207
00208
00209 memset(ClList, 0, sizeof(int32_t)*XSize);
00210
00211 memset(weights_set, 0, sizeof(float64_t)*k);
00212
00213
00214 memset(mus.matrix, 0, sizeof(float64_t)*XDimk);
00215
00216 if (!use_old_mus)
00217 {
00218 for (i=0; i<XSize; i++)
00219 {
00220 const int32_t Cl=CMath::random(0, k-1);
00221 int32_t j;
00222 float64_t weight=Weights.vector[i];
00223
00224 weights_set[Cl]+=weight;
00225 ClList[i]=Cl;
00226
00227 vec=lhs->get_feature_vector(i, vlen, vfree);
00228
00229 for (j=0; j<dimensions; j++)
00230 mus.matrix[Cl*dimensions+j] += weight*vec[j];
00231
00232 lhs->free_feature_vector(vec, i, vfree);
00233 }
00234 for (i=0; i<k; i++)
00235 {
00236 int32_t j;
00237
00238 if (weights_set[i]!=0.0)
00239 for (j=0; j<dimensions; j++)
00240 mus.matrix[i*dimensions+j] /= weights_set[i];
00241 }
00242 }
00243 else
00244 {
00245 ASSERT(mus_start);
00246
00248 rhs_mus->copy_feature_matrix(SGMatrix<float64_t>(mus_start,dimensions,k));
00249 float64_t* p_dists=dists;
00250
00251 for(int32_t idx=0;idx<XSize;idx++,p_dists+=k)
00252 distances_rhs(p_dists,0,k,idx);
00253 p_dists=NULL;
00254
00255 for (i=0; i<XSize; i++)
00256 {
00257 float64_t mini=dists[i*k];
00258 int32_t Cl = 0, j;
00259
00260 for (j=1; j<k; j++)
00261 {
00262 if (dists[i*k+j]<mini)
00263 {
00264 Cl=j;
00265 mini=dists[i*k+j];
00266 }
00267 }
00268 ClList[i]=Cl;
00269 }
00270
00271
00272
00273 for (i=0; i<XSize; i++)
00274 {
00275 const int32_t Cl = ClList[i];
00276 float64_t weight=Weights.vector[i];
00277 weights_set[Cl]+=weight;
00278 #ifndef MUSRECALC
00279 vec=lhs->get_feature_vector(i, vlen, vfree);
00280
00281 for (j=0; j<dimensions; j++)
00282 mus.matrix[Cl*dimensions+j] += weight*vec[j];
00283
00284 lhs->free_feature_vector(vec, i, vfree);
00285 #endif
00286 }
00287 #ifndef MUSRECALC
00288
00289 for (i=0; i<k; i++)
00290 {
00291 if (weights_set[i]!=0.0)
00292 {
00293 int32_t j;
00294 for (j=0; j<dimensions; j++)
00295 mus.matrix[i*dimensions+j] /= weights_set[i];
00296 }
00297 }
00298 #endif
00299 }
00300
00301
00302
00303 while (changed && (iter<max_iter))
00304 {
00305 iter++;
00306 if (iter==max_iter-1)
00307 SG_WARNING("kmeans clustering changed throughout %d iterations stopping...\n", max_iter-1);
00308
00309 if (iter%1000 == 0)
00310 SG_INFO("Iteration[%d/%d]: Assignment of %i patterns changed.\n", iter, max_iter, changed);
00311 changed=0;
00312
00313 #ifdef MUSRECALC
00314
00315 memset(mus.matrix, 0, sizeof(float64_t)*XDimk);
00316
00317 for (i=0; i<XSize; i++)
00318 {
00319 int32_t j;
00320 int32_t Cl=ClList[i];
00321 float64_t weight=Weights.vector[i];
00322
00323 vec=lhs->get_feature_vector(i, vlen, vfree);
00324
00325 for (j=0; j<dimensions; j++)
00326 mus.matrix[Cl*dimensions+j] += weight*vec[j];
00327
00328 lhs->free_feature_vector(vec, i, vfree);
00329 }
00330 for (i=0; i<k; i++)
00331 {
00332 int32_t j;
00333
00334 if (weights_set[i]!=0.0)
00335 for (j=0; j<dimensions; j++)
00336 mus.matrix[i*dimensions+j] /= weights_set[i];
00337 }
00338 #endif
00339
00340 rhs_mus->copy_feature_matrix(mus);
00341
00342 for (i=0; i<XSize; i++)
00343 {
00344
00345 const int32_t Pat= CMath::random(0, XSize-1);
00346 const int32_t ClList_Pat=ClList[Pat];
00347 int32_t imini, j;
00348 float64_t mini, weight;
00349
00350 weight=Weights.vector[Pat];
00351
00352
00353 for(int32_t idx_k=0;idx_k<k;idx_k++)
00354 dists[idx_k]=distance->distance(Pat,idx_k);
00355
00356
00357 imini=0 ; mini=dists[0];
00358 for (j=1; j<k; j++)
00359 if (dists[j]<mini)
00360 {
00361 mini=dists[j];
00362 imini=j;
00363 }
00364
00365 if (imini!=ClList_Pat)
00366 {
00367 changed= changed + 1;
00368
00369
00370 weights_set[imini]+= weight;
00371
00372 weights_set[ClList_Pat]-= weight;
00373
00374 vec=lhs->get_feature_vector(Pat, vlen, vfree);
00375
00376 for (j=0; j<dimensions; j++)
00377 {
00378 mus.matrix[imini*dimensions+j]-=(vec[j]
00379 -mus.matrix[imini*dimensions+j])
00380 *(weight/weights_set[imini]);
00381 }
00382
00383 lhs->free_feature_vector(vec, Pat, vfree);
00384
00385
00386
00387 if (weights_set[ClList_Pat]!=0.0)
00388 {
00389 vec=lhs->get_feature_vector(Pat, vlen, vfree);
00390
00391 for (j=0; j<dimensions; j++)
00392 {
00393 mus.matrix[ClList_Pat*dimensions+j]-=
00394 (vec[j]
00395 -mus.matrix[ClList_Pat
00396 *dimensions+j])
00397 *(weight/weights_set[ClList_Pat]);
00398 }
00399 lhs->free_feature_vector(vec, Pat, vfree);
00400 }
00401 else
00402
00403 for (j=0; j<dimensions; j++)
00404 mus.matrix[ClList_Pat*dimensions+j]=0;
00405
00406
00407 ClList[Pat] = imini;
00408 }
00409 }
00410 }
00411
00412
00413 for (i=0; i<k; i++)
00414 {
00415 float64_t rmin1=0;
00416 float64_t rmin2=0;
00417
00418 bool first_round=true;
00419
00420 for (int32_t j=0; j<k; j++)
00421 {
00422 if (j!=i)
00423 {
00424 int32_t l;
00425 float64_t dist = 0;
00426
00427 for (l=0; l<dimensions; l++)
00428 {
00429 dist+=CMath::sq(
00430 mus.matrix[i*dimensions+l]
00431 -mus.matrix[j*dimensions+l]);
00432 }
00433
00434 if (first_round)
00435 {
00436 rmin1=dist;
00437 rmin2=dist;
00438 first_round=false;
00439 }
00440 else
00441 {
00442 if ((dist<rmin2) && (dist>=rmin1))
00443 rmin2=dist;
00444
00445 if (dist<rmin1)
00446 {
00447 rmin2=rmin1;
00448 rmin1=dist;
00449 }
00450 }
00451 }
00452 }
00453
00454 R.vector[i]=(0.7*CMath::sqrt(rmin1)+0.3*CMath::sqrt(rmin2));
00455 }
00456 distance->replace_rhs(rhs_cache);
00457 delete rhs_mus;
00458 SG_FREE(ClList);
00459 SG_FREE(weights_set);
00460 SG_FREE(dists);
00461 SG_UNREF(lhs);
00462 }
00463
00464 void CKMeans::store_model_features()
00465 {
00466
00467 CSimpleFeatures<float64_t>* cluster_centers=new CSimpleFeatures<float64_t>(
00468 mus);
00469
00470
00471 mus.do_free=false;
00472 mus.free_matrix();
00473
00474
00475 CFeatures* rhs=distance->get_rhs();
00476 distance->init(cluster_centers, rhs);
00477 SG_UNREF(rhs);
00478 }
00479
00480 void CKMeans::init()
00481 {
00482 max_iter=10000;
00483 k=3;
00484 dimensions=0;
00485
00486 m_parameters->add(&max_iter, "max_iter", "Maximum number of iterations");
00487 m_parameters->add(&k, "k", "Parameter k");
00488 m_parameters->add(&dimensions, "dimensions", "Dimensions of data");
00489 m_parameters->add(&R, "R", "Cluster radiuses");
00490 }
00491