00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include <shogun/clustering/KMeans.h>
00013 #include <shogun/distance/Distance.h>
00014 #include <shogun/features/Labels.h>
00015 #include <shogun/features/SimpleFeatures.h>
00016 #include <shogun/mathematics/Math.h>
00017 #include <shogun/base/Parallel.h>
00018
00019 #ifdef HAVE_PTHREAD
00020 #include <pthread.h>
00021 #endif
00022
00023 #define MUSRECALC
00024
00025 #define PAR_THRESH 10
00026
00027 using namespace shogun;
00028
00029 CKMeans::CKMeans()
00030 : CDistanceMachine()
00031 {
00032 init();
00033 }
00034
00035 CKMeans::CKMeans(int32_t k_, CDistance* d)
00036 : CDistanceMachine()
00037 {
00038 init();
00039 k=k_;
00040 set_distance(d);
00041 }
00042
00043 CKMeans::~CKMeans()
00044 {
00045 R.destroy_vector();
00046 }
00047
00048 bool CKMeans::train_machine(CFeatures* data)
00049 {
00050 ASSERT(distance);
00051
00052 if (data)
00053 distance->init(data, data);
00054
00055 ASSERT(distance->get_feature_type()==F_DREAL);
00056
00057 CSimpleFeatures<float64_t>* lhs=
00058 (CSimpleFeatures<float64_t>*)distance->get_lhs();
00059 ASSERT(lhs);
00060 int32_t num=lhs->get_num_vectors();
00061 SG_UNREF(lhs);
00062
00063 Weights=SGVector<float64_t>(num);
00064 for (int32_t i=0; i<num; i++)
00065 Weights.vector[i]=1.0;
00066
00067 clustknb(false, NULL);
00068 Weights.destroy_vector();
00069
00070 return true;
00071 }
00072
00073 bool CKMeans::load(FILE* srcfile)
00074 {
00075 SG_SET_LOCALE_C;
00076 SG_RESET_LOCALE;
00077 return false;
00078 }
00079
00080 bool CKMeans::save(FILE* dstfile)
00081 {
00082 SG_SET_LOCALE_C;
00083 SG_RESET_LOCALE;
00084 return false;
00085 }
00086
00087 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00088 struct thread_data
00089 {
00090 float64_t* x;
00091 CSimpleFeatures<float64_t>* y;
00092 float64_t* z;
00093 int32_t n1, n2, m;
00094 int32_t js, je;
00095 int32_t offs;
00096 };
00097 #endif // DOXYGEN_SHOULD_SKIP_THIS
00098
00099 namespace shogun
00100 {
00101 void *sqdist_thread_func(void * P)
00102 {
00103 struct thread_data *TD=(struct thread_data*) P;
00104 float64_t* x=TD->x;
00105 CSimpleFeatures<float64_t>* y=TD->y;
00106 float64_t* z=TD->z;
00107 int32_t n1=TD->n1,
00108 m=TD->m,
00109 js=TD->js,
00110 je=TD->je,
00111 offs=TD->offs,
00112 j,i,k;
00113
00114 for (j=js; j<je; j++)
00115 {
00116 int32_t vlen=0;
00117 bool vfree=false;
00118 float64_t* vec=y->get_feature_vector(j+offs, vlen, vfree);
00119
00120 for (i=0; i<n1; i++)
00121 {
00122 float64_t sum=0;
00123 for (k=0; k<m; k++)
00124 sum = sum + CMath::sq(x[i*m + k] - vec[k]);
00125 z[j*n1 + i] = sum;
00126 }
00127
00128 y->free_feature_vector(vec, j, vfree);
00129 }
00130 return NULL;
00131 }
00132 }
00133
00134 void CKMeans::clustknb(bool use_old_mus, float64_t *mus_start)
00135 {
00136 ASSERT(distance && distance->get_feature_type()==F_DREAL);
00137 CSimpleFeatures<float64_t>* lhs = (CSimpleFeatures<float64_t>*) distance->get_lhs();
00138 ASSERT(lhs && lhs->get_num_features()>0 && lhs->get_num_vectors()>0);
00139
00140 int32_t XSize=lhs->get_num_vectors();
00141 dimensions=lhs->get_num_features();
00142 int32_t i, changed=1;
00143 const int32_t XDimk=dimensions*k;
00144 int32_t iter=0;
00145
00146 R.destroy_vector();
00147 R=SGVector<float64_t>(k);
00148
00149 mus=SGMatrix<float64_t>(dimensions, k);
00150
00151 int32_t *ClList=SG_CALLOC(int32_t, XSize);
00152 float64_t *weights_set=SG_CALLOC(float64_t, k);
00153 float64_t *dists=SG_CALLOC(float64_t, k*XSize);
00154
00156 CSimpleFeatures<float64_t>* rhs_mus = new CSimpleFeatures<float64_t>(0);
00157 CFeatures* rhs_cache = distance->replace_rhs(rhs_mus);
00158
00159 int32_t vlen=0;
00160 bool vfree=false;
00161 float64_t* vec=NULL;
00162
00163
00164 memset(ClList, 0, sizeof(int32_t)*XSize);
00165
00166 memset(weights_set, 0, sizeof(float64_t)*k);
00167
00168
00169 memset(mus.matrix, 0, sizeof(float64_t)*XDimk);
00170
00171 if (!use_old_mus)
00172 {
00173 for (i=0; i<XSize; i++)
00174 {
00175 const int32_t Cl=CMath::random(0, k-1);
00176 int32_t j;
00177 float64_t weight=Weights.vector[i];
00178
00179 weights_set[Cl]+=weight;
00180 ClList[i]=Cl;
00181
00182 vec=lhs->get_feature_vector(i, vlen, vfree);
00183
00184 for (j=0; j<dimensions; j++)
00185 mus.matrix[Cl*dimensions+j] += weight*vec[j];
00186
00187 lhs->free_feature_vector(vec, i, vfree);
00188 }
00189 for (i=0; i<k; i++)
00190 {
00191 int32_t j;
00192
00193 if (weights_set[i]!=0.0)
00194 for (j=0; j<dimensions; j++)
00195 mus.matrix[i*dimensions+j] /= weights_set[i];
00196 }
00197 }
00198 else
00199 {
00200 ASSERT(mus_start);
00201
00203 rhs_mus->copy_feature_matrix(SGMatrix<float64_t>(mus_start,dimensions,k));
00204 float64_t* p_dists=dists;
00205
00206 for(int32_t idx=0;idx<XSize;idx++,p_dists+=k)
00207 distances_rhs(p_dists,0,k,idx);
00208 p_dists=NULL;
00209
00210 for (i=0; i<XSize; i++)
00211 {
00212 float64_t mini=dists[i*k];
00213 int32_t Cl = 0, j;
00214
00215 for (j=1; j<k; j++)
00216 {
00217 if (dists[i*k+j]<mini)
00218 {
00219 Cl=j;
00220 mini=dists[i*k+j];
00221 }
00222 }
00223 ClList[i]=Cl;
00224 }
00225
00226
00227
00228 for (i=0; i<XSize; i++)
00229 {
00230 const int32_t Cl = ClList[i];
00231 float64_t weight=Weights.vector[i];
00232 weights_set[Cl]+=weight;
00233 #ifndef MUSRECALC
00234 vec=lhs->get_feature_vector(i, vlen, vfree);
00235
00236 for (j=0; j<dimensions; j++)
00237 mus.matrix[Cl*dimensions+j] += weight*vec[j];
00238
00239 lhs->free_feature_vector(vec, i, vfree);
00240 #endif
00241 }
00242 #ifndef MUSRECALC
00243
00244 for (i=0; i<k; i++)
00245 {
00246 if (weights_set[i]!=0.0)
00247 {
00248 int32_t j;
00249 for (j=0; j<dimensions; j++)
00250 mus.matrix[i*dimensions+j] /= weights_set[i];
00251 }
00252 }
00253 #endif
00254 }
00255
00256
00257
00258 while (changed && (iter<max_iter))
00259 {
00260 iter++;
00261 if (iter==max_iter-1)
00262 SG_WARNING("kmeans clustering changed throughout %d iterations stopping...\n", max_iter-1);
00263
00264 if (iter%1000 == 0)
00265 SG_INFO("Iteration[%d/%d]: Assignment of %i patterns changed.\n", iter, max_iter, changed);
00266 changed=0;
00267
00268 #ifdef MUSRECALC
00269
00270 memset(mus.matrix, 0, sizeof(float64_t)*XDimk);
00271
00272 for (i=0; i<XSize; i++)
00273 {
00274 int32_t j;
00275 int32_t Cl=ClList[i];
00276 float64_t weight=Weights.vector[i];
00277
00278 vec=lhs->get_feature_vector(i, vlen, vfree);
00279
00280 for (j=0; j<dimensions; j++)
00281 mus.matrix[Cl*dimensions+j] += weight*vec[j];
00282
00283 lhs->free_feature_vector(vec, i, vfree);
00284 }
00285 for (i=0; i<k; i++)
00286 {
00287 int32_t j;
00288
00289 if (weights_set[i]!=0.0)
00290 for (j=0; j<dimensions; j++)
00291 mus.matrix[i*dimensions+j] /= weights_set[i];
00292 }
00293 #endif
00294
00295 rhs_mus->copy_feature_matrix(mus);
00296
00297 for (i=0; i<XSize; i++)
00298 {
00299
00300 const int32_t Pat= CMath::random(0, XSize-1);
00301 const int32_t ClList_Pat=ClList[Pat];
00302 int32_t imini, j;
00303 float64_t mini, weight;
00304
00305 weight=Weights.vector[Pat];
00306
00307
00308 for(int32_t idx_k=0;idx_k<k;idx_k++)
00309 dists[idx_k]=distance->distance(Pat,idx_k);
00310
00311
00312 imini=0 ; mini=dists[0];
00313 for (j=1; j<k; j++)
00314 if (dists[j]<mini)
00315 {
00316 mini=dists[j];
00317 imini=j;
00318 }
00319
00320 if (imini!=ClList_Pat)
00321 {
00322 changed= changed + 1;
00323
00324
00325 weights_set[imini]+= weight;
00326
00327 weights_set[ClList_Pat]-= weight;
00328
00329 vec=lhs->get_feature_vector(Pat, vlen, vfree);
00330
00331 for (j=0; j<dimensions; j++)
00332 {
00333 mus.matrix[imini*dimensions+j]-=(vec[j]
00334 -mus.matrix[imini*dimensions+j])
00335 *(weight/weights_set[imini]);
00336 }
00337
00338 lhs->free_feature_vector(vec, Pat, vfree);
00339
00340
00341
00342 if (weights_set[ClList_Pat]!=0.0)
00343 {
00344 vec=lhs->get_feature_vector(Pat, vlen, vfree);
00345
00346 for (j=0; j<dimensions; j++)
00347 {
00348 mus.matrix[ClList_Pat*dimensions+j]-=
00349 (vec[j]
00350 -mus.matrix[ClList_Pat
00351 *dimensions+j])
00352 *(weight/weights_set[ClList_Pat]);
00353 }
00354 lhs->free_feature_vector(vec, Pat, vfree);
00355 }
00356 else
00357
00358 for (j=0; j<dimensions; j++)
00359 mus.matrix[ClList_Pat*dimensions+j]=0;
00360
00361
00362 ClList[Pat] = imini;
00363 }
00364 }
00365 }
00366
00367
00368 for (i=0; i<k; i++)
00369 {
00370 float64_t rmin1=0;
00371 float64_t rmin2=0;
00372
00373 bool first_round=true;
00374
00375 for (int32_t j=0; j<k; j++)
00376 {
00377 if (j!=i)
00378 {
00379 int32_t l;
00380 float64_t dist = 0;
00381
00382 for (l=0; l<dimensions; l++)
00383 {
00384 dist+=CMath::sq(
00385 mus.matrix[i*dimensions+l]
00386 -mus.matrix[j*dimensions+l]);
00387 }
00388
00389 if (first_round)
00390 {
00391 rmin1=dist;
00392 rmin2=dist;
00393 first_round=false;
00394 }
00395 else
00396 {
00397 if ((dist<rmin2) && (dist>=rmin1))
00398 rmin2=dist;
00399
00400 if (dist<rmin1)
00401 {
00402 rmin2=rmin1;
00403 rmin1=dist;
00404 }
00405 }
00406 }
00407 }
00408
00409 R.vector[i]=(0.7*CMath::sqrt(rmin1)+0.3*CMath::sqrt(rmin2));
00410 }
00411 distance->replace_rhs(rhs_cache);
00412 delete rhs_mus;
00413 SG_FREE(ClList);
00414 SG_FREE(weights_set);
00415 SG_FREE(dists);
00416 SG_UNREF(lhs);
00417 }
00418
00419 void CKMeans::store_model_features()
00420 {
00421
00422 CSimpleFeatures<float64_t>* cluster_centers=new CSimpleFeatures<float64_t>(
00423 mus);
00424
00425
00426 mus.do_free=false;
00427 mus.free_matrix();
00428
00429
00430 CFeatures* rhs=distance->get_rhs();
00431 distance->init(cluster_centers, rhs);
00432 SG_UNREF(rhs);
00433 }
00434
00435 void CKMeans::init()
00436 {
00437 max_iter=10000;
00438 k=3;
00439 dimensions=0;
00440
00441 m_parameters->add(&max_iter, "max_iter", "Maximum number of iterations");
00442 m_parameters->add(&k, "k", "Parameter k");
00443 m_parameters->add(&dimensions, "dimensions", "Dimensions of data");
00444 m_parameters->add(&R, "R", "Cluster radiuses");
00445 }
00446