00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011 #include <shogun/preprocessor/MultidimensionalScaling.h>
00012 #ifdef HAVE_LAPACK
00013 #include <shogun/preprocessor/DimensionReductionPreprocessor.h>
00014 #include <shogun/mathematics/lapack.h>
00015 #include <shogun/mathematics/arpack.h>
00016 #include <shogun/distance/CustomDistance.h>
00017 #include <shogun/lib/common.h>
00018 #include <shogun/mathematics/Math.h>
00019 #include <shogun/io/SGIO.h>
00020 #include <shogun/distance/EuclidianDistance.h>
00021
00022 #ifdef HAVE_PTHREAD
00023 #include <pthread.h>
00024 #endif
00025
00026 using namespace shogun;
00027
00028 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00029 struct TRIANGULATION_THREAD_PARAM
00030 {
00032 int32_t idx_start;
00034 int32_t idx_stop;
00036 int32_t idx_step;
00038 int32_t lmk_N;
00040 int32_t total_N;
00042 int32_t m_target_dim;
00044 float64_t* current_dist_to_lmks;
00046 float64_t* lmk_feature_matrix;
00048 float64_t* new_feature_matrix;
00050 const float64_t* distance_matrix;
00052 const float64_t* mean_sq_dist_vector;
00054 const int32_t* lmk_idxs;
00056 const bool* to_process;
00057 };
00058 #endif
00059
00060 CMultidimensionalScaling::CMultidimensionalScaling() : CDimensionReductionPreprocessor()
00061 {
00062 m_eigenvalues = SGVector<float64_t>(NULL,0,false);
00063 init();
00064 }
00065
00066 void CMultidimensionalScaling::init()
00067 {
00068 m_landmark_number = 3;
00069 m_landmark = false;
00070
00071 m_parameters->add(&m_landmark, "landmark", "indicates if landmark approximation should be used");
00072 m_parameters->add(&m_landmark_number, "landmark number", "the number of landmarks for approximation");
00073 }
00074
00075 bool CMultidimensionalScaling::init(CFeatures* features)
00076 {
00077 return true;
00078 }
00079
00080 void CMultidimensionalScaling::cleanup()
00081 {
00082 }
00083
00084 CMultidimensionalScaling::~CMultidimensionalScaling()
00085 {
00086 m_eigenvalues.destroy_vector();
00087 }
00088
00089 CSimpleFeatures<float64_t>* CMultidimensionalScaling::apply_to_distance(CDistance* distance)
00090 {
00091 ASSERT(distance);
00092
00093 SG_REF(distance);
00094
00095
00096 SGMatrix<float64_t> distance_matrix = distance->get_distance_matrix();
00097 SGMatrix<float64_t> feature_matrix;
00098 if (m_landmark)
00099 feature_matrix = landmark_embedding(distance_matrix);
00100 else
00101 feature_matrix = classic_embedding(distance_matrix);
00102
00103 distance_matrix.destroy_matrix();
00104 CSimpleFeatures<float64_t>* features =
00105 new CSimpleFeatures<float64_t>(feature_matrix);
00106
00107
00108 SG_UNREF(distance);
00109 return features;
00110 }
00111
00112 SGMatrix<float64_t> CMultidimensionalScaling::apply_to_feature_matrix(CFeatures* features)
00113 {
00114 CSimpleFeatures<float64_t>* simple_features = (CSimpleFeatures<float64_t>*) features;
00115
00116 SG_REF(features);
00117
00118 CDistance* distance = new CEuclidianDistance(simple_features,simple_features);
00119
00120 Parallel* distance_parallel = distance->parallel;
00121 distance->parallel = this->parallel;
00122
00123
00124 SGMatrix<float64_t> new_feature_matrix;
00125 SGMatrix<float64_t> distance_matrix = distance->get_distance_matrix();
00126 if (m_landmark)
00127 new_feature_matrix = landmark_embedding(distance_matrix);
00128 else
00129 new_feature_matrix = classic_embedding(distance_matrix);
00130
00131 simple_features->set_feature_matrix(new_feature_matrix);
00132
00133
00134 distance->parallel = distance_parallel;
00135 delete distance;
00136 distance_matrix.destroy_matrix();
00137
00138
00139 SG_UNREF(features);
00140 return simple_features->get_feature_matrix();
00141 }
00142
00143 SGVector<float64_t> CMultidimensionalScaling::apply_to_feature_vector(SGVector<float64_t> vector)
00144 {
00145 SG_NOTIMPLEMENTED;
00146 return vector;
00147 }
00148
00149 SGMatrix<float64_t> CMultidimensionalScaling::classic_embedding(SGMatrix<float64_t> distance_matrix)
00150 {
00151 ASSERT(distance_matrix.num_cols==distance_matrix.num_rows);
00152 int32_t N = distance_matrix.num_cols;
00153
00154
00155 int32_t i,j;
00156
00157
00158 float64_t dsq;
00159 for (i=0; i<N; i++)
00160 {
00161 for (j=i; j<N; j++)
00162 {
00163 dsq = CMath::sq(distance_matrix.matrix[i*N+j]);
00164 distance_matrix.matrix[i*N+j] = dsq;
00165 distance_matrix.matrix[j*N+i] = dsq;
00166 }
00167 }
00168 CMath::center_matrix(distance_matrix.matrix,N,N);
00169 for (i=0; i<N; i++)
00170 {
00171 distance_matrix.matrix[i*N+i] *= -0.5;
00172 for (j=i+1; j<N; j++)
00173 {
00174 distance_matrix.matrix[i*N+j] *= -0.5;
00175 distance_matrix.matrix[j*N+i] *= -0.5;
00176 }
00177 }
00178
00179
00180 float64_t* replace_feature_matrix = SG_MALLOC(float64_t, N*m_target_dim);
00181
00182
00183 int eigenproblem_status = 0;
00184 #ifdef HAVE_ARPACK
00185
00186 float64_t* eigenvalues_vector = SG_MALLOC(float64_t, m_target_dim);
00187
00188 arpack_dsaeupd_wrap(distance_matrix.matrix, NULL, N, m_target_dim, "LM", 1, false, 0.0, 0.0,
00189 eigenvalues_vector, replace_feature_matrix,
00190 eigenproblem_status);
00191
00192 ASSERT(eigenproblem_status == 0);
00193
00194 float64_t tmp;
00195 for (j=0; j<N; j++)
00196 {
00197 for (i=0; i<m_target_dim/2; i++)
00198 {
00199 tmp = replace_feature_matrix[j*m_target_dim+i];
00200 replace_feature_matrix[j*m_target_dim+i] =
00201 replace_feature_matrix[j*m_target_dim+(m_target_dim-i-1)];
00202 replace_feature_matrix[j*m_target_dim+(m_target_dim-i-1)] = tmp;
00203 }
00204 }
00205
00206 for (i=0; i<m_target_dim/2; i++)
00207 {
00208 tmp = eigenvalues_vector[i];
00209 eigenvalues_vector[i] = eigenvalues_vector[m_target_dim-i-1];
00210 eigenvalues_vector[m_target_dim-i-1] = tmp;
00211 }
00212
00213
00214 for (i=0; i<m_target_dim; i++)
00215 {
00216 for (j=0; j<N; j++)
00217 replace_feature_matrix[j*m_target_dim+i] *=
00218 CMath::sqrt(eigenvalues_vector[i]);
00219 }
00220
00221
00222 m_eigenvalues.destroy_vector();
00223 m_eigenvalues = SGVector<float64_t>(eigenvalues_vector,m_target_dim,true);
00224 #else
00225
00226 float64_t* eigenvalues_vector = SG_MALLOC(float64_t, N);
00227 float64_t* eigenvectors = SG_MALLOC(float64_t, m_target_dim*N);
00228
00229 wrap_dsyevr('V','U',N,distance_matrix.matrix,N,N-m_target_dim+1,N,eigenvalues_vector,eigenvectors,&eigenproblem_status);
00230
00231 ASSERT(eigenproblem_status==0);
00232
00233
00234 m_eigenvalues.destroy_vector();
00235 m_eigenvalues = SGVector<float64_t>(m_target_dim);
00236 m_eigenvalues.do_free = false;
00237
00238
00239 for (i=0; i<m_target_dim; i++)
00240 m_eigenvalues.vector[i] = eigenvalues_vector[m_target_dim-i-1];
00241
00242 SG_FREE(eigenvalues_vector);
00243
00244
00245 for (i=0; i<m_target_dim; i++)
00246 {
00247 for (j=0; j<N; j++)
00248 {
00249 replace_feature_matrix[j*m_target_dim+i] =
00250 eigenvectors[(m_target_dim-i-1)*N+j] * CMath::sqrt(m_eigenvalues.vector[i]);
00251 }
00252 }
00253 SG_FREE(eigenvectors);
00254 #endif
00255
00256
00257 for (i=0; i<m_eigenvalues.vlen; i++)
00258 {
00259 if (m_eigenvalues.vector[i]<=0.0)
00260 {
00261 SG_WARNING("Embedding is not consistent (got neg eigenvalues): features %d-%d are wrong",
00262 i, m_eigenvalues.vlen-1);
00263 break;
00264 }
00265 }
00266
00267 return SGMatrix<float64_t>(replace_feature_matrix,m_target_dim,N);
00268 }
00269
00270 SGMatrix<float64_t> CMultidimensionalScaling::landmark_embedding(SGMatrix<float64_t> distance_matrix)
00271 {
00272 ASSERT(distance_matrix.num_cols==distance_matrix.num_rows);
00273 int32_t lmk_N = m_landmark_number;
00274 int32_t i,j,t;
00275 int32_t total_N = distance_matrix.num_cols;
00276 if (lmk_N<3)
00277 {
00278 SG_ERROR("Number of landmarks (%d) should be greater than 3 for proper triangulation.\n",
00279 lmk_N);
00280 }
00281 if (lmk_N>total_N)
00282 {
00283 SG_ERROR("Number of landmarks (%d) should be less than total number of vectors (%d).\n",
00284 lmk_N, total_N);
00285 }
00286
00287
00288 SGVector<int32_t> lmk_idxs = shuffle(lmk_N,total_N);
00289
00290 float64_t* lmk_dist_matrix = SG_MALLOC(float64_t, lmk_N*lmk_N);
00291 for (i=0; i<lmk_N; i++)
00292 {
00293 for (j=0; j<lmk_N; j++)
00294 lmk_dist_matrix[i*lmk_N+j] =
00295 distance_matrix.matrix[lmk_idxs.vector[i]*total_N+lmk_idxs.vector[j]];
00296 }
00297
00298
00299 SGMatrix<float64_t> lmk_dist_sgmatrix(lmk_dist_matrix,lmk_N,lmk_N);
00300
00301 float64_t* mean_sq_dist_vector = SG_CALLOC(float64_t, lmk_N);
00302 for (i=0; i<lmk_N; i++)
00303 {
00304 for (j=0; j<lmk_N; j++)
00305 mean_sq_dist_vector[i] += CMath::sq(lmk_dist_matrix[i*lmk_N+j]);
00306
00307 mean_sq_dist_vector[i] /= lmk_N;
00308 }
00309 SGMatrix<float64_t> lmk_feature_matrix = classic_embedding(lmk_dist_sgmatrix);
00310
00311 lmk_dist_sgmatrix.destroy_matrix();
00312
00313
00314 float64_t* new_feature_matrix = SG_CALLOC(float64_t, m_target_dim*total_N);
00315
00316
00317 for (i=0; i<lmk_N; i++)
00318 {
00319 for (j=0; j<m_target_dim; j++)
00320 new_feature_matrix[lmk_idxs.vector[i]*m_target_dim+j] =
00321 lmk_feature_matrix.matrix[i*m_target_dim+j];
00322 }
00323
00324
00325 ASSERT(m_eigenvalues.vector && m_eigenvalues.vlen == m_target_dim);
00326 for (i=0; i<lmk_N; i++)
00327 {
00328 for (j=0; j<m_target_dim; j++)
00329 lmk_feature_matrix.matrix[i*m_target_dim+j] /= m_eigenvalues.vector[j];
00330 }
00331
00332
00333
00334 bool* to_process = SG_MALLOC(bool, total_N);
00335 for (j=0; j<total_N; j++)
00336 to_process[j] = true;
00337 for (j=0; j<lmk_N; j++)
00338 to_process[lmk_idxs.vector[j]] = false;
00339
00340
00341 #ifdef HAVE_PTHREAD
00342 int32_t num_threads = parallel->get_num_threads();
00343 ASSERT(num_threads>0);
00344
00345 pthread_t* threads = SG_MALLOC(pthread_t, num_threads);
00346 TRIANGULATION_THREAD_PARAM* parameters = SG_MALLOC(TRIANGULATION_THREAD_PARAM, num_threads);
00347 pthread_attr_t attr;
00348 pthread_attr_init(&attr);
00349 pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
00350 float64_t* current_dist_to_lmks = SG_MALLOC(float64_t, lmk_N*num_threads);
00351
00352 for (t=0; t<num_threads; t++)
00353 {
00354 parameters[t].idx_start = t;
00355 parameters[t].idx_stop = total_N;
00356 parameters[t].idx_step = num_threads;
00357 parameters[t].lmk_N = lmk_N;
00358 parameters[t].total_N = total_N;
00359 parameters[t].m_target_dim = m_target_dim;
00360 parameters[t].current_dist_to_lmks = current_dist_to_lmks+t*lmk_N;
00361 parameters[t].distance_matrix = distance_matrix.matrix;
00362 parameters[t].mean_sq_dist_vector = mean_sq_dist_vector;
00363 parameters[t].lmk_idxs = lmk_idxs.vector;
00364 parameters[t].lmk_feature_matrix = lmk_feature_matrix.matrix;
00365 parameters[t].new_feature_matrix = new_feature_matrix;
00366 parameters[t].to_process = to_process;
00367 pthread_create(&threads[t], &attr, run_triangulation_thread, (void*)¶meters[t]);
00368 }
00369
00370 for (t=0; t<num_threads; t++)
00371 pthread_join(threads[t], NULL);
00372 pthread_attr_destroy(&attr);
00373 SG_FREE(parameters);
00374 SG_FREE(threads);
00375 #else
00376
00377 float64_t* current_dist_to_lmks = SG_MALLOC(float64_t, lmk_N);
00378 TRIANGULATION_THREAD_PARAM single_thread_param;
00379 single_thread_param.idx_start = 0;
00380 single_thread_param.idx_stop = total_N;
00381 single_thread_param.idx_step = 1;
00382 single_thread_param.lmk_N = lmk_N;
00383 single_thread_param.total_N = total_N;
00384 single_thread_param.m_target_dim = m_target_dim;
00385 single_thread_param.current_dist_to_lmks = current_dist_to_lmks;
00386 single_thread_param.distance_matrix = distance_matrix.matrix;
00387 single_thread_param.mean_sq_dist_vector = mean_sq_dist_vector;
00388 single_thread_param.lmk_idxs = lmk_idxs.vector;
00389 single_thread_param.lmk_feature_matrix = lmk_feature_matrix.matrix;
00390 single_thread_param.new_feature_matrix = new_feature_matrix;
00391 single_thread_param.to_process = to_process;
00392 run_triangulation_thread((void*)&single_thread_param);
00393 #endif
00394
00395 lmk_feature_matrix.destroy_matrix();
00396 SG_FREE(current_dist_to_lmks);
00397 lmk_idxs.destroy_vector();
00398 SG_FREE(mean_sq_dist_vector);
00399 SG_FREE(to_process);
00400 lmk_idxs.destroy_vector();
00401
00402 return SGMatrix<float64_t>(new_feature_matrix,m_target_dim,total_N);
00403 }
00404
00405 void* CMultidimensionalScaling::run_triangulation_thread(void* p)
00406 {
00407 TRIANGULATION_THREAD_PARAM* parameters = (TRIANGULATION_THREAD_PARAM*)p;
00408 int32_t idx_start = parameters->idx_start;
00409 int32_t idx_step = parameters->idx_step;
00410 int32_t idx_stop = parameters->idx_stop;
00411 const int32_t* lmk_idxs = parameters->lmk_idxs;
00412 const float64_t* distance_matrix = parameters->distance_matrix;
00413 const float64_t* mean_sq_dist_vector = parameters->mean_sq_dist_vector;
00414 float64_t* current_dist_to_lmks = parameters->current_dist_to_lmks;
00415 int32_t m_target_dim = parameters->m_target_dim;
00416 int32_t lmk_N = parameters->lmk_N;
00417 int32_t total_N = parameters->total_N;
00418 const bool* to_process = parameters->to_process;
00419 float64_t* lmk_feature_matrix = parameters->lmk_feature_matrix;
00420 float64_t* new_feature_matrix = parameters->new_feature_matrix;
00421
00422 int32_t i,k;
00423 for (i=idx_start; i<idx_stop; i+=idx_step)
00424 {
00425
00426 if (!to_process[i])
00427 continue;
00428
00429
00430 for (k=0; k<lmk_N; k++)
00431 {
00432 current_dist_to_lmks[k] =
00433 CMath::sq(distance_matrix[i*total_N+lmk_idxs[k]]) -
00434 mean_sq_dist_vector[k];
00435 }
00436
00437 cblas_dgemv(CblasColMajor,CblasNoTrans,
00438 m_target_dim,lmk_N,
00439 -0.5,lmk_feature_matrix,m_target_dim,
00440 current_dist_to_lmks,1,
00441 0.0,(new_feature_matrix+i*m_target_dim),1);
00442 }
00443 return NULL;
00444 }
00445
00446
00447 SGVector<int32_t> CMultidimensionalScaling::shuffle(int32_t count, int32_t total_count)
00448 {
00449 int32_t* idxs = SG_MALLOC(int32_t, total_count);
00450 int32_t i,rnd;
00451 int32_t* permuted_idxs = SG_MALLOC(int32_t, count);
00452
00453
00454 for (i=0; i<total_count; i++)
00455 idxs[i] = i;
00456 for (i=0; i<count; i++)
00457 permuted_idxs[i] = idxs[i];
00458 for (i=count; i<total_count; i++)
00459 {
00460 rnd = CMath::random(1,i);
00461 if (rnd<count)
00462 permuted_idxs[rnd] = idxs[i];
00463 }
00464 SG_FREE(idxs);
00465
00466 CMath::qsort(permuted_idxs,count);
00467 return SGVector<int32_t>(permuted_idxs, count);
00468 }
00469
00470 #endif