00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011 #include <shogun/converter/MultidimensionalScaling.h>
00012 #ifdef HAVE_LAPACK
00013 #include <shogun/converter/EmbeddingConverter.h>
00014 #include <shogun/mathematics/lapack.h>
00015 #include <shogun/mathematics/arpack.h>
00016 #include <shogun/distance/CustomDistance.h>
00017 #include <shogun/lib/common.h>
00018 #include <shogun/mathematics/Math.h>
00019 #include <shogun/io/SGIO.h>
00020 #include <shogun/distance/EuclidianDistance.h>
00021
00022 #ifdef HAVE_PTHREAD
00023 #include <pthread.h>
00024 #endif
00025
00026 using namespace shogun;
00027
00028 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00029 struct TRIANGULATION_THREAD_PARAM
00030 {
00032 int32_t idx_start;
00034 int32_t idx_stop;
00036 int32_t idx_step;
00038 int32_t lmk_N;
00040 int32_t total_N;
00042 int32_t m_target_dim;
00044 float64_t* current_dist_to_lmks;
00046 float64_t* lmk_feature_matrix;
00048 float64_t* new_feature_matrix;
00050 const float64_t* distance_matrix;
00052 const float64_t* mean_sq_dist_vector;
00054 const int32_t* lmk_idxs;
00056 const bool* to_process;
00057 };
00058 #endif
00059
00060 CMultidimensionalScaling::CMultidimensionalScaling() : CEmbeddingConverter()
00061 {
00062 m_eigenvalues = SGVector<float64_t>(NULL,0,false);
00063 m_landmark_number = 3;
00064 m_landmark = false;
00065
00066 init();
00067 }
00068
00069 void CMultidimensionalScaling::init()
00070 {
00071 m_parameters->add(&m_eigenvalues, "eigenvalues", "eigenvalues of last embedding");
00072 m_parameters->add(&m_landmark, "landmark", "indicates if landmark approximation should be used");
00073 m_parameters->add(&m_landmark_number, "landmark number", "the number of landmarks for approximation");
00074 }
00075
00076 CMultidimensionalScaling::~CMultidimensionalScaling()
00077 {
00078 m_eigenvalues.destroy_vector();
00079 }
00080
00081 SGVector<float64_t> CMultidimensionalScaling::get_eigenvalues() const
00082 {
00083 return m_eigenvalues;
00084 }
00085
00086 void CMultidimensionalScaling::set_landmark_number(int32_t num)
00087 {
00088 if (num<3)
00089 SG_ERROR("Number of landmarks should be greater than 3 to make triangulation possible while %d given.",
00090 num);
00091 m_landmark_number = num;
00092 }
00093
00094 int32_t CMultidimensionalScaling::get_landmark_number() const
00095 {
00096 return m_landmark_number;
00097 }
00098
00099 void CMultidimensionalScaling::set_landmark(bool landmark)
00100 {
00101 m_landmark = landmark;
00102 }
00103
00104 bool CMultidimensionalScaling::get_landmark() const
00105 {
00106 return m_landmark;
00107 }
00108
00109 const char* CMultidimensionalScaling::get_name() const
00110 {
00111 return "MultidimensionalScaling";
00112 };
00113
00114 CSimpleFeatures<float64_t>* CMultidimensionalScaling::embed_distance(CDistance* distance)
00115 {
00116 ASSERT(distance);
00117
00118
00119 SGMatrix<float64_t> distance_matrix = process_distance_matrix(distance->get_distance_matrix());
00120 SGMatrix<float64_t> feature_matrix;
00121 if (m_landmark)
00122 feature_matrix = landmark_embedding(distance_matrix);
00123 else
00124 feature_matrix = classic_embedding(distance_matrix);
00125
00126 distance_matrix.destroy_matrix();
00127
00128 return new CSimpleFeatures<float64_t>(feature_matrix);
00129 }
00130
00131 SGMatrix<float64_t> CMultidimensionalScaling::process_distance_matrix(SGMatrix<float64_t> distance_matrix)
00132 {
00133 return distance_matrix;
00134 }
00135
00136 CFeatures* CMultidimensionalScaling::apply(CFeatures* features)
00137 {
00138 SG_REF(features);
00139 ASSERT(m_distance);
00140
00141 m_distance->init(features,features);
00142 CSimpleFeatures<float64_t>* embedding = embed_distance(m_distance);
00143 m_distance->remove_lhs_and_rhs();
00144
00145 SG_UNREF(features);
00146 return (CFeatures*)embedding;
00147 }
00148
00149 SGMatrix<float64_t> CMultidimensionalScaling::classic_embedding(SGMatrix<float64_t> distance_matrix)
00150 {
00151 ASSERT(m_target_dim>0);
00152 ASSERT(distance_matrix.num_cols==distance_matrix.num_rows);
00153 int32_t N = distance_matrix.num_cols;
00154
00155
00156 int32_t i,j;
00157
00158
00159 float64_t dsq;
00160 for (i=0; i<N; i++)
00161 {
00162 for (j=i; j<N; j++)
00163 {
00164 dsq = CMath::sq(distance_matrix[i*N+j]);
00165 distance_matrix[i*N+j] = dsq;
00166 distance_matrix[j*N+i] = dsq;
00167 }
00168 }
00169 CMath::center_matrix(distance_matrix.matrix,N,N);
00170 for (i=0; i<N; i++)
00171 {
00172 distance_matrix[i*N+i] *= -0.5;
00173 for (j=i+1; j<N; j++)
00174 {
00175 distance_matrix[i*N+j] *= -0.5;
00176 distance_matrix[j*N+i] *= -0.5;
00177 }
00178 }
00179
00180
00181 float64_t* replace_feature_matrix = SG_MALLOC(float64_t, N*m_target_dim);
00182
00183
00184 int eigenproblem_status = 0;
00185 #ifdef HAVE_ARPACK
00186
00187 float64_t* eigenvalues_vector = SG_MALLOC(float64_t, m_target_dim);
00188
00189 arpack_dsxupd(distance_matrix.matrix,NULL,false,N,m_target_dim,"LM",false,1,false,0.0,0.0,
00190 eigenvalues_vector,replace_feature_matrix,eigenproblem_status);
00191
00192 ASSERT(eigenproblem_status == 0);
00193
00194 float64_t tmp;
00195 for (j=0; j<N; j++)
00196 {
00197 for (i=0; i<m_target_dim/2; i++)
00198 {
00199 tmp = replace_feature_matrix[j*m_target_dim+i];
00200 replace_feature_matrix[j*m_target_dim+i] =
00201 replace_feature_matrix[j*m_target_dim+(m_target_dim-i-1)];
00202 replace_feature_matrix[j*m_target_dim+(m_target_dim-i-1)] = tmp;
00203 }
00204 }
00205
00206 for (i=0; i<m_target_dim/2; i++)
00207 {
00208 tmp = eigenvalues_vector[i];
00209 eigenvalues_vector[i] = eigenvalues_vector[m_target_dim-i-1];
00210 eigenvalues_vector[m_target_dim-i-1] = tmp;
00211 }
00212
00213
00214 for (i=0; i<m_target_dim; i++)
00215 {
00216 for (j=0; j<N; j++)
00217 replace_feature_matrix[j*m_target_dim+i] *=
00218 CMath::sqrt(eigenvalues_vector[i]);
00219 }
00220
00221
00222 m_eigenvalues.destroy_vector();
00223 m_eigenvalues = SGVector<float64_t>(eigenvalues_vector,m_target_dim,true);
00224 #else
00225
00226 float64_t* eigenvalues_vector = SG_MALLOC(float64_t, N);
00227 float64_t* eigenvectors = SG_MALLOC(float64_t, m_target_dim*N);
00228
00229 wrap_dsyevr('V','U',N,distance_matrix.matrix,N,N-m_target_dim+1,N,eigenvalues_vector,eigenvectors,&eigenproblem_status);
00230
00231 ASSERT(eigenproblem_status==0);
00232
00233
00234 m_eigenvalues.destroy_vector();
00235 m_eigenvalues = SGVector<float64_t>(m_target_dim);
00236 m_eigenvalues.do_free = false;
00237
00238
00239 for (i=0; i<m_target_dim; i++)
00240 m_eigenvalues.vector[i] = eigenvalues_vector[m_target_dim-i-1];
00241
00242 SG_FREE(eigenvalues_vector);
00243
00244
00245 for (i=0; i<m_target_dim; i++)
00246 {
00247 for (j=0; j<N; j++)
00248 {
00249 replace_feature_matrix[j*m_target_dim+i] =
00250 eigenvectors[(m_target_dim-i-1)*N+j] * CMath::sqrt(m_eigenvalues.vector[i]);
00251 }
00252 }
00253 SG_FREE(eigenvectors);
00254 #endif
00255
00256
00257 for (i=0; i<m_eigenvalues.vlen; i++)
00258 {
00259 if (m_eigenvalues.vector[i]<=0.0)
00260 {
00261 SG_WARNING("Embedding is not consistent (got neg eigenvalues): features %d-%d are wrong",
00262 i, m_eigenvalues.vlen-1);
00263 break;
00264 }
00265 }
00266
00267 return SGMatrix<float64_t>(replace_feature_matrix,m_target_dim,N);
00268 }
00269
00270 SGMatrix<float64_t> CMultidimensionalScaling::landmark_embedding(SGMatrix<float64_t> distance_matrix)
00271 {
00272 ASSERT(m_target_dim>0);
00273 ASSERT(distance_matrix.num_cols==distance_matrix.num_rows);
00274 int32_t lmk_N = m_landmark_number;
00275 int32_t i,j,t;
00276 int32_t total_N = distance_matrix.num_cols;
00277 if (lmk_N<3)
00278 {
00279 SG_ERROR("Number of landmarks (%d) should be greater than 3 for proper triangulation.\n",
00280 lmk_N);
00281 }
00282 if (lmk_N>total_N)
00283 {
00284 SG_ERROR("Number of landmarks (%d) should be less than total number of vectors (%d).\n",
00285 lmk_N, total_N);
00286 }
00287
00288
00289 SGVector<int32_t> lmk_idxs = shuffle(lmk_N,total_N);
00290
00291 float64_t* lmk_dist_matrix = SG_MALLOC(float64_t, lmk_N*lmk_N);
00292 for (i=0; i<lmk_N; i++)
00293 {
00294 for (j=0; j<lmk_N; j++)
00295 lmk_dist_matrix[i*lmk_N+j] =
00296 distance_matrix[lmk_idxs.vector[i]*total_N+lmk_idxs.vector[j]];
00297 }
00298
00299
00300 SGMatrix<float64_t> lmk_dist_sgmatrix(lmk_dist_matrix,lmk_N,lmk_N);
00301
00302 float64_t* mean_sq_dist_vector = SG_CALLOC(float64_t, lmk_N);
00303 for (i=0; i<lmk_N; i++)
00304 {
00305 for (j=0; j<lmk_N; j++)
00306 mean_sq_dist_vector[i] += CMath::sq(lmk_dist_matrix[i*lmk_N+j]);
00307
00308 mean_sq_dist_vector[i] /= lmk_N;
00309 }
00310 SGMatrix<float64_t> lmk_feature_matrix = classic_embedding(lmk_dist_sgmatrix);
00311
00312 lmk_dist_sgmatrix.destroy_matrix();
00313
00314
00315 float64_t* new_feature_matrix = SG_CALLOC(float64_t, m_target_dim*total_N);
00316
00317
00318 for (i=0; i<lmk_N; i++)
00319 {
00320 for (j=0; j<m_target_dim; j++)
00321 new_feature_matrix[lmk_idxs.vector[i]*m_target_dim+j] =
00322 lmk_feature_matrix[i*m_target_dim+j];
00323 }
00324
00325
00326 ASSERT(m_eigenvalues.vector && m_eigenvalues.vlen == m_target_dim);
00327 for (i=0; i<lmk_N; i++)
00328 {
00329 for (j=0; j<m_target_dim; j++)
00330 lmk_feature_matrix[i*m_target_dim+j] /= m_eigenvalues.vector[j];
00331 }
00332
00333
00334
00335 bool* to_process = SG_MALLOC(bool, total_N);
00336 for (j=0; j<total_N; j++)
00337 to_process[j] = true;
00338 for (j=0; j<lmk_N; j++)
00339 to_process[lmk_idxs.vector[j]] = false;
00340
00341
00342 #ifdef HAVE_PTHREAD
00343 int32_t num_threads = parallel->get_num_threads();
00344 ASSERT(num_threads>0);
00345
00346 pthread_t* threads = SG_MALLOC(pthread_t, num_threads);
00347 TRIANGULATION_THREAD_PARAM* parameters = SG_MALLOC(TRIANGULATION_THREAD_PARAM, num_threads);
00348 pthread_attr_t attr;
00349 pthread_attr_init(&attr);
00350 pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
00351 float64_t* current_dist_to_lmks = SG_MALLOC(float64_t, lmk_N*num_threads);
00352
00353 for (t=0; t<num_threads; t++)
00354 {
00355 parameters[t].idx_start = t;
00356 parameters[t].idx_stop = total_N;
00357 parameters[t].idx_step = num_threads;
00358 parameters[t].lmk_N = lmk_N;
00359 parameters[t].total_N = total_N;
00360 parameters[t].m_target_dim = m_target_dim;
00361 parameters[t].current_dist_to_lmks = current_dist_to_lmks+t*lmk_N;
00362 parameters[t].distance_matrix = distance_matrix.matrix;
00363 parameters[t].mean_sq_dist_vector = mean_sq_dist_vector;
00364 parameters[t].lmk_idxs = lmk_idxs.vector;
00365 parameters[t].lmk_feature_matrix = lmk_feature_matrix.matrix;
00366 parameters[t].new_feature_matrix = new_feature_matrix;
00367 parameters[t].to_process = to_process;
00368 pthread_create(&threads[t], &attr, run_triangulation_thread, (void*)¶meters[t]);
00369 }
00370
00371 for (t=0; t<num_threads; t++)
00372 pthread_join(threads[t], NULL);
00373 pthread_attr_destroy(&attr);
00374 SG_FREE(parameters);
00375 SG_FREE(threads);
00376 #else
00377
00378 float64_t* current_dist_to_lmks = SG_MALLOC(float64_t, lmk_N);
00379 TRIANGULATION_THREAD_PARAM single_thread_param;
00380 single_thread_param.idx_start = 0;
00381 single_thread_param.idx_stop = total_N;
00382 single_thread_param.idx_step = 1;
00383 single_thread_param.lmk_N = lmk_N;
00384 single_thread_param.total_N = total_N;
00385 single_thread_param.m_target_dim = m_target_dim;
00386 single_thread_param.current_dist_to_lmks = current_dist_to_lmks;
00387 single_thread_param.distance_matrix = distance_matrix.matrix;
00388 single_thread_param.mean_sq_dist_vector = mean_sq_dist_vector;
00389 single_thread_param.lmk_idxs = lmk_idxs.vector;
00390 single_thread_param.lmk_feature_matrix = lmk_feature_matrix.matrix;
00391 single_thread_param.new_feature_matrix = new_feature_matrix;
00392 single_thread_param.to_process = to_process;
00393 run_triangulation_thread((void*)&single_thread_param);
00394 #endif
00395
00396 lmk_feature_matrix.destroy_matrix();
00397 SG_FREE(current_dist_to_lmks);
00398 lmk_idxs.destroy_vector();
00399 SG_FREE(mean_sq_dist_vector);
00400 SG_FREE(to_process);
00401 lmk_idxs.destroy_vector();
00402
00403 return SGMatrix<float64_t>(new_feature_matrix,m_target_dim,total_N);
00404 }
00405
00406 void* CMultidimensionalScaling::run_triangulation_thread(void* p)
00407 {
00408 TRIANGULATION_THREAD_PARAM* parameters = (TRIANGULATION_THREAD_PARAM*)p;
00409 int32_t idx_start = parameters->idx_start;
00410 int32_t idx_step = parameters->idx_step;
00411 int32_t idx_stop = parameters->idx_stop;
00412 const int32_t* lmk_idxs = parameters->lmk_idxs;
00413 const float64_t* distance_matrix = parameters->distance_matrix;
00414 const float64_t* mean_sq_dist_vector = parameters->mean_sq_dist_vector;
00415 float64_t* current_dist_to_lmks = parameters->current_dist_to_lmks;
00416 int32_t m_target_dim = parameters->m_target_dim;
00417 int32_t lmk_N = parameters->lmk_N;
00418 int32_t total_N = parameters->total_N;
00419 const bool* to_process = parameters->to_process;
00420 float64_t* lmk_feature_matrix = parameters->lmk_feature_matrix;
00421 float64_t* new_feature_matrix = parameters->new_feature_matrix;
00422
00423 int32_t i,k;
00424 for (i=idx_start; i<idx_stop; i+=idx_step)
00425 {
00426
00427 if (!to_process[i])
00428 continue;
00429
00430
00431 for (k=0; k<lmk_N; k++)
00432 {
00433 current_dist_to_lmks[k] =
00434 CMath::sq(distance_matrix[i*total_N+lmk_idxs[k]]) -
00435 mean_sq_dist_vector[k];
00436 }
00437
00438 cblas_dgemv(CblasColMajor,CblasNoTrans,
00439 m_target_dim,lmk_N,
00440 -0.5,lmk_feature_matrix,m_target_dim,
00441 current_dist_to_lmks,1,
00442 0.0,(new_feature_matrix+i*m_target_dim),1);
00443 }
00444 return NULL;
00445 }
00446
00447
00448 SGVector<int32_t> CMultidimensionalScaling::shuffle(int32_t count, int32_t total_count)
00449 {
00450 int32_t* idxs = SG_MALLOC(int32_t, total_count);
00451 int32_t i,rnd;
00452 int32_t* permuted_idxs = SG_MALLOC(int32_t, count);
00453
00454
00455 for (i=0; i<total_count; i++)
00456 idxs[i] = i;
00457 for (i=0; i<count; i++)
00458 permuted_idxs[i] = idxs[i];
00459 for (i=count; i<total_count; i++)
00460 {
00461 rnd = CMath::random(1,i);
00462 if (rnd<count)
00463 permuted_idxs[rnd] = idxs[i];
00464 }
00465 SG_FREE(idxs);
00466
00467 CMath::qsort(permuted_idxs,count);
00468 return SGVector<int32_t>(permuted_idxs, count);
00469 }
00470
00471 #endif