MultidimensionalScaling.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2011 Sergey Lisitsyn
00008  * Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society
00009  */
00010 
00011 #include <shogun/converter/MultidimensionalScaling.h>
00012 #ifdef HAVE_LAPACK
00013 #include <shogun/converter/EmbeddingConverter.h>
00014 #include <shogun/mathematics/lapack.h>
00015 #include <shogun/mathematics/arpack.h>
00016 #include <shogun/distance/CustomDistance.h>
00017 #include <shogun/lib/common.h>
00018 #include <shogun/mathematics/Math.h>
00019 #include <shogun/io/SGIO.h>
00020 #include <shogun/distance/EuclidianDistance.h>
00021 
00022 #ifdef HAVE_PTHREAD
00023 #include <pthread.h>
00024 #endif
00025 
00026 using namespace shogun;
00027 
00028 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00029 struct TRIANGULATION_THREAD_PARAM
00030 {
00032     int32_t idx_start;
00034     int32_t idx_stop;
00036     int32_t idx_step;
00038     int32_t lmk_N;
00040     int32_t total_N;
00042     int32_t m_target_dim;
00044     float64_t* current_dist_to_lmks;
00046     float64_t* lmk_feature_matrix;
00048     float64_t* new_feature_matrix;
00050     const float64_t* distance_matrix;
00052     const float64_t* mean_sq_dist_vector;
00054     const int32_t* lmk_idxs;
00056     const bool* to_process;
00057 };
00058 #endif
00059 
00060 CMultidimensionalScaling::CMultidimensionalScaling() : CEmbeddingConverter()
00061 {
00062     m_eigenvalues = SGVector<float64_t>(NULL,0,false);
00063     m_landmark_number = 3;
00064     m_landmark = false;
00065     
00066     init();
00067 }
00068 
00069 void CMultidimensionalScaling::init()
00070 {
00071     m_parameters->add(&m_eigenvalues, "eigenvalues", "eigenvalues of last embedding");
00072     m_parameters->add(&m_landmark, "landmark", "indicates if landmark approximation should be used");
00073     m_parameters->add(&m_landmark_number, "landmark number", "the number of landmarks for approximation");
00074 }
00075 
00076 CMultidimensionalScaling::~CMultidimensionalScaling()
00077 {
00078     m_eigenvalues.destroy_vector();
00079 }
00080 
00081 SGVector<float64_t> CMultidimensionalScaling::get_eigenvalues() const
00082 {
00083     return m_eigenvalues;
00084 }
00085 
00086 void CMultidimensionalScaling::set_landmark_number(int32_t num)
00087 {
00088     if (num<3)
00089         SG_ERROR("Number of landmarks should be greater than 3 to make triangulation possible while %d given.",
00090                  num);
00091     m_landmark_number = num;
00092 }
00093 
00094 int32_t CMultidimensionalScaling::get_landmark_number() const
00095 {
00096     return m_landmark_number;
00097 }
00098 
00099 void CMultidimensionalScaling::set_landmark(bool landmark)
00100 {
00101     m_landmark = landmark;
00102 }
00103 
00104 bool CMultidimensionalScaling::get_landmark() const
00105 {
00106     return m_landmark;
00107 }
00108 
00109 const char* CMultidimensionalScaling::get_name() const 
00110 {
00111     return "MultidimensionalScaling";
00112 };
00113 
00114 CSimpleFeatures<float64_t>* CMultidimensionalScaling::embed_distance(CDistance* distance)
00115 {
00116     ASSERT(distance);
00117 
00118     // compute feature_matrix by landmark or classic embedding of distance matrix
00119     SGMatrix<float64_t> distance_matrix = process_distance_matrix(distance->get_distance_matrix());
00120     SGMatrix<float64_t> feature_matrix;
00121     if (m_landmark)
00122         feature_matrix = landmark_embedding(distance_matrix);
00123     else
00124         feature_matrix = classic_embedding(distance_matrix);
00125     
00126     distance_matrix.destroy_matrix();
00127 
00128     return new CSimpleFeatures<float64_t>(feature_matrix);
00129 }
00130 
00131 SGMatrix<float64_t> CMultidimensionalScaling::process_distance_matrix(SGMatrix<float64_t> distance_matrix)
00132 {
00133     return distance_matrix;
00134 }
00135 
00136 CFeatures* CMultidimensionalScaling::apply(CFeatures* features)
00137 {
00138     SG_REF(features);
00139     ASSERT(m_distance);
00140     
00141     m_distance->init(features,features);
00142     CSimpleFeatures<float64_t>* embedding = embed_distance(m_distance);
00143     m_distance->remove_lhs_and_rhs();
00144     
00145     SG_UNREF(features);
00146     return (CFeatures*)embedding;
00147 }
00148 
00149 SGMatrix<float64_t> CMultidimensionalScaling::classic_embedding(SGMatrix<float64_t> distance_matrix)
00150 {
00151     ASSERT(m_target_dim>0);
00152     ASSERT(distance_matrix.num_cols==distance_matrix.num_rows);
00153     int32_t N = distance_matrix.num_cols;
00154 
00155     // loop variables
00156     int32_t i,j;
00157     
00158     // double center distance_matrix
00159     float64_t dsq;
00160     for (i=0; i<N; i++)
00161     {
00162         for (j=i; j<N; j++)
00163         {
00164             dsq = CMath::sq(distance_matrix[i*N+j]);
00165             distance_matrix[i*N+j] = dsq;
00166             distance_matrix[j*N+i] = dsq;
00167         }
00168     }
00169     CMath::center_matrix(distance_matrix.matrix,N,N);
00170     for (i=0; i<N; i++)
00171     {
00172         distance_matrix[i*N+i] *= -0.5;
00173         for (j=i+1; j<N; j++)
00174         {
00175             distance_matrix[i*N+j] *= -0.5;
00176             distance_matrix[j*N+i] *= -0.5;
00177         }
00178     }
00179 
00180     // feature matrix representing given distance
00181     float64_t* replace_feature_matrix = SG_MALLOC(float64_t, N*m_target_dim);
00182  
00183     // status of eigenproblem to be solved
00184     int eigenproblem_status = 0;
00185 #ifdef HAVE_ARPACK
00186     // using ARPACK
00187     float64_t* eigenvalues_vector = SG_MALLOC(float64_t, m_target_dim);
00188     // solve eigenproblem with ARPACK (faster)
00189     arpack_dsxupd(distance_matrix.matrix,NULL,false,N,m_target_dim,"LM",false,1,false,0.0,0.0,
00190                   eigenvalues_vector,replace_feature_matrix,eigenproblem_status);
00191     // check for failure
00192     ASSERT(eigenproblem_status == 0);
00193     // reverse eigenvectors order
00194     float64_t tmp;
00195     for (j=0; j<N; j++)
00196     {
00197         for (i=0; i<m_target_dim/2; i++)
00198         {
00199             tmp = replace_feature_matrix[j*m_target_dim+i];
00200             replace_feature_matrix[j*m_target_dim+i] = 
00201                 replace_feature_matrix[j*m_target_dim+(m_target_dim-i-1)];
00202             replace_feature_matrix[j*m_target_dim+(m_target_dim-i-1)] = tmp;
00203         }
00204     }
00205     // reverse eigenvalues order
00206     for (i=0; i<m_target_dim/2; i++)
00207     {
00208         tmp = eigenvalues_vector[i];
00209         eigenvalues_vector[i] = eigenvalues_vector[m_target_dim-i-1];
00210         eigenvalues_vector[m_target_dim-i-1] = tmp;
00211     }
00212 
00213     // finally construct embedding
00214     for (i=0; i<m_target_dim; i++)
00215     {
00216         for (j=0; j<N; j++)
00217             replace_feature_matrix[j*m_target_dim+i] *=
00218                 CMath::sqrt(eigenvalues_vector[i]);
00219     }
00220         
00221     // set eigenvalues vector
00222     m_eigenvalues.destroy_vector();
00223     m_eigenvalues = SGVector<float64_t>(eigenvalues_vector,m_target_dim,true);
00224 #else /* not HAVE_ARPACK */
00225     // using LAPACK
00226     float64_t* eigenvalues_vector = SG_MALLOC(float64_t, N);
00227     float64_t* eigenvectors = SG_MALLOC(float64_t, m_target_dim*N);
00228     // solve eigenproblem with LAPACK
00229     wrap_dsyevr('V','U',N,distance_matrix.matrix,N,N-m_target_dim+1,N,eigenvalues_vector,eigenvectors,&eigenproblem_status);
00230     // check for failure
00231     ASSERT(eigenproblem_status==0);
00232     
00233     // set eigenvalues vector
00234     m_eigenvalues.destroy_vector();
00235     m_eigenvalues = SGVector<float64_t>(m_target_dim);
00236     m_eigenvalues.do_free = false;
00237 
00238     // fill eigenvalues vector in backwards order
00239     for (i=0; i<m_target_dim; i++)
00240         m_eigenvalues.vector[i] = eigenvalues_vector[m_target_dim-i-1];
00241 
00242     SG_FREE(eigenvalues_vector);
00243 
00244     // construct embedding
00245     for (i=0; i<m_target_dim; i++)
00246     {
00247         for (j=0; j<N; j++)
00248         {
00249             replace_feature_matrix[j*m_target_dim+i] = 
00250                   eigenvectors[(m_target_dim-i-1)*N+j] * CMath::sqrt(m_eigenvalues.vector[i]);
00251         }
00252     }
00253     SG_FREE(eigenvectors);
00254 #endif /* HAVE_ARPACK else */
00255     
00256     // warn user if there are negative or zero eigenvalues
00257     for (i=0; i<m_eigenvalues.vlen; i++)
00258     {
00259         if (m_eigenvalues.vector[i]<=0.0)
00260         {
00261             SG_WARNING("Embedding is not consistent (got neg eigenvalues): features %d-%d are wrong",
00262                        i, m_eigenvalues.vlen-1);
00263             break;
00264         }
00265     }
00266     
00267     return SGMatrix<float64_t>(replace_feature_matrix,m_target_dim,N);
00268 }
00269 
00270 SGMatrix<float64_t> CMultidimensionalScaling::landmark_embedding(SGMatrix<float64_t> distance_matrix)
00271 {
00272     ASSERT(m_target_dim>0);
00273     ASSERT(distance_matrix.num_cols==distance_matrix.num_rows);
00274     int32_t lmk_N = m_landmark_number;
00275     int32_t i,j,t;
00276     int32_t total_N = distance_matrix.num_cols;
00277     if (lmk_N<3)
00278     {
00279         SG_ERROR("Number of landmarks (%d) should be greater than 3 for proper triangulation.\n", 
00280                  lmk_N);
00281     }
00282     if (lmk_N>total_N)
00283     {
00284         SG_ERROR("Number of landmarks (%d) should be less than total number of vectors (%d).\n",
00285                  lmk_N, total_N);
00286     }
00287     
00288     // get landmark indexes with random permutation
00289     SGVector<int32_t> lmk_idxs = shuffle(lmk_N,total_N);
00290     // compute distances between landmarks
00291     float64_t* lmk_dist_matrix = SG_MALLOC(float64_t, lmk_N*lmk_N);
00292     for (i=0; i<lmk_N; i++)
00293     {
00294         for (j=0; j<lmk_N; j++)
00295             lmk_dist_matrix[i*lmk_N+j] =
00296                 distance_matrix[lmk_idxs.vector[i]*total_N+lmk_idxs.vector[j]];
00297     }
00298 
00299     // get landmarks embedding
00300     SGMatrix<float64_t> lmk_dist_sgmatrix(lmk_dist_matrix,lmk_N,lmk_N);
00301     // compute mean vector of squared distances
00302     float64_t* mean_sq_dist_vector = SG_CALLOC(float64_t, lmk_N);
00303     for (i=0; i<lmk_N; i++)
00304     {
00305         for (j=0; j<lmk_N; j++)
00306             mean_sq_dist_vector[i] += CMath::sq(lmk_dist_matrix[i*lmk_N+j]);
00307 
00308         mean_sq_dist_vector[i] /= lmk_N;
00309     }   
00310     SGMatrix<float64_t> lmk_feature_matrix = classic_embedding(lmk_dist_sgmatrix);
00311 
00312     lmk_dist_sgmatrix.destroy_matrix();
00313 
00314     // construct new feature matrix
00315     float64_t* new_feature_matrix = SG_CALLOC(float64_t, m_target_dim*total_N);
00316 
00317     // fill new feature matrix with embedded landmarks
00318     for (i=0; i<lmk_N; i++)
00319     {
00320         for (j=0; j<m_target_dim; j++)
00321             new_feature_matrix[lmk_idxs.vector[i]*m_target_dim+j] =
00322                 lmk_feature_matrix[i*m_target_dim+j];
00323     }
00324 
00325     // get exactly defined pseudoinverse of landmarks feature matrix
00326     ASSERT(m_eigenvalues.vector && m_eigenvalues.vlen == m_target_dim);
00327     for (i=0; i<lmk_N; i++)
00328     {
00329         for (j=0; j<m_target_dim; j++)
00330             lmk_feature_matrix[i*m_target_dim+j] /= m_eigenvalues.vector[j];
00331     }
00332 
00333 
00334     // set to_process els true if should be processed
00335     bool* to_process = SG_MALLOC(bool, total_N);
00336     for (j=0; j<total_N; j++)
00337         to_process[j] = true;
00338     for (j=0; j<lmk_N; j++)
00339         to_process[lmk_idxs.vector[j]] = false;
00340 
00341     // get embedding for non-landmark vectors
00342 #ifdef HAVE_PTHREAD
00343     int32_t num_threads = parallel->get_num_threads();
00344     ASSERT(num_threads>0);
00345     // allocate threads and it's parameters
00346     pthread_t* threads = SG_MALLOC(pthread_t, num_threads);
00347     TRIANGULATION_THREAD_PARAM* parameters = SG_MALLOC(TRIANGULATION_THREAD_PARAM, num_threads);
00348     pthread_attr_t attr;
00349     pthread_attr_init(&attr);
00350     pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
00351     float64_t* current_dist_to_lmks = SG_MALLOC(float64_t, lmk_N*num_threads);
00352     // run threads
00353     for (t=0; t<num_threads; t++)
00354     {
00355         parameters[t].idx_start = t;
00356         parameters[t].idx_stop = total_N;
00357         parameters[t].idx_step = num_threads;
00358         parameters[t].lmk_N = lmk_N;
00359         parameters[t].total_N = total_N;
00360         parameters[t].m_target_dim = m_target_dim;
00361         parameters[t].current_dist_to_lmks = current_dist_to_lmks+t*lmk_N;
00362         parameters[t].distance_matrix = distance_matrix.matrix;
00363         parameters[t].mean_sq_dist_vector = mean_sq_dist_vector;
00364         parameters[t].lmk_idxs = lmk_idxs.vector;
00365         parameters[t].lmk_feature_matrix = lmk_feature_matrix.matrix;
00366         parameters[t].new_feature_matrix = new_feature_matrix;
00367         parameters[t].to_process = to_process;
00368         pthread_create(&threads[t], &attr, run_triangulation_thread, (void*)&parameters[t]);
00369     }
00370     // join threads
00371     for (t=0; t<num_threads; t++)
00372         pthread_join(threads[t], NULL);
00373     pthread_attr_destroy(&attr);
00374     SG_FREE(parameters);
00375     SG_FREE(threads);
00376 #else
00377     // run single 'thread'
00378     float64_t* current_dist_to_lmks = SG_MALLOC(float64_t, lmk_N);
00379     TRIANGULATION_THREAD_PARAM single_thread_param;
00380     single_thread_param.idx_start = 0;
00381     single_thread_param.idx_stop = total_N;
00382     single_thread_param.idx_step = 1;
00383     single_thread_param.lmk_N = lmk_N;
00384     single_thread_param.total_N = total_N;
00385     single_thread_param.m_target_dim = m_target_dim;
00386     single_thread_param.current_dist_to_lmks = current_dist_to_lmks;
00387     single_thread_param.distance_matrix = distance_matrix.matrix;
00388     single_thread_param.mean_sq_dist_vector = mean_sq_dist_vector;
00389     single_thread_param.lmk_idxs = lmk_idxs.vector;
00390     single_thread_param.lmk_feature_matrix = lmk_feature_matrix.matrix;
00391     single_thread_param.new_feature_matrix = new_feature_matrix;
00392     single_thread_param.to_process = to_process;
00393     run_triangulation_thread((void*)&single_thread_param);
00394 #endif
00395     // cleanup
00396     lmk_feature_matrix.destroy_matrix();
00397     SG_FREE(current_dist_to_lmks);
00398     lmk_idxs.destroy_vector();
00399     SG_FREE(mean_sq_dist_vector);
00400     SG_FREE(to_process);
00401     lmk_idxs.destroy_vector();
00402 
00403     return SGMatrix<float64_t>(new_feature_matrix,m_target_dim,total_N);
00404 }
00405 
00406 void* CMultidimensionalScaling::run_triangulation_thread(void* p)
00407 {
00408     TRIANGULATION_THREAD_PARAM* parameters = (TRIANGULATION_THREAD_PARAM*)p;
00409     int32_t idx_start = parameters->idx_start;
00410     int32_t idx_step = parameters->idx_step;
00411     int32_t idx_stop = parameters->idx_stop;
00412     const int32_t* lmk_idxs = parameters->lmk_idxs;
00413     const float64_t* distance_matrix = parameters->distance_matrix;
00414     const float64_t* mean_sq_dist_vector = parameters->mean_sq_dist_vector;
00415     float64_t* current_dist_to_lmks = parameters->current_dist_to_lmks;
00416     int32_t m_target_dim = parameters->m_target_dim;
00417     int32_t lmk_N = parameters->lmk_N;
00418     int32_t total_N = parameters->total_N;
00419     const bool* to_process = parameters->to_process;
00420     float64_t* lmk_feature_matrix = parameters->lmk_feature_matrix;
00421     float64_t* new_feature_matrix = parameters->new_feature_matrix;
00422 
00423     int32_t i,k;
00424     for (i=idx_start; i<idx_stop; i+=idx_step)
00425     {
00426         // skip if landmark
00427         if (!to_process[i])
00428             continue;
00429 
00430         // compute difference from mean landmark distance vector
00431         for (k=0; k<lmk_N; k++)
00432         {
00433             current_dist_to_lmks[k] =
00434                 CMath::sq(distance_matrix[i*total_N+lmk_idxs[k]]) -
00435                 mean_sq_dist_vector[k];
00436         }
00437         // compute embedding
00438         cblas_dgemv(CblasColMajor,CblasNoTrans,
00439                     m_target_dim,lmk_N,
00440                     -0.5,lmk_feature_matrix,m_target_dim,
00441                     current_dist_to_lmks,1,
00442                     0.0,(new_feature_matrix+i*m_target_dim),1);
00443     }
00444     return NULL;
00445 }
00446 
00447 
00448 SGVector<int32_t> CMultidimensionalScaling::shuffle(int32_t count, int32_t total_count)
00449 {
00450     int32_t* idxs = SG_MALLOC(int32_t, total_count);
00451     int32_t i,rnd;
00452     int32_t* permuted_idxs = SG_MALLOC(int32_t, count);
00453 
00454     // reservoir sampling
00455     for (i=0; i<total_count; i++)
00456         idxs[i] = i;
00457     for (i=0; i<count; i++)
00458         permuted_idxs[i] = idxs[i];
00459     for (i=count; i<total_count; i++)
00460     {
00461         rnd = CMath::random(1,i);
00462         if (rnd<count)
00463             permuted_idxs[rnd] = idxs[i];
00464     }
00465     SG_FREE(idxs);
00466 
00467     CMath::qsort(permuted_idxs,count);
00468     return SGVector<int32_t>(permuted_idxs, count);
00469 }
00470 
00471 #endif /* HAVE_LAPACK */
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation