MultidimensionalScaling.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2011 Sergey Lisitsyn
00008  * Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society
00009  */
00010 
00011 #include <shogun/converter/MultidimensionalScaling.h>
00012 #ifdef HAVE_LAPACK
00013 #include <shogun/converter/EmbeddingConverter.h>
00014 #include <shogun/mathematics/lapack.h>
00015 #include <shogun/mathematics/arpack.h>
00016 #include <shogun/distance/CustomDistance.h>
00017 #include <shogun/lib/common.h>
00018 #include <shogun/mathematics/Math.h>
00019 #include <shogun/mathematics/Statistics.h>
00020 #include <shogun/io/SGIO.h>
00021 #include <shogun/distance/EuclideanDistance.h>
00022 
00023 #ifdef HAVE_PTHREAD
00024 #include <pthread.h>
00025 #endif
00026 
00027 using namespace shogun;
00028 
00029 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00030 struct TRIANGULATION_THREAD_PARAM
00031 {
00033     int32_t idx_start;
00035     int32_t idx_stop;
00037     int32_t idx_step;
00039     int32_t lmk_N;
00041     int32_t total_N;
00043     int32_t m_target_dim;
00045     float64_t* current_dist_to_lmks;
00047     float64_t* lmk_feature_matrix;
00049     float64_t* new_feature_matrix;
00051     const float64_t* distance_matrix;
00053     const float64_t* mean_sq_dist_vector;
00055     const int32_t* lmk_idxs;
00057     const bool* to_process;
00058 };
00059 #endif
00060 
00061 CMultidimensionalScaling::CMultidimensionalScaling() : CEmbeddingConverter()
00062 {
00063     m_eigenvalues = SGVector<float64_t>();
00064     m_landmark_number = 3;
00065     m_landmark = false;
00066 
00067     init();
00068 }
00069 
00070 void CMultidimensionalScaling::init()
00071 {
00072     SG_ADD(&m_eigenvalues, "eigenvalues", "eigenvalues of last embedding",
00073         MS_NOT_AVAILABLE);
00074     SG_ADD(&m_landmark, "landmark",
00075         "indicates if landmark approximation should be used", MS_NOT_AVAILABLE);
00076     SG_ADD(&m_landmark_number, "landmark number",
00077         "the number of landmarks for approximation", MS_AVAILABLE);
00078 }
00079 
00080 CMultidimensionalScaling::~CMultidimensionalScaling()
00081 {
00082 }
00083 
00084 SGVector<float64_t> CMultidimensionalScaling::get_eigenvalues() const
00085 {
00086     return m_eigenvalues;
00087 }
00088 
00089 void CMultidimensionalScaling::set_landmark_number(int32_t num)
00090 {
00091     if (num<3)
00092         SG_ERROR("Number of landmarks should be greater than 3 to make triangulation possible while %d given.",
00093                  num);
00094     m_landmark_number = num;
00095 }
00096 
00097 int32_t CMultidimensionalScaling::get_landmark_number() const
00098 {
00099     return m_landmark_number;
00100 }
00101 
00102 void CMultidimensionalScaling::set_landmark(bool landmark)
00103 {
00104     m_landmark = landmark;
00105 }
00106 
00107 bool CMultidimensionalScaling::get_landmark() const
00108 {
00109     return m_landmark;
00110 }
00111 
00112 const char* CMultidimensionalScaling::get_name() const
00113 {
00114     return "MultidimensionalScaling";
00115 };
00116 
00117 CDenseFeatures<float64_t>* CMultidimensionalScaling::embed_distance(CDistance* distance)
00118 {
00119     ASSERT(distance);
00120 
00121     // compute feature_matrix by landmark or classic embedding of distance matrix
00122     SGMatrix<float64_t> distance_matrix = process_distance_matrix(distance->get_distance_matrix());
00123     SGMatrix<float64_t> feature_matrix;
00124     if (m_landmark)
00125         feature_matrix = landmark_embedding(distance_matrix);
00126     else
00127         feature_matrix = classic_embedding(distance_matrix);
00128 
00129     return new CDenseFeatures<float64_t>(feature_matrix);
00130 }
00131 
00132 SGMatrix<float64_t> CMultidimensionalScaling::process_distance_matrix(SGMatrix<float64_t> distance_matrix)
00133 {
00134     return distance_matrix;
00135 }
00136 
00137 CFeatures* CMultidimensionalScaling::apply(CFeatures* features)
00138 {
00139     SG_REF(features);
00140     ASSERT(m_distance);
00141 
00142     m_distance->init(features,features);
00143     CDenseFeatures<float64_t>* embedding = embed_distance(m_distance);
00144     m_distance->remove_lhs_and_rhs();
00145 
00146     SG_UNREF(features);
00147     return (CFeatures*)embedding;
00148 }
00149 
00150 SGMatrix<float64_t> CMultidimensionalScaling::classic_embedding(SGMatrix<float64_t> distance_matrix)
00151 {
00152     ASSERT(m_target_dim>0);
00153     ASSERT(distance_matrix.num_cols==distance_matrix.num_rows);
00154     int32_t N = distance_matrix.num_cols;
00155 
00156     // loop variables
00157     int32_t i,j;
00158 
00159     // double center distance_matrix
00160     for (i=0; i<N; i++)
00161     {
00162         for (j=0; j<N; j++)
00163             distance_matrix(i,j) = CMath::sq(distance_matrix(i,j));
00164     }
00165     SGMatrix<float64_t>::center_matrix(distance_matrix.matrix,N,N);
00166     for (i=0; i<N; i++)
00167     {
00168         for (j=0; j<N; j++)
00169             distance_matrix(i,j) = distance_matrix(i,j)*(-0.5);
00170     }
00171 
00172     // feature matrix representing given distance
00173     float64_t* replace_feature_matrix = SG_MALLOC(float64_t, N*m_target_dim);
00174 
00175     // status of eigenproblem to be solved
00176     int eigenproblem_status = 0;
00177 #ifdef HAVE_ARPACK
00178     // using ARPACK
00179     float64_t* eigenvalues_vector = SG_MALLOC(float64_t, m_target_dim);
00180     // solve eigenproblem with ARPACK (faster)
00181     arpack_dsxupd(distance_matrix.matrix,NULL,false,N,m_target_dim,"LM",false,1,false,false,0.0,0.0,
00182                   eigenvalues_vector,replace_feature_matrix,eigenproblem_status);
00183     // check for failure
00184     ASSERT(eigenproblem_status == 0);
00185     // reverse eigenvectors order
00186     float64_t tmp;
00187     for (j=0; j<N; j++)
00188     {
00189         for (i=0; i<m_target_dim/2; i++)
00190         {
00191             tmp = replace_feature_matrix[j*m_target_dim+i];
00192             replace_feature_matrix[j*m_target_dim+i] =
00193                 replace_feature_matrix[j*m_target_dim+(m_target_dim-i-1)];
00194             replace_feature_matrix[j*m_target_dim+(m_target_dim-i-1)] = tmp;
00195         }
00196     }
00197     // reverse eigenvalues order
00198     for (i=0; i<m_target_dim/2; i++)
00199     {
00200         tmp = eigenvalues_vector[i];
00201         eigenvalues_vector[i] = eigenvalues_vector[m_target_dim-i-1];
00202         eigenvalues_vector[m_target_dim-i-1] = tmp;
00203     }
00204 
00205     // finally construct embedding
00206     for (i=0; i<m_target_dim; i++)
00207     {
00208         for (j=0; j<N; j++)
00209             replace_feature_matrix[j*m_target_dim+i] *=
00210                 CMath::sqrt(eigenvalues_vector[i]);
00211     }
00212 
00213     // set eigenvalues vector
00214     m_eigenvalues = SGVector<float64_t>(eigenvalues_vector,m_target_dim);
00215 #else /* not HAVE_ARPACK */
00216     // using LAPACK
00217     float64_t* eigenvalues_vector = SG_MALLOC(float64_t, N);
00218     float64_t* eigenvectors = SG_MALLOC(float64_t, m_target_dim*N);
00219     // solve eigenproblem with LAPACK
00220     wrap_dsyevr('V','U',N,distance_matrix.matrix,N,N-m_target_dim+1,N,eigenvalues_vector,eigenvectors,&eigenproblem_status);
00221     // check for failure
00222     ASSERT(eigenproblem_status==0);
00223 
00224     // set eigenvalues vector
00225     m_eigenvalues = SGVector<float64_t>(m_target_dim);
00226 
00227     // fill eigenvalues vector in backwards order
00228     for (i=0; i<m_target_dim; i++)
00229         m_eigenvalues.vector[i] = eigenvalues_vector[m_target_dim-i-1];
00230 
00231     // construct embedding
00232     for (i=0; i<m_target_dim; i++)
00233     {
00234         for (j=0; j<N; j++)
00235         {
00236             replace_feature_matrix[j*m_target_dim+i] =
00237                   eigenvectors[(m_target_dim-i-1)*N+j] * CMath::sqrt(m_eigenvalues.vector[i]);
00238         }
00239     }
00240     SG_FREE(eigenvectors);
00241     SG_FREE(eigenvalues_vector);
00242 #endif /* HAVE_ARPACK else */
00243 
00244     // warn user if there are negative or zero eigenvalues
00245     for (i=0; i<m_eigenvalues.vlen; i++)
00246     {
00247         if (m_eigenvalues.vector[i]<=0.0)
00248         {
00249             SG_WARNING("Embedding is not consistent (got neg eigenvalues): features %d-%d are wrong",
00250                        i, m_eigenvalues.vlen-1);
00251             break;
00252         }
00253     }
00254 
00255     return SGMatrix<float64_t>(replace_feature_matrix,m_target_dim,N);
00256 }
00257 
00258 SGMatrix<float64_t> CMultidimensionalScaling::landmark_embedding(SGMatrix<float64_t> distance_matrix)
00259 {
00260     ASSERT(m_target_dim>0);
00261     ASSERT(distance_matrix.num_cols==distance_matrix.num_rows);
00262     int32_t lmk_N = m_landmark_number;
00263     int32_t i,j;
00264     int32_t total_N = distance_matrix.num_cols;
00265     if (lmk_N<3)
00266     {
00267         SG_ERROR("Number of landmarks (%d) should be greater than 3 for proper triangulation.\n",
00268                  lmk_N);
00269     }
00270     if (lmk_N>total_N)
00271     {
00272         SG_ERROR("Number of landmarks (%d) should be less than total number of vectors (%d).\n",
00273                  lmk_N, total_N);
00274     }
00275 
00276     // get landmark indexes with random permutation
00277     SGVector<int32_t> lmk_idxs = CStatistics::sample_indices(lmk_N,total_N);
00278     // compute distances between landmarks
00279     float64_t* lmk_dist_matrix = SG_MALLOC(float64_t, lmk_N*lmk_N);
00280     for (i=0; i<lmk_N; i++)
00281     {
00282         for (j=0; j<lmk_N; j++)
00283             lmk_dist_matrix[i*lmk_N+j] =
00284                 distance_matrix[lmk_idxs.vector[i]*total_N+lmk_idxs.vector[j]];
00285     }
00286 
00287     // get landmarks embedding
00288     SGMatrix<float64_t> lmk_dist_sgmatrix(lmk_dist_matrix,lmk_N,lmk_N);
00289     // compute mean vector of squared distances
00290     float64_t* mean_sq_dist_vector = SG_CALLOC(float64_t, lmk_N);
00291     for (i=0; i<lmk_N; i++)
00292     {
00293         for (j=0; j<lmk_N; j++)
00294             mean_sq_dist_vector[i] += CMath::sq(lmk_dist_matrix[i*lmk_N+j]);
00295 
00296         mean_sq_dist_vector[i] /= lmk_N;
00297     }
00298     SGMatrix<float64_t> lmk_feature_matrix = classic_embedding(lmk_dist_sgmatrix);
00299 
00300     // construct new feature matrix
00301     float64_t* new_feature_matrix = SG_CALLOC(float64_t, m_target_dim*total_N);
00302 
00303     // fill new feature matrix with embedded landmarks
00304     for (i=0; i<lmk_N; i++)
00305     {
00306         for (j=0; j<m_target_dim; j++)
00307             new_feature_matrix[lmk_idxs.vector[i]*m_target_dim+j] =
00308                 lmk_feature_matrix[i*m_target_dim+j];
00309     }
00310 
00311     // get exactly defined pseudoinverse of landmarks feature matrix
00312     ASSERT(m_eigenvalues.vector && m_eigenvalues.vlen == m_target_dim);
00313     for (i=0; i<lmk_N; i++)
00314     {
00315         for (j=0; j<m_target_dim; j++)
00316             lmk_feature_matrix[i*m_target_dim+j] /= m_eigenvalues.vector[j];
00317     }
00318 
00319 
00320     // set to_process els true if should be processed
00321     bool* to_process = SG_MALLOC(bool, total_N);
00322     for (j=0; j<total_N; j++)
00323         to_process[j] = true;
00324     for (j=0; j<lmk_N; j++)
00325         to_process[lmk_idxs.vector[j]] = false;
00326 
00327     // get embedding for non-landmark vectors
00328 #ifdef HAVE_PTHREAD
00329     int32_t t;
00330     int32_t num_threads = parallel->get_num_threads();
00331     ASSERT(num_threads>0);
00332     // allocate threads and it's parameters
00333     pthread_t* threads = SG_MALLOC(pthread_t, num_threads);
00334     TRIANGULATION_THREAD_PARAM* parameters = SG_MALLOC(TRIANGULATION_THREAD_PARAM, num_threads);
00335     pthread_attr_t attr;
00336     pthread_attr_init(&attr);
00337     pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
00338     float64_t* current_dist_to_lmks = SG_MALLOC(float64_t, lmk_N*num_threads);
00339     // run threads
00340     for (t=0; t<num_threads; t++)
00341     {
00342         parameters[t].idx_start = t;
00343         parameters[t].idx_stop = total_N;
00344         parameters[t].idx_step = num_threads;
00345         parameters[t].lmk_N = lmk_N;
00346         parameters[t].total_N = total_N;
00347         parameters[t].m_target_dim = m_target_dim;
00348         parameters[t].current_dist_to_lmks = current_dist_to_lmks+t*lmk_N;
00349         parameters[t].distance_matrix = distance_matrix.matrix;
00350         parameters[t].mean_sq_dist_vector = mean_sq_dist_vector;
00351         parameters[t].lmk_idxs = lmk_idxs.vector;
00352         parameters[t].lmk_feature_matrix = lmk_feature_matrix.matrix;
00353         parameters[t].new_feature_matrix = new_feature_matrix;
00354         parameters[t].to_process = to_process;
00355         pthread_create(&threads[t], &attr, run_triangulation_thread, (void*)&parameters[t]);
00356     }
00357     // join threads
00358     for (t=0; t<num_threads; t++)
00359         pthread_join(threads[t], NULL);
00360     pthread_attr_destroy(&attr);
00361     SG_FREE(parameters);
00362     SG_FREE(threads);
00363 #else
00364     // run single 'thread'
00365     float64_t* current_dist_to_lmks = SG_MALLOC(float64_t, lmk_N);
00366     TRIANGULATION_THREAD_PARAM single_thread_param;
00367     single_thread_param.idx_start = 0;
00368     single_thread_param.idx_stop = total_N;
00369     single_thread_param.idx_step = 1;
00370     single_thread_param.lmk_N = lmk_N;
00371     single_thread_param.total_N = total_N;
00372     single_thread_param.m_target_dim = m_target_dim;
00373     single_thread_param.current_dist_to_lmks = current_dist_to_lmks;
00374     single_thread_param.distance_matrix = distance_matrix.matrix;
00375     single_thread_param.mean_sq_dist_vector = mean_sq_dist_vector;
00376     single_thread_param.lmk_idxs = lmk_idxs.vector;
00377     single_thread_param.lmk_feature_matrix = lmk_feature_matrix.matrix;
00378     single_thread_param.new_feature_matrix = new_feature_matrix;
00379     single_thread_param.to_process = to_process;
00380     run_triangulation_thread((void*)&single_thread_param);
00381 #endif
00382     // cleanup
00383     SG_FREE(current_dist_to_lmks);
00384     SG_FREE(mean_sq_dist_vector);
00385     SG_FREE(to_process);
00386 
00387     return SGMatrix<float64_t>(new_feature_matrix,m_target_dim,total_N);
00388 }
00389 
00390 void* CMultidimensionalScaling::run_triangulation_thread(void* p)
00391 {
00392     TRIANGULATION_THREAD_PARAM* parameters = (TRIANGULATION_THREAD_PARAM*)p;
00393     int32_t idx_start = parameters->idx_start;
00394     int32_t idx_step = parameters->idx_step;
00395     int32_t idx_stop = parameters->idx_stop;
00396     const int32_t* lmk_idxs = parameters->lmk_idxs;
00397     const float64_t* distance_matrix = parameters->distance_matrix;
00398     const float64_t* mean_sq_dist_vector = parameters->mean_sq_dist_vector;
00399     float64_t* current_dist_to_lmks = parameters->current_dist_to_lmks;
00400     int32_t m_target_dim = parameters->m_target_dim;
00401     int32_t lmk_N = parameters->lmk_N;
00402     int32_t total_N = parameters->total_N;
00403     const bool* to_process = parameters->to_process;
00404     float64_t* lmk_feature_matrix = parameters->lmk_feature_matrix;
00405     float64_t* new_feature_matrix = parameters->new_feature_matrix;
00406 
00407     int32_t i,k;
00408     for (i=idx_start; i<idx_stop; i+=idx_step)
00409     {
00410         // skip if landmark
00411         if (!to_process[i])
00412             continue;
00413 
00414         // compute difference from mean landmark distance vector
00415         for (k=0; k<lmk_N; k++)
00416         {
00417             current_dist_to_lmks[k] =
00418                 CMath::sq(distance_matrix[i*total_N+lmk_idxs[k]]) -
00419                 mean_sq_dist_vector[k];
00420         }
00421         // compute embedding
00422         cblas_dgemv(CblasColMajor,CblasNoTrans,
00423                     m_target_dim,lmk_N,
00424                     -0.5,lmk_feature_matrix,m_target_dim,
00425                     current_dist_to_lmks,1,
00426                     0.0,(new_feature_matrix+i*m_target_dim),1);
00427     }
00428     return NULL;
00429 }
00430 
00431 #endif /* HAVE_LAPACK */
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation