00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011 #include <shogun/converter/MultidimensionalScaling.h>
00012 #ifdef HAVE_LAPACK
00013 #include <shogun/converter/EmbeddingConverter.h>
00014 #include <shogun/mathematics/lapack.h>
00015 #include <shogun/mathematics/arpack.h>
00016 #include <shogun/distance/CustomDistance.h>
00017 #include <shogun/lib/common.h>
00018 #include <shogun/mathematics/Math.h>
00019 #include <shogun/mathematics/Statistics.h>
00020 #include <shogun/io/SGIO.h>
00021 #include <shogun/distance/EuclideanDistance.h>
00022
00023 #ifdef HAVE_PTHREAD
00024 #include <pthread.h>
00025 #endif
00026
00027 using namespace shogun;
00028
00029 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00030 struct TRIANGULATION_THREAD_PARAM
00031 {
00033 int32_t idx_start;
00035 int32_t idx_stop;
00037 int32_t idx_step;
00039 int32_t lmk_N;
00041 int32_t total_N;
00043 int32_t m_target_dim;
00045 float64_t* current_dist_to_lmks;
00047 float64_t* lmk_feature_matrix;
00049 float64_t* new_feature_matrix;
00051 const float64_t* distance_matrix;
00053 const float64_t* mean_sq_dist_vector;
00055 const int32_t* lmk_idxs;
00057 const bool* to_process;
00058 };
00059 #endif
00060
00061 CMultidimensionalScaling::CMultidimensionalScaling() : CEmbeddingConverter()
00062 {
00063 m_eigenvalues = SGVector<float64_t>();
00064 m_landmark_number = 3;
00065 m_landmark = false;
00066
00067 init();
00068 }
00069
00070 void CMultidimensionalScaling::init()
00071 {
00072 SG_ADD(&m_eigenvalues, "eigenvalues", "eigenvalues of last embedding",
00073 MS_NOT_AVAILABLE);
00074 SG_ADD(&m_landmark, "landmark",
00075 "indicates if landmark approximation should be used", MS_NOT_AVAILABLE);
00076 SG_ADD(&m_landmark_number, "landmark number",
00077 "the number of landmarks for approximation", MS_AVAILABLE);
00078 }
00079
00080 CMultidimensionalScaling::~CMultidimensionalScaling()
00081 {
00082 }
00083
00084 SGVector<float64_t> CMultidimensionalScaling::get_eigenvalues() const
00085 {
00086 return m_eigenvalues;
00087 }
00088
00089 void CMultidimensionalScaling::set_landmark_number(int32_t num)
00090 {
00091 if (num<3)
00092 SG_ERROR("Number of landmarks should be greater than 3 to make triangulation possible while %d given.",
00093 num);
00094 m_landmark_number = num;
00095 }
00096
00097 int32_t CMultidimensionalScaling::get_landmark_number() const
00098 {
00099 return m_landmark_number;
00100 }
00101
00102 void CMultidimensionalScaling::set_landmark(bool landmark)
00103 {
00104 m_landmark = landmark;
00105 }
00106
00107 bool CMultidimensionalScaling::get_landmark() const
00108 {
00109 return m_landmark;
00110 }
00111
00112 const char* CMultidimensionalScaling::get_name() const
00113 {
00114 return "MultidimensionalScaling";
00115 };
00116
00117 CDenseFeatures<float64_t>* CMultidimensionalScaling::embed_distance(CDistance* distance)
00118 {
00119 ASSERT(distance);
00120
00121
00122 SGMatrix<float64_t> distance_matrix = process_distance_matrix(distance->get_distance_matrix());
00123 SGMatrix<float64_t> feature_matrix;
00124 if (m_landmark)
00125 feature_matrix = landmark_embedding(distance_matrix);
00126 else
00127 feature_matrix = classic_embedding(distance_matrix);
00128
00129 return new CDenseFeatures<float64_t>(feature_matrix);
00130 }
00131
00132 SGMatrix<float64_t> CMultidimensionalScaling::process_distance_matrix(SGMatrix<float64_t> distance_matrix)
00133 {
00134 return distance_matrix;
00135 }
00136
00137 CFeatures* CMultidimensionalScaling::apply(CFeatures* features)
00138 {
00139 SG_REF(features);
00140 ASSERT(m_distance);
00141
00142 m_distance->init(features,features);
00143 CDenseFeatures<float64_t>* embedding = embed_distance(m_distance);
00144 m_distance->remove_lhs_and_rhs();
00145
00146 SG_UNREF(features);
00147 return (CFeatures*)embedding;
00148 }
00149
00150 SGMatrix<float64_t> CMultidimensionalScaling::classic_embedding(SGMatrix<float64_t> distance_matrix)
00151 {
00152 ASSERT(m_target_dim>0);
00153 ASSERT(distance_matrix.num_cols==distance_matrix.num_rows);
00154 int32_t N = distance_matrix.num_cols;
00155
00156
00157 int32_t i,j;
00158
00159
00160 for (i=0; i<N; i++)
00161 {
00162 for (j=0; j<N; j++)
00163 distance_matrix(i,j) = CMath::sq(distance_matrix(i,j));
00164 }
00165 SGMatrix<float64_t>::center_matrix(distance_matrix.matrix,N,N);
00166 for (i=0; i<N; i++)
00167 {
00168 for (j=0; j<N; j++)
00169 distance_matrix(i,j) = distance_matrix(i,j)*(-0.5);
00170 }
00171
00172
00173 float64_t* replace_feature_matrix = SG_MALLOC(float64_t, N*m_target_dim);
00174
00175
00176 int eigenproblem_status = 0;
00177 #ifdef HAVE_ARPACK
00178
00179 float64_t* eigenvalues_vector = SG_MALLOC(float64_t, m_target_dim);
00180
00181 arpack_dsxupd(distance_matrix.matrix,NULL,false,N,m_target_dim,"LM",false,1,false,false,0.0,0.0,
00182 eigenvalues_vector,replace_feature_matrix,eigenproblem_status);
00183
00184 ASSERT(eigenproblem_status == 0);
00185
00186 float64_t tmp;
00187 for (j=0; j<N; j++)
00188 {
00189 for (i=0; i<m_target_dim/2; i++)
00190 {
00191 tmp = replace_feature_matrix[j*m_target_dim+i];
00192 replace_feature_matrix[j*m_target_dim+i] =
00193 replace_feature_matrix[j*m_target_dim+(m_target_dim-i-1)];
00194 replace_feature_matrix[j*m_target_dim+(m_target_dim-i-1)] = tmp;
00195 }
00196 }
00197
00198 for (i=0; i<m_target_dim/2; i++)
00199 {
00200 tmp = eigenvalues_vector[i];
00201 eigenvalues_vector[i] = eigenvalues_vector[m_target_dim-i-1];
00202 eigenvalues_vector[m_target_dim-i-1] = tmp;
00203 }
00204
00205
00206 for (i=0; i<m_target_dim; i++)
00207 {
00208 for (j=0; j<N; j++)
00209 replace_feature_matrix[j*m_target_dim+i] *=
00210 CMath::sqrt(eigenvalues_vector[i]);
00211 }
00212
00213
00214 m_eigenvalues = SGVector<float64_t>(eigenvalues_vector,m_target_dim);
00215 #else
00216
00217 float64_t* eigenvalues_vector = SG_MALLOC(float64_t, N);
00218 float64_t* eigenvectors = SG_MALLOC(float64_t, m_target_dim*N);
00219
00220 wrap_dsyevr('V','U',N,distance_matrix.matrix,N,N-m_target_dim+1,N,eigenvalues_vector,eigenvectors,&eigenproblem_status);
00221
00222 ASSERT(eigenproblem_status==0);
00223
00224
00225 m_eigenvalues = SGVector<float64_t>(m_target_dim);
00226
00227
00228 for (i=0; i<m_target_dim; i++)
00229 m_eigenvalues.vector[i] = eigenvalues_vector[m_target_dim-i-1];
00230
00231
00232 for (i=0; i<m_target_dim; i++)
00233 {
00234 for (j=0; j<N; j++)
00235 {
00236 replace_feature_matrix[j*m_target_dim+i] =
00237 eigenvectors[(m_target_dim-i-1)*N+j] * CMath::sqrt(m_eigenvalues.vector[i]);
00238 }
00239 }
00240 SG_FREE(eigenvectors);
00241 SG_FREE(eigenvalues_vector);
00242 #endif
00243
00244
00245 for (i=0; i<m_eigenvalues.vlen; i++)
00246 {
00247 if (m_eigenvalues.vector[i]<=0.0)
00248 {
00249 SG_WARNING("Embedding is not consistent (got neg eigenvalues): features %d-%d are wrong",
00250 i, m_eigenvalues.vlen-1);
00251 break;
00252 }
00253 }
00254
00255 return SGMatrix<float64_t>(replace_feature_matrix,m_target_dim,N);
00256 }
00257
00258 SGMatrix<float64_t> CMultidimensionalScaling::landmark_embedding(SGMatrix<float64_t> distance_matrix)
00259 {
00260 ASSERT(m_target_dim>0);
00261 ASSERT(distance_matrix.num_cols==distance_matrix.num_rows);
00262 int32_t lmk_N = m_landmark_number;
00263 int32_t i,j;
00264 int32_t total_N = distance_matrix.num_cols;
00265 if (lmk_N<3)
00266 {
00267 SG_ERROR("Number of landmarks (%d) should be greater than 3 for proper triangulation.\n",
00268 lmk_N);
00269 }
00270 if (lmk_N>total_N)
00271 {
00272 SG_ERROR("Number of landmarks (%d) should be less than total number of vectors (%d).\n",
00273 lmk_N, total_N);
00274 }
00275
00276
00277 SGVector<int32_t> lmk_idxs = CStatistics::sample_indices(lmk_N,total_N);
00278
00279 float64_t* lmk_dist_matrix = SG_MALLOC(float64_t, lmk_N*lmk_N);
00280 for (i=0; i<lmk_N; i++)
00281 {
00282 for (j=0; j<lmk_N; j++)
00283 lmk_dist_matrix[i*lmk_N+j] =
00284 distance_matrix[lmk_idxs.vector[i]*total_N+lmk_idxs.vector[j]];
00285 }
00286
00287
00288 SGMatrix<float64_t> lmk_dist_sgmatrix(lmk_dist_matrix,lmk_N,lmk_N);
00289
00290 float64_t* mean_sq_dist_vector = SG_CALLOC(float64_t, lmk_N);
00291 for (i=0; i<lmk_N; i++)
00292 {
00293 for (j=0; j<lmk_N; j++)
00294 mean_sq_dist_vector[i] += CMath::sq(lmk_dist_matrix[i*lmk_N+j]);
00295
00296 mean_sq_dist_vector[i] /= lmk_N;
00297 }
00298 SGMatrix<float64_t> lmk_feature_matrix = classic_embedding(lmk_dist_sgmatrix);
00299
00300
00301 float64_t* new_feature_matrix = SG_CALLOC(float64_t, m_target_dim*total_N);
00302
00303
00304 for (i=0; i<lmk_N; i++)
00305 {
00306 for (j=0; j<m_target_dim; j++)
00307 new_feature_matrix[lmk_idxs.vector[i]*m_target_dim+j] =
00308 lmk_feature_matrix[i*m_target_dim+j];
00309 }
00310
00311
00312 ASSERT(m_eigenvalues.vector && m_eigenvalues.vlen == m_target_dim);
00313 for (i=0; i<lmk_N; i++)
00314 {
00315 for (j=0; j<m_target_dim; j++)
00316 lmk_feature_matrix[i*m_target_dim+j] /= m_eigenvalues.vector[j];
00317 }
00318
00319
00320
00321 bool* to_process = SG_MALLOC(bool, total_N);
00322 for (j=0; j<total_N; j++)
00323 to_process[j] = true;
00324 for (j=0; j<lmk_N; j++)
00325 to_process[lmk_idxs.vector[j]] = false;
00326
00327
00328 #ifdef HAVE_PTHREAD
00329 int32_t t;
00330 int32_t num_threads = parallel->get_num_threads();
00331 ASSERT(num_threads>0);
00332
00333 pthread_t* threads = SG_MALLOC(pthread_t, num_threads);
00334 TRIANGULATION_THREAD_PARAM* parameters = SG_MALLOC(TRIANGULATION_THREAD_PARAM, num_threads);
00335 pthread_attr_t attr;
00336 pthread_attr_init(&attr);
00337 pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
00338 float64_t* current_dist_to_lmks = SG_MALLOC(float64_t, lmk_N*num_threads);
00339
00340 for (t=0; t<num_threads; t++)
00341 {
00342 parameters[t].idx_start = t;
00343 parameters[t].idx_stop = total_N;
00344 parameters[t].idx_step = num_threads;
00345 parameters[t].lmk_N = lmk_N;
00346 parameters[t].total_N = total_N;
00347 parameters[t].m_target_dim = m_target_dim;
00348 parameters[t].current_dist_to_lmks = current_dist_to_lmks+t*lmk_N;
00349 parameters[t].distance_matrix = distance_matrix.matrix;
00350 parameters[t].mean_sq_dist_vector = mean_sq_dist_vector;
00351 parameters[t].lmk_idxs = lmk_idxs.vector;
00352 parameters[t].lmk_feature_matrix = lmk_feature_matrix.matrix;
00353 parameters[t].new_feature_matrix = new_feature_matrix;
00354 parameters[t].to_process = to_process;
00355 pthread_create(&threads[t], &attr, run_triangulation_thread, (void*)¶meters[t]);
00356 }
00357
00358 for (t=0; t<num_threads; t++)
00359 pthread_join(threads[t], NULL);
00360 pthread_attr_destroy(&attr);
00361 SG_FREE(parameters);
00362 SG_FREE(threads);
00363 #else
00364
00365 float64_t* current_dist_to_lmks = SG_MALLOC(float64_t, lmk_N);
00366 TRIANGULATION_THREAD_PARAM single_thread_param;
00367 single_thread_param.idx_start = 0;
00368 single_thread_param.idx_stop = total_N;
00369 single_thread_param.idx_step = 1;
00370 single_thread_param.lmk_N = lmk_N;
00371 single_thread_param.total_N = total_N;
00372 single_thread_param.m_target_dim = m_target_dim;
00373 single_thread_param.current_dist_to_lmks = current_dist_to_lmks;
00374 single_thread_param.distance_matrix = distance_matrix.matrix;
00375 single_thread_param.mean_sq_dist_vector = mean_sq_dist_vector;
00376 single_thread_param.lmk_idxs = lmk_idxs.vector;
00377 single_thread_param.lmk_feature_matrix = lmk_feature_matrix.matrix;
00378 single_thread_param.new_feature_matrix = new_feature_matrix;
00379 single_thread_param.to_process = to_process;
00380 run_triangulation_thread((void*)&single_thread_param);
00381 #endif
00382
00383 SG_FREE(current_dist_to_lmks);
00384 SG_FREE(mean_sq_dist_vector);
00385 SG_FREE(to_process);
00386
00387 return SGMatrix<float64_t>(new_feature_matrix,m_target_dim,total_N);
00388 }
00389
00390 void* CMultidimensionalScaling::run_triangulation_thread(void* p)
00391 {
00392 TRIANGULATION_THREAD_PARAM* parameters = (TRIANGULATION_THREAD_PARAM*)p;
00393 int32_t idx_start = parameters->idx_start;
00394 int32_t idx_step = parameters->idx_step;
00395 int32_t idx_stop = parameters->idx_stop;
00396 const int32_t* lmk_idxs = parameters->lmk_idxs;
00397 const float64_t* distance_matrix = parameters->distance_matrix;
00398 const float64_t* mean_sq_dist_vector = parameters->mean_sq_dist_vector;
00399 float64_t* current_dist_to_lmks = parameters->current_dist_to_lmks;
00400 int32_t m_target_dim = parameters->m_target_dim;
00401 int32_t lmk_N = parameters->lmk_N;
00402 int32_t total_N = parameters->total_N;
00403 const bool* to_process = parameters->to_process;
00404 float64_t* lmk_feature_matrix = parameters->lmk_feature_matrix;
00405 float64_t* new_feature_matrix = parameters->new_feature_matrix;
00406
00407 int32_t i,k;
00408 for (i=idx_start; i<idx_stop; i+=idx_step)
00409 {
00410
00411 if (!to_process[i])
00412 continue;
00413
00414
00415 for (k=0; k<lmk_N; k++)
00416 {
00417 current_dist_to_lmks[k] =
00418 CMath::sq(distance_matrix[i*total_N+lmk_idxs[k]]) -
00419 mean_sq_dist_vector[k];
00420 }
00421
00422 cblas_dgemv(CblasColMajor,CblasNoTrans,
00423 m_target_dim,lmk_N,
00424 -0.5,lmk_feature_matrix,m_target_dim,
00425 current_dist_to_lmks,1,
00426 0.0,(new_feature_matrix+i*m_target_dim),1);
00427 }
00428 return NULL;
00429 }
00430
00431 #endif