00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011 #include <shogun/converter/LocallyLinearEmbedding.h>
00012 #include <shogun/lib/config.h>
00013 #ifdef HAVE_LAPACK
00014 #include <shogun/converter/EmbeddingConverter.h>
00015 #include <shogun/mathematics/arpack.h>
00016 #include <shogun/mathematics/lapack.h>
00017 #include <shogun/lib/FibonacciHeap.h>
00018 #include <shogun/lib/CoverTree.h>
00019 #include <shogun/base/DynArray.h>
00020 #include <shogun/mathematics/Math.h>
00021 #include <shogun/io/SGIO.h>
00022 #include <shogun/lib/Time.h>
00023 #include <shogun/distance/Distance.h>
00024 #include <shogun/lib/Signal.h>
00025
00026 #ifdef HAVE_PTHREAD
00027 #include <pthread.h>
00028 #endif
00029
00030 using namespace shogun;
00031
00032 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00033 struct LINRECONSTRUCTION_THREAD_PARAM
00034 {
00036 int32_t idx_start;
00038 int32_t idx_stop;
00040 int32_t idx_step;
00042 int32_t m_k;
00044 const int32_t* neighborhood_matrix;
00046 const float64_t* feature_matrix;
00048 float64_t* z_matrix;
00050 float64_t* covariance_matrix;
00052 float64_t* id_vector;
00054 float64_t* W_matrix;
00056 float64_t m_reconstruction_shift;
00058 int32_t dim;
00060 int32_t N;
00062 int32_t actual_k;
00063 #ifdef HAVE_PTHREAD
00064
00065 PTHREAD_LOCK_T* W_matrix_lock;
00066 #endif
00067 };
00068
00069 class LLE_COVERTREE_POINT
00070 {
00071 public:
00072
00073 LLE_COVERTREE_POINT(int32_t index, const SGMatrix<float64_t>& dmatrix)
00074 {
00075 point_index = index;
00076 distance_matrix = dmatrix;
00077 }
00078
00079 inline double distance(const LLE_COVERTREE_POINT& p) const
00080 {
00081 return distance_matrix[point_index*distance_matrix.num_rows+p.point_index];
00082 }
00083
00084 inline bool operator==(const LLE_COVERTREE_POINT& p) const
00085 {
00086 return (p.point_index==point_index);
00087 }
00088
00089 int32_t point_index;
00090 SGMatrix<float64_t> distance_matrix;
00091 };
00092
00093 #endif
00094
00095 CLocallyLinearEmbedding::CLocallyLinearEmbedding() :
00096 CEmbeddingConverter()
00097 {
00098 m_k = 10;
00099 m_max_k = 60;
00100 m_auto_k = false;
00101 m_nullspace_shift = -1e-9;
00102 m_reconstruction_shift = 1e-3;
00103 #ifdef HAVE_ARPACK
00104 m_use_arpack = true;
00105 #else
00106 m_use_arpack = false;
00107 #endif
00108 init();
00109 }
00110
00111 void CLocallyLinearEmbedding::init()
00112 {
00113 SG_ADD(&m_auto_k, "auto_k",
00114 "whether k should be determined automatically in range", MS_AVAILABLE);
00115 SG_ADD(&m_k, "k", "number of neighbors", MS_AVAILABLE);
00116 SG_ADD(&m_max_k, "max_k",
00117 "maximum number of neighbors used to compute optimal one", MS_AVAILABLE);
00118 m_parameters->add(&m_nullspace_shift, "nullspace_shift",
00119 "nullspace finding regularization shift");
00120 SG_ADD(&m_reconstruction_shift, "reconstruction_shift",
00121 "shift used to regularize reconstruction step", MS_NOT_AVAILABLE);
00122 SG_ADD(&m_use_arpack, "use_arpack", "whether arpack is being used or not",
00123 MS_NOT_AVAILABLE);
00124 }
00125
00126
00127 CLocallyLinearEmbedding::~CLocallyLinearEmbedding()
00128 {
00129 }
00130
00131 void CLocallyLinearEmbedding::set_k(int32_t k)
00132 {
00133 ASSERT(k>0);
00134 m_k = k;
00135 }
00136
00137 int32_t CLocallyLinearEmbedding::get_k() const
00138 {
00139 return m_k;
00140 }
00141
00142 void CLocallyLinearEmbedding::set_max_k(int32_t max_k)
00143 {
00144 ASSERT(max_k>=m_k);
00145 m_max_k = max_k;
00146 }
00147
00148 int32_t CLocallyLinearEmbedding::get_max_k() const
00149 {
00150 return m_max_k;
00151 }
00152
00153 void CLocallyLinearEmbedding::set_auto_k(bool auto_k)
00154 {
00155 m_auto_k = auto_k;
00156 }
00157
00158 bool CLocallyLinearEmbedding::get_auto_k() const
00159 {
00160 return m_auto_k;
00161 }
00162
00163 void CLocallyLinearEmbedding::set_nullspace_shift(float64_t nullspace_shift)
00164 {
00165 m_nullspace_shift = nullspace_shift;
00166 }
00167
00168 float64_t CLocallyLinearEmbedding::get_nullspace_shift() const
00169 {
00170 return m_nullspace_shift;
00171 }
00172
00173 void CLocallyLinearEmbedding::set_reconstruction_shift(float64_t reconstruction_shift)
00174 {
00175 m_reconstruction_shift = reconstruction_shift;
00176 }
00177
00178 float64_t CLocallyLinearEmbedding::get_reconstruction_shift() const
00179 {
00180 return m_reconstruction_shift;
00181 }
00182
00183 void CLocallyLinearEmbedding::set_use_arpack(bool use_arpack)
00184 {
00185 m_use_arpack = use_arpack;
00186 }
00187
00188 bool CLocallyLinearEmbedding::get_use_arpack() const
00189 {
00190 return m_use_arpack;
00191 }
00192
00193 const char* CLocallyLinearEmbedding::get_name() const
00194 {
00195 return "LocallyLinearEmbedding";
00196 }
00197
00198 CFeatures* CLocallyLinearEmbedding::apply(CFeatures* features)
00199 {
00200 ASSERT(features);
00201
00202 if (!(features->get_feature_class()==C_DENSE &&
00203 features->get_feature_type()==F_DREAL))
00204 {
00205 SG_ERROR("Given features are not of SimpleRealFeatures type.\n");
00206 }
00207
00208 CDenseFeatures<float64_t>* simple_features = (CDenseFeatures<float64_t>*) features;
00209 SG_REF(features);
00210
00211
00212 int32_t N = simple_features->get_num_vectors();
00213 if (m_k>=N)
00214 SG_ERROR("Number of neighbors (%d) should be less than number of objects (%d).\n",
00215 m_k, N);
00216
00217
00218 SG_DEBUG("Computing distance matrix\n");
00219 ASSERT(m_distance);
00220 CTime* time = new CTime();
00221 time->start();
00222 m_distance->init(simple_features,simple_features);
00223 SGMatrix<float64_t> distance_matrix = m_distance->get_distance_matrix();
00224 m_distance->remove_lhs_and_rhs();
00225 SG_DEBUG("Distance matrix computation took %fs\n",time->cur_time_diff());
00226 SG_DEBUG("Calculating neighborhood matrix\n");
00227 SGMatrix<int32_t> neighborhood_matrix;
00228
00229 time->start();
00230 if (m_auto_k)
00231 {
00232 neighborhood_matrix = get_neighborhood_matrix(distance_matrix,m_max_k);
00233 m_k = estimate_k(simple_features,neighborhood_matrix);
00234 SG_DEBUG("Estimated k with value of %d\n",m_k);
00235 }
00236 else
00237 neighborhood_matrix = get_neighborhood_matrix(distance_matrix,m_k);
00238
00239 SG_DEBUG("Neighbors finding took %fs\n",time->cur_time_diff());
00240
00241
00242 float64_t* W_matrix = SG_CALLOC(float64_t, N*N);
00243
00244
00245 SG_DEBUG("Constructing weight matrix\n");
00246 time->start();
00247 SGMatrix<float64_t> weight_matrix = construct_weight_matrix(simple_features,W_matrix,neighborhood_matrix);
00248 SG_DEBUG("Weight matrix construction took %.5fs\n", time->cur_time_diff());
00249
00250
00251 SG_DEBUG("Finding nullspace\n");
00252 time->start();
00253 SGMatrix<float64_t> new_feature_matrix = construct_embedding(weight_matrix,m_target_dim);
00254 SG_DEBUG("Eigenproblem solving took %.5fs\n", time->cur_time_diff());
00255 delete time;
00256
00257 SG_UNREF(features);
00258 return (CFeatures*)(new CDenseFeatures<float64_t>(new_feature_matrix));
00259 }
00260
00261 int32_t CLocallyLinearEmbedding::estimate_k(CDenseFeatures<float64_t>* simple_features, SGMatrix<int32_t> neighborhood_matrix)
00262 {
00263 int32_t right = m_max_k;
00264 int32_t left = m_k;
00265 int32_t left_third;
00266 int32_t right_third;
00267 ASSERT(right>=left);
00268 if (right==left) return left;
00269 int32_t dim;
00270 int32_t N;
00271 float64_t* feature_matrix= simple_features->get_feature_matrix(dim,N);
00272 float64_t* z_matrix = SG_MALLOC(float64_t,right*dim);
00273 float64_t* covariance_matrix = SG_MALLOC(float64_t,right*right);
00274 float64_t* resid_vector = SG_MALLOC(float64_t, right);
00275 float64_t* id_vector = SG_MALLOC(float64_t, right);
00276 while (right-left>2)
00277 {
00278 left_third = (left*2+right)/3;
00279 right_third = (right*2+left)/3;
00280 float64_t left_val = compute_reconstruction_error(left_third,dim,N,feature_matrix,z_matrix,
00281 covariance_matrix,resid_vector,
00282 id_vector,neighborhood_matrix);
00283 float64_t right_val = compute_reconstruction_error(right_third,dim,N,feature_matrix,z_matrix,
00284 covariance_matrix,resid_vector,
00285 id_vector,neighborhood_matrix);
00286 if (left_val<right_val)
00287 right = right_third;
00288 else
00289 left = left_third;
00290 }
00291 SG_FREE(z_matrix);
00292 SG_FREE(covariance_matrix);
00293 SG_FREE(resid_vector);
00294 SG_FREE(id_vector);
00295 return right;
00296 }
00297
00298 float64_t CLocallyLinearEmbedding::compute_reconstruction_error(int32_t k, int dim, int N, float64_t* feature_matrix,
00299 float64_t* z_matrix, float64_t* covariance_matrix,
00300 float64_t* resid_vector, float64_t* id_vector,
00301 SGMatrix<int32_t> neighborhood_matrix)
00302 {
00303
00304 int32_t i,j;
00305 float64_t total_residual_norm = 0.0;
00306 for (i=CMath::random(0,20); i<N; i+=N/20)
00307 {
00308 for (j=0; j<k; j++)
00309 {
00310 cblas_dcopy(dim,feature_matrix+neighborhood_matrix[i*m_k+j]*dim,1,z_matrix+j*dim,1);
00311 cblas_daxpy(dim,-1.0,feature_matrix+i*dim,1,z_matrix+j*dim,1);
00312 }
00313 cblas_dgemm(CblasColMajor,CblasTrans,CblasNoTrans,
00314 k,k,dim,
00315 1.0,z_matrix,dim,
00316 z_matrix,dim,
00317 0.0,covariance_matrix,k);
00318 for (j=0; j<k; j++)
00319 {
00320 resid_vector[j] = 1.0;
00321 id_vector[j] = 1.0;
00322 }
00323 if (k>dim)
00324 {
00325 float64_t trace = 0.0;
00326 for (j=0; j<k; j++)
00327 trace += covariance_matrix[j*k+j];
00328 for (j=0; j<m_k; j++)
00329 covariance_matrix[j*k+j] += m_reconstruction_shift*trace;
00330 }
00331 clapack_dposv(CblasColMajor,CblasLower,k,1,covariance_matrix,k,id_vector,k);
00332 float64_t norming=0.0;
00333 for (j=0; j<k; j++)
00334 norming += id_vector[j];
00335 cblas_dscal(k,-1.0/norming,id_vector,1);
00336 cblas_dsymv(CblasColMajor,CblasLower,k,-1.0,covariance_matrix,k,id_vector,1,1.0,resid_vector,1);
00337 total_residual_norm += cblas_dnrm2(k,resid_vector,1);
00338 }
00339 return total_residual_norm/k;
00340 }
00341
00342 SGMatrix<float64_t> CLocallyLinearEmbedding::construct_weight_matrix(CDenseFeatures<float64_t>* simple_features,
00343 float64_t* W_matrix, SGMatrix<int32_t> neighborhood_matrix)
00344 {
00345 int32_t N = simple_features->get_num_vectors();
00346 int32_t dim = simple_features->get_num_features();
00347 #ifdef HAVE_PTHREAD
00348 int32_t t;
00349 int32_t num_threads = parallel->get_num_threads();
00350 ASSERT(num_threads>0);
00351
00352 pthread_t* threads = SG_MALLOC(pthread_t, num_threads);
00353 LINRECONSTRUCTION_THREAD_PARAM* parameters = SG_MALLOC(LINRECONSTRUCTION_THREAD_PARAM, num_threads);
00354 pthread_attr_t attr;
00355 pthread_attr_init(&attr);
00356 pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
00357 #else
00358 int32_t num_threads = 1;
00359 #endif
00360
00361 float64_t* z_matrix = SG_MALLOC(float64_t, m_k*dim*num_threads);
00362 float64_t* covariance_matrix = SG_MALLOC(float64_t, m_k*m_k*num_threads);
00363 float64_t* id_vector = SG_MALLOC(float64_t, m_k*num_threads);
00364
00365
00366 SGMatrix<float64_t> feature_matrix = simple_features->get_feature_matrix();
00367
00368 #ifdef HAVE_PTHREAD
00369 PTHREAD_LOCK_T W_matrix_lock;
00370 PTHREAD_LOCK_INIT(&W_matrix_lock);
00371 for (t=0; t<num_threads; t++)
00372 {
00373 parameters[t].idx_start = t;
00374 parameters[t].idx_step = num_threads;
00375 parameters[t].idx_stop = N;
00376 parameters[t].m_k = m_k;
00377 parameters[t].neighborhood_matrix = neighborhood_matrix.matrix;
00378 parameters[t].z_matrix = z_matrix+(m_k*dim)*t;
00379 parameters[t].feature_matrix = feature_matrix.matrix;
00380 parameters[t].covariance_matrix = covariance_matrix+(m_k*m_k)*t;
00381 parameters[t].id_vector = id_vector+m_k*t;
00382 parameters[t].W_matrix = W_matrix;
00383 parameters[t].m_reconstruction_shift = m_reconstruction_shift;
00384 parameters[t].dim = feature_matrix.num_rows;
00385 parameters[t].N = feature_matrix.num_cols;
00386 parameters[t].actual_k = neighborhood_matrix.num_rows;
00387 parameters[t].W_matrix_lock = &W_matrix_lock;
00388 pthread_create(&threads[t], &attr, run_linearreconstruction_thread, (void*)¶meters[t]);
00389 }
00390 for (t=0; t<num_threads; t++)
00391 pthread_join(threads[t], NULL);
00392 pthread_attr_destroy(&attr);
00393 SG_FREE(parameters);
00394 SG_FREE(threads);
00395 #else
00396 LINRECONSTRUCTION_THREAD_PARAM single_thread_param;
00397 single_thread_param.idx_start = 0;
00398 single_thread_param.idx_step = 1;
00399 single_thread_param.idx_stop = N;
00400 single_thread_param.m_k = m_k;
00401 single_thread_param.neighborhood_matrix = neighborhood_matrix.matrix;
00402 single_thread_param.z_matrix = z_matrix;
00403 single_thread_param.feature_matrix = feature_matrix.matrix;
00404 single_thread_param.covariance_matrix = covariance_matrix;
00405 single_thread_param.id_vector = id_vector;
00406 single_thread_param.W_matrix = W_matrix;
00407 single_thread_param.m_reconstruction_shift = m_reconstruction_shift;
00408 run_linearreconstruction_thread((void*)&single_thread_param);
00409 #endif
00410
00411
00412 SG_FREE(id_vector);
00413 SG_FREE(z_matrix);
00414 SG_FREE(covariance_matrix);
00415
00416 return SGMatrix<float64_t>(W_matrix,N,N);
00417 }
00418
00419 SGMatrix<float64_t> CLocallyLinearEmbedding::construct_embedding(SGMatrix<float64_t> matrix,int dimension)
00420 {
00421 int i,j;
00422 ASSERT(matrix.num_cols==matrix.num_rows);
00423 int N = matrix.num_cols;
00424
00425 int eigenproblem_status = 0;
00426
00427 float64_t* eigenvalues_vector = NULL;
00428 float64_t* eigenvectors = NULL;
00429 float64_t* nullspace_features = NULL;
00430 if (m_use_arpack)
00431 {
00432 #ifndef HAVE_ARPACK
00433 SG_ERROR("ARPACK is not supported in this configuration.\n");
00434 #endif
00435
00436 eigenvalues_vector = SG_MALLOC(float64_t, dimension+1);
00437 #ifdef HAVE_ARPACK
00438 arpack_dsxupd(matrix.matrix,NULL,false,N,dimension+1,"LA",true,3,true,false,m_nullspace_shift,0.0,
00439 eigenvalues_vector,matrix.matrix,eigenproblem_status);
00440 matrix.num_rows = dimension+1;
00441 #endif
00442 if (eigenproblem_status)
00443 SG_ERROR("ARPACK failed with code: %d", eigenproblem_status);
00444 nullspace_features = SG_MALLOC(float64_t, N*dimension);
00445 for (i=0; i<dimension; i++)
00446 {
00447 for (j=0; j<N; j++)
00448 nullspace_features[j*dimension+i] = matrix[j*(dimension+1)+i+1];
00449 }
00450 SG_FREE(eigenvalues_vector);
00451 }
00452 else
00453 {
00454
00455 eigenvalues_vector = SG_MALLOC(float64_t, N);
00456 eigenvectors = SG_MALLOC(float64_t,(dimension+1)*N);
00457 wrap_dsyevr('V','U',N,matrix.matrix,N,2,dimension+2,eigenvalues_vector,eigenvectors,&eigenproblem_status);
00458 if (eigenproblem_status)
00459 SG_ERROR("LAPACK failed with code: %d", eigenproblem_status);
00460 nullspace_features = SG_MALLOC(float64_t, N*dimension);
00461
00462 for (i=0; i<dimension; i++)
00463 {
00464 for (j=0; j<N; j++)
00465 nullspace_features[j*dimension+i] = eigenvectors[i*N+j];
00466 }
00467 SG_FREE(eigenvectors);
00468 SG_FREE(eigenvalues_vector);
00469 }
00470 return SGMatrix<float64_t>(nullspace_features,dimension,N);
00471 }
00472
00473 void* CLocallyLinearEmbedding::run_linearreconstruction_thread(void* p)
00474 {
00475 LINRECONSTRUCTION_THREAD_PARAM* parameters = (LINRECONSTRUCTION_THREAD_PARAM*)p;
00476 int32_t idx_start = parameters->idx_start;
00477 int32_t idx_step = parameters->idx_step;
00478 int32_t idx_stop = parameters->idx_stop;
00479 int32_t m_k = parameters->m_k;
00480 const int32_t* neighborhood_matrix = parameters->neighborhood_matrix;
00481 float64_t* z_matrix = parameters->z_matrix;
00482 const float64_t* feature_matrix = parameters->feature_matrix;
00483 float64_t* covariance_matrix = parameters->covariance_matrix;
00484 float64_t* id_vector = parameters->id_vector;
00485 float64_t* W_matrix = parameters->W_matrix;
00486 float64_t m_reconstruction_shift = parameters->m_reconstruction_shift;
00487 #ifdef HAVE_PTHREAD
00488 PTHREAD_LOCK_T* W_matrix_lock = parameters->W_matrix_lock;
00489 #endif
00490
00491 int32_t i,j,k;
00492 int32_t dim = parameters->dim;
00493 int32_t N = parameters->N;
00494 int32_t actual_k = parameters->actual_k;
00495 float64_t norming,trace;
00496
00497 for (i=idx_start; i<idx_stop; i+=idx_step)
00498 {
00499
00500
00501 for (j=0; j<m_k; j++)
00502 {
00503 cblas_dcopy(dim,feature_matrix+neighborhood_matrix[i*actual_k+j]*dim,1,z_matrix+j*dim,1);
00504 cblas_daxpy(dim,-1.0,feature_matrix+i*dim,1,z_matrix+j*dim,1);
00505 }
00506
00507
00508 cblas_dgemm(CblasColMajor,CblasTrans,CblasNoTrans,
00509 m_k,m_k,dim,
00510 1.0,z_matrix,dim,
00511 z_matrix,dim,
00512 0.0,covariance_matrix,m_k);
00513
00514 for (j=0; j<m_k; j++)
00515 id_vector[j] = 1.0;
00516
00517
00518 if (m_k>dim)
00519 {
00520
00521 trace = 0.0;
00522 for (j=0; j<m_k; j++)
00523 trace += covariance_matrix[j*m_k+j];
00524
00525 for (j=0; j<m_k; j++)
00526 covariance_matrix[j*m_k+j] += m_reconstruction_shift*trace;
00527 }
00528
00529
00530
00531 clapack_dposv(CblasColMajor,CblasLower,m_k,1,covariance_matrix,m_k,id_vector,m_k);
00532
00533
00534 norming=0.0;
00535 for (j=0; j<m_k; j++)
00536 norming += id_vector[j];
00537
00538 cblas_dscal(m_k,1.0/norming,id_vector,1);
00539
00540 memset(covariance_matrix,0,sizeof(float64_t)*m_k*m_k);
00541 cblas_dger(CblasColMajor,m_k,m_k,1.0,id_vector,1,id_vector,1,covariance_matrix,m_k);
00542
00543
00544 W_matrix[N*i+i] += 1.0;
00545 #ifdef HAVE_PTHREAD
00546 PTHREAD_LOCK(W_matrix_lock);
00547 #endif
00548 for (j=0; j<m_k; j++)
00549 {
00550 W_matrix[N*i+neighborhood_matrix[i*actual_k+j]] -= id_vector[j];
00551 W_matrix[N*neighborhood_matrix[i*actual_k+j]+i] -= id_vector[j];
00552 }
00553 for (j=0; j<m_k; j++)
00554 {
00555 for (k=0; k<m_k; k++)
00556 W_matrix[N*neighborhood_matrix[i*actual_k+j]+neighborhood_matrix[i*actual_k+k]]+=
00557 covariance_matrix[j*m_k+k];
00558 }
00559 #ifdef HAVE_PTHREAD
00560 PTHREAD_UNLOCK(W_matrix_lock);
00561 #endif
00562 }
00563 return NULL;
00564 }
00565
00566 SGMatrix<int32_t> CLocallyLinearEmbedding::get_neighborhood_matrix(SGMatrix<float64_t> distance_matrix, int32_t k)
00567 {
00568 int32_t i;
00569 int32_t N = distance_matrix.num_rows;
00570
00571 int32_t* neighborhood_matrix = SG_MALLOC(int32_t, N*k);
00572
00573 float64_t max_dist = SGVector<float64_t>::max(distance_matrix.matrix,N*N);
00574
00575 CoverTree<LLE_COVERTREE_POINT>* coverTree = new CoverTree<LLE_COVERTREE_POINT>(max_dist);
00576
00577 for (i=0; i<N; i++)
00578 coverTree->insert(LLE_COVERTREE_POINT(i,distance_matrix));
00579
00580 for (i=0; i<N; i++)
00581 {
00582 std::vector<LLE_COVERTREE_POINT> neighbors =
00583 coverTree->kNearestNeighbors(LLE_COVERTREE_POINT(i,distance_matrix),k+1);
00584 for (std::size_t m=1; m<unsigned(k+1); m++)
00585 neighborhood_matrix[i*k+m-1] = neighbors[m].point_index;
00586 }
00587
00588 delete coverTree;
00589
00590 return SGMatrix<int32_t>(neighborhood_matrix,k,N);
00591 }
00592 #endif