SHOGUN: StochasticProximityEmbedding.cpp Source File

Go to the documentation of this file.
 /*
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 3 of the License, or
  * (at your option) any later version.
  *
  * Written (W) 2012 Fernando José Iglesias García
  * Copyright (C) 2012 Fernando José Iglesias García
  */
 
 #include <shogun/converter/StochasticProximityEmbedding.h>
 #include <shogun/lib/config.h>
 #ifdef HAVE_LAPACK
 #include <shogun/converter/EmbeddingConverter.h>
 #include <shogun/io/SGIO.h>
 #include <shogun/lib/CoverTree.h>
 #include <shogun/mathematics/Math.h>
 #include <shogun/distance/Distance.h>
 
 using namespace shogun;
 
 class SPE_COVERTREE_POINT
 {
 public:
 
     SPE_COVERTREE_POINT(int32_t index, CDistance* specp_distance)
     {
         m_point_index = index;
         m_distance = specp_distance;
     }
 
     inline double distance(const SPE_COVERTREE_POINT& p) const
     {
         return m_distance->distance(m_point_index, p.m_point_index);
     }
 
     inline bool operator==(const SPE_COVERTREE_POINT& p) const
     {
         return (p.m_point_index == m_point_index);
     }
 
     int32_t m_point_index;
     CDistance* m_distance;
 };
 
 CStochasticProximityEmbedding::CStochasticProximityEmbedding() : 
     CEmbeddingConverter()
 {
     // Initialize to default values
     m_k         = 12;
     m_nupdates  = 100;
     m_strategy  = SPE_GLOBAL;
     m_tolerance = 1e-5;
 
     init();
 }
 
 void CStochasticProximityEmbedding::init()
 {
     SG_ADD(&m_k, "m_k", "Number of neighbors", MS_NOT_AVAILABLE);
     SG_ADD((machine_int_t*) &m_strategy, "m_strategy", "SPE strategy", 
             MS_NOT_AVAILABLE);
     SG_ADD(&m_tolerance, "m_tolerance", "Regularization parameter", 
             MS_NOT_AVAILABLE);
 }
 
 CStochasticProximityEmbedding::~CStochasticProximityEmbedding()
 {
 }
 
 void CStochasticProximityEmbedding::set_k(int32_t k)
 {
     if ( k <= 0 )
         SG_ERROR("Number of neighbors k must be greater than 0");
 
     m_k = k;
 }
 
 int32_t CStochasticProximityEmbedding::get_k() const
 {
     return m_k;
 }
 
 void CStochasticProximityEmbedding::set_strategy(ESPEStrategy strategy)
 {
     m_strategy = strategy;
 }
 
 ESPEStrategy CStochasticProximityEmbedding::get_strategy() const
 {
     return m_strategy;
 }
 
 void CStochasticProximityEmbedding::set_tolerance(float32_t tolerance)
 {
     if ( tolerance <= 0 )
         SG_ERROR("Tolerance regularization parameter must be greater "
              "than 0");
 
     m_tolerance = tolerance;
 }
 
 int32_t CStochasticProximityEmbedding::get_tolerance() const
 {
     return m_tolerance;
 }
 
 void CStochasticProximityEmbedding::set_nupdates(int32_t nupdates)
 {
     if ( nupdates <= 0 )
         SG_ERROR("The number of updates must be greater than 0");
 
     m_nupdates = nupdates;
 }
 
 int32_t CStochasticProximityEmbedding::get_nupdates() const
 {
     return m_nupdates;
 }
 
 const char * CStochasticProximityEmbedding::get_name() const
 {
     return "StochasticProximityEmbedding";
 }
 
 CFeatures* CStochasticProximityEmbedding::apply(CFeatures* features)
 {
     if ( !features )
         SG_ERROR("Features are required to apply SPE\n");
 
     // Shorthand for the DenseFeatures
     CDenseFeatures< float64_t >* simple_features = 
         (CDenseFeatures< float64_t >*) features;
     SG_REF(features);
 
     // Get and check the number of vectors
     int32_t N = simple_features->get_num_vectors();
     if ( m_strategy == SPE_LOCAL && m_k >= N )
         SG_ERROR("The number of neighbors (%d) must be less than "
                  "the number of vectors (%d)\n", m_k, N);
 
     if ( 2*m_nupdates > N )
         SG_ERROR("The number of vectors (%d) must be at least two times "
              "the number of updates (%d)\n", N, m_nupdates);
 
     m_distance->init(simple_features, simple_features);
     CDenseFeatures< float64_t >* embedding = embed_distance(m_distance);
     m_distance->remove_lhs_and_rhs();
 
     SG_UNREF(features);
     return (CFeatures*)embedding;
 }
 
 SGMatrix<int32_t> CStochasticProximityEmbedding::get_neighborhood_matrix(CDistance* distance, int32_t k, int32_t N, float64_t max_dist)
 {
     int32_t i;
     int32_t* neighborhood_matrix = SG_MALLOC(int32_t, N*k);
 
     CoverTree<SPE_COVERTREE_POINT>* coverTree = new CoverTree<SPE_COVERTREE_POINT>(max_dist);
 
     for (i=0; i<N; i++)
         coverTree->insert(SPE_COVERTREE_POINT(i, distance));
 
     for (i=0; i<N; i++)
     {
         std::vector<SPE_COVERTREE_POINT> neighbors =
             coverTree->kNearestNeighbors(SPE_COVERTREE_POINT(i, distance), k+1);
         for (std::size_t m=1; m<unsigned(k+1); m++)
             neighborhood_matrix[i*k+m-1] = neighbors[m].m_point_index;
     }
 
     delete coverTree;
 
     return SGMatrix<int32_t>(neighborhood_matrix,k,N);
 }
 
 CDenseFeatures< float64_t >* CStochasticProximityEmbedding::embed_distance(CDistance* distance)
 {
     if ( !distance )
         SG_ERROR("Embed distance received no instance of CDistance\n");
 
     // Compute distance matrix
     SG_DEBUG("Computing distance matrix\n");
 
     if ( distance->get_distance_type() != D_EUCLIDEAN )
         SG_ERROR("SPE only supports Euclidean distance, %s given\n", 
                 distance->get_name());
 
     // Get the number of features, assume that distance(features, features)
     int32_t N = distance->get_num_vec_rhs();
 
     // Look for the maximum distance (make the same assumption)
     int32_t i, j, k;
     float64_t max = 0.0, tmp = 0.0;
     for ( i = 0 ; i < N ; ++i )
         for ( j = i+1 ; j < N ; ++j )
             if ( ( tmp = distance->distance(i, j) ) > max )
                 max = tmp;
 
     // Compute a normalizer to be used for the distances if global strategy selected
     float64_t alpha = 0.0;
     if ( m_strategy == SPE_GLOBAL )
         alpha = 1.0 / max * CMath::sqrt(2.0);
 
     // Compute neighborhood matrix if local strategy used
     SGMatrix< int32_t > neighbors_mat;
     if ( m_strategy == SPE_LOCAL )
     {
         SG_DEBUG("Computing neighborhood matrix\n");
         neighbors_mat = get_neighborhood_matrix(distance, m_k, N, max);
     }
 
     // Initialize vectors in the embedded space randomly, Y is the short for
     // new_feature_matrix
     SGMatrix< float64_t > Y(m_target_dim, N);
     SGVector<float64_t>::random_vector(Y.matrix, m_target_dim*N, 0.0, 1.0);
 
     // SPE's loop
     
     // Initialize the maximum number of iterations
     int32_t max_iter = 2000 + CMath::round(0.04 * N*N);
     if ( m_strategy == SPE_LOCAL )
         max_iter *= 3;
 
     // Initialize the learning parameter
     float32_t lambda = 1.0;
 
     // Initialize variables to use in the main loop
     float64_t sum = 0.0;
     index_t   idx1 = 0, idx2 = 0, idx = 0;
 
     SGVector< float64_t > scale(m_nupdates);
     SGVector< float64_t > D(m_nupdates);
     SGMatrix< float64_t > Yd(m_nupdates, m_target_dim);
     SGVector< float64_t > Rt(m_nupdates);
     int32_t* ind2 = NULL;
 
     // Variables required just if local strategy used
     SGMatrix< int32_t > ind1Neighbors;
     SGVector< int32_t > J2;
     if ( m_strategy == SPE_LOCAL )
     {
         ind2 = SG_MALLOC(int32_t, m_nupdates);
 
         ind1Neighbors = SGMatrix<int32_t>(m_k,m_nupdates);
 
         J2 = SGVector<int32_t>(m_nupdates);
     }
 
     for ( i = 0 ; i < max_iter ; ++i )
     {
         if ( !(i % 1000) )
             SG_DEBUG("SPE's loop, iteration %d of %d\n", i, max_iter);
 
         // Select the vectors to be updated in this iteration
         int32_t* J = CMath::randperm(N);
 
         // Pointer to the first set of vector indices to update
         int32_t* ind1 = J;
 
         // Pointer ind2 to the second set of vector indices to update
         if ( m_strategy == SPE_GLOBAL )
             ind2 = J + m_nupdates;
         else
         {
             // Select the second set of indices to update among neighbors
             // of the first set
                 
             // Get the neighbors of interest
             for ( j = 0 ; j < m_nupdates ; ++j )
             {
                 for ( k = 0 ; k < m_k ; ++k )
                     ind1Neighbors[k + j*m_k] =
                         neighbors_mat.matrix[k + ind1[j]*m_k];
             }
 
             // Generate pseudo-random indices
             for ( j = 0 ; j < m_nupdates ; ++j )
             {
                 J2[j] = CMath::round( CMath::random(0.0, 1.0)*(m_k-1) )
                         + m_k*j;
             }
 
             // Select final indices
             for ( j = 0 ; j < m_nupdates ; ++j )
                 ind2[j] = ind1Neighbors.matrix[ J2[j] ];
         }
 
         // Compute distances betweeen the selected points in embedded space
 
         for ( j = 0 ; j < m_nupdates ; ++j )
         {
             sum = 0.0;
 
             for ( k = 0 ; k < m_target_dim ; ++k )
             {
                 idx1 = k + ind1[j]*m_target_dim;
                 idx2 = k + ind2[j]*m_target_dim;
                 sum += CMath::sq(Y.matrix[idx1] - Y.matrix[idx2]);
             }
 
             D[j] = CMath::sqrt(sum);
         }
 
         // Get the corresponding distances in the original space
         
         if ( m_strategy == SPE_LOCAL )
             Rt.set_const(1);
         else    // SPE_GLOBAL strategy used
             Rt.set_const(alpha);
 
         for ( j = 0 ; j < m_nupdates ; ++j )
             Rt[j] *= distance->distance( ind1[j], ind2[j] );
 
         // Compute some terms for update
 
         // Scale factor: (Rt - D) ./ (D + m_tolerance)
         for ( j = 0 ; j < m_nupdates ; ++j )
             scale[j] = ( Rt[j] - D[j] ) / ( D[j] + m_tolerance );
 
         // Difference matrix: Y(ind1) - Y(ind2)
         for ( j = 0 ; j < m_nupdates ; ++j )
             for ( k = 0 ; k < m_target_dim ; ++k )
             {
                 idx1 = k + ind1[j]*m_target_dim;
                 idx2 = k + ind2[j]*m_target_dim;
                 idx  = k +      j *m_target_dim;
 
                 Yd[idx] = Y[idx1] - Y[idx2];
             }
 
         // Update the location of the vectors in the embedded space
         for ( j = 0 ; j < m_nupdates ; ++j )
             for ( k = 0 ; k < m_target_dim ; ++k )
             {
                 idx1 = k + ind1[j]*m_target_dim;
                 idx2 = k + ind2[j]*m_target_dim;
                 idx  = k +      j *m_target_dim;
 
                 Y[idx1] += lambda / 2 * scale[j] * Yd[idx];
                 Y[idx2] -= lambda / 2 * scale[j] * Yd[idx];
             }
 
         // Update the learning parameter
         lambda = lambda - ( lambda / max_iter );
 
         // Free memory
         SG_FREE(J);
     }
 
     // Free memory
     if ( m_strategy == SPE_LOCAL )
         SG_FREE(ind2);
 
     return new CDenseFeatures< float64_t >(Y);
 }
 
 #endif /* HAVE_LAPACK */