00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011 #include <shogun/converter/StochasticProximityEmbedding.h>
00012 #include <shogun/lib/config.h>
00013 #ifdef HAVE_LAPACK
00014 #include <shogun/converter/EmbeddingConverter.h>
00015 #include <shogun/io/SGIO.h>
00016 #include <shogun/lib/CoverTree.h>
00017 #include <shogun/mathematics/Math.h>
00018 #include <shogun/distance/Distance.h>
00019
00020 using namespace shogun;
00021
00022 class SPE_COVERTREE_POINT
00023 {
00024 public:
00025
00026 SPE_COVERTREE_POINT(int32_t index, CDistance* specp_distance)
00027 {
00028 m_point_index = index;
00029 m_distance = specp_distance;
00030 }
00031
00032 inline double distance(const SPE_COVERTREE_POINT& p) const
00033 {
00034 return m_distance->distance(m_point_index, p.m_point_index);
00035 }
00036
00037 inline bool operator==(const SPE_COVERTREE_POINT& p) const
00038 {
00039 return (p.m_point_index == m_point_index);
00040 }
00041
00042 int32_t m_point_index;
00043 CDistance* m_distance;
00044 };
00045
00046 CStochasticProximityEmbedding::CStochasticProximityEmbedding() :
00047 CEmbeddingConverter()
00048 {
00049
00050 m_k = 12;
00051 m_nupdates = 100;
00052 m_strategy = SPE_GLOBAL;
00053 m_tolerance = 1e-5;
00054
00055 init();
00056 }
00057
00058 void CStochasticProximityEmbedding::init()
00059 {
00060 SG_ADD(&m_k, "m_k", "Number of neighbors", MS_NOT_AVAILABLE);
00061 SG_ADD((machine_int_t*) &m_strategy, "m_strategy", "SPE strategy",
00062 MS_NOT_AVAILABLE);
00063 SG_ADD(&m_tolerance, "m_tolerance", "Regularization parameter",
00064 MS_NOT_AVAILABLE);
00065 }
00066
00067 CStochasticProximityEmbedding::~CStochasticProximityEmbedding()
00068 {
00069 }
00070
00071 void CStochasticProximityEmbedding::set_k(int32_t k)
00072 {
00073 if ( k <= 0 )
00074 SG_ERROR("Number of neighbors k must be greater than 0");
00075
00076 m_k = k;
00077 }
00078
00079 int32_t CStochasticProximityEmbedding::get_k() const
00080 {
00081 return m_k;
00082 }
00083
00084 void CStochasticProximityEmbedding::set_strategy(ESPEStrategy strategy)
00085 {
00086 m_strategy = strategy;
00087 }
00088
00089 ESPEStrategy CStochasticProximityEmbedding::get_strategy() const
00090 {
00091 return m_strategy;
00092 }
00093
00094 void CStochasticProximityEmbedding::set_tolerance(float32_t tolerance)
00095 {
00096 if ( tolerance <= 0 )
00097 SG_ERROR("Tolerance regularization parameter must be greater "
00098 "than 0");
00099
00100 m_tolerance = tolerance;
00101 }
00102
00103 int32_t CStochasticProximityEmbedding::get_tolerance() const
00104 {
00105 return m_tolerance;
00106 }
00107
00108 void CStochasticProximityEmbedding::set_nupdates(int32_t nupdates)
00109 {
00110 if ( nupdates <= 0 )
00111 SG_ERROR("The number of updates must be greater than 0");
00112
00113 m_nupdates = nupdates;
00114 }
00115
00116 int32_t CStochasticProximityEmbedding::get_nupdates() const
00117 {
00118 return m_nupdates;
00119 }
00120
00121 const char * CStochasticProximityEmbedding::get_name() const
00122 {
00123 return "StochasticProximityEmbedding";
00124 }
00125
00126 CFeatures* CStochasticProximityEmbedding::apply(CFeatures* features)
00127 {
00128 if ( !features )
00129 SG_ERROR("Features are required to apply SPE\n");
00130
00131
00132 CDenseFeatures< float64_t >* simple_features =
00133 (CDenseFeatures< float64_t >*) features;
00134 SG_REF(features);
00135
00136
00137 int32_t N = simple_features->get_num_vectors();
00138 if ( m_strategy == SPE_LOCAL && m_k >= N )
00139 SG_ERROR("The number of neighbors (%d) must be less than "
00140 "the number of vectors (%d)\n", m_k, N);
00141
00142 if ( 2*m_nupdates > N )
00143 SG_ERROR("The number of vectors (%d) must be at least two times "
00144 "the number of updates (%d)\n", N, m_nupdates);
00145
00146 m_distance->init(simple_features, simple_features);
00147 CDenseFeatures< float64_t >* embedding = embed_distance(m_distance);
00148 m_distance->remove_lhs_and_rhs();
00149
00150 SG_UNREF(features);
00151 return (CFeatures*)embedding;
00152 }
00153
00154 SGMatrix<int32_t> CStochasticProximityEmbedding::get_neighborhood_matrix(CDistance* distance, int32_t k, int32_t N, float64_t max_dist)
00155 {
00156 int32_t i;
00157 int32_t* neighborhood_matrix = SG_MALLOC(int32_t, N*k);
00158
00159 CoverTree<SPE_COVERTREE_POINT>* coverTree = new CoverTree<SPE_COVERTREE_POINT>(max_dist);
00160
00161 for (i=0; i<N; i++)
00162 coverTree->insert(SPE_COVERTREE_POINT(i, distance));
00163
00164 for (i=0; i<N; i++)
00165 {
00166 std::vector<SPE_COVERTREE_POINT> neighbors =
00167 coverTree->kNearestNeighbors(SPE_COVERTREE_POINT(i, distance), k+1);
00168 for (std::size_t m=1; m<unsigned(k+1); m++)
00169 neighborhood_matrix[i*k+m-1] = neighbors[m].m_point_index;
00170 }
00171
00172 delete coverTree;
00173
00174 return SGMatrix<int32_t>(neighborhood_matrix,k,N);
00175 }
00176
00177 CDenseFeatures< float64_t >* CStochasticProximityEmbedding::embed_distance(CDistance* distance)
00178 {
00179 if ( !distance )
00180 SG_ERROR("Embed distance received no instance of CDistance\n");
00181
00182
00183 SG_DEBUG("Computing distance matrix\n");
00184
00185 if ( distance->get_distance_type() != D_EUCLIDEAN )
00186 SG_ERROR("SPE only supports Euclidean distance, %s given\n",
00187 distance->get_name());
00188
00189
00190 int32_t N = distance->get_num_vec_rhs();
00191
00192
00193 int32_t i, j, k;
00194 float64_t max = 0.0, tmp = 0.0;
00195 for ( i = 0 ; i < N ; ++i )
00196 for ( j = i+1 ; j < N ; ++j )
00197 if ( ( tmp = distance->distance(i, j) ) > max )
00198 max = tmp;
00199
00200
00201 float64_t alpha = 0.0;
00202 if ( m_strategy == SPE_GLOBAL )
00203 alpha = 1.0 / max * CMath::sqrt(2.0);
00204
00205
00206 SGMatrix< int32_t > neighbors_mat;
00207 if ( m_strategy == SPE_LOCAL )
00208 {
00209 SG_DEBUG("Computing neighborhood matrix\n");
00210 neighbors_mat = get_neighborhood_matrix(distance, m_k, N, max);
00211 }
00212
00213
00214
00215 SGMatrix< float64_t > Y(m_target_dim, N);
00216 SGVector<float64_t>::random_vector(Y.matrix, m_target_dim*N, 0.0, 1.0);
00217
00218
00219
00220
00221 int32_t max_iter = 2000 + CMath::round(0.04 * N*N);
00222 if ( m_strategy == SPE_LOCAL )
00223 max_iter *= 3;
00224
00225
00226 float32_t lambda = 1.0;
00227
00228
00229 float64_t sum = 0.0;
00230 index_t idx1 = 0, idx2 = 0, idx = 0;
00231
00232 SGVector< float64_t > scale(m_nupdates);
00233 SGVector< float64_t > D(m_nupdates);
00234 SGMatrix< float64_t > Yd(m_nupdates, m_target_dim);
00235 SGVector< float64_t > Rt(m_nupdates);
00236 int32_t* ind2 = NULL;
00237
00238
00239 SGMatrix< int32_t > ind1Neighbors;
00240 SGVector< int32_t > J2;
00241 if ( m_strategy == SPE_LOCAL )
00242 {
00243 ind2 = SG_MALLOC(int32_t, m_nupdates);
00244
00245 ind1Neighbors = SGMatrix<int32_t>(m_k,m_nupdates);
00246
00247 J2 = SGVector<int32_t>(m_nupdates);
00248 }
00249
00250 for ( i = 0 ; i < max_iter ; ++i )
00251 {
00252 if ( !(i % 1000) )
00253 SG_DEBUG("SPE's loop, iteration %d of %d\n", i, max_iter);
00254
00255
00256 int32_t* J = CMath::randperm(N);
00257
00258
00259 int32_t* ind1 = J;
00260
00261
00262 if ( m_strategy == SPE_GLOBAL )
00263 ind2 = J + m_nupdates;
00264 else
00265 {
00266
00267
00268
00269
00270 for ( j = 0 ; j < m_nupdates ; ++j )
00271 {
00272 for ( k = 0 ; k < m_k ; ++k )
00273 ind1Neighbors[k + j*m_k] =
00274 neighbors_mat.matrix[k + ind1[j]*m_k];
00275 }
00276
00277
00278 for ( j = 0 ; j < m_nupdates ; ++j )
00279 {
00280 J2[j] = CMath::round( CMath::random(0.0, 1.0)*(m_k-1) )
00281 + m_k*j;
00282 }
00283
00284
00285 for ( j = 0 ; j < m_nupdates ; ++j )
00286 ind2[j] = ind1Neighbors.matrix[ J2[j] ];
00287 }
00288
00289
00290
00291 for ( j = 0 ; j < m_nupdates ; ++j )
00292 {
00293 sum = 0.0;
00294
00295 for ( k = 0 ; k < m_target_dim ; ++k )
00296 {
00297 idx1 = k + ind1[j]*m_target_dim;
00298 idx2 = k + ind2[j]*m_target_dim;
00299 sum += CMath::sq(Y.matrix[idx1] - Y.matrix[idx2]);
00300 }
00301
00302 D[j] = CMath::sqrt(sum);
00303 }
00304
00305
00306
00307 if ( m_strategy == SPE_LOCAL )
00308 Rt.set_const(1);
00309 else
00310 Rt.set_const(alpha);
00311
00312 for ( j = 0 ; j < m_nupdates ; ++j )
00313 Rt[j] *= distance->distance( ind1[j], ind2[j] );
00314
00315
00316
00317
00318 for ( j = 0 ; j < m_nupdates ; ++j )
00319 scale[j] = ( Rt[j] - D[j] ) / ( D[j] + m_tolerance );
00320
00321
00322 for ( j = 0 ; j < m_nupdates ; ++j )
00323 for ( k = 0 ; k < m_target_dim ; ++k )
00324 {
00325 idx1 = k + ind1[j]*m_target_dim;
00326 idx2 = k + ind2[j]*m_target_dim;
00327 idx = k + j *m_target_dim;
00328
00329 Yd[idx] = Y[idx1] - Y[idx2];
00330 }
00331
00332
00333 for ( j = 0 ; j < m_nupdates ; ++j )
00334 for ( k = 0 ; k < m_target_dim ; ++k )
00335 {
00336 idx1 = k + ind1[j]*m_target_dim;
00337 idx2 = k + ind2[j]*m_target_dim;
00338 idx = k + j *m_target_dim;
00339
00340 Y[idx1] += lambda / 2 * scale[j] * Yd[idx];
00341 Y[idx2] -= lambda / 2 * scale[j] * Yd[idx];
00342 }
00343
00344
00345 lambda = lambda - ( lambda / max_iter );
00346
00347
00348 SG_FREE(J);
00349 }
00350
00351
00352 if ( m_strategy == SPE_LOCAL )
00353 SG_FREE(ind2);
00354
00355 return new CDenseFeatures< float64_t >(Y);
00356 }
00357
00358 #endif