SHOGUN  v2.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
StochasticProximityEmbedding.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2012 Fernando José Iglesias García
8  * Copyright (C) 2012 Fernando José Iglesias García
9  */
10 
12 #include <shogun/lib/config.h>
13 #ifdef HAVE_LAPACK
15 #include <shogun/io/SGIO.h>
16 #include <shogun/lib/CoverTree.h>
19 
20 using namespace shogun;
21 
23 {
24 public:
25 
26  SPE_COVERTREE_POINT(int32_t index, CDistance* specp_distance)
27  {
28  m_point_index = index;
29  m_distance = specp_distance;
30  }
31 
32  inline double distance(const SPE_COVERTREE_POINT& p) const
33  {
34  return m_distance->distance(m_point_index, p.m_point_index);
35  }
36 
37  inline bool operator==(const SPE_COVERTREE_POINT& p) const
38  {
39  return (p.m_point_index == m_point_index);
40  }
41 
42  int32_t m_point_index;
44 };
45 
48 {
49  // Initialize to default values
50  m_k = 12;
51  m_nupdates = 100;
52  m_strategy = SPE_GLOBAL;
53  m_tolerance = 1e-5;
54 
55  init();
56 }
57 
58 void CStochasticProximityEmbedding::init()
59 {
60  SG_ADD(&m_k, "m_k", "Number of neighbors", MS_NOT_AVAILABLE);
61  SG_ADD((machine_int_t*) &m_strategy, "m_strategy", "SPE strategy",
63  SG_ADD(&m_tolerance, "m_tolerance", "Regularization parameter",
65 }
66 
68 {
69 }
70 
72 {
73  if ( k <= 0 )
74  SG_ERROR("Number of neighbors k must be greater than 0");
75 
76  m_k = k;
77 }
78 
80 {
81  return m_k;
82 }
83 
85 {
86  m_strategy = strategy;
87 }
88 
90 {
91  return m_strategy;
92 }
93 
95 {
96  if ( tolerance <= 0 )
97  SG_ERROR("Tolerance regularization parameter must be greater "
98  "than 0");
99 
100  m_tolerance = tolerance;
101 }
102 
104 {
105  return m_tolerance;
106 }
107 
109 {
110  if ( nupdates <= 0 )
111  SG_ERROR("The number of updates must be greater than 0");
112 
113  m_nupdates = nupdates;
114 }
115 
117 {
118  return m_nupdates;
119 }
120 
122 {
123  return "StochasticProximityEmbedding";
124 }
125 
127 {
128  if ( !features )
129  SG_ERROR("Features are required to apply SPE\n");
130 
131  // Shorthand for the DenseFeatures
132  CDenseFeatures< float64_t >* simple_features =
133  (CDenseFeatures< float64_t >*) features;
134  SG_REF(features);
135 
136  // Get and check the number of vectors
137  int32_t N = simple_features->get_num_vectors();
138  if ( m_strategy == SPE_LOCAL && m_k >= N )
139  SG_ERROR("The number of neighbors (%d) must be less than "
140  "the number of vectors (%d)\n", m_k, N);
141 
142  if ( 2*m_nupdates > N )
143  SG_ERROR("The number of vectors (%d) must be at least two times "
144  "the number of updates (%d)\n", N, m_nupdates);
145 
146  m_distance->init(simple_features, simple_features);
147  CDenseFeatures< float64_t >* embedding = embed_distance(m_distance);
149 
150  SG_UNREF(features);
151  return (CFeatures*)embedding;
152 }
153 
154 SGMatrix<int32_t> CStochasticProximityEmbedding::get_neighborhood_matrix(CDistance* distance, int32_t k, int32_t N, float64_t max_dist)
155 {
156  int32_t i;
157  int32_t* neighborhood_matrix = SG_MALLOC(int32_t, N*k);
158 
160 
161  for (i=0; i<N; i++)
162  coverTree->insert(SPE_COVERTREE_POINT(i, distance));
163 
164  for (i=0; i<N; i++)
165  {
166  std::vector<SPE_COVERTREE_POINT> neighbors =
167  coverTree->kNearestNeighbors(SPE_COVERTREE_POINT(i, distance), k+1);
168  for (std::size_t m=1; m<unsigned(k+1); m++)
169  neighborhood_matrix[i*k+m-1] = neighbors[m].m_point_index;
170  }
171 
172  delete coverTree;
173 
174  return SGMatrix<int32_t>(neighborhood_matrix,k,N);
175 }
176 
177 CDenseFeatures< float64_t >* CStochasticProximityEmbedding::embed_distance(CDistance* distance)
178 {
179  if ( !distance )
180  SG_ERROR("Embed distance received no instance of CDistance\n");
181 
182  // Compute distance matrix
183  SG_DEBUG("Computing distance matrix\n");
184 
185  if ( distance->get_distance_type() != D_EUCLIDEAN )
186  SG_ERROR("SPE only supports Euclidean distance, %s given\n",
187  distance->get_name());
188 
189  // Get the number of features, assume that distance(features, features)
190  int32_t N = distance->get_num_vec_rhs();
191 
192  // Look for the maximum distance (make the same assumption)
193  int32_t i, j, k;
194  float64_t max = 0.0, tmp = 0.0;
195  for ( i = 0 ; i < N ; ++i )
196  for ( j = i+1 ; j < N ; ++j )
197  if ( ( tmp = distance->distance(i, j) ) > max )
198  max = tmp;
199 
200  // Compute a normalizer to be used for the distances if global strategy selected
201  float64_t alpha = 0.0;
202  if ( m_strategy == SPE_GLOBAL )
203  alpha = 1.0 / max * CMath::sqrt(2.0);
204 
205  // Compute neighborhood matrix if local strategy used
206  SGMatrix< int32_t > neighbors_mat;
207  if ( m_strategy == SPE_LOCAL )
208  {
209  SG_DEBUG("Computing neighborhood matrix\n");
210  neighbors_mat = get_neighborhood_matrix(distance, m_k, N, max);
211  }
212 
213  // Initialize vectors in the embedded space randomly, Y is the short for
214  // new_feature_matrix
216  SGVector<float64_t>::random_vector(Y.matrix, m_target_dim*N, 0.0, 1.0);
217 
218  // SPE's loop
219 
220  // Initialize the maximum number of iterations
221  int32_t max_iter = 2000 + CMath::round(0.04 * N*N);
222  if ( m_strategy == SPE_LOCAL )
223  max_iter *= 3;
224 
225  // Initialize the learning parameter
226  float32_t lambda = 1.0;
227 
228  // Initialize variables to use in the main loop
229  float64_t sum = 0.0;
230  index_t idx1 = 0, idx2 = 0, idx = 0;
231 
232  SGVector< float64_t > scale(m_nupdates);
233  SGVector< float64_t > D(m_nupdates);
234  SGMatrix< float64_t > Yd(m_nupdates, m_target_dim);
235  SGVector< float64_t > Rt(m_nupdates);
236  int32_t* ind2 = NULL;
237 
238  // Variables required just if local strategy used
239  SGMatrix< int32_t > ind1Neighbors;
241  if ( m_strategy == SPE_LOCAL )
242  {
243  ind2 = SG_MALLOC(int32_t, m_nupdates);
244 
245  ind1Neighbors = SGMatrix<int32_t>(m_k,m_nupdates);
246 
247  J2 = SGVector<int32_t>(m_nupdates);
248  }
249 
250  for ( i = 0 ; i < max_iter ; ++i )
251  {
252  if ( !(i % 1000) )
253  SG_DEBUG("SPE's loop, iteration %d of %d\n", i, max_iter);
254 
255  // Select the vectors to be updated in this iteration
256  int32_t* J = CMath::randperm(N);
257 
258  // Pointer to the first set of vector indices to update
259  int32_t* ind1 = J;
260 
261  // Pointer ind2 to the second set of vector indices to update
262  if ( m_strategy == SPE_GLOBAL )
263  ind2 = J + m_nupdates;
264  else
265  {
266  // Select the second set of indices to update among neighbors
267  // of the first set
268 
269  // Get the neighbors of interest
270  for ( j = 0 ; j < m_nupdates ; ++j )
271  {
272  for ( k = 0 ; k < m_k ; ++k )
273  ind1Neighbors[k + j*m_k] =
274  neighbors_mat.matrix[k + ind1[j]*m_k];
275  }
276 
277  // Generate pseudo-random indices
278  for ( j = 0 ; j < m_nupdates ; ++j )
279  {
280  J2[j] = CMath::round( CMath::random(0.0, 1.0)*(m_k-1) )
281  + m_k*j;
282  }
283 
284  // Select final indices
285  for ( j = 0 ; j < m_nupdates ; ++j )
286  ind2[j] = ind1Neighbors.matrix[ J2[j] ];
287  }
288 
289  // Compute distances betweeen the selected points in embedded space
290 
291  for ( j = 0 ; j < m_nupdates ; ++j )
292  {
293  sum = 0.0;
294 
295  for ( k = 0 ; k < m_target_dim ; ++k )
296  {
297  idx1 = k + ind1[j]*m_target_dim;
298  idx2 = k + ind2[j]*m_target_dim;
299  sum += CMath::sq(Y.matrix[idx1] - Y.matrix[idx2]);
300  }
301 
302  D[j] = CMath::sqrt(sum);
303  }
304 
305  // Get the corresponding distances in the original space
306 
307  if ( m_strategy == SPE_LOCAL )
308  Rt.set_const(1);
309  else // SPE_GLOBAL strategy used
310  Rt.set_const(alpha);
311 
312  for ( j = 0 ; j < m_nupdates ; ++j )
313  Rt[j] *= distance->distance( ind1[j], ind2[j] );
314 
315  // Compute some terms for update
316 
317  // Scale factor: (Rt - D) ./ (D + m_tolerance)
318  for ( j = 0 ; j < m_nupdates ; ++j )
319  scale[j] = ( Rt[j] - D[j] ) / ( D[j] + m_tolerance );
320 
321  // Difference matrix: Y(ind1) - Y(ind2)
322  for ( j = 0 ; j < m_nupdates ; ++j )
323  for ( k = 0 ; k < m_target_dim ; ++k )
324  {
325  idx1 = k + ind1[j]*m_target_dim;
326  idx2 = k + ind2[j]*m_target_dim;
327  idx = k + j *m_target_dim;
328 
329  Yd[idx] = Y[idx1] - Y[idx2];
330  }
331 
332  // Update the location of the vectors in the embedded space
333  for ( j = 0 ; j < m_nupdates ; ++j )
334  for ( k = 0 ; k < m_target_dim ; ++k )
335  {
336  idx1 = k + ind1[j]*m_target_dim;
337  idx2 = k + ind2[j]*m_target_dim;
338  idx = k + j *m_target_dim;
339 
340  Y[idx1] += lambda / 2 * scale[j] * Yd[idx];
341  Y[idx2] -= lambda / 2 * scale[j] * Yd[idx];
342  }
343 
344  // Update the learning parameter
345  lambda = lambda - ( lambda / max_iter );
346 
347  // Free memory
348  SG_FREE(J);
349  }
350 
351  // Free memory
352  if ( m_strategy == SPE_LOCAL )
353  SG_FREE(ind2);
354 
355  return new CDenseFeatures< float64_t >(Y);
356 }
357 
358 #endif /* HAVE_LAPACK */

SHOGUN Machine Learning Toolbox - Documentation