SHOGUN  v2.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Distance.cpp
Go to the documentation of this file.
1 /*
2  * this program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2006-2009 Christian Gehl
8  * Written (W) 2006-2009 Soeren Sonnenburg
9  * Copyright (C) 2006-2009 Fraunhofer Institute FIRST and Max-Planck-Society
10  */
11 
12 #include <shogun/lib/config.h>
13 #include <shogun/lib/common.h>
14 #include <shogun/io/SGIO.h>
15 #include <shogun/io/File.h>
16 #include <shogun/lib/Time.h>
17 #include <shogun/base/Parallel.h>
18 #include <shogun/base/Parameter.h>
19 
22 
23 #include <string.h>
24 #include <unistd.h>
25 
26 #ifdef HAVE_PTHREAD
27 #include <pthread.h>
28 #endif
29 
30 using namespace shogun;
31 
32 #ifndef DOXYGEN_SHOULD_SKIP_THIS
33 struct DISTANCE_THREAD_PARAM
34 {
35  // CDistance instance used by thread to compute distance
37  // distance matrix to store computed distances
38  float64_t* distance_matrix;
39  // starting index of the main loop
40  int32_t idx_start;
41  // end index of the main loop
42  int32_t idx_stop;
43  // step of the main loop
44  int32_t idx_step;
45  // number of lhs vectors
46  int32_t lhs_vectors_number;
47  // number of rhs vectors
48  int32_t rhs_vectors_number;
49  // whether matrix distance is symmetric
50  bool symmetric;
51  // chunking method
52  bool chunk_by_lhs;
53 };
54 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
55 
57 {
58  init();
59 }
60 
61 
63 {
64  init();
65  init(p_lhs, p_rhs);
66 }
67 
69 {
71  precomputed_matrix=NULL;
72 
74 }
75 
76 bool CDistance::init(CFeatures* l, CFeatures* r)
77 {
78  //make sure features were indeed supplied
79  ASSERT(l);
80  ASSERT(r);
81 
82  //make sure features are compatible
85 
86  //remove references to previous features
88 
89  //increase reference counts
90  SG_REF(l);
91  SG_REF(r);
92 
93  lhs=l;
94  rhs=r;
95 
98 
100  precomputed_matrix=NULL ;
101 
102  return true;
103 }
104 
105 void CDistance::load(CFile* loader)
106 {
109 }
110 
111 void CDistance::save(CFile* writer)
112 {
115 }
116 
118 {
119  SG_UNREF(rhs);
120  rhs = NULL;
121  num_rhs=0;
122 
123  SG_UNREF(lhs);
124  lhs = NULL;
125  num_lhs=0;
126 }
127 
129 {
130  SG_UNREF(lhs);
131  lhs = NULL;
132  num_lhs=0;
133 }
134 
137 {
138  SG_UNREF(rhs);
139  rhs = NULL;
140  num_rhs=0;
141 }
142 
144 {
145  //make sure features were indeed supplied
146  ASSERT(r);
147 
148  //make sure features are compatible
151 
152  //remove references to previous rhs features
153  CFeatures* tmp=rhs;
154 
155  rhs=r;
157 
159  precomputed_matrix=NULL ;
160 
161  // return old features including reference count
162  return tmp;
163 }
164 
166 {
167  //make sure features were indeed supplied
168  ASSERT(l);
169 
170  //make sure features are compatible
173 
174  //remove references to previous rhs features
175  CFeatures* tmp=lhs;
176 
177  lhs=l;
179 
181  precomputed_matrix=NULL ;
182 
183  // return old features including reference count
184  return tmp;
185 }
186 
188 {
189  int32_t num_left=lhs->get_num_vectors();
190  int32_t num_right=rhs->get_num_vectors();
191  SG_INFO( "precomputing distance matrix (%ix%i)\n", num_left, num_right) ;
192 
193  ASSERT(num_left==num_right);
194  ASSERT(lhs==rhs);
195  int32_t num=num_left;
196 
198  precomputed_matrix=SG_MALLOC(float32_t, num*(num+1)/2);
199 
200  for (int32_t i=0; i<num; i++)
201  {
202  SG_PROGRESS(i*i,0,num*num);
203  for (int32_t j=0; j<=i; j++)
204  precomputed_matrix[i*(i+1)/2+j] = compute(i,j) ;
205  }
206 
207  SG_PROGRESS(num*num,0,num*num);
208  SG_DONE();
209 }
210 
212 {
213  int32_t m,n;
214  float64_t* data=get_distance_matrix_real(m,n,NULL);
215  return SGMatrix<float64_t>(data, m,n);
216 }
217 
219  int32_t &num_vec1, int32_t &num_vec2, float32_t* target)
220 {
221  float32_t* result = NULL;
222  CFeatures* f1 = lhs;
223  CFeatures* f2 = rhs;
224 
225  if (has_features())
226  {
227  if (target && (num_vec1!=get_num_vec_lhs() || num_vec2!=get_num_vec_rhs()))
228  SG_ERROR("distance matrix does not fit into target\n");
229 
230  num_vec1=get_num_vec_lhs();
231  num_vec2=get_num_vec_rhs();
232  int64_t total_num=num_vec1*num_vec2;
233  int32_t num_done=0;
234 
235  SG_DEBUG("returning distance matrix of size %dx%d\n", num_vec1, num_vec2);
236 
237  if (target)
238  result=target;
239  else
240  result=SG_MALLOC(float32_t, total_num);
241 
242  if ( (f1 == f2) && (num_vec1 == num_vec2) && (f1!=NULL && f2!=NULL) )
243  {
244  for (int32_t i=0; i<num_vec1; i++)
245  {
246  for (int32_t j=i; j<num_vec1; j++)
247  {
248  float64_t v=distance(i,j);
249 
250  result[i+j*num_vec1]=v;
251  result[j+i*num_vec1]=v;
252 
253  if (num_done%100000)
254  SG_PROGRESS(num_done, 0, total_num-1);
255 
256  if (i!=j)
257  num_done+=2;
258  else
259  num_done+=1;
260  }
261  }
262  }
263  else
264  {
265  for (int32_t i=0; i<num_vec1; i++)
266  {
267  for (int32_t j=0; j<num_vec2; j++)
268  {
269  result[i+j*num_vec1]=distance(i,j) ;
270 
271  if (num_done%100000)
272  SG_PROGRESS(num_done, 0, total_num-1);
273 
274  num_done++;
275  }
276  }
277  }
278 
279  SG_DONE();
280  }
281  else
282  SG_ERROR("no features assigned to distance\n");
283 
284  return result;
285 }
286 
288  int32_t &lhs_vectors_number, int32_t &rhs_vectors_number, float64_t* target)
289 {
290  float64_t* distance_matrix = NULL;
291  CFeatures* lhs_features = lhs;
292  CFeatures* rhs_features = rhs;
293 
294  // check for errors
295  if (!has_features())
296  SG_ERROR("No features assigned to the distance.\n");
297 
298  if (target &&
299  (lhs_vectors_number!=get_num_vec_lhs() ||
300  rhs_vectors_number!=get_num_vec_rhs()))
301  SG_ERROR("Distance matrix does not fit into the given target.\n");
302 
303  // init numbers of vectors and total number of distances
304  lhs_vectors_number = get_num_vec_lhs();
305  rhs_vectors_number = get_num_vec_rhs();
306  int64_t total_distances_number = lhs_vectors_number*rhs_vectors_number;
307 
308  SG_DEBUG("Calculating distance matrix of size %dx%d.\n", lhs_vectors_number, rhs_vectors_number);
309 
310  // redirect to target or allocate memory
311  if (target)
312  distance_matrix = target;
313  else
314  distance_matrix = SG_MALLOC(float64_t, total_distances_number);
315 
316  // check if we're computing symmetric distance_matrix
317  bool symmetric = (lhs_features==rhs_features) || (lhs_vectors_number==rhs_vectors_number);
318  // select chunking method according to greatest dimension
319  bool chunk_by_lhs = (lhs_vectors_number >= rhs_vectors_number);
320 
321 #ifdef HAVE_PTHREAD
322  // init parallel to work
323  int32_t num_threads = parallel->get_num_threads();
324  ASSERT(num_threads>0);
325  pthread_t* threads = SG_MALLOC(pthread_t, num_threads);
326  DISTANCE_THREAD_PARAM* parameters = SG_MALLOC(DISTANCE_THREAD_PARAM,num_threads);
327  pthread_attr_t attr;
328  pthread_attr_init(&attr);
329  pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
330  // run threads
331  for (int32_t t=0; t<num_threads; t++)
332  {
333  parameters[t].idx_start = t;
334  parameters[t].idx_stop = chunk_by_lhs ? lhs_vectors_number : rhs_vectors_number;
335  parameters[t].idx_step = num_threads;
336  parameters[t].distance_matrix = distance_matrix;
337  parameters[t].symmetric = symmetric;
338  parameters[t].lhs_vectors_number = lhs_vectors_number;
339  parameters[t].rhs_vectors_number = rhs_vectors_number;
340  parameters[t].chunk_by_lhs = chunk_by_lhs;
341  parameters[t].distance = this;
342  pthread_create(&threads[t], &attr, run_distance_thread, (void*)&parameters[t]);
343  }
344  // join, i.e. wait threads for finish
345  for (int32_t t=0; t<num_threads; t++)
346  {
347  pthread_join(threads[t], NULL);
348  }
349  // cleanup
350  pthread_attr_destroy(&attr);
351  SG_FREE(parameters);
352  SG_FREE(threads);
353 #else
354  // init one-threaded parameters
355  DISTANCE_THREAD_PARAM single_thread_param;
356  single_thread_param.idx_start = 0;
357  single_thread_param.idx_stop = chunk_by_lhs ? lhs_vectors_number : rhs_vectors_number;
358  single_thread_param.idx_step = 1;
359  single_thread_param.distance_matrix = distance_matrix;
360  single_thread_param.symmetric = symmetric;
361  single_thread_param.lhs_vectors_number = lhs_vectors_number;
362  single_thread_param.rhs_vectors_number = rhs_vectors_number;
363  single_thread_param.chunk_by_lhs = chunk_by_lhs;
364  single_thread_param.distance = this;
365  // run thread
366  run_distance_thread((void*)&single_thread_param);
367 #endif
368 
369  return distance_matrix;
370 }
371 
372 void CDistance::init()
373 {
374  precomputed_matrix = NULL;
375  precompute_matrix = false;
376  lhs = NULL;
377  rhs = NULL;
378  num_lhs=0;
379  num_rhs=0;
380 
381  m_parameters->add((CSGObject**) &lhs, "lhs",
382  "Feature vectors to occur on left hand side.");
383  m_parameters->add((CSGObject**) &rhs, "rhs",
384  "Feature vectors to occur on right hand side.");
385 }
386 
388 {
389  DISTANCE_THREAD_PARAM* parameters = (DISTANCE_THREAD_PARAM*)p;
390  float64_t* distance_matrix = parameters->distance_matrix;
391  CDistance* distance = parameters->distance;
392  int32_t idx_start = parameters->idx_start;
393  int32_t idx_stop = parameters->idx_stop;
394  int32_t idx_step = parameters->idx_step;
395  int32_t lhs_vectors_number = parameters->lhs_vectors_number;
396  int32_t rhs_vectors_number = parameters->rhs_vectors_number;
397  bool symmetric = parameters->symmetric;
398  bool chunk_by_lhs = parameters->chunk_by_lhs;
399 
400  if (symmetric)
401  {
402  for (int32_t i=idx_start; i<idx_stop; i+=idx_step)
403  {
404  for (int32_t j=i; j<rhs_vectors_number; j++)
405  {
406  float64_t ij_distance = distance->compute(i,j);
407  distance_matrix[i*rhs_vectors_number+j] = ij_distance;
408  distance_matrix[j*rhs_vectors_number+i] = ij_distance;
409  }
410  }
411  }
412  else
413  {
414  if (chunk_by_lhs)
415  {
416  for (int32_t i=idx_start; i<idx_stop; i+=idx_step)
417  {
418  for (int32_t j=0; j<rhs_vectors_number; j++)
419  {
420  distance_matrix[j*lhs_vectors_number+i] = distance->compute(i,j);
421  }
422  }
423  }
424  else
425  {
426  for (int32_t j=idx_start; j<idx_stop; j+=idx_step)
427  {
428  for (int32_t i=0; i<lhs_vectors_number; i++)
429  {
430  distance_matrix[j*lhs_vectors_number+i] = distance->compute(i,j);
431  }
432  }
433  }
434  }
435 
436  return NULL;
437 }

SHOGUN Machine Learning Toolbox - Documentation