SHOGUN  v3.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
KNN.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2006 Christian Gehl
8  * Written (W) 2006-2009 Soeren Sonnenburg
9  * Written (W) 2011 Sergey Lisitsyn
10  * Written (W) 2012 Fernando José Iglesias García, cover tree support
11  * Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society
12  */
13 
14 #include <shogun/multiclass/KNN.h>
15 #include <shogun/labels/Labels.h>
18 #include <shogun/lib/Signal.h>
19 #include <shogun/lib/JLCoverTree.h>
20 #include <shogun/lib/Time.h>
21 #include <shogun/base/Parameter.h>
22 
23 //#define BENCHMARK_KNN
24 //#define DEBUG_KNN
25 
26 using namespace shogun;
27 
30 {
31  init();
32 }
33 
34 CKNN::CKNN(int32_t k, CDistance* d, CLabels* trainlab)
36 {
37  init();
38 
39  m_k=k;
40 
41  ASSERT(d)
42  ASSERT(trainlab)
43 
44  set_distance(d);
45  set_labels(trainlab);
47 }
48 
49 void CKNN::init()
50 {
51  /* do not store model features by default (CDistanceMachine::apply(...) is
52  * overwritten */
54 
55  m_k=3;
56  m_q=1.0;
57  m_use_covertree=false;
58  m_num_classes=0;
59 
60  /* use the method classify_multiply_k to experiment with different values
61  * of k */
62  SG_ADD(&m_k, "m_k", "Parameter k", MS_NOT_AVAILABLE);
63  SG_ADD(&m_q, "m_q", "Parameter q", MS_AVAILABLE);
64  SG_ADD(&m_use_covertree, "m_use_covertree", "Parameter use_covertree", MS_NOT_AVAILABLE);
65  SG_ADD(&m_num_classes, "m_num_classes", "Number of classes", MS_NOT_AVAILABLE);
66 }
67 
69 {
70 }
71 
73 {
76 
77  if (data)
78  {
79  if (m_labels->get_num_labels() != data->get_num_vectors())
80  SG_ERROR("Number of training vectors does not match number of labels\n")
81  distance->init(data, data);
82  }
83 
84  SGVector<int32_t> lab=((CMulticlassLabels*) m_labels)->get_int_labels();
88 
89  int32_t max_class=m_train_labels.vector[0];
90  int32_t min_class=m_train_labels.vector[0];
91 
92  for (int32_t i=1; i<m_train_labels.vlen; i++)
93  {
94  max_class=CMath::max(max_class, m_train_labels.vector[i]);
95  min_class=CMath::min(min_class, m_train_labels.vector[i]);
96  }
97 
98  for (int32_t i=0; i<m_train_labels.vlen; i++)
99  m_train_labels.vector[i]-=min_class;
100 
101  m_min_label=min_class;
102  m_num_classes=max_class-min_class+1;
103 
104  SG_INFO("m_num_classes: %d (%+d to %+d) num_train: %d\n", m_num_classes,
105  min_class, max_class, m_train_labels.vlen);
106 
107  return true;
108 }
109 
111 {
112  //number of examples to which kNN is applied
113  int32_t n=distance->get_num_vec_rhs();
114  //distances to train data
115  float64_t* dists=SG_MALLOC(float64_t, m_train_labels.vlen);
116  //indices to train data
117  index_t* train_idxs=SG_MALLOC(index_t, m_train_labels.vlen);
118  //pre-allocation of the nearest neighbors
119  SGMatrix<index_t> NN(m_k, n);
120 
121  //for each test example
122  for (int32_t i=0; i<n && (!CSignal::cancel_computations()); i++)
123  {
124  SG_PROGRESS(i, 0, n)
125 
126  //lhs idx 0..num train examples-1 (i.e., all train examples) and rhs idx i
127  distances_lhs(dists,0,m_train_labels.vlen-1,i);
128 
129  //fill in an array with 0..num train examples-1
130  for (int32_t j=0; j<m_train_labels.vlen; j++)
131  train_idxs[j]=j;
132 
133  //sort the distance vector between test example i and all train examples
134  CMath::qsort_index(dists, train_idxs, m_train_labels.vlen);
135 
136 #ifdef DEBUG_KNN
137  SG_PRINT("\nQuick sort query %d\n", i)
138  for (int32_t j=0; j<m_k; j++)
139  SG_PRINT("%d ", train_idxs[j])
140  SG_PRINT("\n")
141 #endif
142 
143  //fill in the output the indices of the nearest neighbors
144  for (int32_t j=0; j<m_k; j++)
145  NN(j,i) = train_idxs[j];
146  }
147 
148  SG_FREE(train_idxs);
149  SG_FREE(dists);
150 
151  return NN;
152 }
153 
155 {
156  if (data)
157  init_distance(data);
158 
159  //redirecting to fast (without sorting) classify if k==1
160  if (m_k == 1)
161  return classify_NN();
162 
166 
167  int32_t num_lab=distance->get_num_vec_rhs();
168  ASSERT(m_k<=distance->get_num_vec_lhs())
169 
170  CMulticlassLabels* output=new CMulticlassLabels(num_lab);
171 
172  //labels of the k nearest neighbors
173  int32_t* train_lab=SG_MALLOC(int32_t, m_k);
174 
175  SG_INFO("%d test examples\n", num_lab)
177 
178  //histogram of classes and returned output
179  float64_t* classes=SG_MALLOC(float64_t, m_num_classes);
180 
181 #ifdef BENCHMARK_KNN
182  CTime tstart;
183  float64_t tfinish, tparsed, tcreated, tqueried;
184 #endif
185 
186  if ( ! m_use_covertree )
187  {
188  //get the k nearest neighbors of each example
190 
191  //from the indices to the nearest neighbors, compute the class labels
192  for (int32_t i=0; i<num_lab && (!CSignal::cancel_computations()); i++)
193  {
194  //write the labels of the k nearest neighbors from theirs indices
195  for (int32_t j=0; j<m_k; j++)
196  train_lab[j] = m_train_labels[ NN(j,i) ];
197 
198  //get the index of the 'nearest' class
199  int32_t out_idx = choose_class(classes, train_lab);
200  //write the label of 'nearest' in the output
201  output->set_label(i, out_idx + m_min_label);
202  }
203 
204 #ifdef BENCHMARK_KNN
205  SG_PRINT(">>>> Quick sort applied in %9.4f\n",
206  (tfinish = tstart.cur_time_diff(false)));
207 #endif
208  }
209  else // Use cover tree
210  {
211  // m_q != 1.0 not supported with cover tree because the neighbors
212  // are not retrieved in increasing order of distance to the query
213  float64_t old_q = m_q;
214  if ( old_q != 1.0 )
215  SG_INFO("q != 1.0 not supported with cover tree, using q = 1\n")
216 
217  // From the sets of features (lhs and rhs) stored in distance,
218  // build arrays of cover tree points
219  v_array< CJLCoverTreePoint > set_of_points =
221  v_array< CJLCoverTreePoint > set_of_queries =
223 
224 #ifdef BENCHMARK_KNN
225  SG_PRINT(">>>> JL parsed in %9.4f\n",
226  ( tparsed = tstart.cur_time_diff(false) ) - tfinish);
227 #endif
228  // Build the cover trees, one for the test vectors (rhs features)
229  // and another for the training vectors (lhs features)
231  node< CJLCoverTreePoint > top = batch_create(set_of_points);
232  CFeatures* l = distance->replace_lhs(r);
233  distance->replace_rhs(r);
234  node< CJLCoverTreePoint > top_query = batch_create(set_of_queries);
235 
236 #ifdef BENCHMARK_KNN
237  SG_PRINT(">>>> Cover trees created in %9.4f\n",
238  (tcreated = tstart.cur_time_diff(false)) - tparsed);
239 #endif
240 
241  // Get the k nearest neighbors to all the test vectors (batch method)
242  distance->replace_lhs(l);
244  k_nearest_neighbor(top, top_query, res, m_k);
245 
246 #ifdef BENCHMARK_KNN
247  SG_PRINT(">>>> Query finished in %9.4f\n",
248  (tqueried = tstart.cur_time_diff(false)) - tcreated);
249 #endif
250 
251 #ifdef DEBUG_KNN
252  SG_PRINT("\nJL Results:\n")
253  for ( int32_t i = 0 ; i < res.index ; ++i )
254  {
255  for ( int32_t j = 0 ; j < res[i].index ; ++j )
256  {
257  printf("%d ", res[i][j].m_index);
258  }
259  printf("\n");
260  }
261  SG_PRINT("\n")
262 #endif
263 
264  for ( int32_t i = 0 ; i < res.index ; ++i )
265  {
266  // Translate from indices to labels of the nearest neighbors
267  for ( int32_t j = 0; j < m_k; ++j )
268  // The first index in res[i] points to the test vector
269  train_lab[j] = m_train_labels.vector[ res[i][j+1].m_index ];
270 
271  // Get the index of the 'nearest' class
272  int32_t out_idx = choose_class(classes, train_lab);
273  output->set_label(res[i][0].m_index, out_idx+m_min_label);
274  }
275 
276  m_q = old_q;
277 
278 #ifdef BENCHMARK_KNN
279  SG_PRINT(">>>> JL applied in %9.4f\n", tstart.cur_time_diff(false))
280 #endif
281  }
282 
283  SG_FREE(classes);
284  SG_FREE(train_lab);
285 
286  return output;
287 }
288 
290 {
293 
294  int32_t num_lab = distance->get_num_vec_rhs();
295  ASSERT(num_lab)
296 
297  CMulticlassLabels* output = new CMulticlassLabels(num_lab);
298  float64_t* distances = SG_MALLOC(float64_t, m_train_labels.vlen);
299 
300  SG_INFO("%d test examples\n", num_lab)
302 
303  // for each test example
304  for (int32_t i=0; i<num_lab && (!CSignal::cancel_computations()); i++)
305  {
306  SG_PROGRESS(i,0,num_lab)
307 
308  // get distances from i-th test example to 0..num_m_train_labels-1 train examples
309  distances_lhs(distances,0,m_train_labels.vlen-1,i);
310  int32_t j;
311 
312  // assuming 0th train examples as nearest to i-th test example
313  int32_t out_idx = 0;
314  float64_t min_dist = distances[0];
315 
316  // searching for nearest neighbor by comparing distances
317  for (j=0; j<m_train_labels.vlen; j++)
318  {
319  if (distances[j]<min_dist)
320  {
321  min_dist = distances[j];
322  out_idx = j;
323  }
324  }
325 
326  // label i-th test example with label of nearest neighbor with out_idx index
327  output->set_label(i,m_train_labels.vector[out_idx]+m_min_label);
328  }
329 
330  SG_FREE(distances);
331  return output;
332 }
333 
335 {
339 
340  int32_t num_lab=distance->get_num_vec_rhs();
341  ASSERT(m_k<=num_lab)
342 
343  int32_t* output=SG_MALLOC(int32_t, m_k*num_lab);
344 
345  //working buffer of m_train_labels
346  int32_t* train_lab=SG_MALLOC(int32_t, m_k);
347 
348  //histogram of classes and returned output
349  int32_t* classes=SG_MALLOC(int32_t, m_num_classes);
350 
351  SG_INFO("%d test examples\n", num_lab)
353 
354  if ( ! m_use_covertree )
355  {
356  //get the k nearest neighbors of each example
358 
359  for (int32_t i=0; i<num_lab && (!CSignal::cancel_computations()); i++)
360  {
361  //write the labels of the k nearest neighbors from theirs indices
362  for (int32_t j=0; j<m_k; j++)
363  train_lab[j] = m_train_labels[ NN(j,i) ];
364 
365  choose_class_for_multiple_k(output+i, classes, train_lab, num_lab);
366  }
367  }
368  else // Use cover tree
369  {
370  //allocation for distances to nearest neighbors
371  float64_t* dists=SG_MALLOC(float64_t, m_k);
372 
373  // From the sets of features (lhs and rhs) stored in distance,
374  // build arrays of cover tree points
375  v_array< CJLCoverTreePoint > set_of_points =
377  v_array< CJLCoverTreePoint > set_of_queries =
379 
380  // Build the cover trees, one for the test vectors (rhs features)
381  // and another for the training vectors (lhs features)
383  node< CJLCoverTreePoint > top = batch_create(set_of_points);
384  CFeatures* l = distance->replace_lhs(r);
385  distance->replace_rhs(r);
386  node< CJLCoverTreePoint > top_query = batch_create(set_of_queries);
387 
388  // Get the k nearest neighbors to all the test vectors (batch method)
389  distance->replace_lhs(l);
391  k_nearest_neighbor(top, top_query, res, m_k);
392 
393  for ( int32_t i = 0 ; i < res.index ; ++i )
394  {
395  // Handle the fact that cover tree doesn't return neighbors
396  // ordered by distance
397 
398  for ( int32_t j = 0 ; j < m_k ; ++j )
399  {
400  // The first index in res[i] points to the test vector
401  dists[j] = distance->distance(res[i][j+1].m_index,
402  res[i][0].m_index);
403  train_lab[j] = m_train_labels.vector[
404  res[i][j+1].m_index ];
405  }
406 
407  // Now we get the indices to the neighbors sorted by distance
408  CMath::qsort_index(dists, train_lab, m_k);
409 
410  choose_class_for_multiple_k(output+res[i][0].m_index, classes,
411  train_lab, num_lab);
412  }
413 
414  SG_FREE(dists);
415  }
416 
417  SG_FREE(train_lab);
418  SG_FREE(classes);
419 
420  return SGMatrix<int32_t>(output,num_lab,m_k,true);
421 }
422 
424 {
425  if (!distance)
426  SG_ERROR("No distance assigned!\n")
427  CFeatures* lhs=distance->get_lhs();
428  if (!lhs || !lhs->get_num_vectors())
429  {
430  SG_UNREF(lhs);
431  SG_ERROR("No vectors on left hand side\n")
432  }
433  distance->init(lhs, data);
434  SG_UNREF(lhs);
435 }
436 
437 bool CKNN::load(FILE* srcfile)
438 {
441  return false;
442 }
443 
444 bool CKNN::save(FILE* dstfile)
445 {
448  return false;
449 }
450 
452 {
453  CFeatures* d_lhs=distance->get_lhs();
454  CFeatures* d_rhs=distance->get_rhs();
455 
456  /* copy lhs of underlying distance */
457  distance->init(d_lhs->duplicate(), d_rhs);
458 
459  SG_UNREF(d_lhs);
460  SG_UNREF(d_rhs);
461 }
462 
463 int32_t CKNN::choose_class(float64_t* classes, int32_t* train_lab)
464 {
465  memset(classes, 0, sizeof(float64_t)*m_num_classes);
466 
467  float64_t multiplier = m_q;
468  for (int32_t j=0; j<m_k; j++)
469  {
470  classes[train_lab[j]]+= multiplier;
471  multiplier*= multiplier;
472  }
473 
474  //choose the class that got 'outputted' most often
475  int32_t out_idx=0;
476  float64_t out_max=0;
477 
478  for (int32_t j=0; j<m_num_classes; j++)
479  {
480  if (out_max< classes[j])
481  {
482  out_idx= j;
483  out_max= classes[j];
484  }
485  }
486 
487  return out_idx;
488 }
489 
490 void CKNN::choose_class_for_multiple_k(int32_t* output, int32_t* classes, int32_t* train_lab, int32_t step)
491 {
492  //compute histogram of class outputs of the first k nearest neighbours
493  memset(classes, 0, sizeof(int32_t)*m_num_classes);
494 
495  for (int32_t j=0; j<m_k; j++)
496  {
497  classes[train_lab[j]]++;
498 
499  //choose the class that got 'outputted' most often
500  int32_t out_idx=0;
501  int32_t out_max=0;
502 
503  for (int32_t c=0; c<m_num_classes; c++)
504  {
505  if (out_max< classes[c])
506  {
507  out_idx= c;
508  out_max= classes[c];
509  }
510  }
511 
512  output[j*step]=out_idx+m_min_label;
513  }
514 }

SHOGUN Machine Learning Toolbox - Documentation