SHOGUN  v2.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
KNN.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2006 Christian Gehl
8  * Written (W) 2006-2009 Soeren Sonnenburg
9  * Written (W) 2011 Sergey Lisitsyn
10  * Written (W) 2012 Fernando José Iglesias García, cover tree support
11  * Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society
12  */
13 
14 #include <shogun/multiclass/KNN.h>
15 #include <shogun/labels/Labels.h>
18 #include <shogun/lib/Signal.h>
19 #include <shogun/lib/JLCoverTree.h>
20 #include <shogun/lib/Time.h>
21 #include <shogun/base/Parameter.h>
22 
23 //#define BENCHMARK_KNN
24 //#define DEBUG_KNN
25 
26 using namespace shogun;
27 
30 {
31  init();
32 }
33 
34 CKNN::CKNN(int32_t k, CDistance* d, CLabels* trainlab)
36 {
37  init();
38 
39  m_k=k;
40 
41  ASSERT(d);
42  ASSERT(trainlab);
43 
44  set_distance(d);
45  set_labels(trainlab);
47 }
48 
49 void CKNN::init()
50 {
51  /* do not store model features by default (CDistanceMachine::apply(...) is
52  * overwritten */
54 
55  m_k=3;
56  m_q=1.0;
57  m_use_covertree=false;
58  m_num_classes=0;
59 
60  /* use the method classify_multiply_k to experiment with different values
61  * of k */
62  SG_ADD(&m_k, "m_k", "Parameter k", MS_NOT_AVAILABLE);
63  SG_ADD(&m_q, "m_q", "Parameter q", MS_AVAILABLE);
64  SG_ADD(&m_use_covertree, "m_use_covertree", "Parameter use_covertree", MS_NOT_AVAILABLE);
65  SG_ADD(&m_num_classes, "m_num_classes", "Number of classes", MS_NOT_AVAILABLE);
66 }
67 
69 {
70 }
71 
73 {
76 
77  if (data)
78  {
79  if (m_labels->get_num_labels() != data->get_num_vectors())
80  SG_ERROR("Number of training vectors does not match number of labels\n");
81  distance->init(data, data);
82  }
83 
84  SGVector<int32_t> lab=((CMulticlassLabels*) m_labels)->get_int_labels();
88 
89  int32_t max_class=m_train_labels.vector[0];
90  int32_t min_class=m_train_labels.vector[0];
91 
92  for (int32_t i=1; i<m_train_labels.vlen; i++)
93  {
94  max_class=CMath::max(max_class, m_train_labels.vector[i]);
95  min_class=CMath::min(min_class, m_train_labels.vector[i]);
96  }
97 
98  for (int32_t i=0; i<m_train_labels.vlen; i++)
99  m_train_labels.vector[i]-=min_class;
100 
101  m_min_label=min_class;
102  m_num_classes=max_class-min_class+1;
103 
104  SG_INFO("m_num_classes: %d (%+d to %+d) num_train: %d\n", m_num_classes,
105  min_class, max_class, m_train_labels.vlen);
106 
107  return true;
108 }
109 
111 {
112  if (data)
113  init_distance(data);
114 
115  // redirecting to fast (without sorting) classify if k==1
116  if (m_k == 1)
117  return classify_NN();
118 
120  ASSERT(distance);
122 
123  int32_t num_lab=distance->get_num_vec_rhs();
124  ASSERT(m_k<=distance->get_num_vec_lhs());
125 
126  CMulticlassLabels* output=new CMulticlassLabels(num_lab);
127 
128  float64_t* dists = NULL;
129  int32_t* train_lab = NULL;
130 
131  //distances to train data and working buffer of m_train_labels
132  if ( ! m_use_covertree )
133  {
135  train_lab=SG_MALLOC(int32_t, m_train_labels.vlen);
136  }
137  else
138  {
139  train_lab=SG_MALLOC(int32_t, m_k);
140  }
141 
142  SG_INFO( "%d test examples\n", num_lab);
144 
147 
148 #ifdef BENCHMARK_KNN
149  CTime tstart;
150  float64_t tfinish, tparsed, tcreated, tqueried;
151 #endif
152 
153  if ( ! m_use_covertree )
154  {
155  for (int32_t i=0; i<num_lab && (!CSignal::cancel_computations()); i++)
156  {
157  SG_PROGRESS(i, 0, num_lab);
158 
159 #ifdef DEBUG_KNN
160  distances_lhs(dists,0,m_train_labels.vlen-1,i);
161 
162  for (int32_t j=0; j<m_train_labels.vlen; j++)
163  train_lab[j]=j;
164 
165  CMath::qsort_index(dists, train_lab, m_train_labels.vlen);
166 
167  SG_PRINT("\nQuick sort query %d\n", i);
168  for (int32_t j=0; j<m_k; j++)
169  SG_PRINT("%d ", train_lab[j]);
170  SG_PRINT("\n");
171 #endif
172 
173  //lhs idx 1..n and rhs idx i
174  distances_lhs(dists,0,m_train_labels.vlen-1,i);
175 
176  for (int32_t j=0; j<m_train_labels.vlen; j++)
177  train_lab[j]=m_train_labels.vector[j];
178 
179  //sort the distance vector for test example j to all
180  //train examples
181  CMath::qsort_index(dists, train_lab, m_train_labels.vlen);
182 
183  // Get the index of the 'nearest' class
184  int32_t out_idx = choose_class(classes, train_lab);
185  output->set_label(i, out_idx + m_min_label);
186  }
187 
188 #ifdef BENCHMARK_KNN
189  SG_PRINT(">>>> Quick sort applied in %9.4f\n",
190  (tfinish = tstart.cur_time_diff(false)));
191 #endif
192  }
193  else // Use cover tree
194  {
195  // m_q != 1.0 not supported with cover tree because the neighbors
196  // are not retrieved in increasing order of distance to the query
197  float64_t old_q = m_q;
198  if ( old_q != 1.0 )
199  SG_INFO("q != 1.0 not supported with cover tree, using q = 1\n");
200 
201  // From the sets of features (lhs and rhs) stored in distance,
202  // build arrays of cover tree points
203  v_array< CJLCoverTreePoint > set_of_points =
205  v_array< CJLCoverTreePoint > set_of_queries =
207 
208 #ifdef BENCHMARK_KNN
209  SG_PRINT(">>>> JL parsed in %9.4f\n",
210  ( tparsed = tstart.cur_time_diff(false) ) - tfinish);
211 #endif
212  // Build the cover trees, one for the test vectors (rhs features)
213  // and another for the training vectors (lhs features)
215  node< CJLCoverTreePoint > top = batch_create(set_of_points);
216  CFeatures* l = distance->replace_lhs(r);
217  distance->replace_rhs(r);
218  node< CJLCoverTreePoint > top_query = batch_create(set_of_queries);
219 
220 #ifdef BENCHMARK_KNN
221  SG_PRINT(">>>> Cover trees created in %9.4f\n",
222  (tcreated = tstart.cur_time_diff(false)) - tparsed);
223 #endif
224 
225  // Get the k nearest neighbors to all the test vectors (batch method)
226  distance->replace_lhs(l);
228  k_nearest_neighbor(top, top_query, res, m_k);
229 
230 #ifdef BENCHMARK_KNN
231  SG_PRINT(">>>> Query finished in %9.4f\n",
232  (tqueried = tstart.cur_time_diff(false)) - tcreated);
233 #endif
234 
235 #ifdef DEBUG_KNN
236  SG_PRINT("\nJL Results:\n");
237  for ( int32_t i = 0 ; i < res.index ; ++i )
238  {
239  for ( int32_t j = 0 ; j < res[i].index ; ++j )
240  {
241  printf("%d ", res[i][j].m_index);
242  }
243  printf("\n");
244  }
245  SG_PRINT("\n");
246 #endif
247 
248  for ( int32_t i = 0 ; i < res.index ; ++i )
249  {
250  // Translate from indices to labels of the nearest neighbors
251  for ( int32_t j = 0; j < m_k; ++j )
252  // The first index in res[i] points to the test vector
253  train_lab[j] = m_train_labels.vector[ res[i][j+1].m_index ];
254 
255  // Get the index of the 'nearest' class
256  int32_t out_idx = choose_class(classes, train_lab);
257  output->set_label(res[i][0].m_index, out_idx+m_min_label);
258  }
259 
260  m_q = old_q;
261 
262 #ifdef BENCHMARK_KNN
263  SG_PRINT(">>>> JL applied in %9.4f\n", tstart.cur_time_diff(false));
264 #endif
265  }
266 
267  SG_FREE(classes);
268  SG_FREE(train_lab);
269  if ( ! m_use_covertree )
270  SG_FREE(dists);
271 
272  return output;
273 }
274 
276 {
277  ASSERT(distance);
279 
280  int32_t num_lab = distance->get_num_vec_rhs();
281  ASSERT(num_lab);
282 
283  CMulticlassLabels* output = new CMulticlassLabels(num_lab);
285 
286  SG_INFO("%d test examples\n", num_lab);
288 
289  // for each test example
290  for (int32_t i=0; i<num_lab && (!CSignal::cancel_computations()); i++)
291  {
292  SG_PROGRESS(i,0,num_lab);
293 
294  // get distances from i-th test example to 0..num_m_train_labels-1 train examples
295  distances_lhs(distances,0,m_train_labels.vlen-1,i);
296  int32_t j;
297 
298  // assuming 0th train examples as nearest to i-th test example
299  int32_t out_idx = 0;
300  float64_t min_dist = distances[0];
301 
302  // searching for nearest neighbor by comparing distances
303  for (j=0; j<m_train_labels.vlen; j++)
304  {
305  if (distances[j]<min_dist)
306  {
307  min_dist = distances[j];
308  out_idx = j;
309  }
310  }
311 
312  // label i-th test example with label of nearest neighbor with out_idx index
313  output->set_label(i,m_train_labels.vector[out_idx]+m_min_label);
314  }
315 
316  SG_FREE(distances);
317  return output;
318 }
319 
321 {
323  ASSERT(distance);
325 
326  int32_t num_lab=distance->get_num_vec_rhs();
327  ASSERT(m_k<=num_lab);
328 
329  int32_t* output=SG_MALLOC(int32_t, m_k*num_lab);
330 
331  float64_t* dists;
332  int32_t* train_lab;
333  //distances to train data and working buffer of m_train_labels
334  if ( ! m_use_covertree )
335  {
337  train_lab=SG_MALLOC(int32_t, m_train_labels.vlen);
338  }
339  else
340  {
341  dists=SG_MALLOC(float64_t, m_k);
342  train_lab=SG_MALLOC(int32_t, m_k);
343  }
344 
346  int32_t* classes=SG_MALLOC(int32_t, m_num_classes);
347 
348  SG_INFO( "%d test examples\n", num_lab);
350 
351  if ( ! m_use_covertree )
352  {
353  for (int32_t i=0; i<num_lab && (!CSignal::cancel_computations()); i++)
354  {
355  SG_PROGRESS(i, 0, num_lab);
356 
357  // lhs idx 1..n and rhs idx i
358  distances_lhs(dists,0,m_train_labels.vlen-1,i);
359  for (int32_t j=0; j<m_train_labels.vlen; j++)
360  train_lab[j]=m_train_labels.vector[j];
361 
362  //sort the distance vector for test example j to all train examples
363  //classes[1..k] then holds the classes for minimum distance
364  CMath::qsort_index(dists, train_lab, m_train_labels.vlen);
365 
366  //compute histogram of class outputs of the first k nearest
367  //neighbours
368  for (int32_t j=0; j<m_num_classes; j++)
369  classes[j]=0;
370 
371  for (int32_t j=0; j<m_k; j++)
372  {
373  classes[train_lab[j]]++;
374 
375  //choose the class that got 'outputted' most often
376  int32_t out_idx=0;
377  int32_t out_max=0;
378 
379  for (int32_t c=0; c<m_num_classes; c++)
380  {
381  if (out_max< classes[c])
382  {
383  out_idx= c;
384  out_max= classes[c];
385  }
386  }
387  output[j*num_lab+i]=out_idx+m_min_label;
388  }
389  }
390  }
391  else
392  {
393  // From the sets of features (lhs and rhs) stored in distance,
394  // build arrays of cover tree points
395  v_array< CJLCoverTreePoint > set_of_points =
397  v_array< CJLCoverTreePoint > set_of_queries =
399 
400  // Build the cover trees, one for the test vectors (rhs features)
401  // and another for the training vectors (lhs features)
403  node< CJLCoverTreePoint > top = batch_create(set_of_points);
404  CFeatures* l = distance->replace_lhs(r);
405  distance->replace_rhs(r);
406  node< CJLCoverTreePoint > top_query = batch_create(set_of_queries);
407 
408  // Get the k nearest neighbors to all the test vectors (batch method)
409  distance->replace_lhs(l);
411  k_nearest_neighbor(top, top_query, res, m_k);
412 
413  for ( int32_t i = 0 ; i < res.index ; ++i )
414  {
415  // Handle the fact that cover tree doesn't return neighbors
416  // ordered by distance
417 
418  for ( int32_t j = 0 ; j < m_k ; ++j )
419  {
420  // The first index in res[i] points to the test vector
421  dists[j] = distance->distance(res[i][j+1].m_index,
422  res[i][0].m_index);
423  train_lab[j] = m_train_labels.vector[
424  res[i][j+1].m_index ];
425  }
426 
427  // Now we get the indices to the neighbors sorted by distance
428  CMath::qsort_index(dists, train_lab, m_k);
429 
430  //compute histogram of class outputs of the first k nearest
431  //neighbours
432  for (int32_t j=0; j<m_num_classes; j++)
433  classes[j]=0;
434 
435  for (int32_t j=0; j<m_k; j++)
436  {
437  classes[train_lab[j]]++;
438 
439  //choose the class that got 'outputted' most often
440  int32_t out_idx=0;
441  int32_t out_max=0;
442 
443  for (int32_t c=0; c<m_num_classes; c++)
444  {
445  if (out_max< classes[c])
446  {
447  out_idx= c;
448  out_max= classes[c];
449  }
450  }
451  output[j*num_lab+res[i][0].m_index]=out_idx+m_min_label;
452  }
453 
454  }
455  }
456 
457  SG_FREE(train_lab);
458  SG_FREE(classes);
459  SG_FREE(dists);
460 
461  return SGMatrix<int32_t>(output,num_lab,m_k,true);
462 }
463 
465 {
466  if (!distance)
467  SG_ERROR("No distance assigned!\n");
468  CFeatures* lhs=distance->get_lhs();
469  if (!lhs || !lhs->get_num_vectors())
470  {
471  SG_UNREF(lhs);
472  SG_ERROR("No vectors on left hand side\n");
473  }
474  distance->init(lhs, data);
475  SG_UNREF(lhs);
476 }
477 
478 bool CKNN::load(FILE* srcfile)
479 {
482  return false;
483 }
484 
485 bool CKNN::save(FILE* dstfile)
486 {
489  return false;
490 }
491 
493 {
494  CFeatures* d_lhs=distance->get_lhs();
495  CFeatures* d_rhs=distance->get_rhs();
496 
497  /* copy lhs of underlying distance */
498  distance->init(d_lhs->duplicate(), d_rhs);
499 
500  SG_UNREF(d_lhs);
501  SG_UNREF(d_rhs);
502 }
503 
504 int32_t CKNN::choose_class(float64_t* classes, int32_t* train_lab)
505 {
506  memset(classes, 0, sizeof(float64_t)*m_num_classes);
507 
508  float64_t multiplier = m_q;
509  for (int32_t j=0; j<m_k; j++)
510  {
511  classes[train_lab[j]]+= multiplier;
512  multiplier*= multiplier;
513  }
514 
515  //choose the class that got 'outputted' most often
516  int32_t out_idx=0;
517  float64_t out_max=0;
518 
519  for (int32_t j=0; j<m_num_classes; j++)
520  {
521  if (out_max< classes[j])
522  {
523  out_idx= j;
524  out_max= classes[j];
525  }
526  }
527 
528  return out_idx;
529 }

SHOGUN Machine Learning Toolbox - Documentation