KNN.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  *
00008  * Written (W) 2006 Christian Gehl
00009  * Written (W) 2006-2009 Soeren Sonnenburg
00010  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00011  */
00012 
00013 #include "classifier/KNN.h"
00014 #include "features/Labels.h"
00015 #include "lib/Mathematics.h"
00016 #include "lib/Signal.h"
00017 
00018 using namespace shogun;
00019 
00020 CKNN::CKNN()
00021 : CDistanceMachine(), k(3), num_classes(0), num_train_labels(0), train_labels(NULL)
00022 {
00023 }
00024 
00025 CKNN::CKNN(int32_t k_, CDistance* d, CLabels* trainlab)
00026 : CDistanceMachine(), k(k_), num_classes(0), train_labels(NULL)
00027 {
00028     ASSERT(d);
00029     ASSERT(trainlab);
00030 
00031     set_distance(d);
00032     set_labels(trainlab);
00033     num_train_labels=trainlab->get_num_labels();
00034 }
00035 
00036 
00037 CKNN::~CKNN()
00038 {
00039     delete[] train_labels;
00040 }
00041 
00042 bool CKNN::train(CFeatures* data)
00043 {
00044     ASSERT(labels);
00045     ASSERT(distance);
00046 
00047     if (data)
00048     {
00049         if (labels->get_num_labels() != data->get_num_vectors())
00050             SG_ERROR("Number of training vectors does not match number of labels\n");
00051         distance->init(data, data);
00052     }
00053 
00054     train_labels=labels->get_int_labels(num_train_labels);
00055     ASSERT(train_labels);
00056     ASSERT(num_train_labels>0);
00057 
00058     int32_t max_class=train_labels[0];
00059     int32_t min_class=train_labels[0];
00060 
00061     int32_t i;
00062     for (i=1; i<num_train_labels; i++)
00063     {
00064         max_class=CMath::max(max_class, train_labels[i]);
00065         min_class=CMath::min(min_class, train_labels[i]);
00066     }
00067 
00068     for (i=0; i<num_train_labels; i++)
00069         train_labels[i]-=min_class;
00070 
00071     min_label=min_class;
00072     num_classes=max_class-min_class+1;
00073 
00074     SG_INFO( "num_classes: %d (%+d to %+d) num_train: %d\n", num_classes, min_class, max_class, num_train_labels);
00075     return true;
00076 }
00077 
00078 CLabels* CKNN::classify()
00079 {
00080     ASSERT(num_classes>0);
00081     ASSERT(distance);
00082     ASSERT(distance->get_num_vec_rhs());
00083 
00084     int32_t num_lab=distance->get_num_vec_rhs();
00085     ASSERT(k<=num_lab);
00086 
00087     CLabels* output=new CLabels(num_lab);
00088 
00089     //distances to train data and working buffer of train_labels
00090     float64_t* dists=new float64_t[num_train_labels];
00091     int32_t* train_lab=new int32_t[num_train_labels];
00092 
00094     int32_t* classes=new int32_t[num_classes];
00095 
00096     ASSERT(dists);
00097     ASSERT(train_lab);
00098     ASSERT(classes);
00099 
00100     SG_INFO( "%d test examples\n", num_lab);
00101     CSignal::clear_cancel();
00102 
00103     for (int32_t i=0; i<num_lab && (!CSignal::cancel_computations()); i++)
00104     {
00105         SG_PROGRESS(i, 0, num_lab);
00106 
00107         // lhs idx 1..n and rhs idx i
00108         distances_lhs(dists,0,num_train_labels-1,i);
00109         int32_t j;
00110         for (j=0; j<num_train_labels; j++)
00111             train_lab[j]=train_labels[j];
00112 
00113         //sort the distance vector for test example j to all train examples
00114         //classes[1..k] then holds the classes for minimum distance
00115         CMath::qsort_index(dists, train_lab, num_train_labels);
00116 
00117         //compute histogram of class outputs of the first k nearest neighbours
00118         for (j=0; j<num_classes; j++)
00119             classes[j]=0;
00120 
00121         for (j=0; j<k; j++)
00122             classes[train_lab[j]]++;
00123 
00124         //choose the class that got 'outputted' most often
00125         int32_t out_idx=0;
00126         int32_t out_max=0;
00127 
00128         for (j=0; j<num_classes; j++)
00129         {
00130             if (out_max< classes[j])
00131             {
00132                 out_idx= j;
00133                 out_max= classes[j];
00134             }
00135         }
00136 
00137         output->set_label(i, out_idx+min_label);
00138     }
00139 
00140     delete[] dists;
00141     delete[] train_lab;
00142     delete[] classes;
00143 
00144     return output;
00145 }
00146 
00147 CLabels* CKNN::classify(CFeatures* data)
00148 {
00149     if (!distance)
00150         SG_ERROR("No distance assigned!\n");
00151 
00152     CFeatures* lhs=distance->get_lhs();
00153     if (!lhs || !lhs->get_num_vectors())
00154     {
00155         SG_UNREF(lhs);
00156         SG_ERROR("No vectors on left hand side\n");
00157     }
00158     distance->init(lhs, data);
00159     SG_UNREF(lhs);
00160 
00161     return classify();
00162 }
00163 
00164 void CKNN::classify_for_multiple_k(int32_t** dst, int32_t* num_vec, int32_t* k_out)
00165 {
00166     ASSERT(dst);
00167     ASSERT(k_out);
00168     ASSERT(num_vec);
00169 
00170     ASSERT(num_classes>0);
00171     ASSERT(distance);
00172     ASSERT(distance->get_num_vec_rhs());
00173 
00174     int32_t num_lab=distance->get_num_vec_rhs();
00175     ASSERT(k<=num_lab);
00176 
00177     int32_t* output=(int32_t*) malloc(sizeof(int32_t)*k*num_lab);
00178 
00179     //distances to train data and working buffer of train_labels
00180     float64_t* dists=new float64_t[num_train_labels];
00181     int32_t* train_lab=new int32_t[num_train_labels];
00182 
00184     int32_t* classes=new int32_t[num_classes];
00185 
00186     SG_INFO( "%d test examples\n", num_lab);
00187     CSignal::clear_cancel();
00188 
00189     for (int32_t i=0; i<num_lab && (!CSignal::cancel_computations()); i++)
00190     {
00191         SG_PROGRESS(i, 0, num_lab);
00192 
00193         // lhs idx 1..n and rhs idx i
00194         distances_lhs(dists,0,num_train_labels-1,i);
00195         for (int32_t j=0; j<num_train_labels; j++)
00196             train_lab[j]=train_labels[j];
00197 
00198         //sort the distance vector for test example j to all train examples
00199         //classes[1..k] then holds the classes for minimum distance
00200         CMath::qsort_index(dists, train_lab, num_train_labels);
00201 
00202         //compute histogram of class outputs of the first k nearest neighbours
00203         for (int32_t j=0; j<num_classes; j++)
00204             classes[j]=0;
00205 
00206         for (int32_t j=0; j<k; j++)
00207         {
00208             classes[train_lab[j]]++;
00209 
00210             //choose the class that got 'outputted' most often
00211             int32_t out_idx=0;
00212             int32_t out_max=0;
00213 
00214             for (int32_t c=0; c<num_classes; c++)
00215             {
00216                 if (out_max< classes[c])
00217                 {
00218                     out_idx= c;
00219                     out_max= classes[c];
00220                 }
00221             }
00222             output[j*num_lab+i]=out_idx+min_label;
00223         }
00224     }
00225 
00226     delete[] dists;
00227     delete[] train_lab;
00228     delete[] classes;
00229 
00230     *dst=output;
00231     *k_out=k;
00232     *num_vec=num_lab;
00233 }
00234 
00235 bool CKNN::load(FILE* srcfile)
00236 {
00237     SG_SET_LOCALE_C;
00238     SG_RESET_LOCALE;
00239     return false;
00240 }
00241 
00242 bool CKNN::save(FILE* dstfile)
00243 {
00244     SG_SET_LOCALE_C;
00245     SG_RESET_LOCALE;
00246     return false;
00247 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation