KNN.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2006 Christian Gehl
00008  * Written (W) 2006-2009 Soeren Sonnenburg
00009  * Written (W) 2011 Sergey Lisitsyn
00010  * Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society
00011  */
00012 
00013 #include <shogun/classifier/KNN.h>
00014 #include <shogun/features/Labels.h>
00015 #include <shogun/mathematics/Math.h>
00016 #include <shogun/lib/Signal.h>
00017 #include <shogun/base/Parameter.h>
00018 
00019 using namespace shogun;
00020 
00021 CKNN::CKNN()
00022 : CDistanceMachine()
00023 {
00024     init();
00025 }
00026 
00027 CKNN::CKNN(int32_t k, CDistance* d, CLabels* trainlab)
00028 : CDistanceMachine()
00029 {
00030     init();
00031 
00032     m_k=k;
00033 
00034     ASSERT(d);
00035     ASSERT(trainlab);
00036 
00037     set_distance(d);
00038     set_labels(trainlab);
00039     train_labels.vlen=trainlab->get_num_labels();
00040 }
00041 
00042 void CKNN::init()
00043 {
00044     /* do not store model features by default (CDistanceMachine::apply(...) is
00045      * overwritten */
00046     set_store_model_features(false);
00047 
00048     m_k=3;
00049     m_q=1.0;
00050     num_classes=0;
00051 
00052     m_parameters->add(&m_k, "k", "Parameter k");
00053     m_parameters->add(&m_q, "q", "Parameter q");
00054     m_parameters->add(&num_classes, "num_classes", "Number of classes");
00055 }
00056 
00057 CKNN::~CKNN()
00058 {
00059     SG_FREE(train_labels.vector);
00060 }
00061 
00062 bool CKNN::train_machine(CFeatures* data)
00063 {
00064     ASSERT(labels);
00065     ASSERT(distance);
00066 
00067     if (data)
00068     {
00069         if (labels->get_num_labels() != data->get_num_vectors())
00070             SG_ERROR("Number of training vectors does not match number of labels\n");
00071         distance->init(data, data);
00072     }
00073 
00074     SGVector<int32_t> lab=labels->get_int_labels();
00075     train_labels.vlen=lab.vlen;
00076     train_labels.vector=CMath::clone_vector(lab.vector, lab.vlen);
00077     lab.free_vector();
00078     ASSERT(train_labels.vlen>0);
00079 
00080     int32_t max_class=train_labels.vector[0];
00081     int32_t min_class=train_labels.vector[0];
00082 
00083     int32_t i;
00084     for (i=1; i<train_labels.vlen; i++)
00085     {
00086         max_class=CMath::max(max_class, train_labels.vector[i]);
00087         min_class=CMath::min(min_class, train_labels.vector[i]);
00088     }
00089 
00090     for (i=0; i<train_labels.vlen; i++)
00091         train_labels.vector[i]-=min_class;
00092 
00093     min_label=min_class;
00094     num_classes=max_class-min_class+1;
00095 
00096     SG_INFO( "num_classes: %d (%+d to %+d) num_train: %d\n", num_classes,
00097             min_class, max_class, train_labels.vlen);
00098     return true;
00099 }
00100 
00101 CLabels* CKNN::apply()
00102 {
00103     ASSERT(num_classes>0);
00104     ASSERT(distance);
00105     ASSERT(distance->get_num_vec_rhs());
00106 
00107     int32_t num_lab=distance->get_num_vec_rhs();
00108     ASSERT(m_k<=num_lab);
00109 
00110     CLabels* output=new CLabels(num_lab);
00111 
00112     //distances to train data and working buffer of train_labels
00113     float64_t* dists=SG_MALLOC(float64_t, train_labels.vlen);
00114     int32_t* train_lab=SG_MALLOC(int32_t, train_labels.vlen);
00115 
00116     SG_INFO( "%d test examples\n", num_lab);
00117     CSignal::clear_cancel();
00118 
00120     float64_t* classes=SG_MALLOC(float64_t, num_classes);
00121 
00122     for (int32_t i=0; i<num_lab && (!CSignal::cancel_computations()); i++)
00123     {
00124         SG_PROGRESS(i, 0, num_lab);
00125 
00126         // lhs idx 1..n and rhs idx i
00127         distances_lhs(dists,0,train_labels.vlen-1,i);
00128         int32_t j;
00129 
00130         for (j=0; j<train_labels.vlen; j++)
00131             train_lab[j]=train_labels.vector[j];
00132 
00133         //sort the distance vector for test example j to all train examples
00134         //classes[1..k] then holds the classes for minimum distance
00135         CMath::qsort_index(dists, train_lab, train_labels.vlen);
00136 
00137         //compute histogram of class outputs of the first k nearest neighbours
00138         for (j=0; j<num_classes; j++)
00139             classes[j]=0.0;
00140 
00141         float64_t multiplier = m_q;
00142         for (j=0; j<m_k; j++)
00143         {
00144             classes[train_lab[j]]+= multiplier;
00145             multiplier*= multiplier;
00146         }
00147 
00148         //choose the class that got 'outputted' most often
00149         int32_t out_idx=0;
00150         float64_t out_max=0;
00151 
00152         for (j=0; j<num_classes; j++)
00153         {
00154             if (out_max< classes[j])
00155             {
00156                 out_idx= j;
00157                 out_max= classes[j];
00158             }
00159         }
00160         output->set_label(i, out_idx+min_label);
00161     }
00162 
00163     SG_FREE(classes);
00164     SG_FREE(dists);
00165     SG_FREE(train_lab);
00166 
00167     return output;
00168 }
00169 
00170 CLabels* CKNN::apply(CFeatures* data)
00171 {
00172     init_distance(data);
00173 
00174     // redirecting to fast (without sorting) classify if k==1
00175     if (m_k == 1)
00176         return classify_NN();
00177 
00178     return apply();
00179 }
00180 
00181 CLabels* CKNN::classify_NN()
00182 {
00183     ASSERT(distance);
00184     ASSERT(num_classes>0);
00185 
00186     int32_t num_lab = distance->get_num_vec_rhs();
00187     ASSERT(num_lab);
00188 
00189     CLabels* output = new CLabels(num_lab);
00190     float64_t* distances = SG_MALLOC(float64_t, train_labels.vlen);
00191 
00192     SG_INFO("%d test examples\n", num_lab);
00193     CSignal::clear_cancel();
00194 
00195     // for each test example
00196     for (int32_t i=0; i<num_lab && (!CSignal::cancel_computations()); i++)
00197     {
00198         SG_PROGRESS(i,0,num_lab);
00199 
00200         // get distances from i-th test example to 0..num_train_labels-1 train examples
00201         distances_lhs(distances,0,train_labels.vlen-1,i);
00202         int32_t j;
00203 
00204         // assuming 0th train examples as nearest to i-th test example
00205         int32_t out_idx = 0;
00206         float64_t min_dist = distances[0];
00207 
00208         // searching for nearest neighbor by comparing distances
00209         for (j=0; j<train_labels.vlen; j++)
00210         {
00211             if (distances[j]<min_dist)
00212             {
00213                 min_dist = distances[j];
00214                 out_idx = j;
00215             }
00216         }
00217 
00218         // label i-th test example with label of nearest neighbor with out_idx index
00219         output->set_label(i,train_labels.vector[out_idx]+min_label);
00220     }
00221 
00222     delete [] distances;
00223     return output;
00224 }
00225 
00226 SGMatrix<int32_t> CKNN::classify_for_multiple_k()
00227 {
00228     ASSERT(num_classes>0);
00229     ASSERT(distance);
00230     ASSERT(distance->get_num_vec_rhs());
00231 
00232     int32_t num_lab=distance->get_num_vec_rhs();
00233     ASSERT(m_k<=num_lab);
00234 
00235     int32_t* output=SG_MALLOC(int32_t, m_k*num_lab);
00236 
00237     //distances to train data and working buffer of train_labels
00238     float64_t* dists=SG_MALLOC(float64_t, train_labels.vlen);
00239     int32_t* train_lab=SG_MALLOC(int32_t, train_labels.vlen);
00240 
00242     int32_t* classes=SG_MALLOC(int32_t, num_classes);
00243 
00244     SG_INFO( "%d test examples\n", num_lab);
00245     CSignal::clear_cancel();
00246 
00247     for (int32_t i=0; i<num_lab && (!CSignal::cancel_computations()); i++)
00248     {
00249         SG_PROGRESS(i, 0, num_lab);
00250 
00251         // lhs idx 1..n and rhs idx i
00252         distances_lhs(dists,0,train_labels.vlen-1,i);
00253         for (int32_t j=0; j<train_labels.vlen; j++)
00254             train_lab[j]=train_labels.vector[j];
00255 
00256         //sort the distance vector for test example j to all train examples
00257         //classes[1..k] then holds the classes for minimum distance
00258         CMath::qsort_index(dists, train_lab, train_labels.vlen);
00259 
00260         //compute histogram of class outputs of the first k nearest neighbours
00261         for (int32_t j=0; j<num_classes; j++)
00262             classes[j]=0;
00263 
00264         for (int32_t j=0; j<m_k; j++)
00265         {
00266             classes[train_lab[j]]++;
00267 
00268             //choose the class that got 'outputted' most often
00269             int32_t out_idx=0;
00270             int32_t out_max=0;
00271 
00272             for (int32_t c=0; c<num_classes; c++)
00273             {
00274                 if (out_max< classes[c])
00275                 {
00276                     out_idx= c;
00277                     out_max= classes[c];
00278                 }
00279             }
00280             output[j*num_lab+i]=out_idx+min_label;
00281         }
00282     }
00283 
00284     SG_FREE(dists);
00285     SG_FREE(train_lab);
00286     SG_FREE(classes);
00287 
00288     return SGMatrix<int32_t>(output,num_lab,m_k);
00289 }
00290 
00291 void CKNN::init_distance(CFeatures* data)
00292 {
00293     if (!distance)
00294         SG_ERROR("No distance assigned!\n");
00295     CFeatures* lhs=distance->get_lhs();
00296     if (!lhs || !lhs->get_num_vectors())
00297     {
00298         SG_UNREF(lhs);
00299         SG_ERROR("No vectors on left hand side\n");
00300     }
00301     distance->init(lhs, data);
00302     SG_UNREF(lhs);
00303 }
00304 
00305 bool CKNN::load(FILE* srcfile)
00306 {
00307     SG_SET_LOCALE_C;
00308     SG_RESET_LOCALE;
00309     return false;
00310 }
00311 
00312 bool CKNN::save(FILE* dstfile)
00313 {
00314     SG_SET_LOCALE_C;
00315     SG_RESET_LOCALE;
00316     return false;
00317 }
00318 
00319 void CKNN::store_model_features()
00320 {
00321     CFeatures* d_lhs=distance->get_lhs();
00322     CFeatures* d_rhs=distance->get_rhs();
00323 
00324     /* copy lhs of underlying distance */
00325     distance->init(d_lhs->duplicate(), d_rhs);
00326 
00327     SG_UNREF(d_lhs);
00328     SG_UNREF(d_rhs);
00329 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation