00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013 #include <shogun/classifier/KNN.h>
00014 #include <shogun/features/Labels.h>
00015 #include <shogun/mathematics/Math.h>
00016 #include <shogun/lib/Signal.h>
00017 #include <shogun/base/Parameter.h>
00018
00019 using namespace shogun;
00020
00021 CKNN::CKNN()
00022 : CDistanceMachine()
00023 {
00024 init();
00025 }
00026
00027 CKNN::CKNN(int32_t k, CDistance* d, CLabels* trainlab)
00028 : CDistanceMachine()
00029 {
00030 init();
00031
00032 m_k=k;
00033
00034 ASSERT(d);
00035 ASSERT(trainlab);
00036
00037 set_distance(d);
00038 set_labels(trainlab);
00039 train_labels.vlen=trainlab->get_num_labels();
00040 }
00041
00042 void CKNN::init()
00043 {
00044
00045
00046 set_store_model_features(false);
00047
00048 m_k=3;
00049 m_q=1.0;
00050 num_classes=0;
00051
00052 m_parameters->add(&m_k, "k", "Parameter k");
00053 m_parameters->add(&m_q, "q", "Parameter q");
00054 m_parameters->add(&num_classes, "num_classes", "Number of classes");
00055 }
00056
00057 CKNN::~CKNN()
00058 {
00059 SG_FREE(train_labels.vector);
00060 }
00061
00062 bool CKNN::train_machine(CFeatures* data)
00063 {
00064 ASSERT(labels);
00065 ASSERT(distance);
00066
00067 if (data)
00068 {
00069 if (labels->get_num_labels() != data->get_num_vectors())
00070 SG_ERROR("Number of training vectors does not match number of labels\n");
00071 distance->init(data, data);
00072 }
00073
00074 SGVector<int32_t> lab=labels->get_int_labels();
00075 train_labels.vlen=lab.vlen;
00076 train_labels.vector=CMath::clone_vector(lab.vector, lab.vlen);
00077 lab.free_vector();
00078 ASSERT(train_labels.vlen>0);
00079
00080 int32_t max_class=train_labels.vector[0];
00081 int32_t min_class=train_labels.vector[0];
00082
00083 int32_t i;
00084 for (i=1; i<train_labels.vlen; i++)
00085 {
00086 max_class=CMath::max(max_class, train_labels.vector[i]);
00087 min_class=CMath::min(min_class, train_labels.vector[i]);
00088 }
00089
00090 for (i=0; i<train_labels.vlen; i++)
00091 train_labels.vector[i]-=min_class;
00092
00093 min_label=min_class;
00094 num_classes=max_class-min_class+1;
00095
00096 SG_INFO( "num_classes: %d (%+d to %+d) num_train: %d\n", num_classes,
00097 min_class, max_class, train_labels.vlen);
00098 return true;
00099 }
00100
00101 CLabels* CKNN::apply()
00102 {
00103 ASSERT(num_classes>0);
00104 ASSERT(distance);
00105 ASSERT(distance->get_num_vec_rhs());
00106
00107 int32_t num_lab=distance->get_num_vec_rhs();
00108 ASSERT(m_k<=num_lab);
00109
00110 CLabels* output=new CLabels(num_lab);
00111
00112
00113 float64_t* dists=SG_MALLOC(float64_t, train_labels.vlen);
00114 int32_t* train_lab=SG_MALLOC(int32_t, train_labels.vlen);
00115
00116 SG_INFO( "%d test examples\n", num_lab);
00117 CSignal::clear_cancel();
00118
00120 float64_t* classes=SG_MALLOC(float64_t, num_classes);
00121
00122 for (int32_t i=0; i<num_lab && (!CSignal::cancel_computations()); i++)
00123 {
00124 SG_PROGRESS(i, 0, num_lab);
00125
00126
00127 distances_lhs(dists,0,train_labels.vlen-1,i);
00128 int32_t j;
00129
00130 for (j=0; j<train_labels.vlen; j++)
00131 train_lab[j]=train_labels.vector[j];
00132
00133
00134
00135 CMath::qsort_index(dists, train_lab, train_labels.vlen);
00136
00137
00138 for (j=0; j<num_classes; j++)
00139 classes[j]=0.0;
00140
00141 float64_t multiplier = m_q;
00142 for (j=0; j<m_k; j++)
00143 {
00144 classes[train_lab[j]]+= multiplier;
00145 multiplier*= multiplier;
00146 }
00147
00148
00149 int32_t out_idx=0;
00150 float64_t out_max=0;
00151
00152 for (j=0; j<num_classes; j++)
00153 {
00154 if (out_max< classes[j])
00155 {
00156 out_idx= j;
00157 out_max= classes[j];
00158 }
00159 }
00160 output->set_label(i, out_idx+min_label);
00161 }
00162
00163 SG_FREE(classes);
00164 SG_FREE(dists);
00165 SG_FREE(train_lab);
00166
00167 return output;
00168 }
00169
00170 CLabels* CKNN::apply(CFeatures* data)
00171 {
00172 init_distance(data);
00173
00174
00175 if (m_k == 1)
00176 return classify_NN();
00177
00178 return apply();
00179 }
00180
00181 CLabels* CKNN::classify_NN()
00182 {
00183 ASSERT(distance);
00184 ASSERT(num_classes>0);
00185
00186 int32_t num_lab = distance->get_num_vec_rhs();
00187 ASSERT(num_lab);
00188
00189 CLabels* output = new CLabels(num_lab);
00190 float64_t* distances = SG_MALLOC(float64_t, train_labels.vlen);
00191
00192 SG_INFO("%d test examples\n", num_lab);
00193 CSignal::clear_cancel();
00194
00195
00196 for (int32_t i=0; i<num_lab && (!CSignal::cancel_computations()); i++)
00197 {
00198 SG_PROGRESS(i,0,num_lab);
00199
00200
00201 distances_lhs(distances,0,train_labels.vlen-1,i);
00202 int32_t j;
00203
00204
00205 int32_t out_idx = 0;
00206 float64_t min_dist = distances[0];
00207
00208
00209 for (j=0; j<train_labels.vlen; j++)
00210 {
00211 if (distances[j]<min_dist)
00212 {
00213 min_dist = distances[j];
00214 out_idx = j;
00215 }
00216 }
00217
00218
00219 output->set_label(i,train_labels.vector[out_idx]+min_label);
00220 }
00221
00222 delete [] distances;
00223 return output;
00224 }
00225
00226 SGMatrix<int32_t> CKNN::classify_for_multiple_k()
00227 {
00228 ASSERT(num_classes>0);
00229 ASSERT(distance);
00230 ASSERT(distance->get_num_vec_rhs());
00231
00232 int32_t num_lab=distance->get_num_vec_rhs();
00233 ASSERT(m_k<=num_lab);
00234
00235 int32_t* output=SG_MALLOC(int32_t, m_k*num_lab);
00236
00237
00238 float64_t* dists=SG_MALLOC(float64_t, train_labels.vlen);
00239 int32_t* train_lab=SG_MALLOC(int32_t, train_labels.vlen);
00240
00242 int32_t* classes=SG_MALLOC(int32_t, num_classes);
00243
00244 SG_INFO( "%d test examples\n", num_lab);
00245 CSignal::clear_cancel();
00246
00247 for (int32_t i=0; i<num_lab && (!CSignal::cancel_computations()); i++)
00248 {
00249 SG_PROGRESS(i, 0, num_lab);
00250
00251
00252 distances_lhs(dists,0,train_labels.vlen-1,i);
00253 for (int32_t j=0; j<train_labels.vlen; j++)
00254 train_lab[j]=train_labels.vector[j];
00255
00256
00257
00258 CMath::qsort_index(dists, train_lab, train_labels.vlen);
00259
00260
00261 for (int32_t j=0; j<num_classes; j++)
00262 classes[j]=0;
00263
00264 for (int32_t j=0; j<m_k; j++)
00265 {
00266 classes[train_lab[j]]++;
00267
00268
00269 int32_t out_idx=0;
00270 int32_t out_max=0;
00271
00272 for (int32_t c=0; c<num_classes; c++)
00273 {
00274 if (out_max< classes[c])
00275 {
00276 out_idx= c;
00277 out_max= classes[c];
00278 }
00279 }
00280 output[j*num_lab+i]=out_idx+min_label;
00281 }
00282 }
00283
00284 SG_FREE(dists);
00285 SG_FREE(train_lab);
00286 SG_FREE(classes);
00287
00288 return SGMatrix<int32_t>(output,num_lab,m_k);
00289 }
00290
00291 void CKNN::init_distance(CFeatures* data)
00292 {
00293 if (!distance)
00294 SG_ERROR("No distance assigned!\n");
00295 CFeatures* lhs=distance->get_lhs();
00296 if (!lhs || !lhs->get_num_vectors())
00297 {
00298 SG_UNREF(lhs);
00299 SG_ERROR("No vectors on left hand side\n");
00300 }
00301 distance->init(lhs, data);
00302 SG_UNREF(lhs);
00303 }
00304
00305 bool CKNN::load(FILE* srcfile)
00306 {
00307 SG_SET_LOCALE_C;
00308 SG_RESET_LOCALE;
00309 return false;
00310 }
00311
00312 bool CKNN::save(FILE* dstfile)
00313 {
00314 SG_SET_LOCALE_C;
00315 SG_RESET_LOCALE;
00316 return false;
00317 }
00318
00319 void CKNN::store_model_features()
00320 {
00321 CFeatures* d_lhs=distance->get_lhs();
00322 CFeatures* d_rhs=distance->get_rhs();
00323
00324
00325 distance->init(d_lhs->duplicate(), d_rhs);
00326
00327 SG_UNREF(d_lhs);
00328 SG_UNREF(d_rhs);
00329 }