Distance.cpp

Go to the documentation of this file.
00001 /*
00002  * this program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2006-2009 Christian Gehl
00008  * Written (W) 2006-2009 Soeren Sonnenburg
00009  * Copyright (C) 2006-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00010  */
00011 
00012 #include <shogun/lib/config.h>
00013 #include <shogun/lib/common.h>
00014 #include <shogun/io/SGIO.h>
00015 #include <shogun/io/File.h>
00016 #include <shogun/lib/Time.h>
00017 #include <shogun/base/Parallel.h>
00018 #include <shogun/base/Parameter.h>
00019 
00020 #include <shogun/distance/Distance.h>
00021 #include <shogun/features/Features.h>
00022 
00023 #include <string.h>
00024 #include <unistd.h>
00025 
00026 #ifdef HAVE_PTHREAD
00027 #include <pthread.h>
00028 #endif
00029 
00030 using namespace shogun;
00031 
00032 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00033 struct DISTANCE_THREAD_PARAM
00034 {
00035     // CDistance instance used by thread to compute distance
00036     CDistance* distance;
00037     // distance matrix to store computed distances
00038     float64_t* distance_matrix;
00039     // starting index of the main loop
00040     int32_t idx_start;
00041     // end index of the main loop
00042     int32_t idx_stop;
00043     // step of the main loop
00044     int32_t idx_step;
00045     // number of lhs vectors
00046     int32_t lhs_vectors_number;
00047     // number of rhs vectors
00048     int32_t rhs_vectors_number;
00049     // whether matrix distance is symmetric
00050     bool symmetric;
00051     // chunking method
00052     bool chunk_by_lhs;
00053 };
00054 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
00055 
00056 CDistance::CDistance() : CSGObject()
00057 {
00058     init();
00059 }
00060 
00061 
00062 CDistance::CDistance(CFeatures* p_lhs, CFeatures* p_rhs) : CSGObject()
00063 {
00064     init();
00065     init(p_lhs, p_rhs);
00066 }
00067 
00068 CDistance::~CDistance()
00069 {
00070     SG_FREE(precomputed_matrix);
00071     precomputed_matrix=NULL;
00072 
00073     remove_lhs_and_rhs();
00074 }
00075 
00076 bool CDistance::init(CFeatures* l, CFeatures* r)
00077 {
00078     //make sure features were indeed supplied
00079     ASSERT(l);
00080     ASSERT(r);
00081 
00082     //make sure features are compatible
00083     ASSERT(l->get_feature_class()==r->get_feature_class());
00084     ASSERT(l->get_feature_type()==r->get_feature_type());
00085 
00086     //remove references to previous features
00087     remove_lhs_and_rhs();
00088 
00089     //increase reference counts
00090     SG_REF(l);
00091     SG_REF(r);
00092 
00093     lhs=l;
00094     rhs=r;
00095 
00096     num_lhs=l->get_num_vectors();
00097     num_rhs=r->get_num_vectors();
00098 
00099     SG_FREE(precomputed_matrix);
00100     precomputed_matrix=NULL ;
00101 
00102     return true;
00103 }
00104 
00105 void CDistance::load(CFile* loader)
00106 {
00107     SG_SET_LOCALE_C;
00108     SG_RESET_LOCALE;
00109 }
00110 
00111 void CDistance::save(CFile* writer)
00112 {
00113     SG_SET_LOCALE_C;
00114     SG_RESET_LOCALE;
00115 }
00116 
00117 void CDistance::remove_lhs_and_rhs()
00118 {
00119     SG_UNREF(rhs);
00120     rhs = NULL;
00121     num_rhs=0;
00122 
00123     SG_UNREF(lhs);
00124     lhs = NULL;
00125     num_lhs=0;
00126 }
00127 
00128 void CDistance::remove_lhs()
00129 {
00130     SG_UNREF(lhs);
00131     lhs = NULL;
00132     num_lhs=0;
00133 }
00134 
00136 void CDistance::remove_rhs()
00137 {
00138     SG_UNREF(rhs);
00139     rhs = NULL;
00140     num_rhs=0;
00141 }
00142 
00143 CFeatures* CDistance::replace_rhs(CFeatures* r)
00144 {
00145     //make sure features were indeed supplied
00146     ASSERT(r);
00147 
00148     //make sure features are compatible
00149     ASSERT(lhs->get_feature_class()==r->get_feature_class());
00150     ASSERT(lhs->get_feature_type()==r->get_feature_type());
00151 
00152     //remove references to previous rhs features
00153     CFeatures* tmp=rhs;
00154 
00155     rhs=r;
00156     num_rhs=r->get_num_vectors();
00157 
00158     SG_FREE(precomputed_matrix);
00159     precomputed_matrix=NULL ;
00160 
00161     // return old features including reference count
00162     return tmp;
00163 }
00164 
00165 CFeatures* CDistance::replace_lhs(CFeatures* l)
00166 {
00167     //make sure features were indeed supplied
00168     ASSERT(l);
00169 
00170     //make sure features are compatible
00171     ASSERT(rhs->get_feature_class()==l->get_feature_class());
00172     ASSERT(rhs->get_feature_type()==l->get_feature_type());
00173 
00174     //remove references to previous rhs features
00175     CFeatures* tmp=lhs;
00176 
00177     lhs=l;
00178     num_lhs=l->get_num_vectors();
00179 
00180     SG_FREE(precomputed_matrix);
00181     precomputed_matrix=NULL ;
00182 
00183     // return old features including reference count
00184     return tmp;
00185 }
00186 
00187 void CDistance::do_precompute_matrix()
00188 {
00189     int32_t num_left=lhs->get_num_vectors();
00190     int32_t num_right=rhs->get_num_vectors();
00191     SG_INFO( "precomputing distance matrix (%ix%i)\n", num_left, num_right) ;
00192 
00193     ASSERT(num_left==num_right);
00194     ASSERT(lhs==rhs);
00195     int32_t num=num_left;
00196 
00197     SG_FREE(precomputed_matrix);
00198     precomputed_matrix=SG_MALLOC(float32_t, num*(num+1)/2);
00199 
00200     for (int32_t i=0; i<num; i++)
00201     {
00202         SG_PROGRESS(i*i,0,num*num);
00203         for (int32_t j=0; j<=i; j++)
00204             precomputed_matrix[i*(i+1)/2+j] = compute(i,j) ;
00205     }
00206 
00207     SG_PROGRESS(num*num,0,num*num);
00208     SG_DONE();
00209 }
00210 
00211 SGMatrix<float64_t> CDistance::get_distance_matrix()
00212 {
00213     int32_t m,n;
00214     float64_t* data=get_distance_matrix_real(m,n,NULL);
00215     return SGMatrix<float64_t>(data, m,n);
00216 }
00217 
00218 float32_t* CDistance::get_distance_matrix_shortreal(
00219     int32_t &num_vec1, int32_t &num_vec2, float32_t* target)
00220 {
00221     float32_t* result = NULL;
00222     CFeatures* f1 = lhs;
00223     CFeatures* f2 = rhs;
00224 
00225     if (has_features())
00226     {
00227         if (target && (num_vec1!=get_num_vec_lhs() || num_vec2!=get_num_vec_rhs()))
00228             SG_ERROR("distance matrix does not fit into target\n");
00229 
00230         num_vec1=get_num_vec_lhs();
00231         num_vec2=get_num_vec_rhs();
00232         int64_t total_num=num_vec1*num_vec2;
00233         int32_t num_done=0;
00234 
00235         SG_DEBUG("returning distance matrix of size %dx%d\n", num_vec1, num_vec2);
00236 
00237         if (target)
00238             result=target;
00239         else
00240             result=SG_MALLOC(float32_t, total_num);
00241 
00242         if ( (f1 == f2) && (num_vec1 == num_vec2) && (f1!=NULL && f2!=NULL) )
00243         {
00244             for (int32_t i=0; i<num_vec1; i++)
00245             {
00246                 for (int32_t j=i; j<num_vec1; j++)
00247                 {
00248                     float64_t v=distance(i,j);
00249 
00250                     result[i+j*num_vec1]=v;
00251                     result[j+i*num_vec1]=v;
00252 
00253                     if (num_done%100000)
00254                         SG_PROGRESS(num_done, 0, total_num-1);
00255 
00256                     if (i!=j)
00257                         num_done+=2;
00258                     else
00259                         num_done+=1;
00260                 }
00261             }
00262         }
00263         else
00264         {
00265             for (int32_t i=0; i<num_vec1; i++)
00266             {
00267                 for (int32_t j=0; j<num_vec2; j++)
00268                 {
00269                     result[i+j*num_vec1]=distance(i,j) ;
00270 
00271                     if (num_done%100000)
00272                         SG_PROGRESS(num_done, 0, total_num-1);
00273 
00274                     num_done++;
00275                 }
00276             }
00277         }
00278 
00279         SG_DONE();
00280     }
00281     else
00282             SG_ERROR("no features assigned to distance\n");
00283 
00284     return result;
00285 }
00286 
00287 float64_t* CDistance::get_distance_matrix_real(
00288     int32_t &lhs_vectors_number, int32_t &rhs_vectors_number, float64_t* target)
00289 {
00290     float64_t* distance_matrix = NULL;
00291     CFeatures* lhs_features = lhs;
00292     CFeatures* rhs_features = rhs;
00293 
00294     // check for errors
00295     if (!has_features())
00296         SG_ERROR("No features assigned to the distance.\n");
00297 
00298     if (target &&
00299         (lhs_vectors_number!=get_num_vec_lhs() ||
00300          rhs_vectors_number!=get_num_vec_rhs()))
00301         SG_ERROR("Distance matrix does not fit into the given target.\n");
00302 
00303     // init numbers of vectors and total number of distances
00304     lhs_vectors_number = get_num_vec_lhs();
00305     rhs_vectors_number = get_num_vec_rhs();
00306     int64_t total_distances_number = lhs_vectors_number*rhs_vectors_number;
00307 
00308     SG_DEBUG("Calculating distance matrix of size %dx%d.\n", lhs_vectors_number, rhs_vectors_number);
00309 
00310     // redirect to target or allocate memory
00311     if (target)
00312         distance_matrix = target;
00313     else
00314         distance_matrix = SG_MALLOC(float64_t, total_distances_number);
00315 
00316     // check if we're computing symmetric distance_matrix
00317     bool symmetric = (lhs_features==rhs_features) || (lhs_vectors_number==rhs_vectors_number);
00318     // select chunking method according to greatest dimension
00319     bool chunk_by_lhs = (lhs_vectors_number >= rhs_vectors_number);
00320 
00321 #ifdef HAVE_PTHREAD
00322     // init parallel to work
00323     int32_t num_threads = parallel->get_num_threads();
00324     ASSERT(num_threads>0);
00325     pthread_t* threads = SG_MALLOC(pthread_t, num_threads);
00326     DISTANCE_THREAD_PARAM* parameters = SG_MALLOC(DISTANCE_THREAD_PARAM,num_threads);
00327     pthread_attr_t attr;
00328     pthread_attr_init(&attr);
00329     pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
00330     // run threads
00331     for (int32_t t=0; t<num_threads; t++)
00332     {
00333         parameters[t].idx_start = t;
00334         parameters[t].idx_stop = chunk_by_lhs ? lhs_vectors_number : rhs_vectors_number;
00335         parameters[t].idx_step = num_threads;
00336         parameters[t].distance_matrix = distance_matrix;
00337         parameters[t].symmetric = symmetric;
00338         parameters[t].lhs_vectors_number = lhs_vectors_number;
00339         parameters[t].rhs_vectors_number = rhs_vectors_number;
00340         parameters[t].chunk_by_lhs = chunk_by_lhs;
00341         parameters[t].distance = this;
00342         pthread_create(&threads[t], &attr, run_distance_thread, (void*)&parameters[t]);
00343     }
00344     // join, i.e. wait threads for finish
00345     for (int32_t t=0; t<num_threads; t++)
00346     {
00347         pthread_join(threads[t], NULL);
00348     }
00349     // cleanup
00350     pthread_attr_destroy(&attr);
00351     SG_FREE(parameters);
00352     SG_FREE(threads);
00353 #else
00354     // init one-threaded parameters
00355     DISTANCE_THREAD_PARAM single_thread_param;
00356     single_thread_param.idx_start = 0;
00357     single_thread_param.idx_stop = chunk_by_lhs ? lhs_vectors_number : rhs_vectors_number;
00358     single_thread_param.idx_step = 1;
00359     single_thread_param.distance_matrix = distance_matrix;
00360     single_thread_param.symmetric = symmetric;
00361     single_thread_param.lhs_vectors_number = lhs_vectors_number;
00362     single_thread_param.rhs_vectors_number = rhs_vectors_number;
00363     single_thread_param.chunk_by_lhs = chunk_by_lhs;
00364     single_thread_param.distance = this;
00365     // run thread
00366     run_distance_thread((void*)&single_thread_param);
00367 #endif
00368 
00369     return distance_matrix;
00370 }
00371 
00372 void CDistance::init()
00373 {
00374     precomputed_matrix = NULL;
00375     precompute_matrix = false;
00376     lhs = NULL;
00377     rhs = NULL;
00378     num_lhs=0;
00379     num_rhs=0;
00380 
00381     m_parameters->add((CSGObject**) &lhs, "lhs",
00382                       "Feature vectors to occur on left hand side.");
00383     m_parameters->add((CSGObject**) &rhs, "rhs",
00384                       "Feature vectors to occur on right hand side.");
00385 }
00386 
00387 void* CDistance::run_distance_thread(void* p)
00388 {
00389     DISTANCE_THREAD_PARAM* parameters = (DISTANCE_THREAD_PARAM*)p;
00390     float64_t* distance_matrix = parameters->distance_matrix;
00391     CDistance* distance = parameters->distance;
00392     int32_t idx_start = parameters->idx_start;
00393     int32_t idx_stop = parameters->idx_stop;
00394     int32_t idx_step = parameters->idx_step;
00395     int32_t lhs_vectors_number = parameters->lhs_vectors_number;
00396     int32_t rhs_vectors_number = parameters->rhs_vectors_number;
00397     bool symmetric = parameters->symmetric;
00398     bool chunk_by_lhs = parameters->chunk_by_lhs;
00399 
00400     if (symmetric)
00401     {
00402         for (int32_t i=idx_start; i<idx_stop; i+=idx_step)
00403         {
00404             for (int32_t j=i; j<rhs_vectors_number; j++)
00405             {
00406                 float64_t ij_distance = distance->compute(i,j);
00407                 distance_matrix[i*rhs_vectors_number+j] = ij_distance;
00408                 distance_matrix[j*rhs_vectors_number+i] = ij_distance;
00409             }
00410         }
00411     }
00412     else
00413     {
00414         if (chunk_by_lhs)
00415         {
00416             for (int32_t i=idx_start; i<idx_stop; i+=idx_step)
00417             {
00418                 for (int32_t j=0; j<rhs_vectors_number; j++)
00419                 {
00420                     distance_matrix[j*lhs_vectors_number+i] = distance->compute(i,j);
00421                 }
00422             }
00423         }
00424         else
00425         {
00426             for (int32_t j=idx_start; j<idx_stop; j+=idx_step)
00427             {
00428                 for (int32_t i=0; i<lhs_vectors_number; i++)
00429                 {
00430                     distance_matrix[j*lhs_vectors_number+i] = distance->compute(i,j);
00431                 }
00432             }
00433         }
00434     }
00435 
00436     return NULL;
00437 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation