Distance.cpp

Go to the documentation of this file.
00001 /*
00002  * this program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2006-2009 Christian Gehl
00008  * Written (W) 2006-2009 Soeren Sonnenburg
00009  * Copyright (C) 2006-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00010  */
00011 
00012 #include <shogun/lib/config.h>
00013 #include <shogun/lib/common.h>
00014 #include <shogun/io/SGIO.h>
00015 #include <shogun/io/File.h>
00016 #include <shogun/lib/Time.h>
00017 #include <shogun/base/Parallel.h>
00018 #include <shogun/base/Parameter.h>
00019 
00020 #include <shogun/distance/Distance.h>
00021 #include <shogun/features/Features.h>
00022 
00023 #include <string.h>
00024 #include <unistd.h>
00025 
00026 #ifdef HAVE_PTHREAD
00027 #include <pthread.h>
00028 #endif
00029 
00030 using namespace shogun;
00031 
00032 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00033 struct DISTANCE_THREAD_PARAM
00034 {
00035     // CDistance instance used by thread to compute distance
00036     CDistance* distance;
00037     // distance matrix to store computed distances
00038     float64_t* distance_matrix;
00039     // starting index of the main loop
00040     int32_t idx_start;
00041     // end index of the main loop
00042     int32_t idx_stop;
00043     // step of the main loop
00044     int32_t idx_step;
00045     // number of lhs vectors
00046     int32_t lhs_vectors_number;
00047     // number of rhs vectors
00048     int32_t rhs_vectors_number;
00049     // whether matrix distance is symmetric
00050     bool symmetric;
00051     // chunking method
00052     bool chunk_by_lhs;
00053 };
00054 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
00055 
00056 CDistance::CDistance() : CSGObject()
00057 {
00058     init();
00059 }
00060 
00061         
00062 CDistance::CDistance(CFeatures* p_lhs, CFeatures* p_rhs) : CSGObject()
00063 {
00064     init();
00065     init(p_lhs, p_rhs);
00066 }
00067 
00068 CDistance::~CDistance()
00069 {
00070     SG_FREE(precomputed_matrix);
00071     precomputed_matrix=NULL;
00072 
00073     remove_lhs_and_rhs();
00074 }
00075 
00076 bool CDistance::init(CFeatures* l, CFeatures* r)
00077 {
00078     //make sure features were indeed supplied
00079     ASSERT(l);
00080     ASSERT(r);
00081 
00082     //make sure features are compatible
00083     ASSERT(l->get_feature_class()==r->get_feature_class());
00084     ASSERT(l->get_feature_type()==r->get_feature_type());
00085 
00086     //remove references to previous features
00087     remove_lhs_and_rhs();
00088 
00089     //increase reference counts
00090     SG_REF(l);
00091     SG_REF(r);
00092 
00093     lhs=l;
00094     rhs=r;
00095 
00096     SG_FREE(precomputed_matrix);
00097     precomputed_matrix=NULL ;
00098 
00099     return true;
00100 }
00101 
00102 void CDistance::load(CFile* loader)
00103 {
00104     SG_SET_LOCALE_C;
00105     SG_RESET_LOCALE;
00106 }
00107 
00108 void CDistance::save(CFile* writer)
00109 {
00110     SG_SET_LOCALE_C;
00111     SG_RESET_LOCALE;
00112 }
00113 
00114 void CDistance::remove_lhs_and_rhs()
00115 {
00116     SG_UNREF(rhs);
00117     rhs = NULL;
00118 
00119     SG_UNREF(lhs);
00120     lhs = NULL;
00121 }
00122 
00123 void CDistance::remove_lhs()
00124 { 
00125     SG_UNREF(lhs);
00126     lhs = NULL;
00127 }
00128 
00130 void CDistance::remove_rhs()
00131 {
00132     SG_UNREF(rhs);
00133     rhs = NULL;
00134 }
00135 
00136 CFeatures* CDistance::replace_rhs(CFeatures* r)
00137 {
00138      //make sure features were indeed supplied
00139      ASSERT(r);
00140 
00141      //make sure features are compatible
00142      ASSERT(lhs->get_feature_class()==r->get_feature_class());
00143      ASSERT(lhs->get_feature_type()==r->get_feature_type());
00144 
00145      //remove references to previous rhs features
00146      CFeatures* tmp=rhs;
00147      
00148      rhs=r;
00149 
00150      SG_FREE(precomputed_matrix);
00151      precomputed_matrix=NULL ;
00152 
00153      // return old features including reference count
00154      return tmp;
00155 }
00156 
00157 void CDistance::do_precompute_matrix()
00158 {
00159     int32_t num_left=lhs->get_num_vectors();
00160     int32_t num_right=rhs->get_num_vectors();
00161     SG_INFO( "precomputing distance matrix (%ix%i)\n", num_left, num_right) ;
00162 
00163     ASSERT(num_left==num_right);
00164     ASSERT(lhs==rhs);
00165     int32_t num=num_left;
00166     
00167     SG_FREE(precomputed_matrix);
00168     precomputed_matrix=SG_MALLOC(float32_t, num*(num+1)/2);
00169 
00170     for (int32_t i=0; i<num; i++)
00171     {
00172         SG_PROGRESS(i*i,0,num*num);
00173         for (int32_t j=0; j<=i; j++)
00174             precomputed_matrix[i*(i+1)/2+j] = compute(i,j) ;
00175     }
00176 
00177     SG_PROGRESS(num*num,0,num*num);
00178     SG_DONE();
00179 }
00180 
00181 SGMatrix<float64_t> CDistance::get_distance_matrix()
00182 {
00183     int32_t m,n;
00184     float64_t* data=get_distance_matrix_real(m,n,NULL);
00185     return SGMatrix<float64_t>(data, m,n);
00186 }
00187 
00188 float32_t* CDistance::get_distance_matrix_shortreal(
00189     int32_t &num_vec1, int32_t &num_vec2, float32_t* target)
00190 {
00191     float32_t* result = NULL;
00192     CFeatures* f1 = lhs;
00193     CFeatures* f2 = rhs;
00194 
00195     if (f1 && f2)
00196     {
00197         if (target && (num_vec1!=f1->get_num_vectors() || num_vec2!=f2->get_num_vectors()))
00198             SG_ERROR("distance matrix does not fit into target\n");
00199 
00200         num_vec1=f1->get_num_vectors();
00201         num_vec2=f2->get_num_vectors();
00202         int64_t total_num=num_vec1*num_vec2;
00203         int32_t num_done=0;
00204 
00205         SG_DEBUG("returning distance matrix of size %dx%d\n", num_vec1, num_vec2);
00206 
00207         if (target)
00208             result=target;
00209         else
00210             result=SG_MALLOC(float32_t, total_num);
00211 
00212         if ( (f1 == f2) && (num_vec1 == num_vec2) )
00213         {
00214             for (int32_t i=0; i<num_vec1; i++)
00215             {
00216                 for (int32_t j=i; j<num_vec1; j++)
00217                 {
00218                     float64_t v=distance(i,j);
00219 
00220                     result[i+j*num_vec1]=v;
00221                     result[j+i*num_vec1]=v;
00222 
00223                     if (num_done%100000)
00224                         SG_PROGRESS(num_done, 0, total_num-1);
00225 
00226                     if (i!=j)
00227                         num_done+=2;
00228                     else
00229                         num_done+=1;
00230                 }
00231             }
00232         }
00233         else
00234         {
00235             for (int32_t i=0; i<num_vec1; i++)
00236             {
00237                 for (int32_t j=0; j<num_vec2; j++)
00238                 {
00239                     result[i+j*num_vec1]=distance(i,j) ;
00240 
00241                     if (num_done%100000)
00242                         SG_PROGRESS(num_done, 0, total_num-1);
00243 
00244                     num_done++;
00245                 }
00246             }
00247         }
00248 
00249         SG_DONE();
00250     }
00251     else
00252             SG_ERROR("no features assigned to distance\n");
00253 
00254     return result;
00255 }
00256 
00257 float64_t* CDistance::get_distance_matrix_real(
00258     int32_t &lhs_vectors_number, int32_t &rhs_vectors_number, float64_t* target)
00259 {
00260     float64_t* distance_matrix = NULL;
00261     CFeatures* lhs_features = lhs;
00262     CFeatures* rhs_features = rhs;
00263 
00264     // check for errors
00265     if (!lhs_features || !rhs_features)
00266         SG_ERROR("No features assigned to the distance.\n");
00267 
00268     if (target && 
00269         (lhs_vectors_number!=lhs_features->get_num_vectors() ||
00270          rhs_vectors_number!=rhs_features->get_num_vectors()))
00271         SG_ERROR("Distance matrix does not fit into the given target.\n");
00272 
00273     // init numbers of vectors and total number of distances
00274     lhs_vectors_number = lhs_features->get_num_vectors();
00275     rhs_vectors_number = rhs_features->get_num_vectors();
00276     int64_t total_distances_number = lhs_vectors_number*rhs_vectors_number;
00277 
00278     SG_DEBUG("Calculating distance matrix of size %dx%d.\n", lhs_vectors_number, rhs_vectors_number);
00279 
00280     // redirect to target or allocate memory 
00281     if (target)
00282         distance_matrix = target;
00283     else
00284         distance_matrix = SG_MALLOC(float64_t, total_distances_number);
00285 
00286     // check if we're computing symmetric distance_matrix
00287     bool symmetric = (lhs_features==rhs_features) || (lhs_vectors_number==rhs_vectors_number);
00288     // select chunking method according to greatest dimension
00289     bool chunk_by_lhs = (lhs_vectors_number >= rhs_vectors_number);
00290 
00291 #ifdef HAVE_PTHREAD
00292     // init parallel to work
00293     int32_t num_threads = parallel->get_num_threads();
00294     ASSERT(num_threads>0);
00295     pthread_t* threads = SG_MALLOC(pthread_t, num_threads);
00296     DISTANCE_THREAD_PARAM* parameters = SG_MALLOC(DISTANCE_THREAD_PARAM,num_threads);
00297     pthread_attr_t attr;
00298     pthread_attr_init(&attr);
00299     pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
00300     // run threads
00301     for (int32_t t=0; t<num_threads; t++)
00302     {
00303         parameters[t].idx_start = t;
00304         parameters[t].idx_stop = chunk_by_lhs ? lhs_vectors_number : rhs_vectors_number;
00305         parameters[t].idx_step = num_threads;
00306         parameters[t].distance_matrix = distance_matrix;
00307         parameters[t].symmetric = symmetric;
00308         parameters[t].lhs_vectors_number = lhs_vectors_number;
00309         parameters[t].rhs_vectors_number = rhs_vectors_number;
00310         parameters[t].chunk_by_lhs = chunk_by_lhs;
00311         parameters[t].distance = this;
00312         pthread_create(&threads[t], &attr, run_distance_thread, (void*)&parameters[t]);
00313     }
00314     // join, i.e. wait threads for finish
00315     for (int32_t t=0; t<num_threads; t++)
00316     {
00317         pthread_join(threads[t], NULL);
00318     }
00319     // cleanup
00320     pthread_attr_destroy(&attr);
00321     SG_FREE(parameters);
00322     SG_FREE(threads);
00323 #else
00324     // init one-threaded parameters
00325     DISTANCE_THREAD_PARAM single_thread_param;
00326     single_thread_param.idx_start = 0;
00327     single_thread_param.idx_stop = chunk_by_lhs ? lhs_vectors_number : rhs_vectors_number;
00328     single_thread_param.idx_step = 1;
00329     single_thread_param.distance_matrix = distance_matrix;
00330     single_thread_param.symmetric = symmetric;
00331     single_thread_param.lhs_vectors_number = lhs_vectors_number;
00332     single_thread_param.rhs_vectors_number = rhs_vectors_number;
00333     single_thread_param.chunk_by_lhs = chunk_by_lhs;
00334     single_thread_param.distance = this;
00335     // run thread
00336     run_distance_thread((void*)&single_thread_param);
00337 #endif
00338 
00339     return distance_matrix;
00340 }
00341 
00342 void CDistance::init()
00343 {
00344     precomputed_matrix = NULL;
00345     precompute_matrix = false;
00346     lhs = NULL;
00347     rhs = NULL;
00348 
00349     m_parameters->add((CSGObject**) &lhs, "lhs",
00350                       "Feature vectors to occur on left hand side.");
00351     m_parameters->add((CSGObject**) &rhs, "rhs",
00352                       "Feature vectors to occur on right hand side.");
00353 }
00354 
00355 void* CDistance::run_distance_thread(void* p)
00356 {
00357     DISTANCE_THREAD_PARAM* parameters = (DISTANCE_THREAD_PARAM*)p;
00358     float64_t* distance_matrix = parameters->distance_matrix;
00359     CDistance* distance = parameters->distance;
00360     int32_t idx_start = parameters->idx_start;
00361     int32_t idx_stop = parameters->idx_stop;
00362     int32_t idx_step = parameters->idx_step;
00363     int32_t lhs_vectors_number = parameters->lhs_vectors_number;
00364     int32_t rhs_vectors_number = parameters->rhs_vectors_number;
00365     bool symmetric = parameters->symmetric;
00366     bool chunk_by_lhs = parameters->chunk_by_lhs;
00367 
00368     if (symmetric)
00369     {
00370         for (int32_t i=idx_start; i<idx_stop; i+=idx_step)
00371         {
00372             for (int32_t j=i; j<rhs_vectors_number; j++)
00373             {
00374                 float64_t ij_distance = distance->compute(i,j);
00375                 distance_matrix[i*rhs_vectors_number+j] = ij_distance;
00376                 distance_matrix[j*rhs_vectors_number+i] = ij_distance;
00377             }
00378         }
00379     }
00380     else
00381     {
00382         if (chunk_by_lhs)
00383         {
00384             for (int32_t i=idx_start; i<idx_stop; i+=idx_step)
00385             {
00386                 for (int32_t j=0; j<rhs_vectors_number; j++)
00387                 {
00388                     distance_matrix[j*lhs_vectors_number+i] = distance->compute(i,j);
00389                 }
00390             }
00391         }
00392         else
00393         {
00394             for (int32_t j=idx_start; j<idx_stop; j+=idx_step)
00395             {
00396                 for (int32_t i=0; i<lhs_vectors_number; i++)
00397                 {
00398                     distance_matrix[j*lhs_vectors_number+i] = distance->compute(i,j);
00399                 }
00400             }
00401         }
00402     }
00403 
00404     return NULL;
00405 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation