SHOGUN: DotFeatures.cpp Source File

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2009 Soeren Sonnenburg
00008  * Copyright (C) 2009 Fraunhofer Institute FIRST and Max-Planck-Society
00009  */
00010 
00011 #include <shogun/features/DotFeatures.h>
00012 #include <shogun/io/SGIO.h>
00013 #include <shogun/lib/Signal.h>
00014 #include <shogun/lib/Time.h>
00015 #include <shogun/mathematics/Math.h>
00016 #include <shogun/base/Parallel.h>
00017 #include <shogun/base/Parameter.h>
00018 
00019 #ifdef HAVE_PTHREAD
00020 #include <pthread.h>
00021 #endif
00022 
00023 using namespace shogun;
00024 
00025 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00026 struct DF_THREAD_PARAM
00027 {
00028     CDotFeatures* df;
00029     int32_t* sub_index;
00030     float64_t* output;
00031     int32_t start;
00032     int32_t stop;
00033     float64_t* alphas;
00034     float64_t* vec;
00035     int32_t dim;
00036     float64_t bias;
00037     bool progress;
00038 };
00039 #endif // DOXYGEN_SHOULD_SKIP_THIS
00040 
00041 
00042 CDotFeatures::CDotFeatures(int32_t size)
00043     :CFeatures(size), combined_weight(1.0)
00044 {
00045     init();
00046 }
00047 
00048 
00049 CDotFeatures::CDotFeatures(const CDotFeatures & orig)
00050     :CFeatures(orig), combined_weight(orig.combined_weight)
00051 {
00052     init();
00053 }
00054 
00055 
00056 CDotFeatures::CDotFeatures(CFile* loader)
00057     :CFeatures(loader)
00058 {
00059     init();
00060 }
00061 
00062 void CDotFeatures::dense_dot_range(float64_t* output, int32_t start, int32_t stop, float64_t* alphas, float64_t* vec, int32_t dim, float64_t b)
00063 {
00064     ASSERT(output);
00065     // write access is internally between output[start..stop] so the following
00066     // line is necessary to write to output[0...(stop-start-1)]
00067     output-=start; 
00068     ASSERT(start>=0);
00069     ASSERT(start<stop);
00070     ASSERT(stop<=get_num_vectors());
00071 
00072     int32_t num_vectors=stop-start;
00073     ASSERT(num_vectors>0);
00074 
00075     int32_t num_threads=parallel->get_num_threads();
00076     ASSERT(num_threads>0);
00077 
00078     CSignal::clear_cancel();
00079 
00080 #ifdef HAVE_PTHREAD
00081     if (num_threads < 2)
00082     {
00083 #endif
00084         DF_THREAD_PARAM params;
00085         params.df=this;
00086         params.sub_index=NULL;
00087         params.output=output;
00088         params.start=start;
00089         params.stop=stop;
00090         params.alphas=alphas;
00091         params.vec=vec;
00092         params.dim=dim;
00093         params.bias=b;
00094         params.progress=false; //true;
00095         dense_dot_range_helper((void*) &params);
00096 #ifdef HAVE_PTHREAD
00097     }
00098     else
00099     {
00100         pthread_t* threads = SG_MALLOC(pthread_t, num_threads-1);
00101         DF_THREAD_PARAM* params = SG_MALLOC(DF_THREAD_PARAM, num_threads);
00102         int32_t step= num_vectors/num_threads;
00103 
00104         int32_t t;
00105 
00106         for (t=0; t<num_threads-1; t++)
00107         {
00108             params[t].df = this;
00109             params[t].sub_index=NULL;
00110             params[t].output = output;
00111             params[t].start = start+t*step;
00112             params[t].stop = start+(t+1)*step;
00113             params[t].alphas=alphas;
00114             params[t].vec=vec;
00115             params[t].dim=dim;
00116             params[t].bias=b;
00117             params[t].progress = false;
00118             pthread_create(&threads[t], NULL,
00119                     CDotFeatures::dense_dot_range_helper, (void*)&params[t]);
00120         }
00121 
00122         params[t].df = this;
00123         params[t].output = output;
00124         params[t].sub_index=NULL;
00125         params[t].start = start+t*step;
00126         params[t].stop = stop;
00127         params[t].alphas=alphas;
00128         params[t].vec=vec;
00129         params[t].dim=dim;
00130         params[t].bias=b;
00131         params[t].progress = false; //true;
00132         dense_dot_range_helper((void*) &params[t]);
00133 
00134         for (t=0; t<num_threads-1; t++)
00135             pthread_join(threads[t], NULL);
00136 
00137         SG_FREE(params);
00138         SG_FREE(threads);
00139     }
00140 #endif
00141 
00142 #ifndef WIN32
00143         if ( CSignal::cancel_computations() )
00144             SG_INFO( "prematurely stopped.           \n");
00145 #endif
00146 }
00147 
00148 void CDotFeatures::dense_dot_range_subset(int32_t* sub_index, int32_t num, float64_t* output, float64_t* alphas, float64_t* vec, int32_t dim, float64_t b)
00149 {
00150     ASSERT(sub_index);
00151     ASSERT(output);
00152 
00153     int32_t num_threads=parallel->get_num_threads();
00154     ASSERT(num_threads>0);
00155 
00156     CSignal::clear_cancel();
00157 
00158 #ifdef HAVE_PTHREAD
00159     if (num_threads < 2)
00160     {
00161 #endif
00162         DF_THREAD_PARAM params;
00163         params.df=this;
00164         params.sub_index=sub_index;
00165         params.output=output;
00166         params.start=0;
00167         params.stop=num;
00168         params.alphas=alphas;
00169         params.vec=vec;
00170         params.dim=dim;
00171         params.bias=b;
00172         params.progress=false; //true;
00173         dense_dot_range_helper((void*) &params);
00174 #ifdef HAVE_PTHREAD
00175     }
00176     else
00177     {
00178         pthread_t* threads = SG_MALLOC(pthread_t, num_threads-1);
00179         DF_THREAD_PARAM* params = SG_MALLOC(DF_THREAD_PARAM, num_threads);
00180         int32_t step= num/num_threads;
00181 
00182         int32_t t;
00183 
00184         for (t=0; t<num_threads-1; t++)
00185         {
00186             params[t].df = this;
00187             params[t].sub_index=sub_index;
00188             params[t].output = output;
00189             params[t].start = t*step;
00190             params[t].stop = (t+1)*step;
00191             params[t].alphas=alphas;
00192             params[t].vec=vec;
00193             params[t].dim=dim;
00194             params[t].bias=b;
00195             params[t].progress = false;
00196             pthread_create(&threads[t], NULL,
00197                     CDotFeatures::dense_dot_range_helper, (void*)&params[t]);
00198         }
00199 
00200         params[t].df = this;
00201         params[t].sub_index=sub_index;
00202         params[t].output = output;
00203         params[t].start = t*step;
00204         params[t].stop = num;
00205         params[t].alphas=alphas;
00206         params[t].vec=vec;
00207         params[t].dim=dim;
00208         params[t].bias=b;
00209         params[t].progress = false; //true;
00210         dense_dot_range_helper((void*) &params[t]);
00211 
00212         for (t=0; t<num_threads-1; t++)
00213             pthread_join(threads[t], NULL);
00214 
00215         SG_FREE(params);
00216         SG_FREE(threads);
00217     }
00218 #endif
00219 
00220 #ifndef WIN32
00221         if ( CSignal::cancel_computations() )
00222             SG_INFO( "prematurely stopped.           \n");
00223 #endif
00224 }
00225 
00226 void* CDotFeatures::dense_dot_range_helper(void* p)
00227 {
00228     DF_THREAD_PARAM* par=(DF_THREAD_PARAM*) p;
00229     CDotFeatures* df=par->df;
00230     int32_t* sub_index=par->sub_index;
00231     float64_t* output=par->output;
00232     int32_t start=par->start;
00233     int32_t stop=par->stop;
00234     float64_t* alphas=par->alphas;
00235     float64_t* vec=par->vec;
00236     int32_t dim=par->dim;
00237     float64_t bias=par->bias;
00238     bool progress=par->progress;
00239 
00240     if (sub_index)
00241     {
00242 #ifdef WIN32
00243         for (int32_t i=start; i<stop i++)
00244 #else
00245         for (int32_t i=start; i<stop &&
00246                 !CSignal::cancel_computations(); i++)
00247 #endif
00248         {
00249             if (alphas)
00250                 output[i]=alphas[sub_index[i]]*df->dense_dot(sub_index[i], vec, dim)+bias;
00251             else
00252                 output[i]=df->dense_dot(sub_index[i], vec, dim)+bias;
00253             if (progress)
00254                 df->display_progress(start, stop, i);
00255         }
00256 
00257     }
00258     else
00259     {
00260 #ifdef WIN32
00261         for (int32_t i=start; i<stop i++)
00262 #else
00263         for (int32_t i=start; i<stop &&
00264                 !CSignal::cancel_computations(); i++)
00265 #endif
00266         {
00267             if (alphas)
00268                 output[i]=alphas[i]*df->dense_dot(i, vec, dim)+bias;
00269             else
00270                 output[i]=df->dense_dot(i, vec, dim)+bias;
00271             if (progress)
00272                 df->display_progress(start, stop, i);
00273         }
00274     }
00275 
00276     return NULL;
00277 }
00278 
00279 SGMatrix<float64_t> CDotFeatures::get_computed_dot_feature_matrix()
00280 {
00281     SGMatrix<float64_t> m;
00282     
00283     int64_t offs=0;
00284     int32_t num=get_num_vectors();
00285     int32_t dim=get_dim_feature_space();
00286     ASSERT(num>0);
00287     ASSERT(dim>0);
00288 
00289     int64_t sz=((uint64_t) num)* dim;
00290 
00291     m.do_free=true;
00292     m.num_cols=dim;
00293     m.num_rows=num;
00294     m.matrix=SG_MALLOC(float64_t, sz);
00295     memset(m.matrix, 0, sz*sizeof(float64_t));
00296 
00297     for (int32_t i=0; i<num; i++)
00298     {
00299         add_to_dense_vec(1.0, i, &(m.matrix[offs]), dim);
00300         offs+=dim;
00301     }
00302 
00303     return m;
00304 }
00305 
00306 SGVector<float64_t> CDotFeatures::get_computed_dot_feature_vector(int32_t num)
00307 {
00308     SGVector<float64_t> v;
00309 
00310     int32_t dim=get_dim_feature_space();
00311     ASSERT(num>=0 && num<=get_num_vectors());
00312     ASSERT(dim>0);
00313 
00314     v.do_free=true;
00315     v.vlen=dim;
00316     v.vector=SG_MALLOC(float64_t, dim);
00317     memset(v.vector, 0, dim*sizeof(float64_t));
00318 
00319     add_to_dense_vec(1.0, num, v.vector, dim);
00320     return v;
00321 }
00322 
00323 void CDotFeatures::benchmark_add_to_dense_vector(int32_t repeats)
00324 {
00325     int32_t num=get_num_vectors();
00326     int32_t d=get_dim_feature_space();
00327     float64_t* w= SG_MALLOC(float64_t, d);
00328     CMath::fill_vector(w, d, 0.0);
00329 
00330     CTime t;
00331     float64_t start_cpu=t.get_runtime();
00332     float64_t start_wall=t.get_curtime();
00333     for (int32_t r=0; r<repeats; r++)
00334     {
00335         for (int32_t i=0; i<num; i++)
00336             add_to_dense_vec(1.172343*(r+1), i, w, d);
00337     }
00338 
00339     SG_PRINT("Time to process %d x num=%d add_to_dense_vector ops: cputime %fs walltime %fs\n",
00340             repeats, num, (t.get_runtime()-start_cpu)/repeats,
00341             (t.get_curtime()-start_wall)/repeats);
00342 
00343     SG_FREE(w);
00344 }
00345 
00346 void CDotFeatures::benchmark_dense_dot_range(int32_t repeats)
00347 {
00348     int32_t num=get_num_vectors();
00349     int32_t d=get_dim_feature_space();
00350     float64_t* w= SG_MALLOC(float64_t, d);
00351     float64_t* out= SG_MALLOC(float64_t, num);
00352     float64_t* alphas= SG_MALLOC(float64_t, num);
00353     CMath::range_fill_vector(w, d, 17.0);
00354     CMath::range_fill_vector(alphas, num, 1.2345);
00355     //CMath::fill_vector(w, d, 17.0);
00356     //CMath::fill_vector(alphas, num, 1.2345);
00357 
00358     CTime t;
00359     float64_t start_cpu=t.get_runtime();
00360     float64_t start_wall=t.get_curtime();
00361 
00362     for (int32_t r=0; r<repeats; r++)
00363             dense_dot_range(out, 0, num, alphas, w, d, 23);
00364 
00365 #ifdef DEBUG_DOTFEATURES
00366     CMath::display_vector(out, 40, "dense_dot_range");
00367     float64_t* out2= SG_MALLOC(float64_t, num);
00368 
00369     for (int32_t r=0; r<repeats; r++)
00370     {
00371         CMath::fill_vector(out2, num, 0.0);
00372         for (int32_t i=0; i<num; i++)
00373             out2[i]+=dense_dot(i, w, d)*alphas[i]+23;
00374     }
00375     CMath::display_vector(out2, 40, "dense_dot");
00376     for (int32_t i=0; i<num; i++)
00377         out2[i]-=out[i];
00378     CMath::display_vector(out2, 40, "diff");
00379 #endif
00380     SG_PRINT("Time to process %d x num=%d dense_dot_range ops: cputime %fs walltime %fs\n",
00381             repeats, num, (t.get_runtime()-start_cpu)/repeats,
00382             (t.get_curtime()-start_wall)/repeats);
00383 
00384     SG_FREE(alphas);
00385     SG_FREE(out);
00386     SG_FREE(w);
00387 }
00388 
00389 SGVector<float64_t> CDotFeatures::get_mean()
00390 {
00391     int32_t num=get_num_vectors();
00392     int32_t dim=get_dim_feature_space();
00393     ASSERT(num>0);
00394     ASSERT(dim>0);
00395 
00396     SGVector<float64_t> mean(dim);
00397     memset(mean.vector, 0, sizeof(float64_t)*dim);
00398 
00399     for (int i = 0; i < num; i++)
00400         add_to_dense_vec(1, i, mean.vector, dim);
00401     for (int j = 0; j < dim; j++)
00402         mean.vector[j] /= num;
00403 
00404     return mean;
00405 }                                   
00406 
00407 SGMatrix<float64_t> CDotFeatures::get_cov()
00408 {
00409     int32_t num=get_num_vectors();
00410     int32_t dim=get_dim_feature_space();
00411     ASSERT(num>0);
00412     ASSERT(dim>0);
00413 
00414     SGMatrix<float64_t> cov(dim, dim);
00415 
00416     memset(cov.matrix, 0, sizeof(float64_t)*dim*dim);
00417 
00418     SGVector<float64_t> mean = get_mean();
00419 
00420     for (int i = 0; i < num; i++)
00421     {
00422         SGVector<float64_t> v = get_computed_dot_feature_vector(i);
00423         CMath::add<float64_t>(v.vector, 1, v.vector, -1, mean.vector, v.vlen);
00424         for (int m = 0; m < v.vlen; m++)
00425         {
00426             for (int n = 0; n <= m ; n++)
00427             {
00428                 (cov.matrix)[m*v.vlen+n] += v.vector[m]*v.vector[n];
00429             }
00430         }
00431         v.free_vector();
00432     }
00433     for (int m = 0; m < dim; m++)
00434     {
00435         for (int n = 0; n <= m ; n++)
00436         {
00437             (cov.matrix)[m*dim+n] /= num;
00438         }
00439     }
00440     for (int m = 0; m < dim-1; m++)
00441     {
00442         for (int n = m+1; n < dim; n++)
00443         {
00444             (cov.matrix)[m*dim+n] = (cov.matrix)[n*dim+m];
00445         }
00446     }
00447     mean.destroy_vector();
00448     return cov;
00449 }
00450 
00451 void CDotFeatures::display_progress(int32_t start, int32_t stop, int32_t v)
00452 {
00453     int32_t num_vectors=stop-start;
00454     int32_t i=v-start;
00455 
00456     if ( (i% (num_vectors/100+1))== 0)
00457         SG_PROGRESS(v, 0.0, num_vectors-1);
00458 }
00459 
00460 void CDotFeatures::init()
00461 {
00462     set_property(FP_DOT);
00463     m_parameters->add(&combined_weight, "combined_weight",
00464                       "Feature weighting in combined dot features.");
00465 }