SHOGUN: DotFeatures.cpp Source File

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2009 Soeren Sonnenburg
00008  * Copyright (C) 2009 Fraunhofer Institute FIRST and Max-Planck-Society
00009  */
00010 
00011 #include <shogun/features/DotFeatures.h>
00012 #include <shogun/io/SGIO.h>
00013 #include <shogun/lib/Signal.h>
00014 #include <shogun/lib/Time.h>
00015 #include <shogun/mathematics/Math.h>
00016 #include <shogun/base/Parallel.h>
00017 #include <shogun/base/Parameter.h>
00018 
00019 #ifdef HAVE_PTHREAD
00020 #include <pthread.h>
00021 #endif
00022 
00023 using namespace shogun;
00024 
00025 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00026 struct DF_THREAD_PARAM
00027 {
00028     CDotFeatures* df;
00029     int32_t* sub_index;
00030     float64_t* output;
00031     int32_t start;
00032     int32_t stop;
00033     float64_t* alphas;
00034     float64_t* vec;
00035     int32_t dim;
00036     float64_t bias;
00037     bool progress;
00038 };
00039 #endif // DOXYGEN_SHOULD_SKIP_THIS
00040 
00041 
00042 CDotFeatures::CDotFeatures(int32_t size)
00043     :CFeatures(size), combined_weight(1.0)
00044 {
00045     init();
00046 }
00047 
00048 
00049 CDotFeatures::CDotFeatures(const CDotFeatures & orig)
00050     :CFeatures(orig), combined_weight(orig.combined_weight)
00051 {
00052     init();
00053 }
00054 
00055 
00056 CDotFeatures::CDotFeatures(CFile* loader)
00057     :CFeatures(loader)
00058 {
00059     init();
00060 }
00061 
00062 float64_t CDotFeatures::dense_dot_sgvec(int32_t vec_idx1, SGVector<float64_t> vec2)
00063 {
00064     return dense_dot(vec_idx1, vec2.vector, vec2.vlen);
00065 }
00066 
00067 void CDotFeatures::dense_dot_range(float64_t* output, int32_t start, int32_t stop, float64_t* alphas, float64_t* vec, int32_t dim, float64_t b)
00068 {
00069     ASSERT(output);
00070     // write access is internally between output[start..stop] so the following
00071     // line is necessary to write to output[0...(stop-start-1)]
00072     output-=start;
00073     ASSERT(start>=0);
00074     ASSERT(start<stop);
00075     ASSERT(stop<=get_num_vectors());
00076 
00077     int32_t num_vectors=stop-start;
00078     ASSERT(num_vectors>0);
00079 
00080     int32_t num_threads=parallel->get_num_threads();
00081     ASSERT(num_threads>0);
00082 
00083     CSignal::clear_cancel();
00084 
00085 #ifdef HAVE_PTHREAD
00086     if (num_threads < 2)
00087     {
00088 #endif
00089         DF_THREAD_PARAM params;
00090         params.df=this;
00091         params.sub_index=NULL;
00092         params.output=output;
00093         params.start=start;
00094         params.stop=stop;
00095         params.alphas=alphas;
00096         params.vec=vec;
00097         params.dim=dim;
00098         params.bias=b;
00099         params.progress=false; //true;
00100         dense_dot_range_helper((void*) &params);
00101 #ifdef HAVE_PTHREAD
00102     }
00103     else
00104     {
00105         pthread_t* threads = SG_MALLOC(pthread_t, num_threads-1);
00106         DF_THREAD_PARAM* params = SG_MALLOC(DF_THREAD_PARAM, num_threads);
00107         int32_t step= num_vectors/num_threads;
00108 
00109         int32_t t;
00110 
00111         for (t=0; t<num_threads-1; t++)
00112         {
00113             params[t].df = this;
00114             params[t].sub_index=NULL;
00115             params[t].output = output;
00116             params[t].start = start+t*step;
00117             params[t].stop = start+(t+1)*step;
00118             params[t].alphas=alphas;
00119             params[t].vec=vec;
00120             params[t].dim=dim;
00121             params[t].bias=b;
00122             params[t].progress = false;
00123             pthread_create(&threads[t], NULL,
00124                     CDotFeatures::dense_dot_range_helper, (void*)&params[t]);
00125         }
00126 
00127         params[t].df = this;
00128         params[t].output = output;
00129         params[t].sub_index=NULL;
00130         params[t].start = start+t*step;
00131         params[t].stop = stop;
00132         params[t].alphas=alphas;
00133         params[t].vec=vec;
00134         params[t].dim=dim;
00135         params[t].bias=b;
00136         params[t].progress = false; //true;
00137         dense_dot_range_helper((void*) &params[t]);
00138 
00139         for (t=0; t<num_threads-1; t++)
00140             pthread_join(threads[t], NULL);
00141 
00142         SG_FREE(params);
00143         SG_FREE(threads);
00144     }
00145 #endif
00146 
00147 #ifndef WIN32
00148         if ( CSignal::cancel_computations() )
00149             SG_INFO( "prematurely stopped.           \n");
00150 #endif
00151 }
00152 
00153 void CDotFeatures::dense_dot_range_subset(int32_t* sub_index, int32_t num, float64_t* output, float64_t* alphas, float64_t* vec, int32_t dim, float64_t b)
00154 {
00155     ASSERT(sub_index);
00156     ASSERT(output);
00157 
00158     int32_t num_threads=parallel->get_num_threads();
00159     ASSERT(num_threads>0);
00160 
00161     CSignal::clear_cancel();
00162 
00163 #ifdef HAVE_PTHREAD
00164     if (num_threads < 2)
00165     {
00166 #endif
00167         DF_THREAD_PARAM params;
00168         params.df=this;
00169         params.sub_index=sub_index;
00170         params.output=output;
00171         params.start=0;
00172         params.stop=num;
00173         params.alphas=alphas;
00174         params.vec=vec;
00175         params.dim=dim;
00176         params.bias=b;
00177         params.progress=false; //true;
00178         dense_dot_range_helper((void*) &params);
00179 #ifdef HAVE_PTHREAD
00180     }
00181     else
00182     {
00183         pthread_t* threads = SG_MALLOC(pthread_t, num_threads-1);
00184         DF_THREAD_PARAM* params = SG_MALLOC(DF_THREAD_PARAM, num_threads);
00185         int32_t step= num/num_threads;
00186 
00187         int32_t t;
00188 
00189         for (t=0; t<num_threads-1; t++)
00190         {
00191             params[t].df = this;
00192             params[t].sub_index=sub_index;
00193             params[t].output = output;
00194             params[t].start = t*step;
00195             params[t].stop = (t+1)*step;
00196             params[t].alphas=alphas;
00197             params[t].vec=vec;
00198             params[t].dim=dim;
00199             params[t].bias=b;
00200             params[t].progress = false;
00201             pthread_create(&threads[t], NULL,
00202                     CDotFeatures::dense_dot_range_helper, (void*)&params[t]);
00203         }
00204 
00205         params[t].df = this;
00206         params[t].sub_index=sub_index;
00207         params[t].output = output;
00208         params[t].start = t*step;
00209         params[t].stop = num;
00210         params[t].alphas=alphas;
00211         params[t].vec=vec;
00212         params[t].dim=dim;
00213         params[t].bias=b;
00214         params[t].progress = false; //true;
00215         dense_dot_range_helper((void*) &params[t]);
00216 
00217         for (t=0; t<num_threads-1; t++)
00218             pthread_join(threads[t], NULL);
00219 
00220         SG_FREE(params);
00221         SG_FREE(threads);
00222     }
00223 #endif
00224 
00225 #ifndef WIN32
00226         if ( CSignal::cancel_computations() )
00227             SG_INFO( "prematurely stopped.           \n");
00228 #endif
00229 }
00230 
00231 void* CDotFeatures::dense_dot_range_helper(void* p)
00232 {
00233     DF_THREAD_PARAM* par=(DF_THREAD_PARAM*) p;
00234     CDotFeatures* df=par->df;
00235     int32_t* sub_index=par->sub_index;
00236     float64_t* output=par->output;
00237     int32_t start=par->start;
00238     int32_t stop=par->stop;
00239     float64_t* alphas=par->alphas;
00240     float64_t* vec=par->vec;
00241     int32_t dim=par->dim;
00242     float64_t bias=par->bias;
00243     bool progress=par->progress;
00244 
00245     if (sub_index)
00246     {
00247 #ifdef WIN32
00248         for (int32_t i=start; i<stop i++)
00249 #else
00250         for (int32_t i=start; i<stop &&
00251                 !CSignal::cancel_computations(); i++)
00252 #endif
00253         {
00254             if (alphas)
00255                 output[i]=alphas[sub_index[i]]*df->dense_dot(sub_index[i], vec, dim)+bias;
00256             else
00257                 output[i]=df->dense_dot(sub_index[i], vec, dim)+bias;
00258             if (progress)
00259                 df->display_progress(start, stop, i);
00260         }
00261 
00262     }
00263     else
00264     {
00265 #ifdef WIN32
00266         for (int32_t i=start; i<stop i++)
00267 #else
00268         for (int32_t i=start; i<stop &&
00269                 !CSignal::cancel_computations(); i++)
00270 #endif
00271         {
00272             if (alphas)
00273                 output[i]=alphas[i]*df->dense_dot(i, vec, dim)+bias;
00274             else
00275                 output[i]=df->dense_dot(i, vec, dim)+bias;
00276             if (progress)
00277                 df->display_progress(start, stop, i);
00278         }
00279     }
00280 
00281     return NULL;
00282 }
00283 
00284 SGMatrix<float64_t> CDotFeatures::get_computed_dot_feature_matrix()
00285 {
00286 
00287     int64_t offs=0;
00288     int32_t num=get_num_vectors();
00289     int32_t dim=get_dim_feature_space();
00290     ASSERT(num>0);
00291     ASSERT(dim>0);
00292 
00293     SGMatrix<float64_t> m(dim, num);
00294     m.zero();
00295 
00296     for (int32_t i=0; i<num; i++)
00297     {
00298         add_to_dense_vec(1.0, i, &(m.matrix[offs]), dim);
00299         offs+=dim;
00300     }
00301 
00302     return m;
00303 }
00304 
00305 SGVector<float64_t> CDotFeatures::get_computed_dot_feature_vector(int32_t num)
00306 {
00307 
00308     int32_t dim=get_dim_feature_space();
00309     ASSERT(num>=0 && num<=get_num_vectors());
00310     ASSERT(dim>0);
00311 
00312     SGVector<float64_t> v(dim);
00313     v.zero();
00314     add_to_dense_vec(1.0, num, v.vector, dim);
00315     return v;
00316 }
00317 
00318 void CDotFeatures::benchmark_add_to_dense_vector(int32_t repeats)
00319 {
00320     int32_t num=get_num_vectors();
00321     int32_t d=get_dim_feature_space();
00322     float64_t* w= SG_MALLOC(float64_t, d);
00323     SGVector<float64_t>::fill_vector(w, d, 0.0);
00324 
00325     CTime t;
00326     float64_t start_cpu=t.get_runtime();
00327     float64_t start_wall=t.get_curtime();
00328     for (int32_t r=0; r<repeats; r++)
00329     {
00330         for (int32_t i=0; i<num; i++)
00331             add_to_dense_vec(1.172343*(r+1), i, w, d);
00332     }
00333 
00334     SG_PRINT("Time to process %d x num=%d add_to_dense_vector ops: cputime %fs walltime %fs\n",
00335             repeats, num, (t.get_runtime()-start_cpu)/repeats,
00336             (t.get_curtime()-start_wall)/repeats);
00337 
00338     SG_FREE(w);
00339 }
00340 
00341 void CDotFeatures::benchmark_dense_dot_range(int32_t repeats)
00342 {
00343     int32_t num=get_num_vectors();
00344     int32_t d=get_dim_feature_space();
00345     float64_t* w= SG_MALLOC(float64_t, d);
00346     float64_t* out= SG_MALLOC(float64_t, num);
00347     float64_t* alphas= SG_MALLOC(float64_t, num);
00348     SGVector<float64_t>::range_fill_vector(w, d, 17.0);
00349     SGVector<float64_t>::range_fill_vector(alphas, num, 1.2345);
00350     //SGVector<float64_t>::fill_vector(w, d, 17.0);
00351     //SGVector<float64_t>::fill_vector(alphas, num, 1.2345);
00352 
00353     CTime t;
00354     float64_t start_cpu=t.get_runtime();
00355     float64_t start_wall=t.get_curtime();
00356 
00357     for (int32_t r=0; r<repeats; r++)
00358             dense_dot_range(out, 0, num, alphas, w, d, 23);
00359 
00360 #ifdef DEBUG_DOTFEATURES
00361     CMath::display_vector(out, 40, "dense_dot_range");
00362     float64_t* out2= SG_MALLOC(float64_t, num);
00363 
00364     for (int32_t r=0; r<repeats; r++)
00365     {
00366         CMath::fill_vector(out2, num, 0.0);
00367         for (int32_t i=0; i<num; i++)
00368             out2[i]+=dense_dot(i, w, d)*alphas[i]+23;
00369     }
00370     CMath::display_vector(out2, 40, "dense_dot");
00371     for (int32_t i=0; i<num; i++)
00372         out2[i]-=out[i];
00373     CMath::display_vector(out2, 40, "diff");
00374 #endif
00375     SG_PRINT("Time to process %d x num=%d dense_dot_range ops: cputime %fs walltime %fs\n",
00376             repeats, num, (t.get_runtime()-start_cpu)/repeats,
00377             (t.get_curtime()-start_wall)/repeats);
00378 
00379     SG_FREE(alphas);
00380     SG_FREE(out);
00381     SG_FREE(w);
00382 }
00383 
00384 SGVector<float64_t> CDotFeatures::get_mean()
00385 {
00386     int32_t num=get_num_vectors();
00387     int32_t dim=get_dim_feature_space();
00388     ASSERT(num>0);
00389     ASSERT(dim>0);
00390 
00391     SGVector<float64_t> mean(dim);
00392     memset(mean.vector, 0, sizeof(float64_t)*dim);
00393 
00394     for (int i = 0; i < num; i++)
00395         add_to_dense_vec(1, i, mean.vector, dim);
00396     for (int j = 0; j < dim; j++)
00397         mean.vector[j] /= num;
00398 
00399     return mean;
00400 }
00401 
00402 SGVector<float64_t> CDotFeatures::get_mean(CDotFeatures* lhs, CDotFeatures* rhs)
00403 {
00404     ASSERT(lhs && rhs);
00405     ASSERT(lhs->get_dim_feature_space() == rhs->get_dim_feature_space());
00406 
00407     int32_t num_lhs=lhs->get_num_vectors();
00408     int32_t num_rhs=rhs->get_num_vectors();
00409     int32_t dim=lhs->get_dim_feature_space();
00410     ASSERT(num_lhs>0);
00411     ASSERT(num_rhs>0);
00412     ASSERT(dim>0);
00413 
00414     SGVector<float64_t> mean(dim);
00415     memset(mean.vector, 0, sizeof(float64_t)*dim);
00416 
00417     for (int i = 0; i < num_lhs; i++)
00418         lhs->add_to_dense_vec(1, i, mean.vector, dim);
00419     for (int i = 0; i < num_rhs; i++)
00420         rhs->add_to_dense_vec(1, i, mean.vector, dim);
00421     for (int j = 0; j < dim; j++)
00422         mean.vector[j] /= (num_lhs+num_rhs);
00423 
00424     return mean;
00425 }
00426 
00427 SGMatrix<float64_t> CDotFeatures::get_cov()
00428 {
00429     int32_t num=get_num_vectors();
00430     int32_t dim=get_dim_feature_space();
00431     ASSERT(num>0);
00432     ASSERT(dim>0);
00433 
00434     SGMatrix<float64_t> cov(dim, dim);
00435 
00436     memset(cov.matrix, 0, sizeof(float64_t)*dim*dim);
00437 
00438     SGVector<float64_t> mean = get_mean();
00439 
00440     for (int i = 0; i < num; i++)
00441     {
00442         SGVector<float64_t> v = get_computed_dot_feature_vector(i);
00443         SGVector<float64_t>::add(v.vector, 1, v.vector, -1, mean.vector, v.vlen);
00444         for (int m = 0; m < v.vlen; m++)
00445         {
00446             for (int n = 0; n <= m ; n++)
00447             {
00448                 (cov.matrix)[m*v.vlen+n] += v.vector[m]*v.vector[n];
00449             }
00450         }
00451     }
00452     for (int m = 0; m < dim; m++)
00453     {
00454         for (int n = 0; n <= m ; n++)
00455         {
00456             (cov.matrix)[m*dim+n] /= num;
00457         }
00458     }
00459     for (int m = 0; m < dim-1; m++)
00460     {
00461         for (int n = m+1; n < dim; n++)
00462         {
00463             (cov.matrix)[m*dim+n] = (cov.matrix)[n*dim+m];
00464         }
00465     }
00466     return cov;
00467 }
00468 
00469 SGMatrix<float64_t> CDotFeatures::compute_cov(CDotFeatures* lhs, CDotFeatures* rhs)
00470 {
00471     CDotFeatures* feats[2];
00472     feats[0]=lhs;
00473     feats[1]=rhs;
00474 
00475     int32_t nums[2], dims[2], num=0;
00476 
00477     for (int i = 0; i < 2; i++)
00478     {
00479         nums[i]=feats[i]->get_num_vectors();
00480         dims[i]=feats[i]->get_dim_feature_space();
00481         ASSERT(nums[i]>0);
00482         ASSERT(dims[i]>0);
00483         num += nums[i];
00484     }
00485 
00486     ASSERT(dims[0]==dims[1]);
00487     int32_t dim = dims[0];
00488 
00489     SGMatrix<float64_t> cov(dim, dim);
00490 
00491     memset(cov.matrix, 0, sizeof(float64_t)*dim*dim);
00492 
00493     SGVector<float64_t>  mean=get_mean(lhs,rhs);
00494 
00495     for (int i = 0; i < 2; i++)
00496     {
00497         for (int j = 0; j < nums[i]; j++)
00498         {
00499             SGVector<float64_t> v = feats[i]->get_computed_dot_feature_vector(j);
00500             SGVector<float64_t>::add(v.vector, 1, v.vector, -1, mean.vector, v.vlen);
00501             for (int m = 0; m < v.vlen; m++)
00502             {
00503                 for (int n = 0; n <= m; n++)
00504                 {
00505                     (cov.matrix)[m*v.vlen+n] += v.vector[m]*v.vector[n];
00506                 }
00507             }
00508         }
00509     }
00510     for (int m = 0; m < dim; m++)
00511     {
00512         for (int n = 0; n <= m; n++)
00513         {
00514             (cov.matrix)[m*dim+n] /= num;
00515         }
00516     }
00517     for (int m = 0; m < dim-1; m++)
00518     {
00519         for (int n = m+1; n < dim; n++)
00520         {
00521             (cov.matrix[m*dim+n]) = (cov.matrix)[n*dim+m];
00522         }
00523     }
00524 
00525     return cov;
00526 }
00527 
00528 void CDotFeatures::display_progress(int32_t start, int32_t stop, int32_t v)
00529 {
00530     int32_t num_vectors=stop-start;
00531     int32_t i=v-start;
00532 
00533     if ( (i% (num_vectors/100+1))== 0)
00534         SG_PROGRESS(v, 0.0, num_vectors-1);
00535 }
00536 
00537 void CDotFeatures::init()
00538 {
00539     set_property(FP_DOT);
00540     m_parameters->add(&combined_weight, "combined_weight",
00541                       "Feature weighting in combined dot features.");
00542 }