00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011 #include <shogun/features/DotFeatures.h>
00012 #include <shogun/io/SGIO.h>
00013 #include <shogun/lib/Signal.h>
00014 #include <shogun/lib/Time.h>
00015 #include <shogun/mathematics/Math.h>
00016 #include <shogun/base/Parallel.h>
00017 #include <shogun/base/Parameter.h>
00018
00019 #ifdef HAVE_PTHREAD
00020 #include <pthread.h>
00021 #endif
00022
00023 using namespace shogun;
00024
00025 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00026 struct DF_THREAD_PARAM
00027 {
00028 CDotFeatures* df;
00029 int32_t* sub_index;
00030 float64_t* output;
00031 int32_t start;
00032 int32_t stop;
00033 float64_t* alphas;
00034 float64_t* vec;
00035 int32_t dim;
00036 float64_t bias;
00037 bool progress;
00038 };
00039 #endif // DOXYGEN_SHOULD_SKIP_THIS
00040
00041
00042 CDotFeatures::CDotFeatures(int32_t size)
00043 :CFeatures(size), combined_weight(1.0)
00044 {
00045 init();
00046 }
00047
00048
00049 CDotFeatures::CDotFeatures(const CDotFeatures & orig)
00050 :CFeatures(orig), combined_weight(orig.combined_weight)
00051 {
00052 init();
00053 }
00054
00055
00056 CDotFeatures::CDotFeatures(CFile* loader)
00057 :CFeatures(loader)
00058 {
00059 init();
00060 }
00061
00062 float64_t CDotFeatures::dense_dot_sgvec(int32_t vec_idx1, SGVector<float64_t> vec2)
00063 {
00064 return dense_dot(vec_idx1, vec2.vector, vec2.vlen);
00065 }
00066
00067 void CDotFeatures::dense_dot_range(float64_t* output, int32_t start, int32_t stop, float64_t* alphas, float64_t* vec, int32_t dim, float64_t b)
00068 {
00069 ASSERT(output);
00070
00071
00072 output-=start;
00073 ASSERT(start>=0);
00074 ASSERT(start<stop);
00075 ASSERT(stop<=get_num_vectors());
00076
00077 int32_t num_vectors=stop-start;
00078 ASSERT(num_vectors>0);
00079
00080 int32_t num_threads=parallel->get_num_threads();
00081 ASSERT(num_threads>0);
00082
00083 CSignal::clear_cancel();
00084
00085 #ifdef HAVE_PTHREAD
00086 if (num_threads < 2)
00087 {
00088 #endif
00089 DF_THREAD_PARAM params;
00090 params.df=this;
00091 params.sub_index=NULL;
00092 params.output=output;
00093 params.start=start;
00094 params.stop=stop;
00095 params.alphas=alphas;
00096 params.vec=vec;
00097 params.dim=dim;
00098 params.bias=b;
00099 params.progress=false;
00100 dense_dot_range_helper((void*) ¶ms);
00101 #ifdef HAVE_PTHREAD
00102 }
00103 else
00104 {
00105 pthread_t* threads = SG_MALLOC(pthread_t, num_threads-1);
00106 DF_THREAD_PARAM* params = SG_MALLOC(DF_THREAD_PARAM, num_threads);
00107 int32_t step= num_vectors/num_threads;
00108
00109 int32_t t;
00110
00111 for (t=0; t<num_threads-1; t++)
00112 {
00113 params[t].df = this;
00114 params[t].sub_index=NULL;
00115 params[t].output = output;
00116 params[t].start = start+t*step;
00117 params[t].stop = start+(t+1)*step;
00118 params[t].alphas=alphas;
00119 params[t].vec=vec;
00120 params[t].dim=dim;
00121 params[t].bias=b;
00122 params[t].progress = false;
00123 pthread_create(&threads[t], NULL,
00124 CDotFeatures::dense_dot_range_helper, (void*)¶ms[t]);
00125 }
00126
00127 params[t].df = this;
00128 params[t].output = output;
00129 params[t].sub_index=NULL;
00130 params[t].start = start+t*step;
00131 params[t].stop = stop;
00132 params[t].alphas=alphas;
00133 params[t].vec=vec;
00134 params[t].dim=dim;
00135 params[t].bias=b;
00136 params[t].progress = false;
00137 dense_dot_range_helper((void*) ¶ms[t]);
00138
00139 for (t=0; t<num_threads-1; t++)
00140 pthread_join(threads[t], NULL);
00141
00142 SG_FREE(params);
00143 SG_FREE(threads);
00144 }
00145 #endif
00146
00147 #ifndef WIN32
00148 if ( CSignal::cancel_computations() )
00149 SG_INFO( "prematurely stopped. \n");
00150 #endif
00151 }
00152
00153 void CDotFeatures::dense_dot_range_subset(int32_t* sub_index, int32_t num, float64_t* output, float64_t* alphas, float64_t* vec, int32_t dim, float64_t b)
00154 {
00155 ASSERT(sub_index);
00156 ASSERT(output);
00157
00158 int32_t num_threads=parallel->get_num_threads();
00159 ASSERT(num_threads>0);
00160
00161 CSignal::clear_cancel();
00162
00163 #ifdef HAVE_PTHREAD
00164 if (num_threads < 2)
00165 {
00166 #endif
00167 DF_THREAD_PARAM params;
00168 params.df=this;
00169 params.sub_index=sub_index;
00170 params.output=output;
00171 params.start=0;
00172 params.stop=num;
00173 params.alphas=alphas;
00174 params.vec=vec;
00175 params.dim=dim;
00176 params.bias=b;
00177 params.progress=false;
00178 dense_dot_range_helper((void*) ¶ms);
00179 #ifdef HAVE_PTHREAD
00180 }
00181 else
00182 {
00183 pthread_t* threads = SG_MALLOC(pthread_t, num_threads-1);
00184 DF_THREAD_PARAM* params = SG_MALLOC(DF_THREAD_PARAM, num_threads);
00185 int32_t step= num/num_threads;
00186
00187 int32_t t;
00188
00189 for (t=0; t<num_threads-1; t++)
00190 {
00191 params[t].df = this;
00192 params[t].sub_index=sub_index;
00193 params[t].output = output;
00194 params[t].start = t*step;
00195 params[t].stop = (t+1)*step;
00196 params[t].alphas=alphas;
00197 params[t].vec=vec;
00198 params[t].dim=dim;
00199 params[t].bias=b;
00200 params[t].progress = false;
00201 pthread_create(&threads[t], NULL,
00202 CDotFeatures::dense_dot_range_helper, (void*)¶ms[t]);
00203 }
00204
00205 params[t].df = this;
00206 params[t].sub_index=sub_index;
00207 params[t].output = output;
00208 params[t].start = t*step;
00209 params[t].stop = num;
00210 params[t].alphas=alphas;
00211 params[t].vec=vec;
00212 params[t].dim=dim;
00213 params[t].bias=b;
00214 params[t].progress = false;
00215 dense_dot_range_helper((void*) ¶ms[t]);
00216
00217 for (t=0; t<num_threads-1; t++)
00218 pthread_join(threads[t], NULL);
00219
00220 SG_FREE(params);
00221 SG_FREE(threads);
00222 }
00223 #endif
00224
00225 #ifndef WIN32
00226 if ( CSignal::cancel_computations() )
00227 SG_INFO( "prematurely stopped. \n");
00228 #endif
00229 }
00230
00231 void* CDotFeatures::dense_dot_range_helper(void* p)
00232 {
00233 DF_THREAD_PARAM* par=(DF_THREAD_PARAM*) p;
00234 CDotFeatures* df=par->df;
00235 int32_t* sub_index=par->sub_index;
00236 float64_t* output=par->output;
00237 int32_t start=par->start;
00238 int32_t stop=par->stop;
00239 float64_t* alphas=par->alphas;
00240 float64_t* vec=par->vec;
00241 int32_t dim=par->dim;
00242 float64_t bias=par->bias;
00243 bool progress=par->progress;
00244
00245 if (sub_index)
00246 {
00247 #ifdef WIN32
00248 for (int32_t i=start; i<stop i++)
00249 #else
00250 for (int32_t i=start; i<stop &&
00251 !CSignal::cancel_computations(); i++)
00252 #endif
00253 {
00254 if (alphas)
00255 output[i]=alphas[sub_index[i]]*df->dense_dot(sub_index[i], vec, dim)+bias;
00256 else
00257 output[i]=df->dense_dot(sub_index[i], vec, dim)+bias;
00258 if (progress)
00259 df->display_progress(start, stop, i);
00260 }
00261
00262 }
00263 else
00264 {
00265 #ifdef WIN32
00266 for (int32_t i=start; i<stop i++)
00267 #else
00268 for (int32_t i=start; i<stop &&
00269 !CSignal::cancel_computations(); i++)
00270 #endif
00271 {
00272 if (alphas)
00273 output[i]=alphas[i]*df->dense_dot(i, vec, dim)+bias;
00274 else
00275 output[i]=df->dense_dot(i, vec, dim)+bias;
00276 if (progress)
00277 df->display_progress(start, stop, i);
00278 }
00279 }
00280
00281 return NULL;
00282 }
00283
00284 SGMatrix<float64_t> CDotFeatures::get_computed_dot_feature_matrix()
00285 {
00286
00287 int64_t offs=0;
00288 int32_t num=get_num_vectors();
00289 int32_t dim=get_dim_feature_space();
00290 ASSERT(num>0);
00291 ASSERT(dim>0);
00292
00293 SGMatrix<float64_t> m(dim, num);
00294 m.zero();
00295
00296 for (int32_t i=0; i<num; i++)
00297 {
00298 add_to_dense_vec(1.0, i, &(m.matrix[offs]), dim);
00299 offs+=dim;
00300 }
00301
00302 return m;
00303 }
00304
00305 SGVector<float64_t> CDotFeatures::get_computed_dot_feature_vector(int32_t num)
00306 {
00307
00308 int32_t dim=get_dim_feature_space();
00309 ASSERT(num>=0 && num<=get_num_vectors());
00310 ASSERT(dim>0);
00311
00312 SGVector<float64_t> v(dim);
00313 v.zero();
00314 add_to_dense_vec(1.0, num, v.vector, dim);
00315 return v;
00316 }
00317
00318 void CDotFeatures::benchmark_add_to_dense_vector(int32_t repeats)
00319 {
00320 int32_t num=get_num_vectors();
00321 int32_t d=get_dim_feature_space();
00322 float64_t* w= SG_MALLOC(float64_t, d);
00323 SGVector<float64_t>::fill_vector(w, d, 0.0);
00324
00325 CTime t;
00326 float64_t start_cpu=t.get_runtime();
00327 float64_t start_wall=t.get_curtime();
00328 for (int32_t r=0; r<repeats; r++)
00329 {
00330 for (int32_t i=0; i<num; i++)
00331 add_to_dense_vec(1.172343*(r+1), i, w, d);
00332 }
00333
00334 SG_PRINT("Time to process %d x num=%d add_to_dense_vector ops: cputime %fs walltime %fs\n",
00335 repeats, num, (t.get_runtime()-start_cpu)/repeats,
00336 (t.get_curtime()-start_wall)/repeats);
00337
00338 SG_FREE(w);
00339 }
00340
00341 void CDotFeatures::benchmark_dense_dot_range(int32_t repeats)
00342 {
00343 int32_t num=get_num_vectors();
00344 int32_t d=get_dim_feature_space();
00345 float64_t* w= SG_MALLOC(float64_t, d);
00346 float64_t* out= SG_MALLOC(float64_t, num);
00347 float64_t* alphas= SG_MALLOC(float64_t, num);
00348 SGVector<float64_t>::range_fill_vector(w, d, 17.0);
00349 SGVector<float64_t>::range_fill_vector(alphas, num, 1.2345);
00350
00351
00352
00353 CTime t;
00354 float64_t start_cpu=t.get_runtime();
00355 float64_t start_wall=t.get_curtime();
00356
00357 for (int32_t r=0; r<repeats; r++)
00358 dense_dot_range(out, 0, num, alphas, w, d, 23);
00359
00360 #ifdef DEBUG_DOTFEATURES
00361 CMath::display_vector(out, 40, "dense_dot_range");
00362 float64_t* out2= SG_MALLOC(float64_t, num);
00363
00364 for (int32_t r=0; r<repeats; r++)
00365 {
00366 CMath::fill_vector(out2, num, 0.0);
00367 for (int32_t i=0; i<num; i++)
00368 out2[i]+=dense_dot(i, w, d)*alphas[i]+23;
00369 }
00370 CMath::display_vector(out2, 40, "dense_dot");
00371 for (int32_t i=0; i<num; i++)
00372 out2[i]-=out[i];
00373 CMath::display_vector(out2, 40, "diff");
00374 #endif
00375 SG_PRINT("Time to process %d x num=%d dense_dot_range ops: cputime %fs walltime %fs\n",
00376 repeats, num, (t.get_runtime()-start_cpu)/repeats,
00377 (t.get_curtime()-start_wall)/repeats);
00378
00379 SG_FREE(alphas);
00380 SG_FREE(out);
00381 SG_FREE(w);
00382 }
00383
00384 SGVector<float64_t> CDotFeatures::get_mean()
00385 {
00386 int32_t num=get_num_vectors();
00387 int32_t dim=get_dim_feature_space();
00388 ASSERT(num>0);
00389 ASSERT(dim>0);
00390
00391 SGVector<float64_t> mean(dim);
00392 memset(mean.vector, 0, sizeof(float64_t)*dim);
00393
00394 for (int i = 0; i < num; i++)
00395 add_to_dense_vec(1, i, mean.vector, dim);
00396 for (int j = 0; j < dim; j++)
00397 mean.vector[j] /= num;
00398
00399 return mean;
00400 }
00401
00402 SGVector<float64_t> CDotFeatures::get_mean(CDotFeatures* lhs, CDotFeatures* rhs)
00403 {
00404 ASSERT(lhs && rhs);
00405 ASSERT(lhs->get_dim_feature_space() == rhs->get_dim_feature_space());
00406
00407 int32_t num_lhs=lhs->get_num_vectors();
00408 int32_t num_rhs=rhs->get_num_vectors();
00409 int32_t dim=lhs->get_dim_feature_space();
00410 ASSERT(num_lhs>0);
00411 ASSERT(num_rhs>0);
00412 ASSERT(dim>0);
00413
00414 SGVector<float64_t> mean(dim);
00415 memset(mean.vector, 0, sizeof(float64_t)*dim);
00416
00417 for (int i = 0; i < num_lhs; i++)
00418 lhs->add_to_dense_vec(1, i, mean.vector, dim);
00419 for (int i = 0; i < num_rhs; i++)
00420 rhs->add_to_dense_vec(1, i, mean.vector, dim);
00421 for (int j = 0; j < dim; j++)
00422 mean.vector[j] /= (num_lhs+num_rhs);
00423
00424 return mean;
00425 }
00426
00427 SGMatrix<float64_t> CDotFeatures::get_cov()
00428 {
00429 int32_t num=get_num_vectors();
00430 int32_t dim=get_dim_feature_space();
00431 ASSERT(num>0);
00432 ASSERT(dim>0);
00433
00434 SGMatrix<float64_t> cov(dim, dim);
00435
00436 memset(cov.matrix, 0, sizeof(float64_t)*dim*dim);
00437
00438 SGVector<float64_t> mean = get_mean();
00439
00440 for (int i = 0; i < num; i++)
00441 {
00442 SGVector<float64_t> v = get_computed_dot_feature_vector(i);
00443 SGVector<float64_t>::add(v.vector, 1, v.vector, -1, mean.vector, v.vlen);
00444 for (int m = 0; m < v.vlen; m++)
00445 {
00446 for (int n = 0; n <= m ; n++)
00447 {
00448 (cov.matrix)[m*v.vlen+n] += v.vector[m]*v.vector[n];
00449 }
00450 }
00451 }
00452 for (int m = 0; m < dim; m++)
00453 {
00454 for (int n = 0; n <= m ; n++)
00455 {
00456 (cov.matrix)[m*dim+n] /= num;
00457 }
00458 }
00459 for (int m = 0; m < dim-1; m++)
00460 {
00461 for (int n = m+1; n < dim; n++)
00462 {
00463 (cov.matrix)[m*dim+n] = (cov.matrix)[n*dim+m];
00464 }
00465 }
00466 return cov;
00467 }
00468
00469 SGMatrix<float64_t> CDotFeatures::compute_cov(CDotFeatures* lhs, CDotFeatures* rhs)
00470 {
00471 CDotFeatures* feats[2];
00472 feats[0]=lhs;
00473 feats[1]=rhs;
00474
00475 int32_t nums[2], dims[2], num=0;
00476
00477 for (int i = 0; i < 2; i++)
00478 {
00479 nums[i]=feats[i]->get_num_vectors();
00480 dims[i]=feats[i]->get_dim_feature_space();
00481 ASSERT(nums[i]>0);
00482 ASSERT(dims[i]>0);
00483 num += nums[i];
00484 }
00485
00486 ASSERT(dims[0]==dims[1]);
00487 int32_t dim = dims[0];
00488
00489 SGMatrix<float64_t> cov(dim, dim);
00490
00491 memset(cov.matrix, 0, sizeof(float64_t)*dim*dim);
00492
00493 SGVector<float64_t> mean=get_mean(lhs,rhs);
00494
00495 for (int i = 0; i < 2; i++)
00496 {
00497 for (int j = 0; j < nums[i]; j++)
00498 {
00499 SGVector<float64_t> v = feats[i]->get_computed_dot_feature_vector(j);
00500 SGVector<float64_t>::add(v.vector, 1, v.vector, -1, mean.vector, v.vlen);
00501 for (int m = 0; m < v.vlen; m++)
00502 {
00503 for (int n = 0; n <= m; n++)
00504 {
00505 (cov.matrix)[m*v.vlen+n] += v.vector[m]*v.vector[n];
00506 }
00507 }
00508 }
00509 }
00510 for (int m = 0; m < dim; m++)
00511 {
00512 for (int n = 0; n <= m; n++)
00513 {
00514 (cov.matrix)[m*dim+n] /= num;
00515 }
00516 }
00517 for (int m = 0; m < dim-1; m++)
00518 {
00519 for (int n = m+1; n < dim; n++)
00520 {
00521 (cov.matrix[m*dim+n]) = (cov.matrix)[n*dim+m];
00522 }
00523 }
00524
00525 return cov;
00526 }
00527
00528 void CDotFeatures::display_progress(int32_t start, int32_t stop, int32_t v)
00529 {
00530 int32_t num_vectors=stop-start;
00531 int32_t i=v-start;
00532
00533 if ( (i% (num_vectors/100+1))== 0)
00534 SG_PROGRESS(v, 0.0, num_vectors-1);
00535 }
00536
00537 void CDotFeatures::init()
00538 {
00539 set_property(FP_DOT);
00540 m_parameters->add(&combined_weight, "combined_weight",
00541 "Feature weighting in combined dot features.");
00542 }