00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011 #include "features/HashedWDFeaturesTransposed.h"
00012 #include "lib/io.h"
00013 #include "lib/Signal.h"
00014 #include "base/Parallel.h"
00015
00016 #ifndef WIN32
00017 #include <pthread.h>
00018 #endif
00019
00020 using namespace shogun;
00021
00022 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00023 struct HASHEDWD_THREAD_PARAM
00024 {
00025 CHashedWDFeaturesTransposed* hf;
00026 int32_t* sub_index;
00027 float64_t* output;
00028 int32_t start;
00029 int32_t stop;
00030 float64_t* alphas;
00031 float64_t* vec;
00032 float64_t bias;
00033 bool progress;
00034 uint32_t* index;
00035 };
00036 #endif // DOXYGEN_SHOULD_SKIP_THIS
00037
00038 CHashedWDFeaturesTransposed::CHashedWDFeaturesTransposed(void)
00039 :CDotFeatures()
00040 {
00041 SG_UNSTABLE(
00042 "CHashedWDFeaturesTransposed::CHashedWDFeaturesTransposed(void)",
00043 "\n");
00044
00045 strings = NULL;
00046 transposed_strings = NULL;
00047
00048 degree = 0;
00049 start_degree = 0;
00050 from_degree = 0;
00051 string_length = 0;
00052 num_strings = 0;
00053 alphabet_size = 0;
00054 w_dim = 0;
00055 partial_w_dim = 0;
00056 wd_weights = NULL;
00057 mask = 0;
00058 m_hash_bits = 0;
00059
00060 normalization_const = 0.0;
00061 }
00062
00063 CHashedWDFeaturesTransposed::CHashedWDFeaturesTransposed(CStringFeatures<uint8_t>* str,
00064 int32_t start_order, int32_t order, int32_t from_order,
00065 int32_t hash_bits) : CDotFeatures()
00066 {
00067 ASSERT(start_order>=0);
00068 ASSERT(start_order<order);
00069 ASSERT(order<=from_order);
00070 ASSERT(hash_bits>0);
00071 ASSERT(str);
00072 ASSERT(str->have_same_length());
00073 SG_REF(str);
00074
00075 strings=str;
00076 int32_t transposed_num_feat=0;
00077 int32_t transposed_num_vec=0;
00078 transposed_strings=str->get_transposed(transposed_num_feat, transposed_num_vec);
00079
00080 string_length=str->get_max_vector_length();
00081 num_strings=str->get_num_vectors();
00082 ASSERT(transposed_num_feat==num_strings);
00083 ASSERT(transposed_num_vec==string_length);
00084
00085 CAlphabet* alpha=str->get_alphabet();
00086 alphabet_size=alpha->get_num_symbols();
00087 SG_UNREF(alpha);
00088
00089 degree=order;
00090 start_degree=start_order;
00091 if (start_degree!=0)
00092 SG_NOTIMPLEMENTED;
00093 from_degree=from_order;
00094 m_hash_bits=hash_bits;
00095 set_wd_weights();
00096 set_normalization_const();
00097 }
00098
00099 CHashedWDFeaturesTransposed::CHashedWDFeaturesTransposed(const CHashedWDFeaturesTransposed& orig)
00100 : CDotFeatures(orig), strings(orig.strings), transposed_strings(transposed_strings),
00101 degree(orig.degree), start_degree(orig.start_degree),
00102 from_degree(orig.from_degree), m_hash_bits(orig.m_hash_bits),
00103 normalization_const(orig.normalization_const)
00104 {
00105 SG_REF(strings);
00106 string_length=strings->get_max_vector_length();
00107 num_strings=strings->get_num_vectors();
00108 CAlphabet* alpha=strings->get_alphabet();
00109 alphabet_size=alpha->get_num_symbols();
00110 SG_UNREF(alpha);
00111
00112 set_wd_weights();
00113 }
00114
00115 CHashedWDFeaturesTransposed::~CHashedWDFeaturesTransposed()
00116 {
00117 for (int32_t i=0; i<string_length; i++)
00118 delete[] transposed_strings[i].string;
00119 delete[] transposed_strings;
00120
00121 SG_UNREF(strings);
00122 delete[] wd_weights;
00123 }
00124
00125 float64_t CHashedWDFeaturesTransposed::dot(int32_t vec_idx1, CDotFeatures* df, int32_t vec_idx2)
00126 {
00127 ASSERT(df);
00128 ASSERT(df->get_feature_type() == get_feature_type());
00129 ASSERT(df->get_feature_class() == get_feature_class());
00130 CHashedWDFeaturesTransposed* wdf = (CHashedWDFeaturesTransposed*) df;
00131
00132 int32_t len1, len2;
00133 bool free_vec1, free_vec2;
00134
00135 uint8_t* vec1=strings->get_feature_vector(vec_idx1, len1, free_vec1);
00136 uint8_t* vec2=wdf->strings->get_feature_vector(vec_idx2, len2, free_vec2);
00137
00138 ASSERT(len1==len2);
00139
00140 float64_t sum=0.0;
00141
00142 for (int32_t i=0; i<len1; i++)
00143 {
00144 for (int32_t j=0; (i+j<len1) && (j<degree); j++)
00145 {
00146 if (vec1[i+j]!=vec2[i+j])
00147 break;
00148 if (j>=start_degree)
00149 sum += wd_weights[j]*wd_weights[j];
00150 }
00151 }
00152 strings->free_feature_vector(vec1, vec_idx1, free_vec1);
00153 wdf->strings->free_feature_vector(vec2, vec_idx2, free_vec2);
00154 return sum/CMath::sq(normalization_const);
00155 }
00156
00157 float64_t CHashedWDFeaturesTransposed::dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len)
00158 {
00159 if (vec2_len != w_dim)
00160 SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim);
00161
00162 float64_t sum=0;
00163 int32_t len;
00164 bool free_vec1;
00165 uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1);
00166 uint32_t* val=new uint32_t[len];
00167
00168 uint32_t offs=0;
00169
00170 CMath::fill_vector(val, len, 0xDEADBEAF);
00171
00172 for (int32_t i=0; i < len; i++)
00173 {
00174 uint32_t o=offs;
00175 for (int32_t k=0; k<degree && i+k<len; k++)
00176 {
00177 const float64_t wd = wd_weights[k];
00178 const uint32_t h=CHash::IncrementalMurmurHash2(vec[i+k], val[i]);
00179 val[i]=h;
00180 #ifdef DEBUG_HASHEDWD
00181 SG_PRINT("vec[i]=%d, k=%d, offs=%d o=%d h=%d \n", vec[i], k,offs, o, h);
00182 #endif
00183 sum+=vec2[o+(h & mask)]*wd;
00184 o+=partial_w_dim;
00185 }
00186 offs+=partial_w_dim*degree;
00187 }
00188 delete[] val;
00189 strings->free_feature_vector(vec, vec_idx1, free_vec1);
00190
00191 return sum/normalization_const;
00192 }
00193
00194 void CHashedWDFeaturesTransposed::dense_dot_range(float64_t* output, int32_t start, int32_t stop, float64_t* alphas, float64_t* vec, int32_t dim, float64_t b)
00195 {
00196 ASSERT(output);
00197
00198
00199 output-=start;
00200 ASSERT(start>=0);
00201 ASSERT(start<stop);
00202 ASSERT(stop<=get_num_vectors());
00203 uint32_t* index=new uint32_t[stop];
00204
00205 int32_t num_vectors=stop-start;
00206 ASSERT(num_vectors>0);
00207
00208 int32_t num_threads=parallel->get_num_threads();
00209 ASSERT(num_threads>0);
00210
00211 CSignal::clear_cancel();
00212
00213 if (dim != w_dim)
00214 SG_ERROR("Dimensions don't match, vec_len=%d, w_dim=%d\n", dim, w_dim);
00215
00216 #ifndef WIN32
00217 if (num_threads < 2)
00218 {
00219 #endif
00220 HASHEDWD_THREAD_PARAM params;
00221 params.hf=this;
00222 params.sub_index=NULL;
00223 params.output=output;
00224 params.start=start;
00225 params.stop=stop;
00226 params.alphas=alphas;
00227 params.vec=vec;
00228 params.bias=b;
00229 params.progress=false;
00230 params.index=index;
00231 dense_dot_range_helper((void*) ¶ms);
00232 #ifndef WIN32
00233 }
00234 else
00235 {
00236 pthread_t* threads = new pthread_t[num_threads-1];
00237 HASHEDWD_THREAD_PARAM* params = new HASHEDWD_THREAD_PARAM[num_threads];
00238 int32_t step= num_vectors/num_threads;
00239
00240 int32_t t;
00241
00242 for (t=0; t<num_threads-1; t++)
00243 {
00244 params[t].hf = this;
00245 params[t].sub_index=NULL;
00246 params[t].output = output;
00247 params[t].start = start+t*step;
00248 params[t].stop = start+(t+1)*step;
00249 params[t].alphas=alphas;
00250 params[t].vec=vec;
00251 params[t].bias=b;
00252 params[t].progress = false;
00253 params[t].index=index;
00254 pthread_create(&threads[t], NULL,
00255 CHashedWDFeaturesTransposed::dense_dot_range_helper, (void*)¶ms[t]);
00256 }
00257
00258 params[t].hf = this;
00259 params[t].sub_index=NULL;
00260 params[t].output = output;
00261 params[t].start = start+t*step;
00262 params[t].stop = stop;
00263 params[t].alphas=alphas;
00264 params[t].vec=vec;
00265 params[t].bias=b;
00266 params[t].progress = false;
00267 params[t].index=index;
00268 CHashedWDFeaturesTransposed::dense_dot_range_helper((void*) ¶ms[t]);
00269
00270 for (t=0; t<num_threads-1; t++)
00271 pthread_join(threads[t], NULL);
00272
00273 delete[] params;
00274 delete[] threads;
00275 delete[] index;
00276 }
00277 #endif
00278
00279 #ifndef WIN32
00280 if ( CSignal::cancel_computations() )
00281 SG_INFO( "prematurely stopped. \n");
00282 #endif
00283 }
00284
00285 void CHashedWDFeaturesTransposed::dense_dot_range_subset(int32_t* sub_index, int num, float64_t* output, float64_t* alphas, float64_t* vec, int32_t dim, float64_t b)
00286 {
00287 ASSERT(sub_index);
00288 ASSERT(output);
00289
00290 uint32_t* index=new uint32_t[num];
00291
00292 int32_t num_threads=parallel->get_num_threads();
00293 ASSERT(num_threads>0);
00294
00295 CSignal::clear_cancel();
00296
00297 if (dim != w_dim)
00298 SG_ERROR("Dimensions don't match, vec_len=%d, w_dim=%d\n", dim, w_dim);
00299
00300 #ifndef WIN32
00301 if (num_threads < 2)
00302 {
00303 #endif
00304 HASHEDWD_THREAD_PARAM params;
00305 params.hf=this;
00306 params.sub_index=sub_index;
00307 params.output=output;
00308 params.start=0;
00309 params.stop=num;
00310 params.alphas=alphas;
00311 params.vec=vec;
00312 params.bias=b;
00313 params.progress=false;
00314 params.index=index;
00315 dense_dot_range_helper((void*) ¶ms);
00316 #ifndef WIN32
00317 }
00318 else
00319 {
00320 pthread_t* threads = new pthread_t[num_threads-1];
00321 HASHEDWD_THREAD_PARAM* params = new HASHEDWD_THREAD_PARAM[num_threads];
00322 int32_t step= num/num_threads;
00323
00324 int32_t t;
00325
00326 for (t=0; t<num_threads-1; t++)
00327 {
00328 params[t].hf = this;
00329 params[t].sub_index=sub_index;
00330 params[t].output = output;
00331 params[t].start = t*step;
00332 params[t].stop = (t+1)*step;
00333 params[t].alphas=alphas;
00334 params[t].vec=vec;
00335 params[t].bias=b;
00336 params[t].progress = false;
00337 params[t].index=index;
00338 pthread_create(&threads[t], NULL,
00339 CHashedWDFeaturesTransposed::dense_dot_range_helper, (void*)¶ms[t]);
00340 }
00341
00342 params[t].hf = this;
00343 params[t].sub_index=sub_index;
00344 params[t].output = output;
00345 params[t].start = t*step;
00346 params[t].stop = num;
00347 params[t].alphas=alphas;
00348 params[t].vec=vec;
00349 params[t].bias=b;
00350 params[t].progress = false;
00351 params[t].index=index;
00352 CHashedWDFeaturesTransposed::dense_dot_range_helper((void*) ¶ms[t]);
00353
00354 for (t=0; t<num_threads-1; t++)
00355 pthread_join(threads[t], NULL);
00356
00357 delete[] params;
00358 delete[] threads;
00359 delete[] index;
00360 }
00361 #endif
00362
00363 #ifndef WIN32
00364 if ( CSignal::cancel_computations() )
00365 SG_INFO( "prematurely stopped. \n");
00366 #endif
00367 }
00368
00369 void* CHashedWDFeaturesTransposed::dense_dot_range_helper(void* p)
00370 {
00371 HASHEDWD_THREAD_PARAM* par=(HASHEDWD_THREAD_PARAM*) p;
00372 CHashedWDFeaturesTransposed* hf=par->hf;
00373 int32_t* sub_index=par->sub_index;
00374 float64_t* output=par->output;
00375 int32_t start=par->start;
00376 int32_t stop=par->stop;
00377 float64_t* alphas=par->alphas;
00378 float64_t* vec=par->vec;
00379 float64_t bias=par->bias;
00380 bool progress=par->progress;
00381 uint32_t* index=par->index;
00382 int32_t string_length=hf->string_length;
00383 int32_t degree=hf->degree;
00384 float64_t* wd_weights=hf->wd_weights;
00385 TString<uint8_t>* transposed_strings=hf->transposed_strings;
00386 uint32_t mask=hf->mask;
00387 int32_t partial_w_dim=hf->partial_w_dim;
00388 float64_t normalization_const=hf->normalization_const;
00389
00390 if (sub_index)
00391 {
00392 for (int32_t j=start; j<stop; j++)
00393 output[j]=0.0;
00394
00395 uint32_t offs=0;
00396 for (int32_t i=0; i<string_length; i++)
00397 {
00398 uint32_t o=offs;
00399 for (int32_t k=0; k<degree && i+k<string_length; k++)
00400 {
00401 const float64_t wd = wd_weights[k];
00402 uint8_t* dim=transposed_strings[i+k].string;
00403 uint32_t h;
00404
00405 for (int32_t j=start; j<stop; j++)
00406 {
00407 uint8_t bval=dim[sub_index[j]];
00408 if (k==0)
00409 h=CHash::IncrementalMurmurHash2(bval, 0xDEADBEAF);
00410 else
00411 h=CHash::IncrementalMurmurHash2(bval, index[j]);
00412 index[j]=h;
00413 output[j]+=vec[o + (h & mask)]*wd;
00414 }
00415 o+=partial_w_dim;
00416 }
00417 offs+=partial_w_dim*degree;
00418
00419 if (progress)
00420 hf->io->progress(i, 0,string_length, i);
00421 }
00422
00423 for (int32_t j=start; j<stop; j++)
00424 {
00425 if (alphas)
00426 output[j]=output[j]*alphas[sub_index[j]]/normalization_const+bias;
00427 else
00428 output[j]=output[j]/normalization_const+bias;
00429 }
00430 }
00431 else
00432 {
00433 CMath::fill_vector(&output[start], stop-start, 0.0);
00434
00435 uint32_t offs=0;
00436 for (int32_t i=0; i<string_length; i++)
00437 {
00438 uint32_t o=offs;
00439 for (int32_t k=0; k<degree && i+k<string_length; k++)
00440 {
00441 const float64_t wd = wd_weights[k];
00442 uint8_t* dim=transposed_strings[i+k].string;
00443 uint32_t h;
00444
00445 for (int32_t j=start; j<stop; j++)
00446 {
00447 if (k==0)
00448 h=CHash::IncrementalMurmurHash2(dim[j], 0xDEADBEAF);
00449 else
00450 h=CHash::IncrementalMurmurHash2(dim[j], index[j]);
00451 index[j]=h;
00452 output[j]+=vec[o + (h & mask)]*wd;
00453 }
00454 o+=partial_w_dim;
00455 }
00456 offs+=partial_w_dim*degree;
00457
00458 if (progress)
00459 hf->io->progress(i, 0,string_length, i);
00460 }
00461
00462 for (int32_t j=start; j<stop; j++)
00463 {
00464 if (alphas)
00465 output[j]=output[j]*alphas[j]/normalization_const+bias;
00466 else
00467 output[j]=output[j]/normalization_const+bias;
00468 }
00469 }
00470
00471 return NULL;
00472 }
00473
00474 void CHashedWDFeaturesTransposed::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val)
00475 {
00476 if (vec2_len != w_dim)
00477 SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim);
00478
00479 int32_t len;
00480 bool free_vec1;
00481 uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1);
00482 uint32_t* val=new uint32_t[len];
00483
00484 uint32_t offs=0;
00485 float64_t factor=alpha/normalization_const;
00486 if (abs_val)
00487 factor=CMath::abs(factor);
00488
00489 CMath::fill_vector(val, len, 0xDEADBEAF);
00490
00491 for (int32_t i=0; i<len; i++)
00492 {
00493 uint32_t o=offs;
00494 for (int32_t k=0; k<degree && i+k<len; k++)
00495 {
00496 float64_t wd = wd_weights[k]*factor;
00497
00498 const uint32_t h=CHash::IncrementalMurmurHash2(vec[i+k], val[i]);
00499 val[i]=h;
00500
00501 #ifdef DEBUG_HASHEDWD
00502 SG_PRINT("offs=%d o=%d h=%d \n", offs, o, h);
00503 SG_PRINT("vec[i]=%d, k=%d, offs=%d o=%d h=%d \n", vec[i], k,offs, o, h);
00504 #endif
00505 vec2[o+(h & mask)]+=wd;
00506 o+=partial_w_dim;
00507 }
00508 offs+=partial_w_dim*degree;
00509 }
00510
00511 delete[] val;
00512 strings->free_feature_vector(vec, vec_idx1, free_vec1);
00513 }
00514
00515 void CHashedWDFeaturesTransposed::set_wd_weights()
00516 {
00517 ASSERT(degree>0);
00518
00519 mask=(uint32_t) (((uint64_t) 1)<<m_hash_bits)-1;
00520 partial_w_dim=1<<m_hash_bits;
00521 w_dim=partial_w_dim*string_length*(degree-start_degree);
00522
00523 wd_weights=new float64_t[degree];
00524
00525 for (int32_t i=0; i<degree; i++)
00526 wd_weights[i]=sqrt(2.0*(from_degree-i)/(from_degree*(from_degree+1)));
00527
00528 SG_DEBUG("created HashedWDFeaturesTransposed with d=%d (%d), alphabetsize=%d, "
00529 "dim=%d partial_dim=%d num=%d, len=%d\n",
00530 degree, from_degree, alphabet_size,
00531 w_dim, partial_w_dim, num_strings, string_length);
00532 }
00533
00534
00535 void CHashedWDFeaturesTransposed::set_normalization_const(float64_t n)
00536 {
00537 if (n==0)
00538 {
00539 normalization_const=0;
00540 for (int32_t i=0; i<degree; i++)
00541 normalization_const+=(string_length-i)*wd_weights[i]*wd_weights[i];
00542
00543 normalization_const=CMath::sqrt(normalization_const);
00544 }
00545 else
00546 normalization_const=n;
00547
00548 SG_DEBUG("normalization_const:%f\n", normalization_const);
00549 }
00550
00551 CFeatures* CHashedWDFeaturesTransposed::duplicate() const
00552 {
00553 return new CHashedWDFeaturesTransposed(*this);
00554 }