00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011 #include <shogun/features/HashedWDFeaturesTransposed.h>
00012 #include <shogun/io/SGIO.h>
00013 #include <shogun/lib/Signal.h>
00014 #include <shogun/base/Parallel.h>
00015
00016 #ifdef HAVE_PTHREAD
00017 #include <pthread.h>
00018 #endif
00019
00020 using namespace shogun;
00021
00022 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00023 struct HASHEDWD_THREAD_PARAM
00024 {
00025 CHashedWDFeaturesTransposed* hf;
00026 int32_t* sub_index;
00027 float64_t* output;
00028 int32_t start;
00029 int32_t stop;
00030 float64_t* alphas;
00031 float64_t* vec;
00032 float64_t bias;
00033 bool progress;
00034 uint32_t* index;
00035 };
00036 #endif // DOXYGEN_SHOULD_SKIP_THIS
00037
00038 CHashedWDFeaturesTransposed::CHashedWDFeaturesTransposed()
00039 :CDotFeatures()
00040 {
00041 SG_UNSTABLE(
00042 "CHashedWDFeaturesTransposed::CHashedWDFeaturesTransposed()",
00043 "\n");
00044
00045 strings = NULL;
00046 transposed_strings = NULL;
00047
00048 degree = 0;
00049 start_degree = 0;
00050 from_degree = 0;
00051 string_length = 0;
00052 num_strings = 0;
00053 alphabet_size = 0;
00054 w_dim = 0;
00055 partial_w_dim = 0;
00056 wd_weights = NULL;
00057 mask = 0;
00058 m_hash_bits = 0;
00059
00060 normalization_const = 0.0;
00061 }
00062
00063 CHashedWDFeaturesTransposed::CHashedWDFeaturesTransposed(CStringFeatures<uint8_t>* str,
00064 int32_t start_order, int32_t order, int32_t from_order,
00065 int32_t hash_bits) : CDotFeatures()
00066 {
00067 ASSERT(start_order>=0);
00068 ASSERT(start_order<order);
00069 ASSERT(order<=from_order);
00070 ASSERT(hash_bits>0);
00071 ASSERT(str);
00072 ASSERT(str->have_same_length());
00073 SG_REF(str);
00074
00075 strings=str;
00076 int32_t transposed_num_feat=0;
00077 int32_t transposed_num_vec=0;
00078 transposed_strings=str->get_transposed(transposed_num_feat, transposed_num_vec);
00079
00080 string_length=str->get_max_vector_length();
00081 num_strings=str->get_num_vectors();
00082 ASSERT(transposed_num_feat==num_strings);
00083 ASSERT(transposed_num_vec==string_length);
00084
00085 CAlphabet* alpha=str->get_alphabet();
00086 alphabet_size=alpha->get_num_symbols();
00087 SG_UNREF(alpha);
00088
00089 degree=order;
00090 start_degree=start_order;
00091 if (start_degree!=0)
00092 SG_NOTIMPLEMENTED;
00093 from_degree=from_order;
00094 m_hash_bits=hash_bits;
00095 set_wd_weights();
00096 set_normalization_const();
00097 }
00098
00099 CHashedWDFeaturesTransposed::CHashedWDFeaturesTransposed(const CHashedWDFeaturesTransposed& orig)
00100 : CDotFeatures(orig), strings(orig.strings), transposed_strings(orig.transposed_strings),
00101 degree(orig.degree), start_degree(orig.start_degree),
00102 from_degree(orig.from_degree), m_hash_bits(orig.m_hash_bits),
00103 normalization_const(orig.normalization_const)
00104 {
00105 SG_REF(strings);
00106 string_length=strings->get_max_vector_length();
00107 num_strings=strings->get_num_vectors();
00108 CAlphabet* alpha=strings->get_alphabet();
00109 alphabet_size=alpha->get_num_symbols();
00110 SG_UNREF(alpha);
00111
00112 set_wd_weights();
00113 }
00114
00115 CHashedWDFeaturesTransposed::~CHashedWDFeaturesTransposed()
00116 {
00117 for (int32_t i=0; i<string_length; i++)
00118 SG_FREE(transposed_strings[i].string);
00119 SG_FREE(transposed_strings);
00120
00121 SG_UNREF(strings);
00122 SG_FREE(wd_weights);
00123 }
00124
00125 float64_t CHashedWDFeaturesTransposed::dot(int32_t vec_idx1, CDotFeatures* df, int32_t vec_idx2)
00126 {
00127 ASSERT(df);
00128 ASSERT(df->get_feature_type() == get_feature_type());
00129 ASSERT(df->get_feature_class() == get_feature_class());
00130 CHashedWDFeaturesTransposed* wdf = (CHashedWDFeaturesTransposed*) df;
00131
00132 int32_t len1, len2;
00133 bool free_vec1, free_vec2;
00134
00135 uint8_t* vec1=strings->get_feature_vector(vec_idx1, len1, free_vec1);
00136 uint8_t* vec2=wdf->strings->get_feature_vector(vec_idx2, len2, free_vec2);
00137
00138 ASSERT(len1==len2);
00139
00140 float64_t sum=0.0;
00141
00142 for (int32_t i=0; i<len1; i++)
00143 {
00144 for (int32_t j=0; (i+j<len1) && (j<degree); j++)
00145 {
00146 if (vec1[i+j]!=vec2[i+j])
00147 break;
00148 if (j>=start_degree)
00149 sum += wd_weights[j]*wd_weights[j];
00150 }
00151 }
00152 strings->free_feature_vector(vec1, vec_idx1, free_vec1);
00153 wdf->strings->free_feature_vector(vec2, vec_idx2, free_vec2);
00154 return sum/CMath::sq(normalization_const);
00155 }
00156
00157 float64_t CHashedWDFeaturesTransposed::dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len)
00158 {
00159 if (vec2_len != w_dim)
00160 SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim);
00161
00162 float64_t sum=0;
00163 int32_t len;
00164 bool free_vec1;
00165 uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1);
00166 uint32_t* val=SG_MALLOC(uint32_t, len);
00167
00168 uint32_t offs=0;
00169
00170 SGVector<uint32_t>::fill_vector(val, len, 0xDEADBEAF);
00171
00172 for (int32_t i=0; i < len; i++)
00173 {
00174 uint32_t o=offs;
00175 uint32_t carry = 0;
00176 uint32_t chunk = 0;
00177
00178 for (int32_t k=0; k<degree && i+k<len; k++)
00179 {
00180 const float64_t wd = wd_weights[k];
00181 chunk++;
00182 CHash::IncrementalMurmurHash3(&(val[i]), &carry, &(vec[i+k]), 1);
00183 uint32_t h =
00184 CHash::FinalizeIncrementalMurmurHash3(val[i], carry, chunk);
00185 #ifdef DEBUG_HASHEDWD
00186 SG_PRINT("vec[i]=%d, k=%d, offs=%d o=%d h=%d \n", vec[i], k,offs, o, h);
00187 #endif
00188 sum+=vec2[o+(h & mask)]*wd;
00189 o+=partial_w_dim;
00190 }
00191 val[i] = CHash::FinalizeIncrementalMurmurHash3(val[i], carry, chunk);
00192 offs+=partial_w_dim*degree;
00193 }
00194 SG_FREE(val);
00195 strings->free_feature_vector(vec, vec_idx1, free_vec1);
00196
00197 return sum/normalization_const;
00198 }
00199
00200 void CHashedWDFeaturesTransposed::dense_dot_range(float64_t* output, int32_t start, int32_t stop, float64_t* alphas, float64_t* vec, int32_t dim, float64_t b)
00201 {
00202 ASSERT(output);
00203
00204
00205 output-=start;
00206 ASSERT(start>=0);
00207 ASSERT(start<stop);
00208 ASSERT(stop<=get_num_vectors());
00209 uint32_t* index=SG_MALLOC(uint32_t, stop);
00210
00211 int32_t num_vectors=stop-start;
00212 ASSERT(num_vectors>0);
00213
00214 int32_t num_threads=parallel->get_num_threads();
00215 ASSERT(num_threads>0);
00216
00217 CSignal::clear_cancel();
00218
00219 if (dim != w_dim)
00220 SG_ERROR("Dimensions don't match, vec_len=%d, w_dim=%d\n", dim, w_dim);
00221
00222 #ifdef HAVE_PTHREAD
00223 if (num_threads < 2)
00224 {
00225 #endif
00226 HASHEDWD_THREAD_PARAM params;
00227 params.hf=this;
00228 params.sub_index=NULL;
00229 params.output=output;
00230 params.start=start;
00231 params.stop=stop;
00232 params.alphas=alphas;
00233 params.vec=vec;
00234 params.bias=b;
00235 params.progress=false;
00236 params.index=index;
00237 dense_dot_range_helper((void*) ¶ms);
00238 #ifdef HAVE_PTHREAD
00239 }
00240 else
00241 {
00242 pthread_t* threads = SG_MALLOC(pthread_t, num_threads-1);
00243 HASHEDWD_THREAD_PARAM* params = SG_MALLOC(HASHEDWD_THREAD_PARAM, num_threads);
00244 int32_t step= num_vectors/num_threads;
00245
00246 int32_t t;
00247
00248 for (t=0; t<num_threads-1; t++)
00249 {
00250 params[t].hf = this;
00251 params[t].sub_index=NULL;
00252 params[t].output = output;
00253 params[t].start = start+t*step;
00254 params[t].stop = start+(t+1)*step;
00255 params[t].alphas=alphas;
00256 params[t].vec=vec;
00257 params[t].bias=b;
00258 params[t].progress = false;
00259 params[t].index=index;
00260 pthread_create(&threads[t], NULL,
00261 CHashedWDFeaturesTransposed::dense_dot_range_helper, (void*)¶ms[t]);
00262 }
00263
00264 params[t].hf = this;
00265 params[t].sub_index=NULL;
00266 params[t].output = output;
00267 params[t].start = start+t*step;
00268 params[t].stop = stop;
00269 params[t].alphas=alphas;
00270 params[t].vec=vec;
00271 params[t].bias=b;
00272 params[t].progress = false;
00273 params[t].index=index;
00274 CHashedWDFeaturesTransposed::dense_dot_range_helper((void*) ¶ms[t]);
00275
00276 for (t=0; t<num_threads-1; t++)
00277 pthread_join(threads[t], NULL);
00278
00279 SG_FREE(params);
00280 SG_FREE(threads);
00281 }
00282 #endif
00283 SG_FREE(index);
00284
00285 #ifndef WIN32
00286 if ( CSignal::cancel_computations() )
00287 SG_INFO( "prematurely stopped. \n");
00288 #endif
00289 }
00290
00291 void CHashedWDFeaturesTransposed::dense_dot_range_subset(int32_t* sub_index, int num, float64_t* output, float64_t* alphas, float64_t* vec, int32_t dim, float64_t b)
00292 {
00293 ASSERT(sub_index);
00294 ASSERT(output);
00295
00296 uint32_t* index=SG_MALLOC(uint32_t, num);
00297
00298 int32_t num_threads=parallel->get_num_threads();
00299 ASSERT(num_threads>0);
00300
00301 CSignal::clear_cancel();
00302
00303 if (dim != w_dim)
00304 SG_ERROR("Dimensions don't match, vec_len=%d, w_dim=%d\n", dim, w_dim);
00305
00306 #ifdef HAVE_PTHREAD
00307 if (num_threads < 2)
00308 {
00309 #endif
00310 HASHEDWD_THREAD_PARAM params;
00311 params.hf=this;
00312 params.sub_index=sub_index;
00313 params.output=output;
00314 params.start=0;
00315 params.stop=num;
00316 params.alphas=alphas;
00317 params.vec=vec;
00318 params.bias=b;
00319 params.progress=false;
00320 params.index=index;
00321 dense_dot_range_helper((void*) ¶ms);
00322 #ifdef HAVE_PTHREAD
00323 }
00324 else
00325 {
00326 pthread_t* threads = SG_MALLOC(pthread_t, num_threads-1);
00327 HASHEDWD_THREAD_PARAM* params = SG_MALLOC(HASHEDWD_THREAD_PARAM, num_threads);
00328 int32_t step= num/num_threads;
00329
00330 int32_t t;
00331
00332 for (t=0; t<num_threads-1; t++)
00333 {
00334 params[t].hf = this;
00335 params[t].sub_index=sub_index;
00336 params[t].output = output;
00337 params[t].start = t*step;
00338 params[t].stop = (t+1)*step;
00339 params[t].alphas=alphas;
00340 params[t].vec=vec;
00341 params[t].bias=b;
00342 params[t].progress = false;
00343 params[t].index=index;
00344 pthread_create(&threads[t], NULL,
00345 CHashedWDFeaturesTransposed::dense_dot_range_helper, (void*)¶ms[t]);
00346 }
00347
00348 params[t].hf = this;
00349 params[t].sub_index=sub_index;
00350 params[t].output = output;
00351 params[t].start = t*step;
00352 params[t].stop = num;
00353 params[t].alphas=alphas;
00354 params[t].vec=vec;
00355 params[t].bias=b;
00356 params[t].progress = false;
00357 params[t].index=index;
00358 CHashedWDFeaturesTransposed::dense_dot_range_helper((void*) ¶ms[t]);
00359
00360 for (t=0; t<num_threads-1; t++)
00361 pthread_join(threads[t], NULL);
00362
00363 SG_FREE(params);
00364 SG_FREE(threads);
00365 SG_FREE(index);
00366 }
00367 #endif
00368
00369 #ifndef WIN32
00370 if ( CSignal::cancel_computations() )
00371 SG_INFO( "prematurely stopped. \n");
00372 #endif
00373 }
00374
00375 void* CHashedWDFeaturesTransposed::dense_dot_range_helper(void* p)
00376 {
00377 HASHEDWD_THREAD_PARAM* par=(HASHEDWD_THREAD_PARAM*) p;
00378 CHashedWDFeaturesTransposed* hf=par->hf;
00379 int32_t* sub_index=par->sub_index;
00380 float64_t* output=par->output;
00381 int32_t start=par->start;
00382 int32_t stop=par->stop;
00383 float64_t* alphas=par->alphas;
00384 float64_t* vec=par->vec;
00385 float64_t bias=par->bias;
00386 bool progress=par->progress;
00387 uint32_t* index=par->index;
00388 int32_t string_length=hf->string_length;
00389 int32_t degree=hf->degree;
00390 float64_t* wd_weights=hf->wd_weights;
00391 SGString<uint8_t>* transposed_strings=hf->transposed_strings;
00392 uint32_t mask=hf->mask;
00393 int32_t partial_w_dim=hf->partial_w_dim;
00394 float64_t normalization_const=hf->normalization_const;
00395
00396 if (sub_index)
00397 {
00398 for (int32_t j=start; j<stop; j++)
00399 output[j]=0.0;
00400
00401 uint32_t offs=0;
00402 for (int32_t i=0; i<string_length; i++)
00403 {
00404 uint32_t o=offs;
00405 for (int32_t k=0; k<degree && i+k<string_length; k++)
00406 {
00407 const float64_t wd = wd_weights[k];
00408 uint8_t* dim=transposed_strings[i+k].string;
00409 uint32_t carry = 0;
00410 uint32_t chunk = 0;
00411 for (int32_t j=start; j<stop; j++)
00412 {
00413 uint8_t bval=dim[sub_index[j]];
00414 if (k==0)
00415 index[j] = 0xDEADBEAF;
00416
00417 CHash::IncrementalMurmurHash3(&index[j], &carry, &bval, 1);
00418
00419 chunk++;
00420 uint32_t h =
00421 CHash::FinalizeIncrementalMurmurHash3(
00422 index[j], carry, chunk);
00423
00424 output[j]+=vec[o + (h & mask)]*wd;
00425 index[j] = h;
00426 }
00427
00428 index[stop-1] =
00429 CHash::FinalizeIncrementalMurmurHash3(
00430 index[stop-1], carry, chunk);
00431
00432 o+=partial_w_dim;
00433 }
00434 offs+=partial_w_dim*degree;
00435
00436 if (progress)
00437 hf->io->progress(i, 0,string_length, i);
00438 }
00439
00440 for (int32_t j=start; j<stop; j++)
00441 {
00442 if (alphas)
00443 output[j]=output[j]*alphas[sub_index[j]]/normalization_const+bias;
00444 else
00445 output[j]=output[j]/normalization_const+bias;
00446 }
00447 }
00448 else
00449 {
00450 SGVector<float64_t>::fill_vector(&output[start], stop-start, 0.0);
00451
00452 uint32_t offs=0;
00453 for (int32_t i=0; i<string_length; i++)
00454 {
00455 uint32_t o=offs;
00456 for (int32_t k=0; k<degree && i+k<string_length; k++)
00457 {
00458 const float64_t wd = wd_weights[k];
00459 uint8_t* dim=transposed_strings[i+k].string;
00460 uint32_t carry = 0;
00461 uint32_t chunk = 0;
00462
00463 for (int32_t j=start; j<stop; j++)
00464 {
00465 uint8_t bval=dim[sub_index[j]];
00466 if (k==0)
00467 index[j] = 0xDEADBEAF;
00468
00469 CHash::IncrementalMurmurHash3(&index[j], &carry, &bval, 1);
00470
00471 chunk++;
00472 uint32_t h =
00473 CHash::FinalizeIncrementalMurmurHash3(
00474 index[j], carry, chunk);
00475
00476 index[j] = h;
00477 output[j]+=vec[o + (h & mask)]*wd;
00478 }
00479
00480 index[stop-1] = CHash::FinalizeIncrementalMurmurHash3(
00481 index[stop-1], carry, chunk);
00482
00483 o+=partial_w_dim;
00484 }
00485 offs+=partial_w_dim*degree;
00486
00487 if (progress)
00488 hf->io->progress(i, 0,string_length, i);
00489 }
00490
00491 for (int32_t j=start; j<stop; j++)
00492 {
00493 if (alphas)
00494 output[j]=output[j]*alphas[j]/normalization_const+bias;
00495 else
00496 output[j]=output[j]/normalization_const+bias;
00497 }
00498 }
00499
00500 return NULL;
00501 }
00502
00503 void CHashedWDFeaturesTransposed::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val)
00504 {
00505 if (vec2_len != w_dim)
00506 SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim);
00507
00508 int32_t len;
00509 bool free_vec1;
00510 uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1);
00511 uint32_t* val=SG_MALLOC(uint32_t, len);
00512
00513 uint32_t offs=0;
00514 float64_t factor=alpha/normalization_const;
00515 if (abs_val)
00516 factor=CMath::abs(factor);
00517
00518 SGVector<uint32_t>::fill_vector(val, len, 0xDEADBEAF);
00519
00520 for (int32_t i=0; i<len; i++)
00521 {
00522 uint32_t o=offs;
00523 uint32_t carry = 0;
00524 uint32_t chunk = 0;
00525
00526 for (int32_t k=0; k<degree && i+k<len; k++)
00527 {
00528 float64_t wd = wd_weights[k]*factor;
00529 chunk++;
00530 CHash::IncrementalMurmurHash3(&(val[i]), &carry, &(vec[i+k]), 1);
00531 uint32_t h =
00532 CHash::FinalizeIncrementalMurmurHash3(val[i], carry, chunk);
00533 #ifdef DEBUG_HASHEDWD
00534 SG_PRINT("offs=%d o=%d h=%d \n", offs, o, h);
00535 SG_PRINT("vec[i]=%d, k=%d, offs=%d o=%d\n", vec[i], k,offs, o);
00536 #endif
00537 vec2[o+(h & mask)]+=wd;
00538 val[i] = h;
00539 o+=partial_w_dim;
00540 }
00541
00542 val[i] = CHash::FinalizeIncrementalMurmurHash3(val[i], carry, chunk);
00543 offs+=partial_w_dim*degree;
00544 }
00545
00546 SG_FREE(val);
00547 strings->free_feature_vector(vec, vec_idx1, free_vec1);
00548 }
00549
00550 void CHashedWDFeaturesTransposed::set_wd_weights()
00551 {
00552 ASSERT(degree>0);
00553
00554 mask=(uint32_t) (((uint64_t) 1)<<m_hash_bits)-1;
00555 partial_w_dim=1<<m_hash_bits;
00556 w_dim=partial_w_dim*string_length*(degree-start_degree);
00557
00558 wd_weights=SG_MALLOC(float64_t, degree);
00559
00560 for (int32_t i=0; i<degree; i++)
00561 wd_weights[i]=sqrt(2.0*(from_degree-i)/(from_degree*(from_degree+1)));
00562
00563 SG_DEBUG("created HashedWDFeaturesTransposed with d=%d (%d), alphabetsize=%d, "
00564 "dim=%d partial_dim=%d num=%d, len=%d\n",
00565 degree, from_degree, alphabet_size,
00566 w_dim, partial_w_dim, num_strings, string_length);
00567 }
00568
00569
00570 void CHashedWDFeaturesTransposed::set_normalization_const(float64_t n)
00571 {
00572 if (n==0)
00573 {
00574 normalization_const=0;
00575 for (int32_t i=0; i<degree; i++)
00576 normalization_const+=(string_length-i)*wd_weights[i]*wd_weights[i];
00577
00578 normalization_const=CMath::sqrt(normalization_const);
00579 }
00580 else
00581 normalization_const=n;
00582
00583 SG_DEBUG("normalization_const:%f\n", normalization_const);
00584 }
00585
00586 CFeatures* CHashedWDFeaturesTransposed::duplicate() const
00587 {
00588 return new CHashedWDFeaturesTransposed(*this);
00589 }
00590
00591 void* CHashedWDFeaturesTransposed::get_feature_iterator(int32_t vector_index)
00592 {
00593 SG_NOTIMPLEMENTED;
00594 return NULL;
00595 }
00596
00597 bool CHashedWDFeaturesTransposed::get_next_feature(int32_t& index, float64_t& value, void* iterator)
00598 {
00599 SG_NOTIMPLEMENTED;
00600 return NULL;
00601 }
00602
00603 void CHashedWDFeaturesTransposed::free_feature_iterator(void* iterator)
00604 {
00605 SG_NOTIMPLEMENTED;
00606 }