HashedWDFeaturesTransposed.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2010 Soeren Sonnenburg
00008  * Copyright (C) 2010 Berlin Institute of Technology
00009  */
00010 
00011 #include <shogun/features/HashedWDFeaturesTransposed.h>
00012 #include <shogun/io/SGIO.h>
00013 #include <shogun/lib/Signal.h>
00014 #include <shogun/base/Parallel.h>
00015 
00016 #ifdef HAVE_PTHREAD
00017 #include <pthread.h>
00018 #endif
00019 
00020 using namespace shogun;
00021 
00022 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00023 struct HASHEDWD_THREAD_PARAM
00024 {
00025     CHashedWDFeaturesTransposed* hf;
00026     int32_t* sub_index;
00027     float64_t* output;
00028     int32_t start;
00029     int32_t stop;
00030     float64_t* alphas;
00031     float64_t* vec;
00032     float64_t bias;
00033     bool progress;
00034     uint32_t* index;
00035 };
00036 #endif // DOXYGEN_SHOULD_SKIP_THIS
00037 
00038 CHashedWDFeaturesTransposed::CHashedWDFeaturesTransposed(void)
00039     :CDotFeatures()
00040 {
00041     SG_UNSTABLE(
00042         "CHashedWDFeaturesTransposed::CHashedWDFeaturesTransposed(void)",
00043         "\n");
00044 
00045     strings = NULL;
00046     transposed_strings = NULL;
00047 
00048     degree = 0;
00049     start_degree = 0;
00050     from_degree = 0;
00051     string_length = 0;
00052     num_strings = 0;
00053     alphabet_size = 0;
00054     w_dim = 0;
00055     partial_w_dim = 0;
00056     wd_weights = NULL;
00057     mask = 0;
00058     m_hash_bits = 0;
00059 
00060     normalization_const = 0.0;
00061 }
00062 
00063 CHashedWDFeaturesTransposed::CHashedWDFeaturesTransposed(CStringFeatures<uint8_t>* str,
00064         int32_t start_order, int32_t order, int32_t from_order,
00065         int32_t hash_bits) : CDotFeatures()
00066 {
00067     ASSERT(start_order>=0);
00068     ASSERT(start_order<order);
00069     ASSERT(order<=from_order);
00070     ASSERT(hash_bits>0);
00071     ASSERT(str);
00072     ASSERT(str->have_same_length());
00073     SG_REF(str);
00074 
00075     strings=str;
00076     int32_t transposed_num_feat=0;
00077     int32_t transposed_num_vec=0;
00078     transposed_strings=str->get_transposed(transposed_num_feat, transposed_num_vec);
00079 
00080     string_length=str->get_max_vector_length();
00081     num_strings=str->get_num_vectors();
00082     ASSERT(transposed_num_feat==num_strings);
00083     ASSERT(transposed_num_vec==string_length);
00084 
00085     CAlphabet* alpha=str->get_alphabet();
00086     alphabet_size=alpha->get_num_symbols();
00087     SG_UNREF(alpha);
00088 
00089     degree=order;
00090     start_degree=start_order;
00091     if (start_degree!=0)
00092         SG_NOTIMPLEMENTED;
00093     from_degree=from_order;
00094     m_hash_bits=hash_bits;
00095     set_wd_weights();
00096     set_normalization_const();
00097 }
00098 
00099 CHashedWDFeaturesTransposed::CHashedWDFeaturesTransposed(const CHashedWDFeaturesTransposed& orig)
00100     : CDotFeatures(orig), strings(orig.strings), transposed_strings(transposed_strings),
00101     degree(orig.degree), start_degree(orig.start_degree), 
00102     from_degree(orig.from_degree), m_hash_bits(orig.m_hash_bits),
00103     normalization_const(orig.normalization_const)
00104 {
00105     SG_REF(strings);
00106     string_length=strings->get_max_vector_length();
00107     num_strings=strings->get_num_vectors();
00108     CAlphabet* alpha=strings->get_alphabet();
00109     alphabet_size=alpha->get_num_symbols();
00110     SG_UNREF(alpha);
00111 
00112     set_wd_weights();
00113 }
00114 
00115 CHashedWDFeaturesTransposed::~CHashedWDFeaturesTransposed()
00116 {
00117     for (int32_t i=0; i<string_length; i++)
00118         SG_FREE(transposed_strings[i].string);
00119     SG_FREE(transposed_strings);
00120 
00121     SG_UNREF(strings);
00122     SG_FREE(wd_weights);
00123 }
00124 
00125 float64_t CHashedWDFeaturesTransposed::dot(int32_t vec_idx1, CDotFeatures* df, int32_t vec_idx2)
00126 {
00127     ASSERT(df);
00128     ASSERT(df->get_feature_type() == get_feature_type());
00129     ASSERT(df->get_feature_class() == get_feature_class());
00130     CHashedWDFeaturesTransposed* wdf = (CHashedWDFeaturesTransposed*) df;
00131 
00132     int32_t len1, len2;
00133     bool free_vec1, free_vec2;
00134 
00135     uint8_t* vec1=strings->get_feature_vector(vec_idx1, len1, free_vec1);
00136     uint8_t* vec2=wdf->strings->get_feature_vector(vec_idx2, len2, free_vec2);
00137 
00138     ASSERT(len1==len2);
00139 
00140     float64_t sum=0.0;
00141 
00142     for (int32_t i=0; i<len1; i++)
00143     {
00144         for (int32_t j=0; (i+j<len1) && (j<degree); j++)
00145         {
00146             if (vec1[i+j]!=vec2[i+j])
00147                 break;
00148             if (j>=start_degree)
00149                 sum += wd_weights[j]*wd_weights[j];
00150         }
00151     }
00152     strings->free_feature_vector(vec1, vec_idx1, free_vec1);
00153     wdf->strings->free_feature_vector(vec2, vec_idx2, free_vec2);
00154     return sum/CMath::sq(normalization_const);
00155 }
00156 
00157 float64_t CHashedWDFeaturesTransposed::dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len)
00158 {
00159     if (vec2_len != w_dim)
00160         SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim);
00161 
00162     float64_t sum=0;
00163     int32_t len;
00164     bool free_vec1;
00165     uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1);
00166     uint32_t* val=SG_MALLOC(uint32_t, len);
00167 
00168     uint32_t offs=0;
00169 
00170     CMath::fill_vector(val, len, 0xDEADBEAF);
00171 
00172     for (int32_t i=0; i < len; i++) 
00173     {
00174         uint32_t o=offs;
00175         for (int32_t k=0; k<degree && i+k<len; k++)
00176         {
00177             const float64_t wd = wd_weights[k];
00178             const uint32_t h=CHash::IncrementalMurmurHash2(vec[i+k], val[i]);
00179             val[i]=h;
00180 #ifdef DEBUG_HASHEDWD
00181             SG_PRINT("vec[i]=%d, k=%d, offs=%d o=%d h=%d \n", vec[i], k,offs, o, h);
00182 #endif
00183             sum+=vec2[o+(h & mask)]*wd;
00184             o+=partial_w_dim;
00185         }
00186         offs+=partial_w_dim*degree;
00187     }
00188     SG_FREE(val);
00189     strings->free_feature_vector(vec, vec_idx1, free_vec1);
00190 
00191     return sum/normalization_const;
00192 }
00193 
00194 void CHashedWDFeaturesTransposed::dense_dot_range(float64_t* output, int32_t start, int32_t stop, float64_t* alphas, float64_t* vec, int32_t dim, float64_t b)
00195 {
00196     ASSERT(output);
00197     // write access is internally between output[start..stop] so the following
00198     // line is necessary to write to output[0...(stop-start-1)]
00199     output-=start; 
00200     ASSERT(start>=0);
00201     ASSERT(start<stop);
00202     ASSERT(stop<=get_num_vectors());
00203     uint32_t* index=SG_MALLOC(uint32_t, stop);
00204 
00205     int32_t num_vectors=stop-start;
00206     ASSERT(num_vectors>0);
00207 
00208     int32_t num_threads=parallel->get_num_threads();
00209     ASSERT(num_threads>0);
00210 
00211     CSignal::clear_cancel();
00212 
00213     if (dim != w_dim)
00214         SG_ERROR("Dimensions don't match, vec_len=%d, w_dim=%d\n", dim, w_dim);
00215 
00216 #ifdef HAVE_PTHREAD
00217     if (num_threads < 2)
00218     {
00219 #endif
00220         HASHEDWD_THREAD_PARAM params;
00221         params.hf=this;
00222         params.sub_index=NULL;
00223         params.output=output;
00224         params.start=start;
00225         params.stop=stop;
00226         params.alphas=alphas;
00227         params.vec=vec;
00228         params.bias=b;
00229         params.progress=false; //true;
00230         params.index=index;
00231         dense_dot_range_helper((void*) &params);
00232 #ifdef HAVE_PTHREAD
00233     }
00234     else
00235     {
00236         pthread_t* threads = SG_MALLOC(pthread_t, num_threads-1);
00237         HASHEDWD_THREAD_PARAM* params = SG_MALLOC(HASHEDWD_THREAD_PARAM, num_threads);
00238         int32_t step= num_vectors/num_threads;
00239 
00240         int32_t t;
00241 
00242         for (t=0; t<num_threads-1; t++)
00243         {
00244             params[t].hf = this;
00245             params[t].sub_index=NULL;
00246             params[t].output = output;
00247             params[t].start = start+t*step;
00248             params[t].stop = start+(t+1)*step;
00249             params[t].alphas=alphas;
00250             params[t].vec=vec;
00251             params[t].bias=b;
00252             params[t].progress = false;
00253             params[t].index=index;
00254             pthread_create(&threads[t], NULL,
00255                     CHashedWDFeaturesTransposed::dense_dot_range_helper, (void*)&params[t]);
00256         }
00257 
00258         params[t].hf = this;
00259         params[t].sub_index=NULL;
00260         params[t].output = output;
00261         params[t].start = start+t*step;
00262         params[t].stop = stop;
00263         params[t].alphas=alphas;
00264         params[t].vec=vec;
00265         params[t].bias=b;
00266         params[t].progress = false; //true;
00267         params[t].index=index;
00268         CHashedWDFeaturesTransposed::dense_dot_range_helper((void*) &params[t]);
00269 
00270         for (t=0; t<num_threads-1; t++)
00271             pthread_join(threads[t], NULL);
00272 
00273         SG_FREE(params);
00274         SG_FREE(threads);
00275     }
00276 #endif
00277     SG_FREE(index);
00278 
00279 #ifndef WIN32
00280         if ( CSignal::cancel_computations() )
00281             SG_INFO( "prematurely stopped.           \n");
00282 #endif
00283 }
00284 
00285 void CHashedWDFeaturesTransposed::dense_dot_range_subset(int32_t* sub_index, int num, float64_t* output, float64_t* alphas, float64_t* vec, int32_t dim, float64_t b)
00286 {
00287     ASSERT(sub_index);
00288     ASSERT(output);
00289 
00290     uint32_t* index=SG_MALLOC(uint32_t, num);
00291 
00292     int32_t num_threads=parallel->get_num_threads();
00293     ASSERT(num_threads>0);
00294 
00295     CSignal::clear_cancel();
00296 
00297     if (dim != w_dim)
00298         SG_ERROR("Dimensions don't match, vec_len=%d, w_dim=%d\n", dim, w_dim);
00299 
00300 #ifdef HAVE_PTHREAD
00301     if (num_threads < 2)
00302     {
00303 #endif
00304         HASHEDWD_THREAD_PARAM params;
00305         params.hf=this;
00306         params.sub_index=sub_index;
00307         params.output=output;
00308         params.start=0;
00309         params.stop=num;
00310         params.alphas=alphas;
00311         params.vec=vec;
00312         params.bias=b;
00313         params.progress=false; //true;
00314         params.index=index;
00315         dense_dot_range_helper((void*) &params);
00316 #ifdef HAVE_PTHREAD
00317     }
00318     else
00319     {
00320         pthread_t* threads = SG_MALLOC(pthread_t, num_threads-1);
00321         HASHEDWD_THREAD_PARAM* params = SG_MALLOC(HASHEDWD_THREAD_PARAM, num_threads);
00322         int32_t step= num/num_threads;
00323 
00324         int32_t t;
00325 
00326         for (t=0; t<num_threads-1; t++)
00327         {
00328             params[t].hf = this;
00329             params[t].sub_index=sub_index;
00330             params[t].output = output;
00331             params[t].start = t*step;
00332             params[t].stop = (t+1)*step;
00333             params[t].alphas=alphas;
00334             params[t].vec=vec;
00335             params[t].bias=b;
00336             params[t].progress = false;
00337             params[t].index=index;
00338             pthread_create(&threads[t], NULL,
00339                     CHashedWDFeaturesTransposed::dense_dot_range_helper, (void*)&params[t]);
00340         }
00341 
00342         params[t].hf = this;
00343         params[t].sub_index=sub_index;
00344         params[t].output = output;
00345         params[t].start = t*step;
00346         params[t].stop = num;
00347         params[t].alphas=alphas;
00348         params[t].vec=vec;
00349         params[t].bias=b;
00350         params[t].progress = false; //true;
00351         params[t].index=index;
00352         CHashedWDFeaturesTransposed::dense_dot_range_helper((void*) &params[t]);
00353 
00354         for (t=0; t<num_threads-1; t++)
00355             pthread_join(threads[t], NULL);
00356 
00357         SG_FREE(params);
00358         SG_FREE(threads);
00359         SG_FREE(index);
00360     }
00361 #endif
00362 
00363 #ifndef WIN32
00364         if ( CSignal::cancel_computations() )
00365             SG_INFO( "prematurely stopped.           \n");
00366 #endif
00367 }
00368 
00369 void* CHashedWDFeaturesTransposed::dense_dot_range_helper(void* p)
00370 {
00371     HASHEDWD_THREAD_PARAM* par=(HASHEDWD_THREAD_PARAM*) p;
00372     CHashedWDFeaturesTransposed* hf=par->hf;
00373     int32_t* sub_index=par->sub_index;
00374     float64_t* output=par->output;
00375     int32_t start=par->start;
00376     int32_t stop=par->stop;
00377     float64_t* alphas=par->alphas;
00378     float64_t* vec=par->vec;
00379     float64_t bias=par->bias;
00380     bool progress=par->progress;
00381     uint32_t* index=par->index;
00382     int32_t string_length=hf->string_length;
00383     int32_t degree=hf->degree;
00384     float64_t* wd_weights=hf->wd_weights;
00385     SGString<uint8_t>* transposed_strings=hf->transposed_strings;
00386     uint32_t mask=hf->mask;
00387     int32_t partial_w_dim=hf->partial_w_dim;
00388     float64_t normalization_const=hf->normalization_const;
00389 
00390     if (sub_index)
00391     {
00392         for (int32_t j=start; j<stop; j++)
00393             output[j]=0.0;
00394 
00395         uint32_t offs=0;
00396         for (int32_t i=0; i<string_length; i++)
00397         {
00398             uint32_t o=offs;
00399             for (int32_t k=0; k<degree && i+k<string_length; k++)
00400             {
00401                 const float64_t wd = wd_weights[k];
00402                 uint8_t* dim=transposed_strings[i+k].string;
00403                 uint32_t h;
00404 
00405                 for (int32_t j=start; j<stop; j++)
00406                 {
00407                     uint8_t bval=dim[sub_index[j]];
00408                     if (k==0)
00409                         h=CHash::IncrementalMurmurHash2(bval, 0xDEADBEAF);
00410                     else
00411                         h=CHash::IncrementalMurmurHash2(bval, index[j]);
00412                     index[j]=h;
00413                     output[j]+=vec[o + (h & mask)]*wd;
00414                 }
00415                 o+=partial_w_dim;
00416             }
00417             offs+=partial_w_dim*degree;
00418 
00419             if (progress)
00420                 hf->io->progress(i, 0,string_length, i);
00421         }
00422 
00423         for (int32_t j=start; j<stop; j++)
00424         {
00425             if (alphas)
00426                 output[j]=output[j]*alphas[sub_index[j]]/normalization_const+bias;
00427             else
00428                 output[j]=output[j]/normalization_const+bias;
00429         }
00430     }
00431     else
00432     {
00433         CMath::fill_vector(&output[start], stop-start, 0.0);
00434 
00435         uint32_t offs=0;
00436         for (int32_t i=0; i<string_length; i++)
00437         {
00438             uint32_t o=offs;
00439             for (int32_t k=0; k<degree && i+k<string_length; k++)
00440             {
00441                 const float64_t wd = wd_weights[k];
00442                 uint8_t* dim=transposed_strings[i+k].string;
00443                 uint32_t h;
00444 
00445                 for (int32_t j=start; j<stop; j++)
00446                 {
00447                     if (k==0)
00448                         h=CHash::IncrementalMurmurHash2(dim[j], 0xDEADBEAF);
00449                     else
00450                         h=CHash::IncrementalMurmurHash2(dim[j], index[j]);
00451                     index[j]=h;
00452                     output[j]+=vec[o + (h & mask)]*wd;
00453                 }
00454                 o+=partial_w_dim;
00455             }
00456             offs+=partial_w_dim*degree;
00457 
00458             if (progress)
00459                 hf->io->progress(i, 0,string_length, i);
00460         }
00461 
00462         for (int32_t j=start; j<stop; j++)
00463         {
00464             if (alphas)
00465                 output[j]=output[j]*alphas[j]/normalization_const+bias;
00466             else
00467                 output[j]=output[j]/normalization_const+bias;
00468         }
00469     }
00470 
00471     return NULL;
00472 }
00473 
00474 void CHashedWDFeaturesTransposed::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val)
00475 {
00476     if (vec2_len != w_dim)
00477         SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim);
00478 
00479     int32_t len;
00480     bool free_vec1;
00481     uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1);
00482     uint32_t* val=SG_MALLOC(uint32_t, len);
00483 
00484     uint32_t offs=0;
00485     float64_t factor=alpha/normalization_const;
00486     if (abs_val)
00487         factor=CMath::abs(factor);
00488 
00489     CMath::fill_vector(val, len, 0xDEADBEAF);
00490 
00491     for (int32_t i=0; i<len; i++) 
00492     {
00493         uint32_t o=offs;
00494         for (int32_t k=0; k<degree && i+k<len; k++)
00495         {
00496             float64_t wd = wd_weights[k]*factor;
00497 
00498             const uint32_t h=CHash::IncrementalMurmurHash2(vec[i+k], val[i]);
00499             val[i]=h;
00500 
00501 #ifdef DEBUG_HASHEDWD
00502             SG_PRINT("offs=%d o=%d h=%d \n", offs, o, h);
00503             SG_PRINT("vec[i]=%d, k=%d, offs=%d o=%d h=%d \n", vec[i], k,offs, o, h);
00504 #endif
00505             vec2[o+(h & mask)]+=wd;
00506             o+=partial_w_dim;
00507         }
00508         offs+=partial_w_dim*degree;
00509     }
00510 
00511     SG_FREE(val);
00512     strings->free_feature_vector(vec, vec_idx1, free_vec1);
00513 }
00514 
00515 void CHashedWDFeaturesTransposed::set_wd_weights()
00516 {
00517     ASSERT(degree>0);
00518 
00519     mask=(uint32_t) (((uint64_t) 1)<<m_hash_bits)-1;
00520     partial_w_dim=1<<m_hash_bits;
00521     w_dim=partial_w_dim*string_length*(degree-start_degree);
00522 
00523     wd_weights=SG_MALLOC(float64_t, degree);
00524 
00525     for (int32_t i=0; i<degree; i++)
00526         wd_weights[i]=sqrt(2.0*(from_degree-i)/(from_degree*(from_degree+1)));
00527 
00528     SG_DEBUG("created HashedWDFeaturesTransposed with d=%d (%d), alphabetsize=%d, "
00529             "dim=%d partial_dim=%d num=%d, len=%d\n", 
00530             degree, from_degree, alphabet_size, 
00531             w_dim, partial_w_dim, num_strings, string_length);
00532 }
00533 
00534 
00535 void CHashedWDFeaturesTransposed::set_normalization_const(float64_t n)
00536 {
00537     if (n==0)
00538     {
00539         normalization_const=0;
00540         for (int32_t i=0; i<degree; i++)
00541             normalization_const+=(string_length-i)*wd_weights[i]*wd_weights[i];
00542 
00543         normalization_const=CMath::sqrt(normalization_const);
00544     }
00545     else
00546         normalization_const=n;
00547 
00548     SG_DEBUG("normalization_const:%f\n", normalization_const);
00549 }
00550 
00551 CFeatures* CHashedWDFeaturesTransposed::duplicate() const
00552 {
00553     return new CHashedWDFeaturesTransposed(*this);
00554 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation