HashedWDFeaturesTransposed.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2010 Soeren Sonnenburg
00008  * Copyright (C) 2010 Berlin Institute of Technology
00009  */
00010 
00011 #include <shogun/features/HashedWDFeaturesTransposed.h>
00012 #include <shogun/io/SGIO.h>
00013 #include <shogun/lib/Signal.h>
00014 #include <shogun/base/Parallel.h>
00015 
00016 #ifdef HAVE_PTHREAD
00017 #include <pthread.h>
00018 #endif
00019 
00020 using namespace shogun;
00021 
00022 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00023 struct HASHEDWD_THREAD_PARAM
00024 {
00025     CHashedWDFeaturesTransposed* hf;
00026     int32_t* sub_index;
00027     float64_t* output;
00028     int32_t start;
00029     int32_t stop;
00030     float64_t* alphas;
00031     float64_t* vec;
00032     float64_t bias;
00033     bool progress;
00034     uint32_t* index;
00035 };
00036 #endif // DOXYGEN_SHOULD_SKIP_THIS
00037 
00038 CHashedWDFeaturesTransposed::CHashedWDFeaturesTransposed()
00039     :CDotFeatures()
00040 {
00041     SG_UNSTABLE(
00042         "CHashedWDFeaturesTransposed::CHashedWDFeaturesTransposed()",
00043         "\n");
00044 
00045     strings = NULL;
00046     transposed_strings = NULL;
00047 
00048     degree = 0;
00049     start_degree = 0;
00050     from_degree = 0;
00051     string_length = 0;
00052     num_strings = 0;
00053     alphabet_size = 0;
00054     w_dim = 0;
00055     partial_w_dim = 0;
00056     wd_weights = NULL;
00057     mask = 0;
00058     m_hash_bits = 0;
00059 
00060     normalization_const = 0.0;
00061 }
00062 
00063 CHashedWDFeaturesTransposed::CHashedWDFeaturesTransposed(CStringFeatures<uint8_t>* str,
00064         int32_t start_order, int32_t order, int32_t from_order,
00065         int32_t hash_bits) : CDotFeatures()
00066 {
00067     ASSERT(start_order>=0);
00068     ASSERT(start_order<order);
00069     ASSERT(order<=from_order);
00070     ASSERT(hash_bits>0);
00071     ASSERT(str);
00072     ASSERT(str->have_same_length());
00073     SG_REF(str);
00074 
00075     strings=str;
00076     int32_t transposed_num_feat=0;
00077     int32_t transposed_num_vec=0;
00078     transposed_strings=str->get_transposed(transposed_num_feat, transposed_num_vec);
00079 
00080     string_length=str->get_max_vector_length();
00081     num_strings=str->get_num_vectors();
00082     ASSERT(transposed_num_feat==num_strings);
00083     ASSERT(transposed_num_vec==string_length);
00084 
00085     CAlphabet* alpha=str->get_alphabet();
00086     alphabet_size=alpha->get_num_symbols();
00087     SG_UNREF(alpha);
00088 
00089     degree=order;
00090     start_degree=start_order;
00091     if (start_degree!=0)
00092         SG_NOTIMPLEMENTED;
00093     from_degree=from_order;
00094     m_hash_bits=hash_bits;
00095     set_wd_weights();
00096     set_normalization_const();
00097 }
00098 
00099 CHashedWDFeaturesTransposed::CHashedWDFeaturesTransposed(const CHashedWDFeaturesTransposed& orig)
00100     : CDotFeatures(orig), strings(orig.strings), transposed_strings(orig.transposed_strings),
00101     degree(orig.degree), start_degree(orig.start_degree),
00102     from_degree(orig.from_degree), m_hash_bits(orig.m_hash_bits),
00103     normalization_const(orig.normalization_const)
00104 {
00105     SG_REF(strings);
00106     string_length=strings->get_max_vector_length();
00107     num_strings=strings->get_num_vectors();
00108     CAlphabet* alpha=strings->get_alphabet();
00109     alphabet_size=alpha->get_num_symbols();
00110     SG_UNREF(alpha);
00111 
00112     set_wd_weights();
00113 }
00114 
00115 CHashedWDFeaturesTransposed::~CHashedWDFeaturesTransposed()
00116 {
00117     for (int32_t i=0; i<string_length; i++)
00118         SG_FREE(transposed_strings[i].string);
00119     SG_FREE(transposed_strings);
00120 
00121     SG_UNREF(strings);
00122     SG_FREE(wd_weights);
00123 }
00124 
00125 float64_t CHashedWDFeaturesTransposed::dot(int32_t vec_idx1, CDotFeatures* df, int32_t vec_idx2)
00126 {
00127     ASSERT(df);
00128     ASSERT(df->get_feature_type() == get_feature_type());
00129     ASSERT(df->get_feature_class() == get_feature_class());
00130     CHashedWDFeaturesTransposed* wdf = (CHashedWDFeaturesTransposed*) df;
00131 
00132     int32_t len1, len2;
00133     bool free_vec1, free_vec2;
00134 
00135     uint8_t* vec1=strings->get_feature_vector(vec_idx1, len1, free_vec1);
00136     uint8_t* vec2=wdf->strings->get_feature_vector(vec_idx2, len2, free_vec2);
00137 
00138     ASSERT(len1==len2);
00139 
00140     float64_t sum=0.0;
00141 
00142     for (int32_t i=0; i<len1; i++)
00143     {
00144         for (int32_t j=0; (i+j<len1) && (j<degree); j++)
00145         {
00146             if (vec1[i+j]!=vec2[i+j])
00147                 break;
00148             if (j>=start_degree)
00149                 sum += wd_weights[j]*wd_weights[j];
00150         }
00151     }
00152     strings->free_feature_vector(vec1, vec_idx1, free_vec1);
00153     wdf->strings->free_feature_vector(vec2, vec_idx2, free_vec2);
00154     return sum/CMath::sq(normalization_const);
00155 }
00156 
00157 float64_t CHashedWDFeaturesTransposed::dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len)
00158 {
00159     if (vec2_len != w_dim)
00160         SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim);
00161 
00162     float64_t sum=0;
00163     int32_t len;
00164     bool free_vec1;
00165     uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1);
00166     uint32_t* val=SG_MALLOC(uint32_t, len);
00167 
00168     uint32_t offs=0;
00169 
00170     SGVector<uint32_t>::fill_vector(val, len, 0xDEADBEAF);
00171 
00172     for (int32_t i=0; i < len; i++)
00173     {
00174         uint32_t o=offs;
00175         uint32_t carry = 0;
00176         uint32_t chunk = 0;
00177 
00178         for (int32_t k=0; k<degree && i+k<len; k++)
00179         {
00180             const float64_t wd = wd_weights[k];
00181             chunk++;
00182             CHash::IncrementalMurmurHash3(&(val[i]), &carry, &(vec[i+k]), 1);
00183             uint32_t h =
00184                     CHash::FinalizeIncrementalMurmurHash3(val[i], carry, chunk);
00185 #ifdef DEBUG_HASHEDWD
00186             SG_PRINT("vec[i]=%d, k=%d, offs=%d o=%d h=%d \n", vec[i], k,offs, o, h);
00187 #endif
00188             sum+=vec2[o+(h & mask)]*wd;
00189             o+=partial_w_dim;
00190         }
00191         val[i] = CHash::FinalizeIncrementalMurmurHash3(val[i], carry, chunk);
00192         offs+=partial_w_dim*degree;
00193     }
00194     SG_FREE(val);
00195     strings->free_feature_vector(vec, vec_idx1, free_vec1);
00196 
00197     return sum/normalization_const;
00198 }
00199 
00200 void CHashedWDFeaturesTransposed::dense_dot_range(float64_t* output, int32_t start, int32_t stop, float64_t* alphas, float64_t* vec, int32_t dim, float64_t b)
00201 {
00202     ASSERT(output);
00203     // write access is internally between output[start..stop] so the following
00204     // line is necessary to write to output[0...(stop-start-1)]
00205     output-=start;
00206     ASSERT(start>=0);
00207     ASSERT(start<stop);
00208     ASSERT(stop<=get_num_vectors());
00209     uint32_t* index=SG_MALLOC(uint32_t, stop);
00210 
00211     int32_t num_vectors=stop-start;
00212     ASSERT(num_vectors>0);
00213 
00214     int32_t num_threads=parallel->get_num_threads();
00215     ASSERT(num_threads>0);
00216 
00217     CSignal::clear_cancel();
00218 
00219     if (dim != w_dim)
00220         SG_ERROR("Dimensions don't match, vec_len=%d, w_dim=%d\n", dim, w_dim);
00221 
00222 #ifdef HAVE_PTHREAD
00223     if (num_threads < 2)
00224     {
00225 #endif
00226         HASHEDWD_THREAD_PARAM params;
00227         params.hf=this;
00228         params.sub_index=NULL;
00229         params.output=output;
00230         params.start=start;
00231         params.stop=stop;
00232         params.alphas=alphas;
00233         params.vec=vec;
00234         params.bias=b;
00235         params.progress=false; //true;
00236         params.index=index;
00237         dense_dot_range_helper((void*) &params);
00238 #ifdef HAVE_PTHREAD
00239     }
00240     else
00241     {
00242         pthread_t* threads = SG_MALLOC(pthread_t, num_threads-1);
00243         HASHEDWD_THREAD_PARAM* params = SG_MALLOC(HASHEDWD_THREAD_PARAM, num_threads);
00244         int32_t step= num_vectors/num_threads;
00245 
00246         int32_t t;
00247 
00248         for (t=0; t<num_threads-1; t++)
00249         {
00250             params[t].hf = this;
00251             params[t].sub_index=NULL;
00252             params[t].output = output;
00253             params[t].start = start+t*step;
00254             params[t].stop = start+(t+1)*step;
00255             params[t].alphas=alphas;
00256             params[t].vec=vec;
00257             params[t].bias=b;
00258             params[t].progress = false;
00259             params[t].index=index;
00260             pthread_create(&threads[t], NULL,
00261                     CHashedWDFeaturesTransposed::dense_dot_range_helper, (void*)&params[t]);
00262         }
00263 
00264         params[t].hf = this;
00265         params[t].sub_index=NULL;
00266         params[t].output = output;
00267         params[t].start = start+t*step;
00268         params[t].stop = stop;
00269         params[t].alphas=alphas;
00270         params[t].vec=vec;
00271         params[t].bias=b;
00272         params[t].progress = false; //true;
00273         params[t].index=index;
00274         CHashedWDFeaturesTransposed::dense_dot_range_helper((void*) &params[t]);
00275 
00276         for (t=0; t<num_threads-1; t++)
00277             pthread_join(threads[t], NULL);
00278 
00279         SG_FREE(params);
00280         SG_FREE(threads);
00281     }
00282 #endif
00283     SG_FREE(index);
00284 
00285 #ifndef WIN32
00286         if ( CSignal::cancel_computations() )
00287             SG_INFO( "prematurely stopped.           \n");
00288 #endif
00289 }
00290 
00291 void CHashedWDFeaturesTransposed::dense_dot_range_subset(int32_t* sub_index, int num, float64_t* output, float64_t* alphas, float64_t* vec, int32_t dim, float64_t b)
00292 {
00293     ASSERT(sub_index);
00294     ASSERT(output);
00295 
00296     uint32_t* index=SG_MALLOC(uint32_t, num);
00297 
00298     int32_t num_threads=parallel->get_num_threads();
00299     ASSERT(num_threads>0);
00300 
00301     CSignal::clear_cancel();
00302 
00303     if (dim != w_dim)
00304         SG_ERROR("Dimensions don't match, vec_len=%d, w_dim=%d\n", dim, w_dim);
00305 
00306 #ifdef HAVE_PTHREAD
00307     if (num_threads < 2)
00308     {
00309 #endif
00310         HASHEDWD_THREAD_PARAM params;
00311         params.hf=this;
00312         params.sub_index=sub_index;
00313         params.output=output;
00314         params.start=0;
00315         params.stop=num;
00316         params.alphas=alphas;
00317         params.vec=vec;
00318         params.bias=b;
00319         params.progress=false; //true;
00320         params.index=index;
00321         dense_dot_range_helper((void*) &params);
00322 #ifdef HAVE_PTHREAD
00323     }
00324     else
00325     {
00326         pthread_t* threads = SG_MALLOC(pthread_t, num_threads-1);
00327         HASHEDWD_THREAD_PARAM* params = SG_MALLOC(HASHEDWD_THREAD_PARAM, num_threads);
00328         int32_t step= num/num_threads;
00329 
00330         int32_t t;
00331 
00332         for (t=0; t<num_threads-1; t++)
00333         {
00334             params[t].hf = this;
00335             params[t].sub_index=sub_index;
00336             params[t].output = output;
00337             params[t].start = t*step;
00338             params[t].stop = (t+1)*step;
00339             params[t].alphas=alphas;
00340             params[t].vec=vec;
00341             params[t].bias=b;
00342             params[t].progress = false;
00343             params[t].index=index;
00344             pthread_create(&threads[t], NULL,
00345                     CHashedWDFeaturesTransposed::dense_dot_range_helper, (void*)&params[t]);
00346         }
00347 
00348         params[t].hf = this;
00349         params[t].sub_index=sub_index;
00350         params[t].output = output;
00351         params[t].start = t*step;
00352         params[t].stop = num;
00353         params[t].alphas=alphas;
00354         params[t].vec=vec;
00355         params[t].bias=b;
00356         params[t].progress = false; //true;
00357         params[t].index=index;
00358         CHashedWDFeaturesTransposed::dense_dot_range_helper((void*) &params[t]);
00359 
00360         for (t=0; t<num_threads-1; t++)
00361             pthread_join(threads[t], NULL);
00362 
00363         SG_FREE(params);
00364         SG_FREE(threads);
00365         SG_FREE(index);
00366     }
00367 #endif
00368 
00369 #ifndef WIN32
00370         if ( CSignal::cancel_computations() )
00371             SG_INFO( "prematurely stopped.           \n");
00372 #endif
00373 }
00374 
00375 void* CHashedWDFeaturesTransposed::dense_dot_range_helper(void* p)
00376 {
00377     HASHEDWD_THREAD_PARAM* par=(HASHEDWD_THREAD_PARAM*) p;
00378     CHashedWDFeaturesTransposed* hf=par->hf;
00379     int32_t* sub_index=par->sub_index;
00380     float64_t* output=par->output;
00381     int32_t start=par->start;
00382     int32_t stop=par->stop;
00383     float64_t* alphas=par->alphas;
00384     float64_t* vec=par->vec;
00385     float64_t bias=par->bias;
00386     bool progress=par->progress;
00387     uint32_t* index=par->index;
00388     int32_t string_length=hf->string_length;
00389     int32_t degree=hf->degree;
00390     float64_t* wd_weights=hf->wd_weights;
00391     SGString<uint8_t>* transposed_strings=hf->transposed_strings;
00392     uint32_t mask=hf->mask;
00393     int32_t partial_w_dim=hf->partial_w_dim;
00394     float64_t normalization_const=hf->normalization_const;
00395 
00396     if (sub_index)
00397     {
00398         for (int32_t j=start; j<stop; j++)
00399             output[j]=0.0;
00400 
00401         uint32_t offs=0;
00402         for (int32_t i=0; i<string_length; i++)
00403         {
00404             uint32_t o=offs;
00405             for (int32_t k=0; k<degree && i+k<string_length; k++)
00406             {
00407                 const float64_t wd = wd_weights[k];
00408                 uint8_t* dim=transposed_strings[i+k].string;
00409                 uint32_t carry = 0;
00410                 uint32_t chunk = 0;
00411                 for (int32_t j=start; j<stop; j++)
00412                 {
00413                     uint8_t bval=dim[sub_index[j]];
00414                     if (k==0)
00415                         index[j] = 0xDEADBEAF;
00416 
00417                     CHash::IncrementalMurmurHash3(&index[j], &carry, &bval, 1);
00418 
00419                     chunk++;
00420                     uint32_t h =
00421                             CHash::FinalizeIncrementalMurmurHash3(
00422                                     index[j], carry, chunk);
00423 
00424                     output[j]+=vec[o + (h & mask)]*wd;
00425                     index[j] = h;
00426                 }
00427 
00428                 index[stop-1] =
00429                         CHash::FinalizeIncrementalMurmurHash3(
00430                                 index[stop-1], carry, chunk);
00431 
00432                 o+=partial_w_dim;
00433             }
00434             offs+=partial_w_dim*degree;
00435 
00436             if (progress)
00437                 hf->io->progress(i, 0,string_length, i);
00438         }
00439 
00440         for (int32_t j=start; j<stop; j++)
00441         {
00442             if (alphas)
00443                 output[j]=output[j]*alphas[sub_index[j]]/normalization_const+bias;
00444             else
00445                 output[j]=output[j]/normalization_const+bias;
00446         }
00447     }
00448     else
00449     {
00450         SGVector<float64_t>::fill_vector(&output[start], stop-start, 0.0);
00451 
00452         uint32_t offs=0;
00453         for (int32_t i=0; i<string_length; i++)
00454         {
00455             uint32_t o=offs;
00456             for (int32_t k=0; k<degree && i+k<string_length; k++)
00457             {
00458                 const float64_t wd = wd_weights[k];
00459                 uint8_t* dim=transposed_strings[i+k].string;
00460                 uint32_t carry = 0;
00461                 uint32_t chunk = 0;
00462 
00463                 for (int32_t j=start; j<stop; j++)
00464                 {
00465                     uint8_t bval=dim[sub_index[j]];
00466                     if (k==0)
00467                         index[j] = 0xDEADBEAF;
00468 
00469                     CHash::IncrementalMurmurHash3(&index[j], &carry, &bval, 1);
00470 
00471                     chunk++;
00472                     uint32_t h =
00473                             CHash::FinalizeIncrementalMurmurHash3(
00474                                     index[j], carry, chunk);
00475 
00476                     index[j] = h;
00477                     output[j]+=vec[o + (h & mask)]*wd;
00478                 }
00479 
00480                 index[stop-1] = CHash::FinalizeIncrementalMurmurHash3(
00481                         index[stop-1], carry, chunk);
00482 
00483                 o+=partial_w_dim;
00484             }
00485             offs+=partial_w_dim*degree;
00486 
00487             if (progress)
00488                 hf->io->progress(i, 0,string_length, i);
00489         }
00490 
00491         for (int32_t j=start; j<stop; j++)
00492         {
00493             if (alphas)
00494                 output[j]=output[j]*alphas[j]/normalization_const+bias;
00495             else
00496                 output[j]=output[j]/normalization_const+bias;
00497         }
00498     }
00499 
00500     return NULL;
00501 }
00502 
00503 void CHashedWDFeaturesTransposed::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val)
00504 {
00505     if (vec2_len != w_dim)
00506         SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim);
00507 
00508     int32_t len;
00509     bool free_vec1;
00510     uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1);
00511     uint32_t* val=SG_MALLOC(uint32_t, len);
00512 
00513     uint32_t offs=0;
00514     float64_t factor=alpha/normalization_const;
00515     if (abs_val)
00516         factor=CMath::abs(factor);
00517 
00518     SGVector<uint32_t>::fill_vector(val, len, 0xDEADBEAF);
00519 
00520     for (int32_t i=0; i<len; i++)
00521     {
00522         uint32_t o=offs;
00523         uint32_t carry = 0;
00524         uint32_t chunk = 0;
00525 
00526         for (int32_t k=0; k<degree && i+k<len; k++)
00527         {
00528             float64_t wd = wd_weights[k]*factor;
00529             chunk++;
00530             CHash::IncrementalMurmurHash3(&(val[i]), &carry, &(vec[i+k]), 1);
00531             uint32_t h =
00532                     CHash::FinalizeIncrementalMurmurHash3(val[i], carry, chunk);
00533 #ifdef DEBUG_HASHEDWD
00534             SG_PRINT("offs=%d o=%d h=%d \n", offs, o, h);
00535             SG_PRINT("vec[i]=%d, k=%d, offs=%d o=%d\n", vec[i], k,offs, o);
00536 #endif
00537             vec2[o+(h & mask)]+=wd;
00538             val[i] = h;
00539             o+=partial_w_dim;
00540         }
00541 
00542         val[i] = CHash::FinalizeIncrementalMurmurHash3(val[i], carry, chunk);
00543         offs+=partial_w_dim*degree;
00544     }
00545 
00546     SG_FREE(val);
00547     strings->free_feature_vector(vec, vec_idx1, free_vec1);
00548 }
00549 
00550 void CHashedWDFeaturesTransposed::set_wd_weights()
00551 {
00552     ASSERT(degree>0);
00553 
00554     mask=(uint32_t) (((uint64_t) 1)<<m_hash_bits)-1;
00555     partial_w_dim=1<<m_hash_bits;
00556     w_dim=partial_w_dim*string_length*(degree-start_degree);
00557 
00558     wd_weights=SG_MALLOC(float64_t, degree);
00559 
00560     for (int32_t i=0; i<degree; i++)
00561         wd_weights[i]=sqrt(2.0*(from_degree-i)/(from_degree*(from_degree+1)));
00562 
00563     SG_DEBUG("created HashedWDFeaturesTransposed with d=%d (%d), alphabetsize=%d, "
00564             "dim=%d partial_dim=%d num=%d, len=%d\n",
00565             degree, from_degree, alphabet_size,
00566             w_dim, partial_w_dim, num_strings, string_length);
00567 }
00568 
00569 
00570 void CHashedWDFeaturesTransposed::set_normalization_const(float64_t n)
00571 {
00572     if (n==0)
00573     {
00574         normalization_const=0;
00575         for (int32_t i=0; i<degree; i++)
00576             normalization_const+=(string_length-i)*wd_weights[i]*wd_weights[i];
00577 
00578         normalization_const=CMath::sqrt(normalization_const);
00579     }
00580     else
00581         normalization_const=n;
00582 
00583     SG_DEBUG("normalization_const:%f\n", normalization_const);
00584 }
00585 
00586 CFeatures* CHashedWDFeaturesTransposed::duplicate() const
00587 {
00588     return new CHashedWDFeaturesTransposed(*this);
00589 }
00590 
00591 void* CHashedWDFeaturesTransposed::get_feature_iterator(int32_t vector_index)
00592 {
00593     SG_NOTIMPLEMENTED;
00594     return NULL;
00595 }
00596 
00597 bool CHashedWDFeaturesTransposed::get_next_feature(int32_t& index, float64_t& value, void* iterator)
00598 {
00599     SG_NOTIMPLEMENTED;
00600     return NULL;
00601 }
00602 
00603 void CHashedWDFeaturesTransposed::free_feature_iterator(void* iterator)
00604 {
00605     SG_NOTIMPLEMENTED;
00606 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation