WDFeatures.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2009 Soeren Sonnenburg
00008  * Copyright (C) 2009 Fraunhofer Institute FIRST and Max-Planck-Society
00009  */
00010 
00011 #include <shogun/features/WDFeatures.h>
00012 #include <shogun/io/SGIO.h>
00013 
00014 using namespace shogun;
00015 
00016 CWDFeatures::CWDFeatures(void) :CDotFeatures()
00017 {
00018     SG_UNSTABLE("CWDFeatures::CWDFeatures(void) :CDotFeatures()",
00019                 "\n");
00020 
00021     strings = NULL;
00022 
00023     degree = 0;
00024     from_degree = 0;
00025     string_length = 0;
00026     num_strings = 0;
00027     alphabet_size = 0;
00028     w_dim = 0;
00029     wd_weights = NULL;
00030     normalization_const = 0.0;
00031 }
00032 
00033 CWDFeatures::CWDFeatures(CStringFeatures<uint8_t>* str,
00034         int32_t order, int32_t from_order) : CDotFeatures()
00035 {
00036     ASSERT(str);
00037     ASSERT(str->have_same_length());
00038     SG_REF(str);
00039 
00040     strings=str;
00041     string_length=str->get_max_vector_length();
00042     num_strings=str->get_num_vectors();
00043     CAlphabet* alpha=str->get_alphabet();
00044     alphabet_size=alpha->get_num_symbols();
00045     SG_UNREF(alpha);
00046 
00047     degree=order;
00048     from_degree=from_order;
00049     wd_weights=NULL;
00050     set_wd_weights();
00051     set_normalization_const();
00052 
00053 }
00054 
00055 CWDFeatures::CWDFeatures(const CWDFeatures& orig)
00056     : CDotFeatures(orig), strings(orig.strings),
00057     degree(orig.degree), from_degree(orig.from_degree),
00058     normalization_const(orig.normalization_const)
00059 {
00060     SG_REF(strings);
00061     string_length=strings->get_max_vector_length();
00062     num_strings=strings->get_num_vectors();
00063     CAlphabet* alpha=strings->get_alphabet();
00064     alphabet_size=alpha->get_num_symbols();
00065     SG_UNREF(alpha);
00066 
00067     wd_weights=NULL;
00068     set_wd_weights();
00069 }
00070 
00071 CWDFeatures::~CWDFeatures()
00072 {
00073     SG_UNREF(strings);
00074     SG_FREE(wd_weights);
00075 }
00076 
00077 float64_t CWDFeatures::dot(int32_t vec_idx1, CDotFeatures* df, int32_t vec_idx2)
00078 {
00079     ASSERT(df);
00080     ASSERT(df->get_feature_type() == get_feature_type());
00081     ASSERT(df->get_feature_class() == get_feature_class());
00082     CWDFeatures* wdf = (CWDFeatures*) df;
00083 
00084     int32_t len1, len2;
00085     bool free_vec1, free_vec2;
00086 
00087     uint8_t* vec1=strings->get_feature_vector(vec_idx1, len1, free_vec1);
00088     uint8_t* vec2=wdf->strings->get_feature_vector(vec_idx2, len2, free_vec2);
00089 
00090     ASSERT(len1==len2);
00091 
00092     float64_t sum=0.0;
00093 
00094     for (int32_t i=0; i<len1; i++)
00095     {
00096         for (int32_t j=0; (i+j<len1) && (j<degree); j++)
00097         {
00098             if (vec1[i+j]!=vec2[i+j])
00099                 break ;
00100             sum += wd_weights[j]*wd_weights[j];
00101         }
00102     }
00103     strings->free_feature_vector(vec1, vec_idx1, free_vec1);
00104     wdf->strings->free_feature_vector(vec2, vec_idx2, free_vec2);
00105     return sum/CMath::sq(normalization_const);
00106 }
00107 
00108 float64_t CWDFeatures::dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len)
00109 {
00110     if (vec2_len != w_dim)
00111         SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim);
00112 
00113     float64_t sum=0;
00114     int32_t lim=CMath::min(degree, string_length);
00115     int32_t len;
00116     bool free_vec1;
00117     uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1);
00118     int32_t* val=SG_MALLOC(int32_t, len);
00119     CMath::fill_vector(val, len, 0);
00120 
00121     int32_t asize=alphabet_size;
00122     int32_t asizem1=1;
00123     int32_t offs=0;
00124 
00125     for (int32_t k=0; k<lim; k++)
00126     {
00127         float64_t wd = wd_weights[k];
00128 
00129         int32_t o=offs;
00130         for (int32_t i=0; i+k < len; i++) 
00131         {
00132             val[i]+=asizem1*vec[i+k];
00133             sum+=vec2[val[i]+o]*wd;
00134             o+=asize;
00135         }
00136         offs+=asize*len;
00137         asize*=alphabet_size;
00138         asizem1*=alphabet_size;
00139     }
00140     SG_FREE(val);
00141     strings->free_feature_vector(vec, vec_idx1, free_vec1);
00142 
00143     return sum/normalization_const;
00144 }
00145 
00146 void CWDFeatures::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val)
00147 {
00148     if (vec2_len != w_dim)
00149         SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim);
00150 
00151     int32_t lim=CMath::min(degree, string_length);
00152     int32_t len;
00153     bool free_vec1;
00154     uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1);
00155     int32_t* val=SG_MALLOC(int32_t, len);
00156     CMath::fill_vector(val, len, 0);
00157 
00158     int32_t asize=alphabet_size;
00159     int32_t asizem1=1;
00160     int32_t offs=0;
00161 
00162     for (int32_t k=0; k<lim; k++)
00163     {
00164         float64_t wd = alpha*wd_weights[k]/normalization_const;
00165 
00166         if (abs_val)
00167             wd=CMath::abs(wd);
00168 
00169         int32_t o=offs;
00170         for (int32_t i=0; i+k < len; i++) 
00171         {
00172             val[i]+=asizem1*vec[i+k];
00173             vec2[val[i]+o]+=wd;
00174             o+=asize;
00175         }
00176         offs+=asize*len;
00177         asize*=alphabet_size;
00178         asizem1*=alphabet_size;
00179     }
00180     SG_FREE(val);
00181 
00182     strings->free_feature_vector(vec, vec_idx1, free_vec1);
00183 }
00184 
00185 void CWDFeatures::set_wd_weights()
00186 {
00187     ASSERT(degree>0 && degree<=8);
00188     SG_FREE(wd_weights);
00189     wd_weights=SG_MALLOC(float64_t, degree);
00190     w_dim=0;
00191 
00192     for (int32_t i=0; i<degree; i++)
00193     {
00194         w_dim+=CMath::pow(alphabet_size, i+1)*string_length;
00195         wd_weights[i]=sqrt(2.0*(from_degree-i)/(from_degree*(from_degree+1)));
00196     }
00197     SG_DEBUG("created WDFeatures with d=%d (%d), alphabetsize=%d, dim=%d num=%d, len=%d\n", degree, from_degree, alphabet_size, w_dim, num_strings, string_length);
00198 }
00199 
00200 
00201 void CWDFeatures::set_normalization_const(float64_t n)
00202 {
00203     if (n==0)
00204     {
00205         normalization_const=0;
00206         for (int32_t i=0; i<degree; i++)
00207             normalization_const+=(string_length-i)*wd_weights[i]*wd_weights[i];
00208 
00209         normalization_const=CMath::sqrt(normalization_const);
00210     }
00211     else
00212         normalization_const=n;
00213 
00214     SG_DEBUG("normalization_const:%f\n", normalization_const);
00215 }
00216 
00217 void* CWDFeatures::get_feature_iterator(int32_t vector_index)
00218 {
00219     if (vector_index>=num_strings)
00220     {
00221         SG_ERROR("Index out of bounds (number of strings %d, you "
00222                 "requested %d)\n", num_strings, vector_index);
00223     }
00224 
00225     wd_feature_iterator* it=SG_MALLOC(wd_feature_iterator, 1);
00226 
00227     it->lim=CMath::min(degree, string_length);
00228     it->vec= strings->get_feature_vector(vector_index, it->vlen, it->vfree);
00229     it->vidx=vector_index;
00230 
00231     it->vec = strings->get_feature_vector(vector_index, it->vlen, it->vfree);
00232     it->val=SG_MALLOC(int32_t, it->vlen);
00233     CMath::fill_vector(it->val, it->vlen, 0);
00234 
00235     it->asize=alphabet_size;
00236     it->asizem1=1;
00237     it->offs=0;
00238     it->k=0;
00239     it->i=0;
00240     it->o=0;
00241 
00242     return it;
00243 }
00244 
00245 bool CWDFeatures::get_next_feature(int32_t& index, float64_t& value, void* iterator)
00246 {
00247     wd_feature_iterator* it=(wd_feature_iterator*) iterator;
00248 
00249     if (it->i + it->k >= it->vlen)
00250     {
00251         if (it->k < it->lim-1)
00252         {
00253             it->offs+=it->asize*it->vlen;
00254             it->asize*=alphabet_size;
00255             it->asizem1*=alphabet_size;
00256             it->k++;
00257             it->i=0;
00258             it->o=it->offs;
00259         }
00260         else
00261             return false;
00262     }
00263 
00264     int32_t i=it->i;
00265     int32_t k=it->k;
00266 #ifdef DEBUG_WDFEATURES
00267     SG_PRINT("i=%d k=%d offs=%d o=%d asize=%d asizem1=%d\n", i, k, it->offs, it->o, it->asize, it->asizem1);
00268 #endif
00269 
00270     it->val[i]+=it->asizem1*it->vec[i+k];
00271     value=wd_weights[k]/normalization_const;
00272     index=it->val[i]+it->o;
00273 #ifdef DEBUG_WDFEATURES
00274     SG_PRINT("index=%d val=%f w_size=%d lim=%d vlen=%d\n", index, value, w_dim, it->lim, it->vlen);
00275 #endif
00276 
00277     it->o+=it->asize;
00278     it->i=i+1;
00279 
00280     return true;
00281 }
00282 
00283 void CWDFeatures::free_feature_iterator(void* iterator)
00284 {
00285     ASSERT(iterator);
00286     wd_feature_iterator* it=(wd_feature_iterator*) iterator;
00287     strings->free_feature_vector(it->vec, it->vidx, it->vfree);
00288     SG_FREE(it->val);
00289     SG_FREE(it);
00290 }
00291 
00292 CFeatures* CWDFeatures::duplicate() const
00293 {
00294     return new CWDFeatures(*this);
00295 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation