ImplicitWeightedSpecFeatures.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2009 Soeren Sonnenburg
00008  * Copyright (C) 2009 Fraunhofer Institute FIRST and Max-Planck-Society
00009  */
00010 
00011 #include <shogun/features/ImplicitWeightedSpecFeatures.h>
00012 #include <shogun/io/SGIO.h>
00013 
00014 using namespace shogun;
00015 
00016 CImplicitWeightedSpecFeatures::CImplicitWeightedSpecFeatures(void)
00017     :CDotFeatures()
00018 {
00019     SG_UNSTABLE("CImplicitWeightedSpecFeatures::"
00020                 "CImplicitWeightedSpecFeatures(void)", "\n");
00021 
00022     strings = NULL;
00023     normalization_factors = NULL;
00024     num_strings = 0;
00025     alphabet_size = 0;
00026 
00027     degree = 0;
00028     spec_size = 0;
00029     spec_weights = 0;
00030 }
00031 
00032 CImplicitWeightedSpecFeatures::CImplicitWeightedSpecFeatures(CStringFeatures<uint16_t>* str, bool normalize) : CDotFeatures()
00033 {
00034     ASSERT(str);
00035     strings=str;
00036     SG_REF(strings)
00037     normalization_factors=NULL;
00038     spec_weights=NULL;
00039     num_strings = str->get_num_vectors();
00040     alphabet_size = str->get_original_num_symbols();
00041     degree=str->get_order();
00042     set_wd_weights();
00043 
00044     SG_DEBUG("WEIGHTED SPEC alphasz=%d, size=%d, num_str=%d\n", alphabet_size,
00045             spec_size, num_strings);
00046 
00047     if (normalize)
00048         compute_normalization_const();
00049 }
00050 
00051 void CImplicitWeightedSpecFeatures::compute_normalization_const()
00052 {
00053     float64_t* factors=SG_MALLOC(float64_t, num_strings);
00054 
00055     for (int32_t i=0; i<num_strings; i++)
00056         factors[i]=1.0/CMath::sqrt(dot(i, this, i));
00057 
00058     normalization_factors=factors;
00059     //CMath::display_vector(normalization_factors, num_strings, "n");
00060 }
00061 
00062 bool CImplicitWeightedSpecFeatures::set_wd_weights()
00063 {
00064     SG_FREE(spec_weights);
00065     spec_weights=SG_MALLOC(float64_t, degree);
00066 
00067     int32_t i;
00068     float64_t sum=0;
00069     spec_size=0;
00070 
00071     for (i=0; i<degree; i++)
00072     {
00073         spec_size+=CMath::pow(alphabet_size, i+1);
00074         spec_weights[i]=degree-i;
00075         sum+=spec_weights[i];
00076     }
00077     for (i=0; i<degree; i++)
00078         spec_weights[i]=CMath::sqrt(spec_weights[i]/sum);
00079 
00080     return spec_weights!=NULL;
00081 }
00082 
00083 bool CImplicitWeightedSpecFeatures::set_weights(float64_t* w, int32_t d)
00084 {
00085     ASSERT(d==degree);
00086 
00087     SG_FREE(spec_weights);
00088     spec_weights=SG_MALLOC(float64_t, degree);
00089     for (int32_t i=0; i<degree; i++)
00090         spec_weights[i]=CMath::sqrt(w[i]);
00091     return true;
00092 }
00093 
00094 CImplicitWeightedSpecFeatures::CImplicitWeightedSpecFeatures(const CImplicitWeightedSpecFeatures& orig) : CDotFeatures(orig), 
00095     num_strings(orig.num_strings), 
00096     alphabet_size(orig.alphabet_size), spec_size(orig.spec_size)
00097 {
00098     SG_NOTIMPLEMENTED;
00099     SG_REF(strings);
00100 }
00101 
00102 CImplicitWeightedSpecFeatures::~CImplicitWeightedSpecFeatures()
00103 {
00104     SG_UNREF(strings);
00105     SG_FREE(spec_weights);
00106     SG_FREE(normalization_factors);
00107 }
00108 
00109 float64_t CImplicitWeightedSpecFeatures::dot(int32_t vec_idx1, CDotFeatures* df, int32_t vec_idx2)
00110 {
00111     ASSERT(df);
00112     ASSERT(df->get_feature_type() == get_feature_type());
00113     ASSERT(df->get_feature_class() == get_feature_class());
00114     CImplicitWeightedSpecFeatures* sf = (CImplicitWeightedSpecFeatures*) df;
00115 
00116     ASSERT(vec_idx1 < num_strings);
00117     ASSERT(vec_idx2 < sf->get_num_vectors());
00118 
00119     int32_t len1=-1;
00120     int32_t len2=-1;
00121     bool free_vec1;
00122     bool free_vec2;
00123     uint16_t* vec1=strings->get_feature_vector(vec_idx1, len1, free_vec1);
00124     uint16_t* vec2=sf->strings->get_feature_vector(vec_idx2, len2, free_vec2);
00125 
00126     float64_t result=0;
00127     uint8_t mask=0;
00128 
00129     for (int32_t d=0; d<degree; d++)
00130     {
00131         mask = mask | (1 << (degree-d-1));
00132         uint16_t masked=strings->get_masked_symbols(0xffff, mask);
00133 
00134         int32_t left_idx=0;
00135         int32_t right_idx=0;
00136         float64_t weight=spec_weights[d]*spec_weights[d];
00137 
00138         while (left_idx < len1 && right_idx < len2)
00139         {
00140             uint16_t lsym=vec1[left_idx] & masked;
00141             uint16_t rsym=vec2[right_idx] & masked;
00142 
00143             if (lsym == rsym)
00144             {
00145                 int32_t old_left_idx=left_idx;
00146                 int32_t old_right_idx=right_idx;
00147 
00148                 while (left_idx<len1 && (vec1[left_idx] & masked) ==lsym)
00149                     left_idx++;
00150 
00151                 while (right_idx<len2 && (vec2[right_idx] & masked) ==lsym)
00152                     right_idx++;
00153 
00154                 result+=weight*(left_idx-old_left_idx)*(right_idx-old_right_idx);
00155             }
00156             else if (lsym<rsym)
00157                 left_idx++;
00158             else
00159                 right_idx++;
00160         }
00161     }
00162 
00163     strings->free_feature_vector(vec1, vec_idx1, free_vec1);
00164     sf->strings->free_feature_vector(vec2, vec_idx2, free_vec2);
00165 
00166     if (normalization_factors)
00167         return result*normalization_factors[vec_idx1]*normalization_factors[vec_idx2];
00168     else
00169         return result;
00170 }
00171 
00172 float64_t CImplicitWeightedSpecFeatures::dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len)
00173 {
00174     ASSERT(vec2_len == spec_size);
00175     ASSERT(vec_idx1 < num_strings);
00176 
00177     float64_t result=0;
00178     int32_t len1=-1;
00179     bool free_vec1;
00180     uint16_t* vec1=strings->get_feature_vector(vec_idx1, len1, free_vec1);
00181 
00182     if (vec1 && len1>0)
00183     {
00184         for (int32_t j=0; j<len1; j++)
00185         {
00186             uint8_t mask=0;
00187             int32_t offs=0;
00188             uint16_t v=*vec1++;
00189 
00190             for (int32_t d=0; d<degree; d++)
00191             {
00192                 mask = mask | (1 << (degree-d-1));
00193                 int32_t idx=strings->get_masked_symbols(v, mask);
00194                 idx=strings->shift_symbol(idx, degree-d-1);
00195                 result += vec2[offs + idx]*spec_weights[d];
00196                 offs+=strings->shift_offset(1,d+1);
00197             }
00198         }
00199 
00200         strings->free_feature_vector(vec1, vec_idx1, free_vec1);
00201 
00202         if (normalization_factors)
00203             result*=normalization_factors[vec_idx1];
00204     }
00205     else
00206         SG_ERROR("huh?\n");
00207 
00208     return result;
00209 }
00210 
00211 void CImplicitWeightedSpecFeatures::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val)
00212 {
00213     int32_t len1=-1;
00214     bool free_vec1;
00215     uint16_t* vec=strings->get_feature_vector(vec_idx1, len1, free_vec1);
00216 
00217     if (normalization_factors)
00218         alpha*=normalization_factors[vec_idx1];
00219 
00220     if (vec && len1>0)
00221     {
00222         for (int32_t j=0; j<len1; j++)
00223         {
00224             uint8_t mask=0;
00225             int32_t offs=0;
00226             for (int32_t d=0; d<degree; d++)
00227             {
00228                 mask = mask | (1 << (degree-d-1));
00229                 int32_t idx=strings->get_masked_symbols(vec[j], mask);
00230                 idx=strings->shift_symbol(idx, degree-d-1);
00231                 if (abs_val)
00232                     vec2[offs + idx] += CMath::abs(alpha*spec_weights[d]);
00233                 else
00234                     vec2[offs + idx] += alpha*spec_weights[d];
00235                 offs+=strings->shift_offset(1,d+1);
00236             }
00237         }
00238     }
00239 
00240     strings->free_feature_vector(vec, vec_idx1, free_vec1);
00241 }
00242 
00243 CFeatures* CImplicitWeightedSpecFeatures::duplicate() const
00244 {
00245     return new CImplicitWeightedSpecFeatures(*this);
00246 }
00247 
00248 void* CImplicitWeightedSpecFeatures::get_feature_iterator(int32_t vector_index)
00249 {
00250     if (vector_index>=num_strings)
00251     {
00252         SG_ERROR("Index out of bounds (number of strings %d, you "
00253                 "requested %d)\n", num_strings, vector_index);
00254     }
00255 
00256     wspec_feature_iterator* it=SG_MALLOC(wspec_feature_iterator, 1);
00257     it->vec= strings->get_feature_vector(vector_index, it->vlen, it->vfree);
00258     it->vidx=vector_index;
00259 
00260     it->offs=0;
00261     it->d=0;
00262     it->j=0;
00263     it->mask=0;
00264     it->alpha=normalization_factors[vector_index];
00265 
00266     return it;
00267 }
00268 
00269 bool CImplicitWeightedSpecFeatures::get_next_feature(int32_t& index, float64_t& value, void* iterator)
00270 {
00271     wspec_feature_iterator* it=(wspec_feature_iterator*) iterator;
00272 
00273     if (it->d>=degree)
00274     {
00275         if (it->j < it->vlen-1)
00276         {
00277             it->j++;
00278             it->d=0;
00279             it->mask=0;
00280             it->offs=0;
00281         }
00282         else
00283             return false;
00284     }
00285 
00286     int32_t d=it->d;
00287 
00288     it->mask = it->mask | (1 << (degree-d-1));
00289     int32_t idx=strings->get_masked_symbols(it->vec[it->j], it->mask);
00290     idx=strings->shift_symbol(idx, degree-d-1);
00291     value=it->alpha*spec_weights[d];
00292     index=it->offs + idx;
00293     it->offs+=strings->shift_offset(1,d+1);
00294 
00295     it->d=d+1;
00296     return true;
00297 }
00298 
00299 void CImplicitWeightedSpecFeatures::free_feature_iterator(void* iterator)
00300 {
00301     ASSERT(iterator);
00302     wspec_feature_iterator* it=(wspec_feature_iterator*) iterator;
00303     strings->free_feature_vector(it->vec, it->vidx, it->vfree);
00304     SG_FREE(it);
00305 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation