WeightedCommWordStringKernel.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2009 Soeren Sonnenburg
00008  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00009  */
00010 
00011 #include <shogun/lib/common.h>
00012 #include <shogun/kernel/WeightedCommWordStringKernel.h>
00013 #include <shogun/features/StringFeatures.h>
00014 #include <shogun/io/SGIO.h>
00015 
00016 using namespace shogun;
00017 
00018 CWeightedCommWordStringKernel::CWeightedCommWordStringKernel()
00019   : CCommWordStringKernel(0, false)
00020 {
00021     init();
00022 }
00023 
00024 CWeightedCommWordStringKernel::CWeightedCommWordStringKernel(
00025     int32_t size, bool us)
00026 : CCommWordStringKernel(size, us)
00027 {
00028     ASSERT(us==false);
00029     init();
00030 }
00031 
00032 CWeightedCommWordStringKernel::CWeightedCommWordStringKernel(
00033     CStringFeatures<uint16_t>* l, CStringFeatures<uint16_t>* r, bool us,
00034     int32_t size)
00035 : CCommWordStringKernel(size, us)
00036 {
00037     ASSERT(us==false);
00038     init();
00039 
00040     init(l,r);
00041 }
00042 
00043 CWeightedCommWordStringKernel::~CWeightedCommWordStringKernel()
00044 {
00045     SG_FREE(weights);
00046 }
00047 
00048 bool CWeightedCommWordStringKernel::init(CFeatures* l, CFeatures* r)
00049 {
00050     ASSERT(((CStringFeatures<uint16_t>*) l)->get_order() ==
00051             ((CStringFeatures<uint16_t>*) r)->get_order());
00052     degree=((CStringFeatures<uint16_t>*) l)->get_order();
00053     set_wd_weights();
00054 
00055     CCommWordStringKernel::init(l,r);
00056     return init_normalizer();
00057 }
00058 
00059 void CWeightedCommWordStringKernel::cleanup()
00060 {
00061     SG_FREE(weights);
00062     weights=NULL;
00063 
00064     CCommWordStringKernel::cleanup();
00065 }
00066 
00067 bool CWeightedCommWordStringKernel::set_wd_weights()
00068 {
00069     SG_FREE(weights);
00070     weights=SG_MALLOC(float64_t, degree);
00071 
00072     int32_t i;
00073     float64_t sum=0;
00074     for (i=0; i<degree; i++)
00075     {
00076         weights[i]=degree-i;
00077         sum+=weights[i];
00078     }
00079     for (i=0; i<degree; i++)
00080         weights[i]=CMath::sqrt(weights[i]/sum);
00081 
00082     return weights!=NULL;
00083 }
00084 
00085 bool CWeightedCommWordStringKernel::set_weights(float64_t* w, int32_t d)
00086 {
00087     ASSERT(d==degree);
00088 
00089     SG_FREE(weights);
00090     weights=SG_MALLOC(float64_t, degree);
00091     for (int32_t i=0; i<degree; i++)
00092         weights[i]=CMath::sqrt(w[i]);
00093     return true;
00094 }
00095   
00096 float64_t CWeightedCommWordStringKernel::compute_helper(
00097     int32_t idx_a, int32_t idx_b, bool do_sort)
00098 {
00099     int32_t alen, blen;
00100     bool free_avec, free_bvec;
00101 
00102     CStringFeatures<uint16_t>* l = (CStringFeatures<uint16_t>*) lhs;
00103     CStringFeatures<uint16_t>* r = (CStringFeatures<uint16_t>*) rhs;
00104 
00105     uint16_t* av=l->get_feature_vector(idx_a, alen, free_avec);
00106     uint16_t* bv=r->get_feature_vector(idx_b, blen, free_bvec);
00107 
00108     uint16_t* avec=av;
00109     uint16_t* bvec=bv;
00110 
00111     if (do_sort)
00112     {
00113         if (alen>0)
00114         {
00115             avec=SG_MALLOC(uint16_t, alen);
00116             memcpy(avec, av, sizeof(uint16_t)*alen);
00117             CMath::radix_sort(avec, alen);
00118         }
00119         else
00120             avec=NULL;
00121 
00122         if (blen>0)
00123         {
00124             bvec=SG_MALLOC(uint16_t, blen);
00125             memcpy(bvec, bv, sizeof(uint16_t)*blen);
00126             CMath::radix_sort(bvec, blen);
00127         }
00128         else
00129             bvec=NULL;
00130     }
00131     else
00132     {
00133         if ( (l->get_num_preprocessors() != l->get_num_preprocessed()) ||
00134                 (r->get_num_preprocessors() != r->get_num_preprocessed()))
00135         {
00136             SG_ERROR("not all preprocessors have been applied to training (%d/%d)"
00137                     " or test (%d/%d) data\n", l->get_num_preprocessed(), l->get_num_preprocessors(),
00138                     r->get_num_preprocessed(), r->get_num_preprocessors());
00139         }
00140     }
00141 
00142     float64_t result=0;
00143     uint8_t mask=0;
00144 
00145     for (int32_t d=0; d<degree; d++)
00146     {
00147         mask = mask | (1 << (degree-d-1));
00148         uint16_t masked=((CStringFeatures<uint16_t>*) lhs)->get_masked_symbols(0xffff, mask);
00149 
00150         int32_t left_idx=0;
00151         int32_t right_idx=0;
00152         float64_t weight=weights[d]*weights[d];
00153 
00154         while (left_idx < alen && right_idx < blen)
00155         {
00156             uint16_t lsym=avec[left_idx] & masked;
00157             uint16_t rsym=bvec[right_idx] & masked;
00158 
00159             if (lsym == rsym)
00160             {
00161                 int32_t old_left_idx=left_idx;
00162                 int32_t old_right_idx=right_idx;
00163 
00164                 while (left_idx<alen && (avec[left_idx] & masked) ==lsym)
00165                     left_idx++;
00166 
00167                 while (right_idx<blen && (bvec[right_idx] & masked) ==lsym)
00168                     right_idx++;
00169 
00170                 result+=weight*(left_idx-old_left_idx)*(right_idx-old_right_idx);
00171             }
00172             else if (lsym<rsym)
00173                 left_idx++;
00174             else
00175                 right_idx++;
00176         }
00177     }
00178 
00179     if (do_sort)
00180     {
00181         SG_FREE(avec);
00182         SG_FREE(bvec);
00183     }
00184 
00185     l->free_feature_vector(av, idx_a, free_avec);
00186     r->free_feature_vector(bv, idx_b, free_bvec);
00187 
00188     return result;
00189 }
00190 
00191 void CWeightedCommWordStringKernel::add_to_normal(
00192     int32_t vec_idx, float64_t weight)
00193 {
00194     int32_t len=-1;
00195     bool free_vec;
00196     CStringFeatures<uint16_t>* s=(CStringFeatures<uint16_t>*) lhs;
00197     uint16_t* vec=s->get_feature_vector(vec_idx, len, free_vec);
00198 
00199     if (len>0)
00200     {
00201         for (int32_t j=0; j<len; j++)
00202         {
00203             uint8_t mask=0;
00204             int32_t offs=0;
00205             for (int32_t d=0; d<degree; d++)
00206             {
00207                 mask = mask | (1 << (degree-d-1));
00208                 int32_t idx=s->get_masked_symbols(vec[j], mask);
00209                 idx=s->shift_symbol(idx, degree-d-1);
00210                 dictionary_weights[offs + idx] += normalizer->normalize_lhs(weight*weights[d], vec_idx);
00211                 offs+=s->shift_offset(1,d+1);
00212             }
00213         }
00214 
00215         set_is_initialized(true);
00216     }
00217 
00218     s->free_feature_vector(vec, vec_idx, free_vec);
00219 }
00220 
00221 void CWeightedCommWordStringKernel::merge_normal()
00222 {
00223     ASSERT(get_is_initialized());
00224     ASSERT(use_sign==false);
00225 
00226     CStringFeatures<uint16_t>* s=(CStringFeatures<uint16_t>*) rhs;
00227     uint32_t num_symbols=(uint32_t) s->get_num_symbols();
00228     int32_t dic_size=1<<(sizeof(uint16_t)*8);
00229     float64_t* dic=SG_MALLOC(float64_t, dic_size);
00230     memset(dic, 0, sizeof(float64_t)*dic_size);
00231 
00232     for (uint32_t sym=0; sym<num_symbols; sym++)
00233     {
00234         float64_t result=0;
00235         uint8_t mask=0;
00236         int32_t offs=0;
00237         for (int32_t d=0; d<degree; d++)
00238         {
00239             mask = mask | (1 << (degree-d-1));
00240             int32_t idx=s->get_masked_symbols(sym, mask);
00241             idx=s->shift_symbol(idx, degree-d-1);
00242             result += dictionary_weights[offs + idx];
00243             offs+=s->shift_offset(1,d+1);
00244         }
00245         dic[sym]=result;
00246     }
00247 
00248     init_dictionary(1<<(sizeof(uint16_t)*8));
00249     memcpy(dictionary_weights, dic, sizeof(float64_t)*dic_size);
00250     SG_FREE(dic);
00251 }
00252 
00253 float64_t CWeightedCommWordStringKernel::compute_optimized(int32_t i)
00254 { 
00255     if (!get_is_initialized())
00256         SG_ERROR( "CCommWordStringKernel optimization not initialized\n");
00257 
00258     ASSERT(use_sign==false);
00259 
00260     float64_t result=0;
00261     bool free_vec;
00262     int32_t len=-1;
00263     CStringFeatures<uint16_t>* s=(CStringFeatures<uint16_t>*) rhs;
00264     uint16_t* vec=s->get_feature_vector(i, len, free_vec);
00265 
00266     if (vec && len>0)
00267     {
00268         for (int32_t j=0; j<len; j++)
00269         {
00270             uint8_t mask=0;
00271             int32_t offs=0;
00272             for (int32_t d=0; d<degree; d++)
00273             {
00274                 mask = mask | (1 << (degree-d-1));
00275                 int32_t idx=s->get_masked_symbols(vec[j], mask);
00276                 idx=s->shift_symbol(idx, degree-d-1);
00277                 result += dictionary_weights[offs + idx]*weights[d];
00278                 offs+=s->shift_offset(1,d+1);
00279             }
00280         }
00281 
00282         result=normalizer->normalize_rhs(result, i);
00283     }
00284     s->free_feature_vector(vec, i, free_vec);
00285     return result;
00286 }
00287 
00288 float64_t* CWeightedCommWordStringKernel::compute_scoring(
00289     int32_t max_degree, int32_t& num_feat, int32_t& num_sym, float64_t* target,
00290     int32_t num_suppvec, int32_t* IDX, float64_t* alphas, bool do_init)
00291 {
00292     if (do_init)
00293         CCommWordStringKernel::init_optimization(num_suppvec, IDX, alphas);
00294 
00295     int32_t dic_size=1<<(sizeof(uint16_t)*9);
00296     float64_t* dic=SG_MALLOC(float64_t, dic_size);
00297     memcpy(dic, dictionary_weights, sizeof(float64_t)*dic_size);
00298 
00299     merge_normal();
00300     float64_t* result=CCommWordStringKernel::compute_scoring(max_degree, num_feat,
00301             num_sym, target, num_suppvec, IDX, alphas, false);
00302 
00303     init_dictionary(1<<(sizeof(uint16_t)*9));
00304     memcpy(dictionary_weights,dic,  sizeof(float64_t)*dic_size);
00305     SG_FREE(dic);
00306 
00307     return result;
00308 }
00309 
00310 void CWeightedCommWordStringKernel::init()
00311 {
00312     degree=0;
00313     weights=NULL;
00314 
00315     init_dictionary(1<<(sizeof(uint16_t)*9));
00316 
00317     m_parameters->add_vector(&weights, &degree, "weights",
00318             "weights for each of the subkernels of degree 1...d");
00319 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation