00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011 #include "lib/common.h"
00012 #include "kernel/WeightedCommWordStringKernel.h"
00013 #include "features/StringFeatures.h"
00014 #include "lib/io.h"
00015
00016 using namespace shogun;
00017
00018 CWeightedCommWordStringKernel::CWeightedCommWordStringKernel()
00019 : CCommWordStringKernel(0, false)
00020 {
00021 init();
00022 }
00023
00024 CWeightedCommWordStringKernel::CWeightedCommWordStringKernel(
00025 int32_t size, bool us)
00026 : CCommWordStringKernel(size, us)
00027 {
00028 ASSERT(us==false);
00029 init();
00030 }
00031
00032 CWeightedCommWordStringKernel::CWeightedCommWordStringKernel(
00033 CStringFeatures<uint16_t>* l, CStringFeatures<uint16_t>* r, bool us,
00034 int32_t size)
00035 : CCommWordStringKernel(size, us)
00036 {
00037 ASSERT(us==false);
00038 init();
00039
00040 init(l,r);
00041 }
00042
00043 CWeightedCommWordStringKernel::~CWeightedCommWordStringKernel()
00044 {
00045 delete[] weights;
00046 }
00047
00048 bool CWeightedCommWordStringKernel::init(CFeatures* l, CFeatures* r)
00049 {
00050 ASSERT(((CStringFeatures<uint16_t>*) l)->get_order() ==
00051 ((CStringFeatures<uint16_t>*) r)->get_order());
00052 degree=((CStringFeatures<uint16_t>*) l)->get_order();
00053 set_wd_weights();
00054
00055 CCommWordStringKernel::init(l,r);
00056 return init_normalizer();
00057 }
00058
00059 void CWeightedCommWordStringKernel::cleanup()
00060 {
00061 delete[] weights;
00062 weights=NULL;
00063
00064 CCommWordStringKernel::cleanup();
00065 }
00066
00067 bool CWeightedCommWordStringKernel::set_wd_weights()
00068 {
00069 delete[] weights;
00070 weights=new float64_t[degree];
00071
00072 int32_t i;
00073 float64_t sum=0;
00074 for (i=0; i<degree; i++)
00075 {
00076 weights[i]=degree-i;
00077 sum+=weights[i];
00078 }
00079 for (i=0; i<degree; i++)
00080 weights[i]=CMath::sqrt(weights[i]/sum);
00081
00082 return weights!=NULL;
00083 }
00084
00085 bool CWeightedCommWordStringKernel::set_weights(float64_t* w, int32_t d)
00086 {
00087 ASSERT(d==degree);
00088
00089 delete[] weights;
00090 weights=new float64_t[degree];
00091 for (int32_t i=0; i<degree; i++)
00092 weights[i]=CMath::sqrt(w[i]);
00093 return true;
00094 }
00095
00096 float64_t CWeightedCommWordStringKernel::compute_helper(
00097 int32_t idx_a, int32_t idx_b, bool do_sort)
00098 {
00099 int32_t alen, blen;
00100 bool free_avec, free_bvec;
00101
00102 CStringFeatures<uint16_t>* l = (CStringFeatures<uint16_t>*) lhs;
00103 CStringFeatures<uint16_t>* r = (CStringFeatures<uint16_t>*) rhs;
00104
00105 uint16_t* av=l->get_feature_vector(idx_a, alen, free_avec);
00106 uint16_t* bv=r->get_feature_vector(idx_b, blen, free_bvec);
00107
00108 uint16_t* avec=av;
00109 uint16_t* bvec=bv;
00110
00111 if (do_sort)
00112 {
00113 if (alen>0)
00114 {
00115 avec=new uint16_t[alen];
00116 memcpy(avec, av, sizeof(uint16_t)*alen);
00117 CMath::radix_sort(avec, alen);
00118 }
00119 else
00120 avec=NULL;
00121
00122 if (blen>0)
00123 {
00124 bvec=new uint16_t[blen];
00125 memcpy(bvec, bv, sizeof(uint16_t)*blen);
00126 CMath::radix_sort(bvec, blen);
00127 }
00128 else
00129 bvec=NULL;
00130 }
00131 else
00132 {
00133 if ( (l->get_num_preproc() != l->get_num_preprocessed()) ||
00134 (r->get_num_preproc() != r->get_num_preprocessed()))
00135 {
00136 SG_ERROR("not all preprocessors have been applied to training (%d/%d)"
00137 " or test (%d/%d) data\n", l->get_num_preprocessed(), l->get_num_preproc(),
00138 r->get_num_preprocessed(), r->get_num_preproc());
00139 }
00140 }
00141
00142 float64_t result=0;
00143 uint8_t mask=0;
00144
00145 for (int32_t d=0; d<degree; d++)
00146 {
00147 mask = mask | (1 << (degree-d-1));
00148 uint16_t masked=((CStringFeatures<uint16_t>*) lhs)->get_masked_symbols(0xffff, mask);
00149
00150 int32_t left_idx=0;
00151 int32_t right_idx=0;
00152 float64_t weight=weights[d]*weights[d];
00153
00154 while (left_idx < alen && right_idx < blen)
00155 {
00156 uint16_t lsym=avec[left_idx] & masked;
00157 uint16_t rsym=bvec[right_idx] & masked;
00158
00159 if (lsym == rsym)
00160 {
00161 int32_t old_left_idx=left_idx;
00162 int32_t old_right_idx=right_idx;
00163
00164 while (left_idx<alen && (avec[left_idx] & masked) ==lsym)
00165 left_idx++;
00166
00167 while (right_idx<blen && (bvec[right_idx] & masked) ==lsym)
00168 right_idx++;
00169
00170 result+=weight*(left_idx-old_left_idx)*(right_idx-old_right_idx);
00171 }
00172 else if (lsym<rsym)
00173 left_idx++;
00174 else
00175 right_idx++;
00176 }
00177 }
00178
00179 if (do_sort)
00180 {
00181 delete[] avec;
00182 delete[] bvec;
00183 }
00184
00185 l->free_feature_vector(av, idx_a, free_avec);
00186 r->free_feature_vector(bv, idx_b, free_bvec);
00187
00188 return result;
00189 }
00190
00191 void CWeightedCommWordStringKernel::add_to_normal(
00192 int32_t vec_idx, float64_t weight)
00193 {
00194 int32_t len=-1;
00195 bool free_vec;
00196 CStringFeatures<uint16_t>* s=(CStringFeatures<uint16_t>*) lhs;
00197 uint16_t* vec=s->get_feature_vector(vec_idx, len, free_vec);
00198
00199 if (len>0)
00200 {
00201 for (int32_t j=0; j<len; j++)
00202 {
00203 uint8_t mask=0;
00204 int32_t offs=0;
00205 for (int32_t d=0; d<degree; d++)
00206 {
00207 mask = mask | (1 << (degree-d-1));
00208 int32_t idx=s->get_masked_symbols(vec[j], mask);
00209 idx=s->shift_symbol(idx, degree-d-1);
00210 dictionary_weights[offs + idx] += normalizer->normalize_lhs(weight*weights[d], vec_idx);
00211 offs+=s->shift_offset(1,d+1);
00212 }
00213 }
00214
00215 set_is_initialized(true);
00216 }
00217
00218 s->free_feature_vector(vec, vec_idx, free_vec);
00219 }
00220
00221 void CWeightedCommWordStringKernel::merge_normal()
00222 {
00223 ASSERT(get_is_initialized());
00224 ASSERT(use_sign==false);
00225
00226 CStringFeatures<uint16_t>* s=(CStringFeatures<uint16_t>*) rhs;
00227 uint32_t num_symbols=(uint32_t) s->get_num_symbols();
00228 int32_t dic_size=1<<(sizeof(uint16_t)*8);
00229 float64_t* dic=new float64_t[dic_size];
00230 memset(dic, 0, sizeof(float64_t)*dic_size);
00231
00232 for (uint32_t sym=0; sym<num_symbols; sym++)
00233 {
00234 float64_t result=0;
00235 uint8_t mask=0;
00236 int32_t offs=0;
00237 for (int32_t d=0; d<degree; d++)
00238 {
00239 mask = mask | (1 << (degree-d-1));
00240 int32_t idx=s->get_masked_symbols(sym, mask);
00241 idx=s->shift_symbol(idx, degree-d-1);
00242 result += dictionary_weights[offs + idx];
00243 offs+=s->shift_offset(1,d+1);
00244 }
00245 dic[sym]=result;
00246 }
00247
00248 init_dictionary(1<<(sizeof(uint16_t)*8));
00249 memcpy(dictionary_weights, dic, sizeof(float64_t)*dic_size);
00250 delete[] dic;
00251 }
00252
00253 float64_t CWeightedCommWordStringKernel::compute_optimized(int32_t i)
00254 {
00255 if (!get_is_initialized())
00256 SG_ERROR( "CCommWordStringKernel optimization not initialized\n");
00257
00258 ASSERT(use_sign==false);
00259
00260 float64_t result=0;
00261 bool free_vec;
00262 int32_t len=-1;
00263 CStringFeatures<uint16_t>* s=(CStringFeatures<uint16_t>*) rhs;
00264 uint16_t* vec=s->get_feature_vector(i, len, free_vec);
00265
00266 if (vec && len>0)
00267 {
00268 for (int32_t j=0; j<len; j++)
00269 {
00270 uint8_t mask=0;
00271 int32_t offs=0;
00272 for (int32_t d=0; d<degree; d++)
00273 {
00274 mask = mask | (1 << (degree-d-1));
00275 int32_t idx=s->get_masked_symbols(vec[j], mask);
00276 idx=s->shift_symbol(idx, degree-d-1);
00277 result += dictionary_weights[offs + idx]*weights[d];
00278 offs+=s->shift_offset(1,d+1);
00279 }
00280 }
00281
00282 result=normalizer->normalize_rhs(result, i);
00283 }
00284 s->free_feature_vector(vec, i, free_vec);
00285 return result;
00286 }
00287
00288 float64_t* CWeightedCommWordStringKernel::compute_scoring(
00289 int32_t max_degree, int32_t& num_feat, int32_t& num_sym, float64_t* target,
00290 int32_t num_suppvec, int32_t* IDX, float64_t* alphas, bool do_init)
00291 {
00292 if (do_init)
00293 CCommWordStringKernel::init_optimization(num_suppvec, IDX, alphas);
00294
00295 int32_t dic_size=1<<(sizeof(uint16_t)*9);
00296 float64_t* dic=new float64_t[dic_size];
00297 memcpy(dic, dictionary_weights, sizeof(float64_t)*dic_size);
00298
00299 merge_normal();
00300 float64_t* result=CCommWordStringKernel::compute_scoring(max_degree, num_feat,
00301 num_sym, target, num_suppvec, IDX, alphas, false);
00302
00303 init_dictionary(1<<(sizeof(uint16_t)*9));
00304 memcpy(dictionary_weights,dic, sizeof(float64_t)*dic_size);
00305 delete[] dic;
00306
00307 return result;
00308 }
00309
00310 void CWeightedCommWordStringKernel::init()
00311 {
00312 degree=0;
00313 weights=NULL;
00314
00315 init_dictionary(1<<(sizeof(uint16_t)*9));
00316
00317 m_parameters->add_vector(&weights, °ree, "weights",
00318 "weights for each of the subkernels of degree 1...d");
00319 }