SHOGUN  4.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
WeightedCommWordStringKernel.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 1999-2009 Soeren Sonnenburg
8  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
9  */
10 
11 #include <shogun/lib/common.h>
14 #include <shogun/io/SGIO.h>
15 
16 using namespace shogun;
17 
19  : CCommWordStringKernel(0, false)
20 {
21  init();
22 }
23 
25  int32_t size, bool us)
26 : CCommWordStringKernel(size, us)
27 {
28  ASSERT(us==false)
29  init();
30 }
31 
34  int32_t size)
35 : CCommWordStringKernel(size, us)
36 {
37  ASSERT(us==false)
38  init();
39 
40  init(l,r);
41 }
42 
44 {
45  SG_FREE(weights);
46 }
47 
48 bool CWeightedCommWordStringKernel::init(CFeatures* l, CFeatures* r)
49 {
50  ASSERT(((CStringFeatures<uint16_t>*) l)->get_order() ==
51  ((CStringFeatures<uint16_t>*) r)->get_order());
52  degree=((CStringFeatures<uint16_t>*) l)->get_order();
54 
55  CCommWordStringKernel::init(l,r);
56  return init_normalizer();
57 }
58 
60 {
61  SG_FREE(weights);
62  weights=NULL;
63 
65 }
66 
68 {
69  SG_FREE(weights);
70  weights=SG_MALLOC(float64_t, degree);
71 
72  int32_t i;
73  float64_t sum=0;
74  for (i=0; i<degree; i++)
75  {
76  weights[i]=degree-i;
77  sum+=weights[i];
78  }
79  for (i=0; i<degree; i++)
80  weights[i]=CMath::sqrt(weights[i]/sum);
81 
82  return weights!=NULL;
83 }
84 
86 {
87  ASSERT(w.vlen==degree)
88 
89  SG_FREE(weights);
90  weights = w.vector;
91  for (int32_t i=0; i<degree; i++)
93  return true;
94 }
95 
97  int32_t idx_a, int32_t idx_b, bool do_sort)
98 {
99  int32_t alen, blen;
100  bool free_avec, free_bvec;
101 
104 
105  uint16_t* av=l->get_feature_vector(idx_a, alen, free_avec);
106  uint16_t* bv=r->get_feature_vector(idx_b, blen, free_bvec);
107 
108  uint16_t* avec=av;
109  uint16_t* bvec=bv;
110 
111  if (do_sort)
112  {
113  if (alen>0)
114  {
115  avec=SG_MALLOC(uint16_t, alen);
116  memcpy(avec, av, sizeof(uint16_t)*alen);
117  CMath::radix_sort(avec, alen);
118  }
119  else
120  avec=NULL;
121 
122  if (blen>0)
123  {
124  bvec=SG_MALLOC(uint16_t, blen);
125  memcpy(bvec, bv, sizeof(uint16_t)*blen);
126  CMath::radix_sort(bvec, blen);
127  }
128  else
129  bvec=NULL;
130  }
131  else
132  {
133  if ( (l->get_num_preprocessors() != l->get_num_preprocessed()) ||
135  {
136  SG_ERROR("not all preprocessors have been applied to training (%d/%d)"
137  " or test (%d/%d) data\n", l->get_num_preprocessed(), l->get_num_preprocessors(),
139  }
140  }
141 
142  float64_t result=0;
143  uint8_t mask=0;
144 
145  for (int32_t d=0; d<degree; d++)
146  {
147  mask = mask | (1 << (degree-d-1));
148  uint16_t masked=((CStringFeatures<uint16_t>*) lhs)->get_masked_symbols(0xffff, mask);
149 
150  int32_t left_idx=0;
151  int32_t right_idx=0;
152  float64_t weight=weights[d]*weights[d];
153 
154  while (left_idx < alen && right_idx < blen)
155  {
156  uint16_t lsym=avec[left_idx] & masked;
157  uint16_t rsym=bvec[right_idx] & masked;
158 
159  if (lsym == rsym)
160  {
161  int32_t old_left_idx=left_idx;
162  int32_t old_right_idx=right_idx;
163 
164  while (left_idx<alen && (avec[left_idx] & masked) ==lsym)
165  left_idx++;
166 
167  while (right_idx<blen && (bvec[right_idx] & masked) ==lsym)
168  right_idx++;
169 
170  result+=weight*(left_idx-old_left_idx)*(right_idx-old_right_idx);
171  }
172  else if (lsym<rsym)
173  left_idx++;
174  else
175  right_idx++;
176  }
177  }
178 
179  if (do_sort)
180  {
181  SG_FREE(avec);
182  SG_FREE(bvec);
183  }
184 
185  l->free_feature_vector(av, idx_a, free_avec);
186  r->free_feature_vector(bv, idx_b, free_bvec);
187 
188  return result;
189 }
190 
192  int32_t vec_idx, float64_t weight)
193 {
194  int32_t len=-1;
195  bool free_vec;
197  uint16_t* vec=s->get_feature_vector(vec_idx, len, free_vec);
198 
199  if (len>0)
200  {
201  for (int32_t j=0; j<len; j++)
202  {
203  uint8_t mask=0;
204  int32_t offs=0;
205  for (int32_t d=0; d<degree; d++)
206  {
207  mask = mask | (1 << (degree-d-1));
208  int32_t idx=s->get_masked_symbols(vec[j], mask);
209  idx=s->shift_symbol(idx, degree-d-1);
210  dictionary_weights[offs + idx] += normalizer->normalize_lhs(weight*weights[d], vec_idx);
211  offs+=s->shift_offset(1,d+1);
212  }
213  }
214 
215  set_is_initialized(true);
216  }
217 
218  s->free_feature_vector(vec, vec_idx, free_vec);
219 }
220 
222 {
224  ASSERT(use_sign==false)
225 
227  uint32_t num_symbols=(uint32_t) s->get_num_symbols();
228  int32_t dic_size=1<<(sizeof(uint16_t)*8);
229  float64_t* dic=SG_MALLOC(float64_t, dic_size);
230  memset(dic, 0, sizeof(float64_t)*dic_size);
231 
232  for (uint32_t sym=0; sym<num_symbols; sym++)
233  {
234  float64_t result=0;
235  uint8_t mask=0;
236  int32_t offs=0;
237  for (int32_t d=0; d<degree; d++)
238  {
239  mask = mask | (1 << (degree-d-1));
240  int32_t idx=s->get_masked_symbols(sym, mask);
241  idx=s->shift_symbol(idx, degree-d-1);
242  result += dictionary_weights[offs + idx];
243  offs+=s->shift_offset(1,d+1);
244  }
245  dic[sym]=result;
246  }
247 
248  init_dictionary(1<<(sizeof(uint16_t)*8));
249  memcpy(dictionary_weights, dic, sizeof(float64_t)*dic_size);
250  SG_FREE(dic);
251 }
252 
254 {
255  if (!get_is_initialized())
256  SG_ERROR("CCommWordStringKernel optimization not initialized\n")
257 
258  ASSERT(use_sign==false)
259 
260  float64_t result=0;
261  bool free_vec;
262  int32_t len=-1;
264  uint16_t* vec=s->get_feature_vector(i, len, free_vec);
265 
266  if (vec && len>0)
267  {
268  for (int32_t j=0; j<len; j++)
269  {
270  uint8_t mask=0;
271  int32_t offs=0;
272  for (int32_t d=0; d<degree; d++)
273  {
274  mask = mask | (1 << (degree-d-1));
275  int32_t idx=s->get_masked_symbols(vec[j], mask);
276  idx=s->shift_symbol(idx, degree-d-1);
277  result += dictionary_weights[offs + idx]*weights[d];
278  offs+=s->shift_offset(1,d+1);
279  }
280  }
281 
282  result=normalizer->normalize_rhs(result, i);
283  }
284  s->free_feature_vector(vec, i, free_vec);
285  return result;
286 }
287 
289  int32_t max_degree, int32_t& num_feat, int32_t& num_sym, float64_t* target,
290  int32_t num_suppvec, int32_t* IDX, float64_t* alphas, bool do_init)
291 {
292  if (do_init)
293  CCommWordStringKernel::init_optimization(num_suppvec, IDX, alphas);
294 
295  int32_t dic_size=1<<(sizeof(uint16_t)*9);
296  float64_t* dic=SG_MALLOC(float64_t, dic_size);
297  memcpy(dic, dictionary_weights, sizeof(float64_t)*dic_size);
298 
299  merge_normal();
300  float64_t* result=CCommWordStringKernel::compute_scoring(max_degree, num_feat,
301  num_sym, target, num_suppvec, IDX, alphas, false);
302 
303  init_dictionary(1<<(sizeof(uint16_t)*9));
304  memcpy(dictionary_weights,dic, sizeof(float64_t)*dic_size);
305  SG_FREE(dic);
306 
307  return result;
308 }
309 
310 void CWeightedCommWordStringKernel::init()
311 {
312  degree=0;
313  weights=NULL;
314 
315  init_dictionary(1<<(sizeof(uint16_t)*9));
316 
317  m_parameters->add_vector(&weights, &degree, "weights",
318  "weights for each of the subkernels of degree 1...d");
319 }
virtual bool init(CFeatures *l, CFeatures *r)
SGVector< ST > get_feature_vector(int32_t num)
virtual bool init_optimization(int32_t count, int32_t *IDX, float64_t *weights)
ST shift_offset(ST offset, int32_t amount)
virtual bool init_dictionary(int32_t size)
virtual float64_t normalize_rhs(float64_t value, int32_t idx_rhs)=0
#define SG_ERROR(...)
Definition: SGIO.h:129
void set_is_initialized(bool p_init)
Definition: Kernel.h:900
Parameter * m_parameters
Definition: SGObject.h:546
void free_feature_vector(ST *feat_vec, int32_t num, bool dofree)
bool get_is_initialized()
Definition: Kernel.h:754
int32_t get_num_preprocessors() const
Definition: Features.cpp:155
index_t vlen
Definition: SGVector.h:494
ST get_masked_symbols(ST symbol, uint8_t mask)
#define ASSERT(x)
Definition: SGIO.h:201
virtual void add_to_normal(int32_t idx, float64_t weight)
The CommWordString kernel may be used to compute the spectrum kernel from strings that have been mapp...
double float64_t
Definition: common.h:50
bool set_weights(SGVector< float64_t > weights)
virtual bool init_normalizer()
Definition: Kernel.cpp:168
CFeatures * rhs
feature vectors to occur on right hand side
Definition: Kernel.h:1062
void add_vector(bool **param, index_t *length, const char *name, const char *description="")
Definition: Parameter.cpp:334
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
virtual float64_t * compute_scoring(int32_t max_degree, int32_t &num_feat, int32_t &num_sym, float64_t *target, int32_t num_suppvec, int32_t *IDX, float64_t *alphas, bool do_init=true)
int32_t get_num_preprocessed() const
Definition: Features.cpp:103
CFeatures * lhs
feature vectors to occur on left hand side
Definition: Kernel.h:1060
The class Features is the base class of all feature objects.
Definition: Features.h:68
ST shift_symbol(ST symbol, int32_t amount)
CKernelNormalizer * normalizer
Definition: Kernel.h:1087
virtual float64_t * compute_scoring(int32_t max_degree, int32_t &num_feat, int32_t &num_sym, float64_t *target, int32_t num_suppvec, int32_t *IDX, float64_t *alphas, bool do_init=true)
virtual float64_t compute_helper(int32_t idx_a, int32_t idx_b, bool do_sort)
static float32_t sqrt(float32_t x)
Definition: Math.h:459
virtual float64_t normalize_lhs(float64_t value, int32_t idx_lhs)=0
static void radix_sort(T *array, int32_t size)
Definition: Math.h:1378

SHOGUN Machine Learning Toolbox - Documentation