SHOGUN  3.2.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
WDFeatures.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2009 Soeren Sonnenburg
8  * Copyright (C) 2009 Fraunhofer Institute FIRST and Max-Planck-Society
9  */
10 
12 #include <shogun/io/SGIO.h>
13 
14 using namespace shogun;
15 
17 {
18  SG_UNSTABLE("CWDFeatures::CWDFeatures() :CDotFeatures()",
19  "\n");
20 
21  strings = NULL;
22 
23  degree = 0;
24  from_degree = 0;
25  string_length = 0;
26  num_strings = 0;
27  alphabet_size = 0;
28  w_dim = 0;
29  wd_weights = NULL;
30  normalization_const = 0.0;
31 }
32 
34  int32_t order, int32_t from_order) : CDotFeatures()
35 {
36  ASSERT(str)
37  ASSERT(str->have_same_length())
38  SG_REF(str);
39 
40  strings=str;
43  CAlphabet* alpha=str->get_alphabet();
45  SG_UNREF(alpha);
46 
47  degree=order;
48  from_degree=from_order;
49  wd_weights=NULL;
52 
53 }
54 
56  : CDotFeatures(orig), strings(orig.strings),
57  degree(orig.degree), from_degree(orig.from_degree),
58  normalization_const(orig.normalization_const)
59 {
60  SG_REF(strings);
61 
62  if (strings)
63  {
66  CAlphabet* alpha=strings->get_alphabet();
68  SG_UNREF(alpha);
69  }
70  else
71  {
72  string_length = 0;
73  num_strings = 0;
74  alphabet_size = 0;
75  }
76 
77  wd_weights=NULL;
78  if (degree>0)
80 }
81 
83 {
85  SG_FREE(wd_weights);
86 }
87 
88 float64_t CWDFeatures::dot(int32_t vec_idx1, CDotFeatures* df, int32_t vec_idx2)
89 {
90  ASSERT(df)
93  CWDFeatures* wdf = (CWDFeatures*) df;
94 
95  int32_t len1, len2;
96  bool free_vec1, free_vec2;
97 
98  uint8_t* vec1=strings->get_feature_vector(vec_idx1, len1, free_vec1);
99  uint8_t* vec2=wdf->strings->get_feature_vector(vec_idx2, len2, free_vec2);
100 
101  ASSERT(len1==len2)
102 
103  float64_t sum=0.0;
104 
105  for (int32_t i=0; i<len1; i++)
106  {
107  for (int32_t j=0; (i+j<len1) && (j<degree); j++)
108  {
109  if (vec1[i+j]!=vec2[i+j])
110  break ;
111  sum += wd_weights[j]*wd_weights[j];
112  }
113  }
114  strings->free_feature_vector(vec1, vec_idx1, free_vec1);
115  wdf->strings->free_feature_vector(vec2, vec_idx2, free_vec2);
116  return sum/CMath::sq(normalization_const);
117 }
118 
119 float64_t CWDFeatures::dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len)
120 {
121  if (vec2_len != w_dim)
122  SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim)
123 
124  float64_t sum=0;
125  int32_t lim=CMath::min(degree, string_length);
126  int32_t len;
127  bool free_vec1;
128  uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1);
129  int32_t* val=SG_MALLOC(int32_t, len);
130  SGVector<int32_t>::fill_vector(val, len, 0);
131 
132  int32_t asize=alphabet_size;
133  int32_t asizem1=1;
134  int32_t offs=0;
135 
136  for (int32_t k=0; k<lim; k++)
137  {
138  float64_t wd = wd_weights[k];
139 
140  int32_t o=offs;
141  for (int32_t i=0; i+k < len; i++)
142  {
143  val[i]+=asizem1*vec[i+k];
144  sum+=vec2[val[i]+o]*wd;
145  o+=asize;
146  }
147  offs+=asize*len;
148  asize*=alphabet_size;
149  asizem1*=alphabet_size;
150  }
151  SG_FREE(val);
152  strings->free_feature_vector(vec, vec_idx1, free_vec1);
153 
154  return sum/normalization_const;
155 }
156 
157 void CWDFeatures::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val)
158 {
159  if (vec2_len != w_dim)
160  SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim)
161 
162  int32_t lim=CMath::min(degree, string_length);
163  int32_t len;
164  bool free_vec1;
165  uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1);
166  int32_t* val=SG_MALLOC(int32_t, len);
167  SGVector<int32_t>::fill_vector(val, len, 0);
168 
169  int32_t asize=alphabet_size;
170  int32_t asizem1=1;
171  int32_t offs=0;
172 
173  for (int32_t k=0; k<lim; k++)
174  {
176 
177  if (abs_val)
178  wd=CMath::abs(wd);
179 
180  int32_t o=offs;
181  for (int32_t i=0; i+k < len; i++)
182  {
183  val[i]+=asizem1*vec[i+k];
184  vec2[val[i]+o]+=wd;
185  o+=asize;
186  }
187  offs+=asize*len;
188  asize*=alphabet_size;
189  asizem1*=alphabet_size;
190  }
191  SG_FREE(val);
192 
193  strings->free_feature_vector(vec, vec_idx1, free_vec1);
194 }
195 
197 {
198  ASSERT(degree>0 && degree<=8)
199  SG_FREE(wd_weights);
200  wd_weights=SG_MALLOC(float64_t, degree);
201  w_dim=0;
202 
203  for (int32_t i=0; i<degree; i++)
204  {
206  wd_weights[i]=sqrt(2.0*(from_degree-i)/(from_degree*(from_degree+1)));
207  }
208  SG_DEBUG("created WDFeatures with d=%d (%d), alphabetsize=%d, dim=%d num=%d, len=%d\n", degree, from_degree, alphabet_size, w_dim, num_strings, string_length)
209 }
210 
211 
213 {
214  if (n==0)
215  {
217  for (int32_t i=0; i<degree; i++)
219 
221  }
222  else
224 
225  SG_DEBUG("normalization_const:%f\n", normalization_const)
226 }
227 
228 void* CWDFeatures::get_feature_iterator(int32_t vector_index)
229 {
230  if (vector_index>=num_strings)
231  {
232  SG_ERROR("Index out of bounds (number of strings %d, you "
233  "requested %d)\n", num_strings, vector_index);
234  }
235 
236  wd_feature_iterator* it=SG_MALLOC(wd_feature_iterator, 1);
237 
238  it->lim=CMath::min(degree, string_length);
239  it->vec= strings->get_feature_vector(vector_index, it->vlen, it->vfree);
240  it->vidx=vector_index;
241 
242  it->vec = strings->get_feature_vector(vector_index, it->vlen, it->vfree);
243  it->val=SG_MALLOC(int32_t, it->vlen);
244  SGVector<int32_t>::fill_vector(it->val, it->vlen, 0);
245 
246  it->asize=alphabet_size;
247  it->asizem1=1;
248  it->offs=0;
249  it->k=0;
250  it->i=0;
251  it->o=0;
252 
253  return it;
254 }
255 
256 bool CWDFeatures::get_next_feature(int32_t& index, float64_t& value, void* iterator)
257 {
258  wd_feature_iterator* it=(wd_feature_iterator*) iterator;
259 
260  if (it->i + it->k >= it->vlen)
261  {
262  if (it->k < it->lim-1)
263  {
264  it->offs+=it->asize*it->vlen;
265  it->asize*=alphabet_size;
266  it->asizem1*=alphabet_size;
267  it->k++;
268  it->i=0;
269  it->o=it->offs;
270  }
271  else
272  return false;
273  }
274 
275  int32_t i=it->i;
276  int32_t k=it->k;
277 #ifdef DEBUG_WDFEATURES
278  SG_PRINT("i=%d k=%d offs=%d o=%d asize=%d asizem1=%d\n", i, k, it->offs, it->o, it->asize, it->asizem1)
279 #endif
280 
281  it->val[i]+=it->asizem1*it->vec[i+k];
283  index=it->val[i]+it->o;
284 #ifdef DEBUG_WDFEATURES
285  SG_PRINT("index=%d val=%f w_size=%d lim=%d vlen=%d\n", index, value, w_dim, it->lim, it->vlen)
286 #endif
287 
288  it->o+=it->asize;
289  it->i=i+1;
290 
291  return true;
292 }
293 
295 {
296  ASSERT(iterator)
297  wd_feature_iterator* it=(wd_feature_iterator*) iterator;
298  strings->free_feature_vector(it->vec, it->vidx, it->vfree);
299  SG_FREE(it->val);
300  SG_FREE(it);
301 }
302 
304 {
305  return new CWDFeatures(*this);
306 }
307 
309 {
310  return w_dim;
311 }
312 
314 {
315  int32_t vlen=-1;
316  bool free_vec;
317  uint8_t* vec=strings->get_feature_vector(num, vlen, free_vec);
318  strings->free_feature_vector(vec, num, free_vec);
319  return degree*vlen;
320 }
321 
323 {
324  return F_UNKNOWN;
325 }
326 
328 {
329  return C_WD;
330 }
331 
333 {
334  return num_strings;
335 }
336 
338 {
339  return normalization_const;
340 }
341 
343 {
344  ASSERT(weights.vlen==degree)
345 
346  for (int32_t i=0; i<degree; i++)
347  wd_weights[i]=weights.vector[i];
348 }
349 

SHOGUN Machine Learning Toolbox - Documentation