SHOGUN  v2.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
WDFeatures.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2009 Soeren Sonnenburg
8  * Copyright (C) 2009 Fraunhofer Institute FIRST and Max-Planck-Society
9  */
10 
12 #include <shogun/io/SGIO.h>
13 
14 using namespace shogun;
15 
17 {
18  SG_UNSTABLE("CWDFeatures::CWDFeatures() :CDotFeatures()",
19  "\n");
20 
21  strings = NULL;
22 
23  degree = 0;
24  from_degree = 0;
25  string_length = 0;
26  num_strings = 0;
27  alphabet_size = 0;
28  w_dim = 0;
29  wd_weights = NULL;
30  normalization_const = 0.0;
31 }
32 
34  int32_t order, int32_t from_order) : CDotFeatures()
35 {
36  ASSERT(str);
37  ASSERT(str->have_same_length());
38  SG_REF(str);
39 
40  strings=str;
43  CAlphabet* alpha=str->get_alphabet();
45  SG_UNREF(alpha);
46 
47  degree=order;
48  from_degree=from_order;
49  wd_weights=NULL;
52 
53 }
54 
56  : CDotFeatures(orig), strings(orig.strings),
57  degree(orig.degree), from_degree(orig.from_degree),
58  normalization_const(orig.normalization_const)
59 {
60  SG_REF(strings);
63  CAlphabet* alpha=strings->get_alphabet();
65  SG_UNREF(alpha);
66 
67  wd_weights=NULL;
69 }
70 
72 {
75 }
76 
77 float64_t CWDFeatures::dot(int32_t vec_idx1, CDotFeatures* df, int32_t vec_idx2)
78 {
79  ASSERT(df);
82  CWDFeatures* wdf = (CWDFeatures*) df;
83 
84  int32_t len1, len2;
85  bool free_vec1, free_vec2;
86 
87  uint8_t* vec1=strings->get_feature_vector(vec_idx1, len1, free_vec1);
88  uint8_t* vec2=wdf->strings->get_feature_vector(vec_idx2, len2, free_vec2);
89 
90  ASSERT(len1==len2);
91 
92  float64_t sum=0.0;
93 
94  for (int32_t i=0; i<len1; i++)
95  {
96  for (int32_t j=0; (i+j<len1) && (j<degree); j++)
97  {
98  if (vec1[i+j]!=vec2[i+j])
99  break ;
100  sum += wd_weights[j]*wd_weights[j];
101  }
102  }
103  strings->free_feature_vector(vec1, vec_idx1, free_vec1);
104  wdf->strings->free_feature_vector(vec2, vec_idx2, free_vec2);
105  return sum/CMath::sq(normalization_const);
106 }
107 
108 float64_t CWDFeatures::dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len)
109 {
110  if (vec2_len != w_dim)
111  SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim);
112 
113  float64_t sum=0;
114  int32_t lim=CMath::min(degree, string_length);
115  int32_t len;
116  bool free_vec1;
117  uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1);
118  int32_t* val=SG_MALLOC(int32_t, len);
119  SGVector<int32_t>::fill_vector(val, len, 0);
120 
121  int32_t asize=alphabet_size;
122  int32_t asizem1=1;
123  int32_t offs=0;
124 
125  for (int32_t k=0; k<lim; k++)
126  {
127  float64_t wd = wd_weights[k];
128 
129  int32_t o=offs;
130  for (int32_t i=0; i+k < len; i++)
131  {
132  val[i]+=asizem1*vec[i+k];
133  sum+=vec2[val[i]+o]*wd;
134  o+=asize;
135  }
136  offs+=asize*len;
137  asize*=alphabet_size;
138  asizem1*=alphabet_size;
139  }
140  SG_FREE(val);
141  strings->free_feature_vector(vec, vec_idx1, free_vec1);
142 
143  return sum/normalization_const;
144 }
145 
146 void CWDFeatures::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val)
147 {
148  if (vec2_len != w_dim)
149  SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim);
150 
151  int32_t lim=CMath::min(degree, string_length);
152  int32_t len;
153  bool free_vec1;
154  uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1);
155  int32_t* val=SG_MALLOC(int32_t, len);
156  SGVector<int32_t>::fill_vector(val, len, 0);
157 
158  int32_t asize=alphabet_size;
159  int32_t asizem1=1;
160  int32_t offs=0;
161 
162  for (int32_t k=0; k<lim; k++)
163  {
165 
166  if (abs_val)
167  wd=CMath::abs(wd);
168 
169  int32_t o=offs;
170  for (int32_t i=0; i+k < len; i++)
171  {
172  val[i]+=asizem1*vec[i+k];
173  vec2[val[i]+o]+=wd;
174  o+=asize;
175  }
176  offs+=asize*len;
177  asize*=alphabet_size;
178  asizem1*=alphabet_size;
179  }
180  SG_FREE(val);
181 
182  strings->free_feature_vector(vec, vec_idx1, free_vec1);
183 }
184 
186 {
187  ASSERT(degree>0 && degree<=8);
190  w_dim=0;
191 
192  for (int32_t i=0; i<degree; i++)
193  {
195  wd_weights[i]=sqrt(2.0*(from_degree-i)/(from_degree*(from_degree+1)));
196  }
197  SG_DEBUG("created WDFeatures with d=%d (%d), alphabetsize=%d, dim=%d num=%d, len=%d\n", degree, from_degree, alphabet_size, w_dim, num_strings, string_length);
198 }
199 
200 
202 {
203  if (n==0)
204  {
206  for (int32_t i=0; i<degree; i++)
208 
210  }
211  else
213 
214  SG_DEBUG("normalization_const:%f\n", normalization_const);
215 }
216 
217 void* CWDFeatures::get_feature_iterator(int32_t vector_index)
218 {
219  if (vector_index>=num_strings)
220  {
221  SG_ERROR("Index out of bounds (number of strings %d, you "
222  "requested %d)\n", num_strings, vector_index);
223  }
224 
225  wd_feature_iterator* it=SG_MALLOC(wd_feature_iterator, 1);
226 
227  it->lim=CMath::min(degree, string_length);
228  it->vec= strings->get_feature_vector(vector_index, it->vlen, it->vfree);
229  it->vidx=vector_index;
230 
231  it->vec = strings->get_feature_vector(vector_index, it->vlen, it->vfree);
232  it->val=SG_MALLOC(int32_t, it->vlen);
233  SGVector<int32_t>::fill_vector(it->val, it->vlen, 0);
234 
235  it->asize=alphabet_size;
236  it->asizem1=1;
237  it->offs=0;
238  it->k=0;
239  it->i=0;
240  it->o=0;
241 
242  return it;
243 }
244 
245 bool CWDFeatures::get_next_feature(int32_t& index, float64_t& value, void* iterator)
246 {
247  wd_feature_iterator* it=(wd_feature_iterator*) iterator;
248 
249  if (it->i + it->k >= it->vlen)
250  {
251  if (it->k < it->lim-1)
252  {
253  it->offs+=it->asize*it->vlen;
254  it->asize*=alphabet_size;
255  it->asizem1*=alphabet_size;
256  it->k++;
257  it->i=0;
258  it->o=it->offs;
259  }
260  else
261  return false;
262  }
263 
264  int32_t i=it->i;
265  int32_t k=it->k;
266 #ifdef DEBUG_WDFEATURES
267  SG_PRINT("i=%d k=%d offs=%d o=%d asize=%d asizem1=%d\n", i, k, it->offs, it->o, it->asize, it->asizem1);
268 #endif
269 
270  it->val[i]+=it->asizem1*it->vec[i+k];
272  index=it->val[i]+it->o;
273 #ifdef DEBUG_WDFEATURES
274  SG_PRINT("index=%d val=%f w_size=%d lim=%d vlen=%d\n", index, value, w_dim, it->lim, it->vlen);
275 #endif
276 
277  it->o+=it->asize;
278  it->i=i+1;
279 
280  return true;
281 }
282 
284 {
285  ASSERT(iterator);
286  wd_feature_iterator* it=(wd_feature_iterator*) iterator;
287  strings->free_feature_vector(it->vec, it->vidx, it->vfree);
288  SG_FREE(it->val);
289  SG_FREE(it);
290 }
291 
293 {
294  return new CWDFeatures(*this);
295 }
296 
298 {
299  return w_dim;
300 }
301 
303 {
304  int32_t vlen=-1;
305  bool free_vec;
306  uint8_t* vec=strings->get_feature_vector(num, vlen, free_vec);
307  strings->free_feature_vector(vec, num, free_vec);
308  return degree*vlen;
309 }
310 
312 {
313  return F_UNKNOWN;
314 }
315 
317 {
318  return C_WD;
319 }
320 
322 {
323  return num_strings;
324 }
325 
326 int32_t CWDFeatures::get_size() const
327 {
328  return sizeof(float64_t);
329 }
330 
332 {
333  return normalization_const;
334 }
335 
337 {
338  ASSERT(weights.vlen==degree);
339 
340  for (int32_t i=0; i<degree; i++)
341  wd_weights[i]=weights.vector[i];
342 }
343 

SHOGUN Machine Learning Toolbox - Documentation