SHOGUN  3.2.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
HashedWDFeatures.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2010 Soeren Sonnenburg
8  * Copyright (C) 2010 Berlin Institute of Technology
9  */
10 
12 #include <shogun/io/SGIO.h>
13 
14 using namespace shogun;
15 
17 {
18  SG_UNSTABLE("CHashedWDFeatures::CHashedWDFeatures()", "\n")
19 
20  strings = NULL;
21 
22  degree = 0;
23  start_degree = 0;
24  from_degree = 0;
25  string_length = 0;
26  num_strings = 0;
27  alphabet_size = 0;
28  w_dim = 0;
29  partial_w_dim = 0;
30  wd_weights = NULL;
31  mask = 0;
32  m_hash_bits = 0;
33 
34  normalization_const = 0.0;
35 }
36 
38  int32_t start_order, int32_t order, int32_t from_order,
39  int32_t hash_bits) : CDotFeatures()
40 {
41  ASSERT(start_order>=0)
42  ASSERT(start_order<order)
43  ASSERT(order<=from_order)
44  ASSERT(hash_bits>0)
45  ASSERT(str)
46  ASSERT(str->have_same_length())
47  SG_REF(str);
48 
49  strings=str;
52  CAlphabet* alpha=str->get_alphabet();
54  SG_UNREF(alpha);
55 
56  degree=order;
57  start_degree=start_order;
58  from_degree=from_order;
59  m_hash_bits=hash_bits;
62 }
63 
65  : CDotFeatures(orig), strings(orig.strings),
66  degree(orig.degree), start_degree(orig.start_degree),
67  from_degree(orig.from_degree), m_hash_bits(orig.m_hash_bits),
68  normalization_const(orig.normalization_const)
69 {
70 
71 
72  SG_REF(strings);
73  if (strings)
74  {
77  CAlphabet* alpha=strings->get_alphabet();
79  SG_UNREF(alpha);
80  }
81  else
82  {
83  string_length = 0;
84  num_strings = 0;
85  alphabet_size = 0;
86  }
87 
88  if (degree>0)
90 }
91 
93 {
95  SG_FREE(wd_weights);
96 }
97 
98 float64_t CHashedWDFeatures::dot(int32_t vec_idx1, CDotFeatures* df, int32_t vec_idx2)
99 {
100  ASSERT(df)
104 
105  int32_t len1, len2;
106  bool free_vec1, free_vec2;
107 
108  uint8_t* vec1=strings->get_feature_vector(vec_idx1, len1, free_vec1);
109  uint8_t* vec2=wdf->strings->get_feature_vector(vec_idx2, len2, free_vec2);
110 
111  ASSERT(len1==len2)
112 
113  float64_t sum=0.0;
114 
115  for (int32_t i=0; i<len1; i++)
116  {
117  for (int32_t j=0; (i+j<len1) && (j<degree); j++)
118  {
119  if (vec1[i+j]!=vec2[i+j])
120  break;
121  if (j>=start_degree)
122  sum += wd_weights[j]*wd_weights[j];
123  }
124  }
125  strings->free_feature_vector(vec1, vec_idx1, free_vec1);
126  wdf->strings->free_feature_vector(vec2, vec_idx2, free_vec2);
127  return sum/CMath::sq(normalization_const);
128 }
129 
130 float64_t CHashedWDFeatures::dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len)
131 {
132  if (vec2_len != w_dim)
133  SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim)
134 
135  float64_t sum=0;
136  int32_t lim=CMath::min(degree, string_length);
137  int32_t len;
138  bool free_vec1;
139  uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1);
140  uint32_t* val=SG_MALLOC(uint32_t, len);
141 
142  uint32_t offs=0;
143 
144  if (start_degree>0)
145  {
146  // compute hash for strings of length start_degree-1
147  for (int32_t i=0; i+start_degree < len; i++)
148  val[i]=CHash::MurmurHash3(&vec[i], start_degree, 0xDEADBEAF);
149  }
150  else
151  SGVector<uint32_t>::fill_vector(val, len, 0xDEADBEAF);
152 
153  for (int32_t k=start_degree; k<lim; k++)
154  {
155  float64_t wd = wd_weights[k];
156 
157  uint32_t o=offs;
158  uint32_t carry = 0;
159  uint32_t chunk = 0;
160 
161  for (int32_t i=0; i+k < len; i++)
162  {
163  chunk++;
164  CHash::IncrementalMurmurHash3(&(val[i]), &carry, &(vec[i+k]), 1);
165  uint32_t h =
166  CHash::FinalizeIncrementalMurmurHash3(val[i], carry, chunk);
167 #ifdef DEBUG_HASHEDWD
168  SG_PRINT("vec[i]=%d, k=%d, offs=%d o=%d\n", vec[i], k,offs, o)
169 #endif
170  sum+=vec2[o+(h & mask)]*wd;
171  val[i] = h;
172  o+=partial_w_dim;
173  }
174  val[len-k-1] =
175  CHash::FinalizeIncrementalMurmurHash3(val[len-k-1], carry, chunk);
176  offs+=partial_w_dim*len;
177  }
178  SG_FREE(val);
179  strings->free_feature_vector(vec, vec_idx1, free_vec1);
180 
181  return sum/normalization_const;
182 }
183 
184 void CHashedWDFeatures::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val)
185 {
186  if (vec2_len != w_dim)
187  SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim)
188 
189  int32_t lim=CMath::min(degree, string_length);
190  int32_t len;
191  bool free_vec1;
192  uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1);
193  uint32_t* val=SG_MALLOC(uint32_t, len);
194 
195  uint32_t offs=0;
196 
197  if (start_degree>0)
198  {
199  // compute hash for strings of length start_degree-1
200  for (int32_t i=0; i+start_degree < len; i++)
201  val[i]=CHash::MurmurHash3(&vec[i], start_degree, 0xDEADBEAF);
202  }
203  else
204  SGVector<uint32_t>::fill_vector(val, len, 0xDEADBEAF);
205 
206  for (int32_t k=start_degree; k<lim; k++)
207  {
209 
210  if (abs_val)
211  wd=CMath::abs(wd);
212 
213  uint32_t o=offs;
214  uint32_t carry = 0;
215  uint32_t chunk = 0;
216 
217  for (int32_t i=0; i+k < len; i++)
218  {
219  chunk++;
220  CHash::IncrementalMurmurHash3(&(val[i]), &carry, &(vec[i+k]), 1);
221  uint32_t h = CHash::FinalizeIncrementalMurmurHash3(val[i], carry, chunk);
222 
223 #ifdef DEBUG_HASHEDWD
224  SG_PRINT("offs=%d o=%d h=%d \n", offs, o, h)
225  SG_PRINT("vec[i]=%d, k=%d, offs=%d o=%d\n", vec[i], k,offs, o)
226 #endif
227  vec2[o+(h & mask)]+=wd;
228  val[i] = h;
229  o+=partial_w_dim;
230  }
231  val[len-k-1] =
232  CHash::FinalizeIncrementalMurmurHash3(val[len-k-1], carry, chunk);
233 
234  offs+=partial_w_dim*len;
235  }
236 
237  SG_FREE(val);
238  strings->free_feature_vector(vec, vec_idx1, free_vec1);
239 }
240 
242 {
243  ASSERT(degree>0)
244 
245  mask=(uint32_t) (((uint64_t) 1)<<m_hash_bits)-1;
248 
249  wd_weights=SG_MALLOC(float64_t, degree);
250 
251  for (int32_t i=0; i<degree; i++)
252  wd_weights[i]=sqrt(2.0*(from_degree-i)/(from_degree*(from_degree+1)));
253 
254  SG_DEBUG("created HashedWDFeatures with d=%d (%d), alphabetsize=%d, "
255  "dim=%d partial_dim=%d num=%d, len=%d\n",
256  degree, from_degree, alphabet_size,
258 }
259 
260 
262 {
263  if (n==0)
264  {
266  for (int32_t i=0; i<degree; i++)
268 
270  }
271  else
273 
274  SG_DEBUG("normalization_const:%f\n", normalization_const)
275 }
276 
278 {
279  return new CHashedWDFeatures(*this);
280 }
281 
282 
284 {
285  int32_t vlen=-1;
286  bool free_vec;
287  uint8_t* vec=strings->get_feature_vector(num, vlen, free_vec);
288  strings->free_feature_vector(vec, num, free_vec);
289  return degree*vlen;
290 }
291 
292 void* CHashedWDFeatures::get_feature_iterator(int32_t vector_index)
293 {
295  return NULL;
296 }
297 
298 bool CHashedWDFeatures::get_next_feature(int32_t& index, float64_t& value,
299  void* iterator)
300 {
302  return false;
303 }
304 
306 {
308 }

SHOGUN Machine Learning Toolbox - Documentation