SHOGUN  v3.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
HashedWDFeatures.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2010 Soeren Sonnenburg
8  * Copyright (C) 2010 Berlin Institute of Technology
9  */
10 
12 #include <shogun/io/SGIO.h>
13 
14 using namespace shogun;
15 
17 {
18  SG_UNSTABLE("CHashedWDFeatures::CHashedWDFeatures()", "\n")
19 
20  strings = NULL;
21 
22  degree = 0;
23  start_degree = 0;
24  from_degree = 0;
25  string_length = 0;
26  num_strings = 0;
27  alphabet_size = 0;
28  w_dim = 0;
29  partial_w_dim = 0;
30  wd_weights = NULL;
31  mask = 0;
32  m_hash_bits = 0;
33 
34  normalization_const = 0.0;
35 }
36 
38  int32_t start_order, int32_t order, int32_t from_order,
39  int32_t hash_bits) : CDotFeatures()
40 {
41  ASSERT(start_order>=0)
42  ASSERT(start_order<order)
43  ASSERT(order<=from_order)
44  ASSERT(hash_bits>0)
45  ASSERT(str)
46  ASSERT(str->have_same_length())
47  SG_REF(str);
48 
49  strings=str;
52  CAlphabet* alpha=str->get_alphabet();
54  SG_UNREF(alpha);
55 
56  degree=order;
57  start_degree=start_order;
58  from_degree=from_order;
59  m_hash_bits=hash_bits;
62 }
63 
65  : CDotFeatures(orig), strings(orig.strings),
66  degree(orig.degree), start_degree(orig.start_degree),
67  from_degree(orig.from_degree), m_hash_bits(orig.m_hash_bits),
68  normalization_const(orig.normalization_const)
69 {
70  SG_REF(strings);
73  CAlphabet* alpha=strings->get_alphabet();
75  SG_UNREF(alpha);
76 
78 }
79 
81 {
83  SG_FREE(wd_weights);
84 }
85 
86 float64_t CHashedWDFeatures::dot(int32_t vec_idx1, CDotFeatures* df, int32_t vec_idx2)
87 {
88  ASSERT(df)
92 
93  int32_t len1, len2;
94  bool free_vec1, free_vec2;
95 
96  uint8_t* vec1=strings->get_feature_vector(vec_idx1, len1, free_vec1);
97  uint8_t* vec2=wdf->strings->get_feature_vector(vec_idx2, len2, free_vec2);
98 
99  ASSERT(len1==len2)
100 
101  float64_t sum=0.0;
102 
103  for (int32_t i=0; i<len1; i++)
104  {
105  for (int32_t j=0; (i+j<len1) && (j<degree); j++)
106  {
107  if (vec1[i+j]!=vec2[i+j])
108  break;
109  if (j>=start_degree)
110  sum += wd_weights[j]*wd_weights[j];
111  }
112  }
113  strings->free_feature_vector(vec1, vec_idx1, free_vec1);
114  wdf->strings->free_feature_vector(vec2, vec_idx2, free_vec2);
115  return sum/CMath::sq(normalization_const);
116 }
117 
118 float64_t CHashedWDFeatures::dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len)
119 {
120  if (vec2_len != w_dim)
121  SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim)
122 
123  float64_t sum=0;
124  int32_t lim=CMath::min(degree, string_length);
125  int32_t len;
126  bool free_vec1;
127  uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1);
128  uint32_t* val=SG_MALLOC(uint32_t, len);
129 
130  uint32_t offs=0;
131 
132  if (start_degree>0)
133  {
134  // compute hash for strings of length start_degree-1
135  for (int32_t i=0; i+start_degree < len; i++)
136  val[i]=CHash::MurmurHash3(&vec[i], start_degree, 0xDEADBEAF);
137  }
138  else
139  SGVector<uint32_t>::fill_vector(val, len, 0xDEADBEAF);
140 
141  for (int32_t k=start_degree; k<lim; k++)
142  {
143  float64_t wd = wd_weights[k];
144 
145  uint32_t o=offs;
146  uint32_t carry = 0;
147  uint32_t chunk = 0;
148 
149  for (int32_t i=0; i+k < len; i++)
150  {
151  chunk++;
152  CHash::IncrementalMurmurHash3(&(val[i]), &carry, &(vec[i+k]), 1);
153  uint32_t h =
154  CHash::FinalizeIncrementalMurmurHash3(val[i], carry, chunk);
155 #ifdef DEBUG_HASHEDWD
156  SG_PRINT("vec[i]=%d, k=%d, offs=%d o=%d\n", vec[i], k,offs, o)
157 #endif
158  sum+=vec2[o+(h & mask)]*wd;
159  val[i] = h;
160  o+=partial_w_dim;
161  }
162  val[len-k-1] =
163  CHash::FinalizeIncrementalMurmurHash3(val[len-k-1], carry, chunk);
164  offs+=partial_w_dim*len;
165  }
166  SG_FREE(val);
167  strings->free_feature_vector(vec, vec_idx1, free_vec1);
168 
169  return sum/normalization_const;
170 }
171 
172 void CHashedWDFeatures::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val)
173 {
174  if (vec2_len != w_dim)
175  SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim)
176 
177  int32_t lim=CMath::min(degree, string_length);
178  int32_t len;
179  bool free_vec1;
180  uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1);
181  uint32_t* val=SG_MALLOC(uint32_t, len);
182 
183  uint32_t offs=0;
184 
185  if (start_degree>0)
186  {
187  // compute hash for strings of length start_degree-1
188  for (int32_t i=0; i+start_degree < len; i++)
189  val[i]=CHash::MurmurHash3(&vec[i], start_degree, 0xDEADBEAF);
190  }
191  else
192  SGVector<uint32_t>::fill_vector(val, len, 0xDEADBEAF);
193 
194  for (int32_t k=start_degree; k<lim; k++)
195  {
197 
198  if (abs_val)
199  wd=CMath::abs(wd);
200 
201  uint32_t o=offs;
202  uint32_t carry = 0;
203  uint32_t chunk = 0;
204 
205  for (int32_t i=0; i+k < len; i++)
206  {
207  chunk++;
208  CHash::IncrementalMurmurHash3(&(val[i]), &carry, &(vec[i+k]), 1);
209  uint32_t h = CHash::FinalizeIncrementalMurmurHash3(val[i], carry, chunk);
210 
211 #ifdef DEBUG_HASHEDWD
212  SG_PRINT("offs=%d o=%d h=%d \n", offs, o, h)
213  SG_PRINT("vec[i]=%d, k=%d, offs=%d o=%d\n", vec[i], k,offs, o)
214 #endif
215  vec2[o+(h & mask)]+=wd;
216  val[i] = h;
217  o+=partial_w_dim;
218  }
219  val[len-k-1] =
220  CHash::FinalizeIncrementalMurmurHash3(val[len-k-1], carry, chunk);
221 
222  offs+=partial_w_dim*len;
223  }
224 
225  SG_FREE(val);
226  strings->free_feature_vector(vec, vec_idx1, free_vec1);
227 }
228 
230 {
231  ASSERT(degree>0)
232 
233  mask=(uint32_t) (((uint64_t) 1)<<m_hash_bits)-1;
236 
237  wd_weights=SG_MALLOC(float64_t, degree);
238 
239  for (int32_t i=0; i<degree; i++)
240  wd_weights[i]=sqrt(2.0*(from_degree-i)/(from_degree*(from_degree+1)));
241 
242  SG_DEBUG("created HashedWDFeatures with d=%d (%d), alphabetsize=%d, "
243  "dim=%d partial_dim=%d num=%d, len=%d\n",
244  degree, from_degree, alphabet_size,
246 }
247 
248 
250 {
251  if (n==0)
252  {
254  for (int32_t i=0; i<degree; i++)
256 
258  }
259  else
261 
262  SG_DEBUG("normalization_const:%f\n", normalization_const)
263 }
264 
266 {
267  return new CHashedWDFeatures(*this);
268 }
269 
270 
272 {
273  int32_t vlen=-1;
274  bool free_vec;
275  uint8_t* vec=strings->get_feature_vector(num, vlen, free_vec);
276  strings->free_feature_vector(vec, num, free_vec);
277  return degree*vlen;
278 }
279 
280 void* CHashedWDFeatures::get_feature_iterator(int32_t vector_index)
281 {
283  return NULL;
284 }
285 
286 bool CHashedWDFeatures::get_next_feature(int32_t& index, float64_t& value,
287  void* iterator)
288 {
290  return false;
291 }
292 
294 {
296 }

SHOGUN Machine Learning Toolbox - Documentation