SHOGUN  4.1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
HashedWDFeaturesTransposed.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2010 Soeren Sonnenburg
8  * Copyright (C) 2010 Berlin Institute of Technology
9  */
10 
12 #include <shogun/io/SGIO.h>
13 #include <shogun/lib/Signal.h>
14 #include <shogun/base/Parallel.h>
15 
16 #ifdef HAVE_PTHREAD
17 #include <pthread.h>
18 #endif
19 
20 using namespace shogun;
21 
22 #ifndef DOXYGEN_SHOULD_SKIP_THIS
23 struct HASHEDWD_THREAD_PARAM
24 {
26  int32_t* sub_index;
27  float64_t* output;
28  int32_t start;
29  int32_t stop;
30  float64_t* alphas;
31  float64_t* vec;
32  float64_t bias;
33  bool progress;
34  uint32_t* index;
35 };
36 #endif // DOXYGEN_SHOULD_SKIP_THIS
37 
39  :CDotFeatures()
40 {
42  "CHashedWDFeaturesTransposed::CHashedWDFeaturesTransposed()",
43  "\n");
44 
45  strings = NULL;
46  transposed_strings = NULL;
47 
48  degree = 0;
49  start_degree = 0;
50  from_degree = 0;
51  string_length = 0;
52  num_strings = 0;
53  alphabet_size = 0;
54  w_dim = 0;
55  partial_w_dim = 0;
56  wd_weights = NULL;
57  mask = 0;
58  m_hash_bits = 0;
59 
60  normalization_const = 0.0;
61 }
62 
64  int32_t start_order, int32_t order, int32_t from_order,
65  int32_t hash_bits) : CDotFeatures()
66 {
67  ASSERT(start_order>=0)
68  ASSERT(start_order<order)
69  ASSERT(order<=from_order)
70  ASSERT(hash_bits>0)
71  ASSERT(str)
72  ASSERT(str->have_same_length())
73  SG_REF(str);
74 
75  strings=str;
76  int32_t transposed_num_feat=0;
77  int32_t transposed_num_vec=0;
78  transposed_strings=str->get_transposed(transposed_num_feat, transposed_num_vec);
79 
82  ASSERT(transposed_num_feat==num_strings)
83  ASSERT(transposed_num_vec==string_length)
84 
85  CAlphabet* alpha=str->get_alphabet();
87  SG_UNREF(alpha);
88 
89  degree=order;
90  start_degree=start_order;
91  if (start_degree!=0)
93  from_degree=from_order;
94  m_hash_bits=hash_bits;
97 }
98 
100  : CDotFeatures(orig), strings(orig.strings), transposed_strings(orig.transposed_strings),
101  degree(orig.degree), start_degree(orig.start_degree),
102  from_degree(orig.from_degree), m_hash_bits(orig.m_hash_bits),
103  normalization_const(orig.normalization_const)
104 {
105  SG_REF(strings);
108  CAlphabet* alpha=strings->get_alphabet();
110  SG_UNREF(alpha);
111 
112  set_wd_weights();
113 }
114 
116 {
117  for (int32_t i=0; i<string_length; i++)
118  SG_FREE(transposed_strings[i].string);
119  SG_FREE(transposed_strings);
120 
121  SG_UNREF(strings);
122  SG_FREE(wd_weights);
123 }
124 
125 float64_t CHashedWDFeaturesTransposed::dot(int32_t vec_idx1, CDotFeatures* df, int32_t vec_idx2)
126 {
127  ASSERT(df)
131 
132  int32_t len1, len2;
133  bool free_vec1, free_vec2;
134 
135  uint8_t* vec1=strings->get_feature_vector(vec_idx1, len1, free_vec1);
136  uint8_t* vec2=wdf->strings->get_feature_vector(vec_idx2, len2, free_vec2);
137 
138  ASSERT(len1==len2)
139 
140  float64_t sum=0.0;
141 
142  for (int32_t i=0; i<len1; i++)
143  {
144  for (int32_t j=0; (i+j<len1) && (j<degree); j++)
145  {
146  if (vec1[i+j]!=vec2[i+j])
147  break;
148  if (j>=start_degree)
149  sum += wd_weights[j]*wd_weights[j];
150  }
151  }
152  strings->free_feature_vector(vec1, vec_idx1, free_vec1);
153  wdf->strings->free_feature_vector(vec2, vec_idx2, free_vec2);
154  return sum/CMath::sq(normalization_const);
155 }
156 
157 float64_t CHashedWDFeaturesTransposed::dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len)
158 {
159  if (vec2_len != w_dim)
160  SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim)
161 
162  float64_t sum=0;
163  int32_t len;
164  bool free_vec1;
165  uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1);
166  uint32_t* val=SG_MALLOC(uint32_t, len);
167 
168  uint32_t offs=0;
169 
170  SGVector<uint32_t>::fill_vector(val, len, 0xDEADBEAF);
171 
172  for (int32_t i=0; i < len; i++)
173  {
174  uint32_t o=offs;
175  uint32_t carry = 0;
176  uint32_t chunk = 0;
177 
178  for (int32_t k=0; k<degree && i+k<len; k++)
179  {
180  const float64_t wd = wd_weights[k];
181  chunk++;
182  CHash::IncrementalMurmurHash3(&(val[i]), &carry, &(vec[i+k]), 1);
183  uint32_t h =
184  CHash::FinalizeIncrementalMurmurHash3(val[i], carry, chunk);
185 #ifdef DEBUG_HASHEDWD
186  SG_PRINT("vec[i]=%d, k=%d, offs=%d o=%d h=%d \n", vec[i], k,offs, o, h)
187 #endif
188  sum+=vec2[o+(h & mask)]*wd;
189  o+=partial_w_dim;
190  }
191  val[i] = CHash::FinalizeIncrementalMurmurHash3(val[i], carry, chunk);
192  offs+=partial_w_dim*degree;
193  }
194  SG_FREE(val);
195  strings->free_feature_vector(vec, vec_idx1, free_vec1);
196 
197  return sum/normalization_const;
198 }
199 
200 void CHashedWDFeaturesTransposed::dense_dot_range(float64_t* output, int32_t start, int32_t stop, float64_t* alphas, float64_t* vec, int32_t dim, float64_t b)
201 {
202  ASSERT(output)
203  // write access is internally between output[start..stop] so the following
204  // line is necessary to write to output[0...(stop-start-1)]
205  output-=start;
206  ASSERT(start>=0)
207  ASSERT(start<stop)
208  ASSERT(stop<=get_num_vectors())
209  uint32_t* index=SG_MALLOC(uint32_t, stop);
210 
211  int32_t num_vectors=stop-start;
212  ASSERT(num_vectors>0)
213 
214  int32_t num_threads=parallel->get_num_threads();
215  ASSERT(num_threads>0)
216 
218 
219  if (dim != w_dim)
220  SG_ERROR("Dimensions don't match, vec_len=%d, w_dim=%d\n", dim, w_dim)
221 
222 #ifdef HAVE_PTHREAD
223  if (num_threads < 2)
224  {
225 #endif
226  HASHEDWD_THREAD_PARAM params;
227  params.hf=this;
228  params.sub_index=NULL;
229  params.output=output;
230  params.start=start;
231  params.stop=stop;
232  params.alphas=alphas;
233  params.vec=vec;
234  params.bias=b;
235  params.progress=false; //true;
236  params.index=index;
237  dense_dot_range_helper((void*) &params);
238 #ifdef HAVE_PTHREAD
239  }
240  else
241  {
242  pthread_t* threads = SG_MALLOC(pthread_t, num_threads-1);
243  HASHEDWD_THREAD_PARAM* params = SG_MALLOC(HASHEDWD_THREAD_PARAM, num_threads);
244  int32_t step= num_vectors/num_threads;
245 
246  int32_t t;
247 
248  for (t=0; t<num_threads-1; t++)
249  {
250  params[t].hf = this;
251  params[t].sub_index=NULL;
252  params[t].output = output;
253  params[t].start = start+t*step;
254  params[t].stop = start+(t+1)*step;
255  params[t].alphas=alphas;
256  params[t].vec=vec;
257  params[t].bias=b;
258  params[t].progress = false;
259  params[t].index=index;
260  pthread_create(&threads[t], NULL,
262  }
263 
264  params[t].hf = this;
265  params[t].sub_index=NULL;
266  params[t].output = output;
267  params[t].start = start+t*step;
268  params[t].stop = stop;
269  params[t].alphas=alphas;
270  params[t].vec=vec;
271  params[t].bias=b;
272  params[t].progress = false; //true;
273  params[t].index=index;
275 
276  for (t=0; t<num_threads-1; t++)
277  pthread_join(threads[t], NULL);
278 
279  SG_FREE(params);
280  SG_FREE(threads);
281  }
282 #endif
283  SG_FREE(index);
284 
285 #ifndef WIN32
287  SG_INFO("prematurely stopped. \n")
288 #endif
289 }
290 
291 void CHashedWDFeaturesTransposed::dense_dot_range_subset(int32_t* sub_index, int num, float64_t* output, float64_t* alphas, float64_t* vec, int32_t dim, float64_t b)
292 {
293  ASSERT(sub_index)
294  ASSERT(output)
295 
296  uint32_t* index=SG_MALLOC(uint32_t, num);
297 
298  int32_t num_threads=parallel->get_num_threads();
299  ASSERT(num_threads>0)
300 
302 
303  if (dim != w_dim)
304  SG_ERROR("Dimensions don't match, vec_len=%d, w_dim=%d\n", dim, w_dim)
305 
306 #ifdef HAVE_PTHREAD
307  if (num_threads < 2)
308  {
309 #endif
310  HASHEDWD_THREAD_PARAM params;
311  params.hf=this;
312  params.sub_index=sub_index;
313  params.output=output;
314  params.start=0;
315  params.stop=num;
316  params.alphas=alphas;
317  params.vec=vec;
318  params.bias=b;
319  params.progress=false; //true;
320  params.index=index;
321  dense_dot_range_helper((void*) &params);
322 #ifdef HAVE_PTHREAD
323  }
324  else
325  {
326  pthread_t* threads = SG_MALLOC(pthread_t, num_threads-1);
327  HASHEDWD_THREAD_PARAM* params = SG_MALLOC(HASHEDWD_THREAD_PARAM, num_threads);
328  int32_t step= num/num_threads;
329 
330  int32_t t;
331 
332  for (t=0; t<num_threads-1; t++)
333  {
334  params[t].hf = this;
335  params[t].sub_index=sub_index;
336  params[t].output = output;
337  params[t].start = t*step;
338  params[t].stop = (t+1)*step;
339  params[t].alphas=alphas;
340  params[t].vec=vec;
341  params[t].bias=b;
342  params[t].progress = false;
343  params[t].index=index;
344  pthread_create(&threads[t], NULL,
346  }
347 
348  params[t].hf = this;
349  params[t].sub_index=sub_index;
350  params[t].output = output;
351  params[t].start = t*step;
352  params[t].stop = num;
353  params[t].alphas=alphas;
354  params[t].vec=vec;
355  params[t].bias=b;
356  params[t].progress = false; //true;
357  params[t].index=index;
359 
360  for (t=0; t<num_threads-1; t++)
361  pthread_join(threads[t], NULL);
362 
363  SG_FREE(params);
364  SG_FREE(threads);
365  SG_FREE(index);
366  }
367 #endif
368 
369 #ifndef WIN32
371  SG_INFO("prematurely stopped. \n")
372 #endif
373 }
374 
376 {
377  HASHEDWD_THREAD_PARAM* par=(HASHEDWD_THREAD_PARAM*) p;
378  CHashedWDFeaturesTransposed* hf=par->hf;
379  int32_t* sub_index=par->sub_index;
380  float64_t* output=par->output;
381  int32_t start=par->start;
382  int32_t stop=par->stop;
383  float64_t* alphas=par->alphas;
384  float64_t* vec=par->vec;
385  float64_t bias=par->bias;
386  bool progress=par->progress;
387  uint32_t* index=par->index;
388  int32_t string_length=hf->string_length;
389  int32_t degree=hf->degree;
392  uint32_t mask=hf->mask;
393  int32_t partial_w_dim=hf->partial_w_dim;
395 
396  if (sub_index)
397  {
398  for (int32_t j=start; j<stop; j++)
399  output[j]=0.0;
400 
401  uint32_t offs=0;
402  for (int32_t i=0; i<string_length; i++)
403  {
404  uint32_t o=offs;
405  for (int32_t k=0; k<degree && i+k<string_length; k++)
406  {
407  const float64_t wd = wd_weights[k];
408  uint8_t* dim=transposed_strings[i+k].string;
409  uint32_t carry = 0;
410  uint32_t chunk = 0;
411  for (int32_t j=start; j<stop; j++)
412  {
413  uint8_t bval=dim[sub_index[j]];
414  if (k==0)
415  index[j] = 0xDEADBEAF;
416 
417  CHash::IncrementalMurmurHash3(&index[j], &carry, &bval, 1);
418 
419  chunk++;
420  uint32_t h =
422  index[j], carry, chunk);
423 
424  output[j]+=vec[o + (h & mask)]*wd;
425  index[j] = h;
426  }
427 
428  index[stop-1] =
430  index[stop-1], carry, chunk);
431 
432  o+=partial_w_dim;
433  }
434  offs+=partial_w_dim*degree;
435 
436  if (progress)
437  hf->io->progress(i, 0,string_length, i);
438  }
439 
440  for (int32_t j=start; j<stop; j++)
441  {
442  if (alphas)
443  output[j]=output[j]*alphas[sub_index[j]]/normalization_const+bias;
444  else
445  output[j]=output[j]/normalization_const+bias;
446  }
447  }
448  else
449  {
450  SGVector<float64_t>::fill_vector(&output[start], stop-start, 0.0);
451 
452  uint32_t offs=0;
453  for (int32_t i=0; i<string_length; i++)
454  {
455  uint32_t o=offs;
456  for (int32_t k=0; k<degree && i+k<string_length; k++)
457  {
458  const float64_t wd = wd_weights[k];
459  uint8_t* dim=transposed_strings[i+k].string;
460  uint32_t carry = 0;
461  uint32_t chunk = 0;
462 
463  for (int32_t j=start; j<stop; j++)
464  {
465  uint8_t bval=dim[sub_index[j]];
466  if (k==0)
467  index[j] = 0xDEADBEAF;
468 
469  CHash::IncrementalMurmurHash3(&index[j], &carry, &bval, 1);
470 
471  chunk++;
472  uint32_t h =
474  index[j], carry, chunk);
475 
476  index[j] = h;
477  output[j]+=vec[o + (h & mask)]*wd;
478  }
479 
481  index[stop-1], carry, chunk);
482 
483  o+=partial_w_dim;
484  }
485  offs+=partial_w_dim*degree;
486 
487  if (progress)
488  hf->io->progress(i, 0,string_length, i);
489  }
490 
491  for (int32_t j=start; j<stop; j++)
492  {
493  if (alphas)
494  output[j]=output[j]*alphas[j]/normalization_const+bias;
495  else
496  output[j]=output[j]/normalization_const+bias;
497  }
498  }
499 
500  return NULL;
501 }
502 
503 void CHashedWDFeaturesTransposed::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val)
504 {
505  if (vec2_len != w_dim)
506  SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim)
507 
508  int32_t len;
509  bool free_vec1;
510  uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1);
511  uint32_t* val=SG_MALLOC(uint32_t, len);
512 
513  uint32_t offs=0;
514  float64_t factor=alpha/normalization_const;
515  if (abs_val)
516  factor=CMath::abs(factor);
517 
518  SGVector<uint32_t>::fill_vector(val, len, 0xDEADBEAF);
519 
520  for (int32_t i=0; i<len; i++)
521  {
522  uint32_t o=offs;
523  uint32_t carry = 0;
524  uint32_t chunk = 0;
525 
526  for (int32_t k=0; k<degree && i+k<len; k++)
527  {
528  float64_t wd = wd_weights[k]*factor;
529  chunk++;
530  CHash::IncrementalMurmurHash3(&(val[i]), &carry, &(vec[i+k]), 1);
531  uint32_t h =
532  CHash::FinalizeIncrementalMurmurHash3(val[i], carry, chunk);
533 #ifdef DEBUG_HASHEDWD
534  SG_PRINT("offs=%d o=%d h=%d \n", offs, o, h)
535  SG_PRINT("vec[i]=%d, k=%d, offs=%d o=%d\n", vec[i], k,offs, o)
536 #endif
537  vec2[o+(h & mask)]+=wd;
538  val[i] = h;
539  o+=partial_w_dim;
540  }
541 
542  val[i] = CHash::FinalizeIncrementalMurmurHash3(val[i], carry, chunk);
543  offs+=partial_w_dim*degree;
544  }
545 
546  SG_FREE(val);
547  strings->free_feature_vector(vec, vec_idx1, free_vec1);
548 }
549 
551 {
552  ASSERT(degree>0)
553 
554  mask=(uint32_t) (((uint64_t) 1)<<m_hash_bits)-1;
557 
558  wd_weights=SG_MALLOC(float64_t, degree);
559 
560  for (int32_t i=0; i<degree; i++)
561  wd_weights[i]=sqrt(2.0*(from_degree-i)/(from_degree*(from_degree+1)));
562 
563  SG_DEBUG("created HashedWDFeaturesTransposed with d=%d (%d), alphabetsize=%d, "
564  "dim=%d partial_dim=%d num=%d, len=%d\n",
565  degree, from_degree, alphabet_size,
567 }
568 
569 
571 {
572  if (n==0)
573  {
575  for (int32_t i=0; i<degree; i++)
577 
579  }
580  else
582 
583  SG_DEBUG("normalization_const:%f\n", normalization_const)
584 }
585 
587 {
588  return new CHashedWDFeaturesTransposed(*this);
589 }
590 
592 {
594  return NULL;
595 }
596 
597 bool CHashedWDFeaturesTransposed::get_next_feature(int32_t& index, float64_t& value, void* iterator)
598 {
600  return false;
601 }
602 
604 {
606 }
virtual int32_t get_max_vector_length()
SGVector< ST > get_feature_vector(int32_t num)
#define SG_INFO(...)
Definition: SGIO.h:118
static void fill_vector(T *vec, int32_t len, T value)
Definition: SGVector.cpp:223
virtual void free_feature_iterator(void *iterator)
int32_t get_num_threads() const
Definition: Parallel.cpp:64
virtual int32_t get_num_vectors() const
static T sq(T x)
Definition: Math.h:450
#define SG_ERROR(...)
Definition: SGIO.h:129
#define SG_NOTIMPLEMENTED
Definition: SGIO.h:139
The class Alphabet implements an alphabet and alphabet utility functions.
Definition: Alphabet.h:91
static uint32_t FinalizeIncrementalMurmurHash3(uint32_t h, uint32_t carry, uint32_t total_length)
Definition: Hash.cpp:376
virtual void add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t *vec2, int32_t vec2_len, bool abs_val=false)
void free_feature_vector(ST *feat_vec, int32_t num, bool dofree)
Parallel * parallel
Definition: SGObject.h:372
Features that support dot products among other operations.
Definition: DotFeatures.h:44
#define SG_REF(x)
Definition: SGObject.h:51
#define SG_PRINT(...)
Definition: SGIO.h:137
virtual void * get_feature_iterator(int32_t vector_index)
#define ASSERT(x)
Definition: SGIO.h:201
CStringFeatures< ST > * get_transposed()
static void clear_cancel()
Definition: Signal.cpp:129
double float64_t
Definition: common.h:50
virtual EFeatureClass get_feature_class() const
int32_t get_num_symbols() const
Definition: Alphabet.h:139
virtual float64_t dot(int32_t vec_idx1, CDotFeatures *df, int32_t vec_idx2)
virtual EFeatureClass get_feature_class() const =0
static bool cancel_computations()
Definition: Signal.h:86
bool have_same_length(int32_t len=-1)
#define SG_UNREF(x)
Definition: SGObject.h:52
#define SG_DEBUG(...)
Definition: SGIO.h:107
static void IncrementalMurmurHash3(uint32_t *hash, uint32_t *carry, uint8_t *data, int32_t len)
Definition: Hash.cpp:371
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
virtual bool get_next_feature(int32_t &index, float64_t &value, void *iterator)
The class Features is the base class of all feature objects.
Definition: Features.h:68
virtual float64_t dense_dot(int32_t vec_idx1, const float64_t *vec2, int32_t vec2_len)
void progress(float64_t current_val, float64_t min_val=0.0, float64_t max_val=1.0, int32_t decimals=1, const char *prefix="PROGRESS:\t")
Definition: SGIO.cpp:147
Features that compute the Weighted Degreee Kernel feature space explicitly.
virtual EFeatureType get_feature_type() const
virtual void dense_dot_range_subset(int32_t *sub_index, int32_t num, float64_t *output, float64_t *alphas, float64_t *vec, int32_t dim, float64_t b)
static float32_t sqrt(float32_t x)
Definition: Math.h:459
#define SG_UNSTABLE(func,...)
Definition: SGIO.h:132
virtual EFeatureType get_feature_type() const =0
virtual void dense_dot_range(float64_t *output, int32_t start, int32_t stop, float64_t *alphas, float64_t *vec, int32_t dim, float64_t b)
static T abs(T a)
Definition: Math.h:179

SHOGUN Machine Learning Toolbox - Documentation