SHOGUN  6.1.3
HashedWDFeaturesTransposed.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2010 Soeren Sonnenburg
8  * Copyright (C) 2010 Berlin Institute of Technology
9  */
10 
11 #include <shogun/base/Parallel.h>
12 #include <shogun/base/progress.h>
14 #include <shogun/io/SGIO.h>
15 #include <shogun/lib/Signal.h>
16 
17 #ifdef HAVE_PTHREAD
18 #include <pthread.h>
19 
20 #endif
21 
22 using namespace shogun;
23 
24 #ifndef DOXYGEN_SHOULD_SKIP_THIS
25 struct HASHEDWD_THREAD_PARAM
26 {
28  int32_t* sub_index;
29  float64_t* output;
30  int32_t start;
31  int32_t stop;
32  float64_t* alphas;
33  float64_t* vec;
34  float64_t bias;
35  bool progress;
36  PRange<int32_t>* progress_bar;
37  uint32_t* index;
38 };
39 #endif // DOXYGEN_SHOULD_SKIP_THIS
40 
42  :CDotFeatures()
43 {
45  "CHashedWDFeaturesTransposed::CHashedWDFeaturesTransposed()",
46  "\n");
47 
48  strings = NULL;
49  transposed_strings = NULL;
50 
51  degree = 0;
52  start_degree = 0;
53  from_degree = 0;
54  string_length = 0;
55  num_strings = 0;
56  alphabet_size = 0;
57  w_dim = 0;
58  partial_w_dim = 0;
59  wd_weights = NULL;
60  mask = 0;
61  m_hash_bits = 0;
62 
63  normalization_const = 0.0;
64 }
65 
67  int32_t start_order, int32_t order, int32_t from_order,
68  int32_t hash_bits) : CDotFeatures()
69 {
70  ASSERT(start_order>=0)
71  ASSERT(start_order<order)
72  ASSERT(order<=from_order)
73  ASSERT(hash_bits>0)
74  ASSERT(str)
75  ASSERT(str->have_same_length())
76  SG_REF(str);
77 
78  strings=str;
79  int32_t transposed_num_feat=0;
80  int32_t transposed_num_vec=0;
81  transposed_strings=str->get_transposed(transposed_num_feat, transposed_num_vec);
82 
85  ASSERT(transposed_num_feat==num_strings)
86  ASSERT(transposed_num_vec==string_length)
87 
88  CAlphabet* alpha=str->get_alphabet();
90  SG_UNREF(alpha);
91 
92  degree=order;
93  start_degree=start_order;
94  if (start_degree!=0)
96  from_degree=from_order;
97  m_hash_bits=hash_bits;
100 }
101 
104  degree(orig.degree), start_degree(orig.start_degree),
107 {
108  SG_REF(strings);
111  CAlphabet* alpha=strings->get_alphabet();
113  SG_UNREF(alpha);
114 
115  set_wd_weights();
116 }
117 
119 {
120  for (int32_t i=0; i<string_length; i++)
121  SG_FREE(transposed_strings[i].string);
122  SG_FREE(transposed_strings);
123 
124  SG_UNREF(strings);
125  SG_FREE(wd_weights);
126 }
127 
128 float64_t CHashedWDFeaturesTransposed::dot(int32_t vec_idx1, CDotFeatures* df, int32_t vec_idx2)
129 {
130  ASSERT(df)
134 
135  int32_t len1, len2;
136  bool free_vec1, free_vec2;
137 
138  uint8_t* vec1=strings->get_feature_vector(vec_idx1, len1, free_vec1);
139  uint8_t* vec2=wdf->strings->get_feature_vector(vec_idx2, len2, free_vec2);
140 
141  ASSERT(len1==len2)
142 
143  float64_t sum=0.0;
144 
145  for (int32_t i=0; i<len1; i++)
146  {
147  for (int32_t j=0; (i+j<len1) && (j<degree); j++)
148  {
149  if (vec1[i+j]!=vec2[i+j])
150  break;
151  if (j>=start_degree)
152  sum += wd_weights[j]*wd_weights[j];
153  }
154  }
155  strings->free_feature_vector(vec1, vec_idx1, free_vec1);
156  wdf->strings->free_feature_vector(vec2, vec_idx2, free_vec2);
157  return sum/CMath::sq(normalization_const);
158 }
159 
160 float64_t CHashedWDFeaturesTransposed::dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len)
161 {
162  if (vec2_len != w_dim)
163  SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim)
164 
165  float64_t sum=0;
166  int32_t len;
167  bool free_vec1;
168  uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1);
169  uint32_t* val=SG_MALLOC(uint32_t, len);
170 
171  uint32_t offs=0;
172 
173  SGVector<uint32_t>::fill_vector(val, len, 0xDEADBEAF);
174 
175  for (int32_t i=0; i < len; i++)
176  {
177  uint32_t o=offs;
178  uint32_t carry = 0;
179  uint32_t chunk = 0;
180 
181  for (int32_t k=0; k<degree && i+k<len; k++)
182  {
183  const float64_t wd = wd_weights[k];
184  chunk++;
185  CHash::IncrementalMurmurHash3(&(val[i]), &carry, &(vec[i+k]), 1);
186  uint32_t h =
187  CHash::FinalizeIncrementalMurmurHash3(val[i], carry, chunk);
188 #ifdef DEBUG_HASHEDWD
189  SG_PRINT("vec[i]=%d, k=%d, offs=%d o=%d h=%d \n", vec[i], k,offs, o, h)
190 #endif
191  sum+=vec2[o+(h & mask)]*wd;
192  o+=partial_w_dim;
193  }
194  val[i] = CHash::FinalizeIncrementalMurmurHash3(val[i], carry, chunk);
195  offs+=partial_w_dim*degree;
196  }
197  SG_FREE(val);
198  strings->free_feature_vector(vec, vec_idx1, free_vec1);
199 
200  return sum/normalization_const;
201 }
202 
203 void CHashedWDFeaturesTransposed::dense_dot_range(float64_t* output, int32_t start, int32_t stop, float64_t* alphas, float64_t* vec, int32_t dim, float64_t b)
204 {
205  ASSERT(output)
206  // write access is internally between output[start..stop] so the following
207  // line is necessary to write to output[0...(stop-start-1)]
208  output-=start;
209  ASSERT(start>=0)
210  ASSERT(start<stop)
211  ASSERT(stop<=get_num_vectors())
212  uint32_t* index=SG_MALLOC(uint32_t, stop);
213 
214  int32_t num_vectors=stop-start;
215  ASSERT(num_vectors>0)
216 
217  // TODO: port to use OpenMP backend instead of pthread
218 #ifdef HAVE_PTHREAD
219  int32_t num_threads=parallel->get_num_threads();
220 #else
221  int32_t num_threads=1;
222 #endif
223  ASSERT(num_threads>0)
224 
225  if (dim != w_dim)
226  SG_ERROR("Dimensions don't match, vec_len=%d, w_dim=%d\n", dim, w_dim)
227 
228  if (num_threads < 2)
229  {
230  HASHEDWD_THREAD_PARAM params;
231  auto pb = progress(range(start, stop), *this->io);
232  params.hf=this;
233  params.sub_index=NULL;
234  params.output=output;
235  params.start=start;
236  params.stop=stop;
237  params.alphas=alphas;
238  params.vec=vec;
239  params.bias=b;
240  params.progress=false; //true;
241  params.progress_bar = &pb;
242  params.index = index;
243  dense_dot_range_helper((void*) &params);
244  pb.complete();
245  }
246 #ifdef HAVE_PTHREAD
247  else
248  {
249  pthread_t* threads = SG_MALLOC(pthread_t, num_threads-1);
250  HASHEDWD_THREAD_PARAM* params = SG_MALLOC(HASHEDWD_THREAD_PARAM, num_threads);
251  auto pb = progress(range(start, stop), *this->io);
252  int32_t step= num_vectors/num_threads;
253 
254  int32_t t;
255 
256  for (t=0; t<num_threads-1; t++)
257  {
258  params[t].hf = this;
259  params[t].sub_index=NULL;
260  params[t].output = output;
261  params[t].start = start+t*step;
262  params[t].stop = start+(t+1)*step;
263  params[t].alphas=alphas;
264  params[t].vec=vec;
265  params[t].bias=b;
266  params[t].progress = false;
267  params[t].progress_bar = &pb;
268  params[t].index=index;
269  pthread_create(&threads[t], NULL,
271  }
272 
273  params[t].hf = this;
274  params[t].sub_index=NULL;
275  params[t].output = output;
276  params[t].start = start+t*step;
277  params[t].stop = stop;
278  params[t].alphas=alphas;
279  params[t].vec=vec;
280  params[t].bias=b;
281  params[t].progress = false; //true;
282  params[t].progress_bar = &pb;
283  params[t].index=index;
285 
286  for (t=0; t<num_threads-1; t++)
287  pthread_join(threads[t], NULL);
288 
289  SG_FREE(params);
290  SG_FREE(threads);
291  }
292 #endif
293  SG_FREE(index);
294 }
295 
296 void CHashedWDFeaturesTransposed::dense_dot_range_subset(int32_t* sub_index, int num, float64_t* output, float64_t* alphas, float64_t* vec, int32_t dim, float64_t b)
297 {
298  ASSERT(sub_index)
299  ASSERT(output)
300 
301  uint32_t* index=SG_MALLOC(uint32_t, num);
302 
303  // TODO: port to use OpenMP backend instead of pthread
304 #ifdef HAVE_PTHREAD
305  int32_t num_threads=parallel->get_num_threads();
306 #else
307  int32_t num_threads=1;
308 #endif
309  ASSERT(num_threads>0)
310 
311  if (dim != w_dim)
312  SG_ERROR("Dimensions don't match, vec_len=%d, w_dim=%d\n", dim, w_dim)
313 
314  if (num_threads < 2)
315  {
316  HASHEDWD_THREAD_PARAM params;
317  auto pb = progress(range(num), *this->io);
318  params.hf=this;
319  params.sub_index=sub_index;
320  params.output=output;
321  params.start=0;
322  params.stop=num;
323  params.alphas=alphas;
324  params.vec=vec;
325  params.bias=b;
326  params.progress=false; //true;
327  params.progress_bar = &pb;
328  params.index=index;
329  dense_dot_range_helper((void*) &params);
330  pb.complete();
331  }
332 #ifdef HAVE_PTHREAD
333  else
334  {
335  pthread_t* threads = SG_MALLOC(pthread_t, num_threads-1);
336  HASHEDWD_THREAD_PARAM* params = SG_MALLOC(HASHEDWD_THREAD_PARAM, num_threads);
337  int32_t step= num/num_threads;
338  auto pb = progress(range(num), *this->io);
339  int32_t t;
340 
341  for (t=0; t<num_threads-1; t++)
342  {
343  params[t].hf = this;
344  params[t].sub_index=sub_index;
345  params[t].output = output;
346  params[t].start = t*step;
347  params[t].stop = (t+1)*step;
348  params[t].alphas=alphas;
349  params[t].vec=vec;
350  params[t].bias=b;
351  params[t].progress = false;
352  params[t].progress_bar = &pb;
353  params[t].index=index;
354  pthread_create(&threads[t], NULL,
356  }
357 
358  params[t].hf = this;
359  params[t].sub_index=sub_index;
360  params[t].output = output;
361  params[t].start = t*step;
362  params[t].stop = num;
363  params[t].alphas=alphas;
364  params[t].vec=vec;
365  params[t].bias=b;
366  params[t].progress = false; //true;
367  params[t].progress_bar = &pb;
368  params[t].index=index;
370 
371  for (t=0; t<num_threads-1; t++)
372  pthread_join(threads[t], NULL);
373 
374  pb.complete();
375  SG_FREE(params);
376  SG_FREE(threads);
377  SG_FREE(index);
378  }
379 #endif
380 }
381 
383 {
384  HASHEDWD_THREAD_PARAM* par=(HASHEDWD_THREAD_PARAM*) p;
385  CHashedWDFeaturesTransposed* hf=par->hf;
386  int32_t* sub_index=par->sub_index;
387  float64_t* output=par->output;
388  int32_t start=par->start;
389  int32_t stop=par->stop;
390  float64_t* alphas=par->alphas;
391  float64_t* vec=par->vec;
392  float64_t bias=par->bias;
393  bool progress=par->progress;
394  auto pb = par->progress_bar;
395  uint32_t* index=par->index;
396  int32_t string_length=hf->string_length;
397  int32_t degree=hf->degree;
400  uint32_t mask=hf->mask;
401  int32_t partial_w_dim=hf->partial_w_dim;
403 
404  if (sub_index)
405  {
406  for (int32_t j=start; j<stop; j++)
407  output[j]=0.0;
408 
409  uint32_t offs=0;
410  for (int32_t i=0; i<string_length; i++)
411  {
412  uint32_t o=offs;
413  for (int32_t k=0; k<degree && i+k<string_length; k++)
414  {
415  const float64_t wd = wd_weights[k];
416  uint8_t* dim=transposed_strings[i+k].string;
417  uint32_t carry = 0;
418  uint32_t chunk = 0;
419  for (int32_t j=start; j<stop; j++)
420  {
421  uint8_t bval=dim[sub_index[j]];
422  if (k==0)
423  index[j] = 0xDEADBEAF;
424 
425  CHash::IncrementalMurmurHash3(&index[j], &carry, &bval, 1);
426 
427  chunk++;
428  uint32_t h =
430  index[j], carry, chunk);
431 
432  output[j]+=vec[o + (h & mask)]*wd;
433  index[j] = h;
434  }
435 
436  index[stop-1] =
438  index[stop-1], carry, chunk);
439 
440  o+=partial_w_dim;
441  }
442  offs+=partial_w_dim*degree;
443 
444  if (progress)
445  pb->print_progress();
446  }
447 
448  for (int32_t j=start; j<stop; j++)
449  {
450  if (alphas)
451  output[j]=output[j]*alphas[sub_index[j]]/normalization_const+bias;
452  else
453  output[j]=output[j]/normalization_const+bias;
454  }
455  }
456  else
457  {
458  SGVector<float64_t>::fill_vector(&output[start], stop-start, 0.0);
459 
460  uint32_t offs=0;
461  for (int32_t i=0; i<string_length; i++)
462  {
463  uint32_t o=offs;
464  for (int32_t k=0; k<degree && i+k<string_length; k++)
465  {
466  const float64_t wd = wd_weights[k];
467  uint8_t* dim=transposed_strings[i+k].string;
468  uint32_t carry = 0;
469  uint32_t chunk = 0;
470 
471  for (int32_t j=start; j<stop; j++)
472  {
473  uint8_t bval=dim[sub_index[j]];
474  if (k==0)
475  index[j] = 0xDEADBEAF;
476 
477  CHash::IncrementalMurmurHash3(&index[j], &carry, &bval, 1);
478 
479  chunk++;
480  uint32_t h =
482  index[j], carry, chunk);
483 
484  index[j] = h;
485  output[j]+=vec[o + (h & mask)]*wd;
486  }
487 
489  index[stop-1], carry, chunk);
490 
491  o+=partial_w_dim;
492  }
493  offs+=partial_w_dim*degree;
494 
495  if (progress)
496  pb->print_progress();
497  }
498 
499  for (int32_t j=start; j<stop; j++)
500  {
501  if (alphas)
502  output[j]=output[j]*alphas[j]/normalization_const+bias;
503  else
504  output[j]=output[j]/normalization_const+bias;
505  }
506  }
507 
508  return NULL;
509 }
510 
511 void CHashedWDFeaturesTransposed::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val)
512 {
513  if (vec2_len != w_dim)
514  SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim)
515 
516  int32_t len;
517  bool free_vec1;
518  uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1);
519  uint32_t* val=SG_MALLOC(uint32_t, len);
520 
521  uint32_t offs=0;
522  float64_t factor=alpha/normalization_const;
523  if (abs_val)
524  factor=CMath::abs(factor);
525 
526  SGVector<uint32_t>::fill_vector(val, len, 0xDEADBEAF);
527 
528  for (int32_t i=0; i<len; i++)
529  {
530  uint32_t o=offs;
531  uint32_t carry = 0;
532  uint32_t chunk = 0;
533 
534  for (int32_t k=0; k<degree && i+k<len; k++)
535  {
536  float64_t wd = wd_weights[k]*factor;
537  chunk++;
538  CHash::IncrementalMurmurHash3(&(val[i]), &carry, &(vec[i+k]), 1);
539  uint32_t h =
540  CHash::FinalizeIncrementalMurmurHash3(val[i], carry, chunk);
541 #ifdef DEBUG_HASHEDWD
542  SG_PRINT("offs=%d o=%d h=%d \n", offs, o, h)
543  SG_PRINT("vec[i]=%d, k=%d, offs=%d o=%d\n", vec[i], k,offs, o)
544 #endif
545  vec2[o+(h & mask)]+=wd;
546  val[i] = h;
547  o+=partial_w_dim;
548  }
549 
550  val[i] = CHash::FinalizeIncrementalMurmurHash3(val[i], carry, chunk);
551  offs+=partial_w_dim*degree;
552  }
553 
554  SG_FREE(val);
555  strings->free_feature_vector(vec, vec_idx1, free_vec1);
556 }
557 
559 {
560  ASSERT(degree>0)
561 
562  mask=(uint32_t) (((uint64_t) 1)<<m_hash_bits)-1;
565 
566  wd_weights=SG_MALLOC(float64_t, degree);
567 
568  for (int32_t i=0; i<degree; i++)
569  wd_weights[i]=sqrt(2.0*(from_degree-i)/(from_degree*(from_degree+1)));
570 
571  SG_DEBUG("created HashedWDFeaturesTransposed with d=%d (%d), alphabetsize=%d, "
572  "dim=%d partial_dim=%d num=%d, len=%d\n",
573  degree, from_degree, alphabet_size,
575 }
576 
577 
579 {
580  if (n==0)
581  {
583  for (int32_t i=0; i<degree; i++)
585 
587  }
588  else
590 
591  SG_DEBUG("normalization_const:%f\n", normalization_const)
592 }
593 
595 {
596  return new CHashedWDFeaturesTransposed(*this);
597 }
598 
600 {
602  return NULL;
603 }
604 
605 bool CHashedWDFeaturesTransposed::get_next_feature(int32_t& index, float64_t& value, void* iterator)
606 {
608  return false;
609 }
610 
612 {
614 }
virtual void free_feature_iterator(void *iterator)
int32_t get_num_threads() const
Definition: Parallel.cpp:97
PRange< T > progress(Range< T > range, const SGIO &io, std::string prefix="PROGRESS: ", SG_PRG_MODE mode=UTF8, std::function< bool()> condition=[](){return true;})
Definition: progress.h:712
static T sqrt(T x)
Definition: Math.h:428
static T sq(T x)
Definition: Math.h:418
#define SG_ERROR(...)
Definition: SGIO.h:128
#define SG_NOTIMPLEMENTED
Definition: SGIO.h:138
The class Alphabet implements an alphabet and alphabet utility functions.
Definition: Alphabet.h:91
static uint32_t FinalizeIncrementalMurmurHash3(uint32_t h, uint32_t carry, uint32_t total_length)
Definition: Hash.cpp:376
virtual void add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t *vec2, int32_t vec2_len, bool abs_val=false)
Parallel * parallel
Definition: SGObject.h:603
Range< T > range(T rend)
Definition: range.h:136
virtual int32_t get_num_vectors() const
Features that support dot products among other operations.
Definition: DotFeatures.h:44
#define SG_REF(x)
Definition: SGObject.h:52
#define SG_PRINT(...)
Definition: SGIO.h:136
virtual void * get_feature_iterator(int32_t vector_index)
#define ASSERT(x)
Definition: SGIO.h:176
double float64_t
Definition: common.h:60
virtual EFeatureClass get_feature_class() const
int32_t get_num_symbols() const
Definition: Alphabet.h:139
void free_feature_vector(ST *feat_vec, int32_t num, bool dofree)
virtual float64_t dot(int32_t vec_idx1, CDotFeatures *df, int32_t vec_idx2)
virtual EFeatureClass get_feature_class() const =0
SGVector< ST > get_feature_vector(int32_t num)
static void fill_vector(T *vec, int32_t len, T value)
Definition: SGVector.cpp:279
#define SG_UNREF(x)
Definition: SGObject.h:53
#define SG_DEBUG(...)
Definition: SGIO.h:106
static void IncrementalMurmurHash3(uint32_t *hash, uint32_t *carry, uint8_t *data, int32_t len)
Definition: Hash.cpp:371
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
T sum(const Container< T > &a, bool no_diag=false)
virtual bool get_next_feature(int32_t &index, float64_t &value, void *iterator)
The class Features is the base class of all feature objects.
Definition: Features.h:69
virtual float64_t dense_dot(int32_t vec_idx1, const float64_t *vec2, int32_t vec2_len)
virtual int32_t get_max_vector_length()
Features that compute the Weighted Degreee Kernel feature space explicitly.
virtual EFeatureType get_feature_type() const
virtual void dense_dot_range_subset(int32_t *sub_index, int32_t num, float64_t *output, float64_t *alphas, float64_t *vec, int32_t dim, float64_t b)
#define SG_UNSTABLE(func,...)
Definition: SGIO.h:131
bool have_same_length(int32_t len=-1)
virtual EFeatureType get_feature_type() const =0
virtual void dense_dot_range(float64_t *output, int32_t start, int32_t stop, float64_t *alphas, float64_t *vec, int32_t dim, float64_t b)
static T abs(T a)
Definition: Math.h:161
CStringFeatures< ST > * get_transposed()

SHOGUN Machine Learning Toolbox - Documentation