SHOGUN  v3.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
StringFeatures.cpp
Go to the documentation of this file.
5 #include <shogun/io/SGIO.h>
8 
9 #include <sys/types.h>
10 #include <sys/stat.h>
11 #include <dirent.h>
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include <unistd.h>
15 
16 
17 namespace shogun
18 {
19 
21 {
22  init();
23  alphabet=new CAlphabet();
24 }
25 
27 {
28  init();
29 
30  alphabet=new CAlphabet(alpha);
34 }
35 
36 template<class ST> CStringFeatures<ST>::CStringFeatures(SGStringList<ST> string_list, EAlphabet alpha)
37 : CFeatures(0)
38 {
39  init();
40 
41  alphabet=new CAlphabet(alpha);
45  set_features(string_list.strings, string_list.num_strings, string_list.max_string_length);
46 }
47 
48 template<class ST> CStringFeatures<ST>::CStringFeatures(SGStringList<ST> string_list, CAlphabet* alpha)
49 : CFeatures(0)
50 {
51  init();
52 
53  alphabet=new CAlphabet(alpha);
57  set_features(string_list.strings, string_list.num_strings, string_list.max_string_length);
58 }
59 
61 : CFeatures(0)
62 {
63  init();
64 
65  ASSERT(alpha)
66  SG_REF(alpha);
67  alphabet=alpha;
70 }
71 
72 template<class ST> CStringFeatures<ST>::CStringFeatures(const CStringFeatures & orig)
73 : CFeatures(orig), num_vectors(orig.num_vectors),
74  single_string(orig.single_string),
75  length_of_single_string(orig.length_of_single_string),
76  max_string_length(orig.max_string_length),
77  num_symbols(orig.num_symbols),
78  original_num_symbols(orig.original_num_symbols),
79  order(orig.order), preprocess_on_get(false),
80  feature_cache(NULL)
81 {
82  init();
83 
84  ASSERT(orig.single_string == NULL) //not implemented
85 
86  alphabet=orig.alphabet;
88 
89  if (orig.features)
90  {
91  features=SG_MALLOC(SGString<ST>, orig.num_vectors);
92 
93  for (int32_t i=0; i<num_vectors; i++)
94  {
95  features[i].string=SG_MALLOC(ST, orig.features[i].slen);
96  features[i].slen=orig.features[i].slen;
97  memcpy(features[i].string, orig.features[i].string, sizeof(ST)*orig.features[i].slen);
98  }
99  }
100 
101  if (orig.symbol_mask_table)
102  {
103  symbol_mask_table=SG_MALLOC(ST, 256);
105 
106  for (int32_t i=0; i<256; i++)
108  }
109 
112 }
113 
114 template<class ST> CStringFeatures<ST>::CStringFeatures(CFile* loader, EAlphabet alpha)
115 : CFeatures(), num_vectors(0),
116  features(NULL), single_string(NULL), length_of_single_string(0),
117  max_string_length(0), order(0),
118  preprocess_on_get(false), feature_cache(NULL)
119 {
120  init();
121 
122  alphabet=new CAlphabet(alpha);
123  SG_REF(alphabet);
126  load(loader);
127 }
128 
130 {
131  cleanup();
132 
133  SG_UNREF(alphabet);
134 }
135 
136 template<class ST> void CStringFeatures<ST>::cleanup()
137 {
138  remove_all_subsets();
139 
140  if (single_string)
141  {
142  SG_FREE(single_string);
143  single_string=NULL;
144  }
145  else
146  cleanup_feature_vectors(0, num_vectors-1);
147 
148  /*
149  if (single_string)
150  {
151  SG_FREE(single_string);
152  single_string=NULL;
153  }
154  else
155  cleanup_feature_vectors(0, num_vectors-1);
156  */
157 
158  num_vectors=0;
159  SG_FREE(features);
160  SG_FREE(symbol_mask_table);
161  features=NULL;
162  symbol_mask_table=NULL;
163 
164  /* start with a fresh alphabet, but instead of emptying the histogram
165  * create a new object (to leave the alphabet object alone if it is used
166  * by others)
167  */
168  CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet());
169  SG_UNREF(alphabet);
170  alphabet=alpha;
171  SG_REF(alphabet);
172 }
173 
174 template<class ST> void CStringFeatures<ST>::cleanup_feature_vector(int32_t num)
175 {
176  ASSERT(num<get_num_vectors())
177 
178  if (features)
179  {
180  int32_t real_num=m_subset_stack->subset_idx_conversion(num);
181  SG_FREE(features[real_num].string);
182  features[real_num].string=NULL;
183  features[real_num].slen=0;
184 
185  determine_maximum_string_length();
186  }
187 }
188 
189 template<class ST> void CStringFeatures<ST>::cleanup_feature_vectors(int32_t start, int32_t stop)
190 {
191  if (features && get_num_vectors())
192  {
193  ASSERT(start<get_num_vectors())
194  ASSERT(stop<get_num_vectors())
195 
196  for (int32_t i=start; i<=stop; i++)
197  {
198  int32_t real_num=m_subset_stack->subset_idx_conversion(i);
199  SG_FREE(features[real_num].string);
200  features[real_num].string=NULL;
201  features[real_num].slen=0;
202  }
203  determine_maximum_string_length();
204  }
205 }
206 
207 template<class ST> EFeatureClass CStringFeatures<ST>::get_feature_class() const { return C_STRING; }
208 
209 template<class ST> EFeatureType CStringFeatures<ST>::get_feature_type() const { return F_UNKNOWN; }
210 
212 {
213  SG_REF(alphabet);
214  return alphabet;
215 }
216 
217 template<class ST> CFeatures* CStringFeatures<ST>::duplicate() const
218 {
219  return new CStringFeatures<ST>(*this);
220 }
221 
223 {
224  ASSERT(features)
225  if (num>=get_num_vectors())
226  {
227  SG_ERROR("Index out of bounds (number of strings %d, you "
228  "requested %d)\n", get_num_vectors(), num);
229  }
230 
231  int32_t l;
232  bool free_vec;
233  ST* vec=get_feature_vector(num, l, free_vec);
234  ST* dst=SG_MALLOC(ST, l);
235  memcpy(dst, vec, l*sizeof(ST));
236  free_feature_vector(vec, num, free_vec);
237  return SGVector<ST>(dst, l, true);
238 }
239 
240 template<class ST> void CStringFeatures<ST>::set_feature_vector(SGVector<ST> vector, int32_t num)
241 {
242  ASSERT(features)
243 
244  if (m_subset_stack->has_subsets())
245  SG_ERROR("A subset is set, cannot set feature vector\n")
246 
247  if (num>=num_vectors)
248  {
249  SG_ERROR("Index out of bounds (number of strings %d, you "
250  "requested %d)\n", num_vectors, num);
251  }
252 
253  if (vector.vlen<=0)
254  SG_ERROR("String has zero or negative length\n")
255 
256  cleanup_feature_vector(num);
257  features[num].slen=vector.vlen;
258  features[num].string=SG_MALLOC(ST, vector.vlen);
259  memcpy(features[num].string, vector.vector, vector.vlen*sizeof(ST));
260 
261  determine_maximum_string_length();
262 }
263 
265 {
266  preprocess_on_get=true;
267 }
268 
270 {
271  preprocess_on_get=false;
272 }
273 
274 template<class ST> ST* CStringFeatures<ST>::get_feature_vector(int32_t num, int32_t& len, bool& dofree)
275 {
276  ASSERT(features)
277  if (num>=get_num_vectors())
278  SG_ERROR("Requested feature vector with index %d while total num is", num, get_num_vectors())
279 
280  int32_t real_num=m_subset_stack->subset_idx_conversion(num);
281 
282  if (!preprocess_on_get)
283  {
284  dofree=false;
285  len=features[real_num].slen;
286  return features[real_num].string;
287  }
288  else
289  {
290  SG_DEBUG("computing feature vector!\n")
291  ST* feat=compute_feature_vector(num, len);
292  dofree=true;
293 
294  if (get_num_preprocessors())
295  {
296  ST* tmp_feat_before=feat;
297 
298  for (int32_t i=0; i<get_num_preprocessors(); i++)
299  {
300  CStringPreprocessor<ST>* p=(CStringPreprocessor<ST>*) get_preprocessor(i);
301  feat=p->apply_to_string(tmp_feat_before, len);
302  SG_UNREF(p);
303  SG_FREE(tmp_feat_before);
304  tmp_feat_before=feat;
305  }
306  }
307  // TODO: implement caching
308  return feat;
309  }
310 }
311 
313 {
314  int32_t num_feat;
315  int32_t num_vec;
316  SGString<ST>* s=get_transposed(num_feat, num_vec);
317  SGStringList<ST> string_list;
318  string_list.strings = s;
319  string_list.num_strings = num_vec;
320  string_list.max_string_length = num_feat;
321 
322  return new CStringFeatures<ST>(string_list, alphabet);
323 }
324 
325 template<class ST> SGString<ST>* CStringFeatures<ST>::get_transposed(int32_t &num_feat, int32_t &num_vec)
326 {
327  num_feat=get_num_vectors();
328  num_vec=get_max_vector_length();
329  ASSERT(have_same_length())
330 
331  SG_DEBUG("Allocating memory for transposed string features of size %ld\n",
332  int64_t(num_feat)*num_vec);
333 
334  SGString<ST>* sf=SG_MALLOC(SGString<ST>, num_vec);
335 
336  for (int32_t i=0; i<num_vec; i++)
337  {
338  sf[i].string=SG_MALLOC(ST, num_feat);
339  sf[i].slen=num_feat;
340  }
341 
342  for (int32_t i=0; i<num_feat; i++)
343  {
344  int32_t len=0;
345  bool free_vec=false;
346  ST* vec=get_feature_vector(i, len, free_vec);
347 
348  for (int32_t j=0; j<num_vec; j++)
349  sf[j].string[i]=vec[j];
350 
351  free_feature_vector(vec, i, free_vec);
352  }
353  return sf;
354 }
355 
356 template<class ST> void CStringFeatures<ST>::free_feature_vector(ST* feat_vec, int32_t num, bool dofree)
357 {
358  if (num>=get_num_vectors())
359  {
360  SG_ERROR(
361  "Trying to access string[%d] but num_str=%d\n", num,
362  get_num_vectors());
363  }
364 
365  int32_t real_num=m_subset_stack->subset_idx_conversion(num);
366 
367  if (feature_cache)
368  feature_cache->unlock_entry(real_num);
369 
370  if (dofree)
371  SG_FREE(feat_vec);
372 }
373 
374 template<class ST> void CStringFeatures<ST>::free_feature_vector(SGVector<ST> feat_vec, int32_t num)
375 {
376  if (num>=get_num_vectors())
377  {
378  SG_ERROR(
379  "Trying to access string[%d] but num_str=%d\n", num,
380  get_num_vectors());
381  }
382 
383  int32_t real_num=m_subset_stack->subset_idx_conversion(num);
384 
385  if (feature_cache)
386  feature_cache->unlock_entry(real_num);
387 }
388 
389 template<class ST> ST CStringFeatures<ST>::get_feature(int32_t vec_num, int32_t feat_num)
390 {
391  ASSERT(vec_num<get_num_vectors())
392 
393  int32_t len;
394  bool free_vec;
395  ST* vec=get_feature_vector(vec_num, len, free_vec);
396  ASSERT(feat_num<len)
397  ST result=vec[feat_num];
398  free_feature_vector(vec, vec_num, free_vec);
399 
400  return result;
401 }
402 
403 template<class ST> int32_t CStringFeatures<ST>::get_vector_length(int32_t vec_num)
404 {
405  ASSERT(vec_num<get_num_vectors())
406 
407  int32_t len;
408  bool free_vec;
409  ST* vec=get_feature_vector(vec_num, len, free_vec);
410  free_feature_vector(vec, vec_num, free_vec);
411  return len;
412 }
413 
414 template<class ST> int32_t CStringFeatures<ST>::get_max_vector_length()
415 {
416  return max_string_length;
417 }
418 
419 template<class ST> int32_t CStringFeatures<ST>::get_num_vectors() const
420 {
421  return m_subset_stack->has_subsets() ? m_subset_stack->get_size() : num_vectors;
422 }
423 
424 template<class ST> floatmax_t CStringFeatures<ST>::get_num_symbols() { return num_symbols; }
425 
426 template<class ST> floatmax_t CStringFeatures<ST>::get_max_num_symbols() { return CMath::powl(2,sizeof(ST)*8); }
427 
428 template<class ST> floatmax_t CStringFeatures<ST>::get_original_num_symbols() { return original_num_symbols; }
429 
430 template<class ST> int32_t CStringFeatures<ST>::get_order() { return order; }
431 
432 template<class ST> ST CStringFeatures<ST>::get_masked_symbols(ST symbol, uint8_t mask)
433 {
434  ASSERT(symbol_mask_table)
435  return symbol_mask_table[mask] & symbol;
436 }
437 
438 template<class ST> ST CStringFeatures<ST>::shift_offset(ST offset, int32_t amount)
439 {
440  ASSERT(alphabet)
441  return (offset << (amount*alphabet->get_num_bits()));
442 }
443 
444 template<class ST> ST CStringFeatures<ST>::shift_symbol(ST symbol, int32_t amount)
445 {
446  ASSERT(alphabet)
447  return (symbol >> (amount*alphabet->get_num_bits()));
448 }
449 
450 template<class ST> void CStringFeatures<ST>::load_ascii_file(char* fname, bool remap_to_bin,
451  EAlphabet ascii_alphabet, EAlphabet binary_alphabet)
452 {
453  remove_all_subsets();
454 
455  size_t blocksize=1024*1024;
456  size_t required_blocksize=0;
457  uint8_t* dummy=SG_MALLOC(uint8_t, blocksize);
458  uint8_t* overflow=NULL;
459  int32_t overflow_len=0;
460 
461  cleanup();
462 
463  CAlphabet* alpha=new CAlphabet(ascii_alphabet);
464  CAlphabet* alpha_bin=new CAlphabet(binary_alphabet);
465 
466  FILE* f=fopen(fname, "ro");
467 
468  if (f)
469  {
470  num_vectors=0;
471  max_string_length=0;
472 
473  SG_INFO("counting line numbers in file %s\n", fname)
474  size_t block_offs=0;
475  size_t old_block_offs=0;
476  fseek(f, 0, SEEK_END);
477  size_t fsize=ftell(f);
478  rewind(f);
479 
480  if (blocksize>fsize)
481  blocksize=fsize;
482 
483  SG_DEBUG("block_size=%ld file_size=%ld\n", blocksize, fsize)
484 
485  size_t sz=blocksize;
486  while (sz == blocksize)
487  {
488  sz=fread(dummy, sizeof(uint8_t), blocksize, f);
489  for (size_t i=0; i<sz; i++)
490  {
491  block_offs++;
492  if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
493  {
494  num_vectors++;
495  required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs);
496  old_block_offs=block_offs;
497  }
498  }
499  SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t")
500  }
501 
502  SG_INFO("found %d strings\n", num_vectors)
503  SG_FREE(dummy);
504  blocksize=required_blocksize;
505  dummy=SG_MALLOC(uint8_t, blocksize);
506  overflow=SG_MALLOC(uint8_t, blocksize);
507  features=SG_MALLOC(SGString<ST>, num_vectors);
508 
509  rewind(f);
510  sz=blocksize;
511  int32_t lines=0;
512  while (sz == blocksize)
513  {
514  sz=fread(dummy, sizeof(uint8_t), blocksize, f);
515 
516  size_t old_sz=0;
517  for (size_t i=0; i<sz; i++)
518  {
519  if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
520  {
521  int32_t len=i-old_sz;
522  //SG_PRINT("i:%d len:%d old_sz:%d\n", i, len, old_sz)
523  max_string_length=CMath::max(max_string_length, len+overflow_len);
524 
525  features[lines].slen=len;
526  features[lines].string=SG_MALLOC(ST, len);
527 
528  if (remap_to_bin)
529  {
530  for (int32_t j=0; j<overflow_len; j++)
531  features[lines].string[j]=alpha->remap_to_bin(overflow[j]);
532  for (int32_t j=0; j<len; j++)
533  features[lines].string[j+overflow_len]=alpha->remap_to_bin(dummy[old_sz+j]);
534  alpha->add_string_to_histogram(&dummy[old_sz], len);
535  alpha_bin->add_string_to_histogram(features[lines].string, features[lines].slen);
536  }
537  else
538  {
539  for (int32_t j=0; j<overflow_len; j++)
540  features[lines].string[j]=overflow[j];
541  for (int32_t j=0; j<len; j++)
542  features[lines].string[j+overflow_len]=dummy[old_sz+j];
543  alpha->add_string_to_histogram(&dummy[old_sz], len);
544  alpha->add_string_to_histogram(features[lines].string, features[lines].slen);
545  }
546 
547  // clear overflow
548  overflow_len=0;
549 
550  //CMath::display_vector(features[lines].string, len);
551  old_sz=i+1;
552  lines++;
553  SG_PROGRESS(lines, 0, num_vectors, 1, "LOADING:\t")
554  }
555  }
556  for (size_t i=old_sz; i<sz; i++)
557  overflow[i-old_sz]=dummy[i];
558 
559  overflow_len=sz-old_sz;
560  }
561 
562  if (alpha->check_alphabet_size() && alpha->check_alphabet())
563  {
564  SG_INFO("file successfully read\n")
565  SG_INFO("max_string_length=%d\n", max_string_length)
566  SG_INFO("num_strings=%d\n", num_vectors)
567  }
568  fclose(f);
569  }
570 
571  SG_FREE(dummy);
572 
573  SG_UNREF(alphabet);
574 
575  if (remap_to_bin)
576  alphabet=alpha_bin;
577  else
578  alphabet=alpha;
579  SG_REF(alphabet);
580  num_symbols=alphabet->get_num_symbols();
581 }
582 
583 template<class ST> bool CStringFeatures<ST>::load_fasta_file(const char* fname, bool ignore_invalid)
584 {
585  remove_all_subsets();
586 
587  int32_t i=0;
588  uint64_t len=0;
589  uint64_t offs=0;
590  int32_t num=0;
591  int32_t max_len=0;
592 
593  CMemoryMappedFile<char> f(fname);
594 
595  while (true)
596  {
597  char* s=f.get_line(len, offs);
598  if (!s)
599  break;
600 
601  if (len>0 && s[0]=='>')
602  num++;
603  }
604 
605  if (num==0)
606  SG_ERROR("No fasta hunks (lines starting with '>') found\n")
607 
608  cleanup();
609  SG_UNREF(alphabet);
610  alphabet=new CAlphabet(DNA);
611  num_symbols=alphabet->get_num_symbols();
612 
613  SGString<ST>* strings=SG_MALLOC(SGString<ST>, num);
614  offs=0;
615 
616  for (i=0;i<num; i++)
617  {
618  uint64_t id_len=0;
619  char* id=f.get_line(id_len, offs);
620 
621  char* fasta=f.get_line(len, offs);
622  char* s=fasta;
623  int32_t fasta_len=0;
624  int32_t spanned_lines=0;
625 
626  while (true)
627  {
628  if (!s || len==0)
629  SG_ERROR("Error reading fasta entry in line %d len=%ld", 4*i+1, len)
630 
631  if (s[0]=='>' || offs==f.get_size())
632  {
633  offs-=len+1; // seek to beginning
634  if (offs==f.get_size())
635  {
636  SG_DEBUG("at EOF\n")
637  fasta_len+=len;
638  }
639 
640  len=fasta_len-spanned_lines;
641  strings[i].string=SG_MALLOC(ST, len);
642  strings[i].slen=len;
643 
644  ST* str=strings[i].string;
645  int32_t idx=0;
646  SG_DEBUG("'%.*s', len=%d, spanned_lines=%d\n", (int32_t) id_len, id, (int32_t) len, (int32_t) spanned_lines)
647 
648  for (int32_t j=0; j<fasta_len; j++)
649  {
650  if (fasta[j]=='\n')
651  continue;
652 
653  ST c=(ST) fasta[j];
654 
655  if (ignore_invalid && !alphabet->is_valid((uint8_t) fasta[j]))
656  c=(ST) 'A';
657 
658  if (uint64_t(idx)>=len)
659  SG_ERROR("idx=%d j=%d fasta_len=%d, spanned_lines=%d str='%.*s'\n", idx, j, fasta_len, spanned_lines, idx, str)
660  str[idx++]=c;
661  }
662  max_len=CMath::max(max_len, strings[i].slen);
663 
664 
665  break;
666  }
667 
668  spanned_lines++;
669  fasta_len+=len+1; // including '\n'
670  s=f.get_line(len, offs);
671  }
672  }
673  return set_features(strings, num, max_len);
674 }
675 
676 template<class ST> bool CStringFeatures<ST>::load_fastq_file(const char* fname,
677  bool ignore_invalid, bool bitremap_in_single_string)
678 {
679  remove_all_subsets();
680 
681  CMemoryMappedFile<char> f(fname);
682 
683  int32_t i=0;
684  uint64_t len=0;
685  uint64_t offs=0;
686 
687  int32_t num=f.get_num_lines();
688  int32_t max_len=0;
689 
690  if (num%4)
691  SG_ERROR("Number of lines must be divisible by 4 in fastq files\n")
692  num/=4;
693 
694  cleanup();
695  SG_UNREF(alphabet);
696  alphabet=new CAlphabet(DNA);
697 
698  SGString<ST>* strings;
699 
700  ST* str=NULL;
701  if (bitremap_in_single_string)
702  {
703  strings=SG_MALLOC(SGString<ST>, 1);
704  strings[0].string=SG_MALLOC(ST, num);
705  strings[0].slen=num;
706  f.get_line(len, offs);
707  f.get_line(len, offs);
708  order=len;
709  max_len=num;
710  offs=0;
711  original_num_symbols=alphabet->get_num_symbols();
712  str=SG_MALLOC(ST, len);
713  }
714  else
715  strings=SG_MALLOC(SGString<ST>, num);
716 
717  for (i=0;i<num; i++)
718  {
719  if (!f.get_line(len, offs))
720  SG_ERROR("Error reading 'read' identifier in line %d", 4*i)
721 
722  char* s=f.get_line(len, offs);
723  if (!s || len==0)
724  SG_ERROR("Error reading 'read' in line %d len=%ld", 4*i+1, len)
725 
726  if (bitremap_in_single_string)
727  {
728  if (len!=(uint64_t) order)
729  SG_ERROR("read in line %d not of length %d (is %d)\n", 4*i+1, order, len)
730  for (int32_t j=0; j<order; j++)
731  str[j]=(ST) alphabet->remap_to_bin((uint8_t) s[j]);
732 
733  strings[0].string[i]=embed_word(str, order);
734  }
735  else
736  {
737  strings[i].string=SG_MALLOC(ST, len);
738  strings[i].slen=len;
739  str=strings[i].string;
740 
741  if (ignore_invalid)
742  {
743  for (uint64_t j=0; j<len; j++)
744  {
745  if (alphabet->is_valid((uint8_t) s[j]))
746  str[j]= (ST) s[j];
747  else
748  str[j]= (ST) 'A';
749  }
750  }
751  else
752  {
753  for (uint64_t j=0; j<len; j++)
754  str[j]= (ST) s[j];
755  }
756  max_len=CMath::max(max_len, (int32_t) len);
757  }
758 
759 
760  if (!f.get_line(len, offs))
761  SG_ERROR("Error reading 'read' quality identifier in line %d", 4*i+2)
762 
763  if (!f.get_line(len, offs))
764  SG_ERROR("Error reading 'read' quality in line %d", 4*i+3)
765  }
766 
767  if (bitremap_in_single_string)
768  num=1;
769 
770  num_vectors=num;
771  max_string_length=max_len;
772  features=strings;
773 
774  return true;
775 }
776 
777 template<class ST> bool CStringFeatures<ST>::load_from_directory(char* dirname)
778 {
779  remove_all_subsets();
780 
781  struct dirent **namelist;
782  int32_t n;
783 
784  SGIO::set_dirname(dirname);
785 
786  SG_DEBUG("dirname '%s'\n", dirname)
787 
788  n=scandir(dirname, &namelist, &SGIO::filter, alphasort);
789  if (n <= 0)
790  {
791  SG_ERROR("error calling scandir - no files found\n")
792  return false;
793  }
794  else
795  {
796  SGString<ST>* strings=NULL;
797 
798  int32_t num=0;
799  int32_t max_len=-1;
800 
801  //usually n==num_vec, but it might not in race conditions
802  //(file perms modified, file erased)
803  strings=SG_MALLOC(SGString<ST>, n);
804 
805  for (int32_t i=0; i<n; i++)
806  {
807  char* fname=SGIO::concat_filename(namelist[i]->d_name);
808 
809  struct stat s;
810  off_t filesize=0;
811 
812  if (!stat(fname, &s) && s.st_size>0)
813  {
814  filesize=s.st_size/sizeof(ST);
815 
816  FILE* f=fopen(fname, "ro");
817  if (f)
818  {
819  ST* str=SG_MALLOC(ST, filesize);
820  SG_DEBUG("%s:%ld\n", fname, (int64_t) filesize)
821  if (fread(str, sizeof(ST), filesize, f)!=(size_t) filesize)
822  SG_ERROR("failed to read file\n")
823  strings[num].string=str;
824  strings[num].slen=filesize;
825  max_len=CMath::max(max_len, strings[num].slen);
826 
827  num++;
828  fclose(f);
829  }
830  }
831  else
832  SG_ERROR("empty or non readable file \'%s\'\n", fname)
833 
834  SG_FREE(namelist[i]);
835  }
836  SG_FREE(namelist);
837 
838  if (num>0 && strings)
839  {
840  set_features(strings, num, max_len);
841  return true;
842  }
843  }
844  return false;
845 }
846 
848 {
849  set_features(feats.strings, feats.num_strings, feats.max_string_length);
850 }
851 
852 template<class ST> bool CStringFeatures<ST>::set_features(SGString<ST>* p_features, int32_t p_num_vectors, int32_t p_max_string_length)
853 {
854  if (m_subset_stack->has_subsets())
855  SG_ERROR("Cannot call set_features() with subset.\n")
856 
857  if (p_features)
858  {
859  CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet());
860 
861  //compute histogram for char/byte
862  for (int32_t i=0; i<p_num_vectors; i++)
863  alpha->add_string_to_histogram( p_features[i].string, p_features[i].slen);
864 
865  SG_INFO("max_value_in_histogram:%d\n", alpha->get_max_value_in_histogram())
866  SG_INFO("num_symbols_in_histogram:%d\n", alpha->get_num_symbols_in_histogram())
867 
868  if (alpha->check_alphabet_size() && alpha->check_alphabet())
869  {
870  cleanup();
871  SG_UNREF(alphabet);
872 
873  alphabet=alpha;
874  SG_REF(alphabet);
875 
876  // TODO remove copying
877  features = SG_MALLOC(SGString<ST>,p_num_vectors);
878  memcpy(features,p_features,sizeof(SGString<ST>)*p_num_vectors);
879  num_vectors = p_num_vectors;
880  max_string_length = p_max_string_length;
881 
882  return true;
883  }
884  else
885  SG_UNREF(alpha);
886  }
887 
888  return false;
889 }
890 
892 {
893  ASSERT(sf)
894 
895  if (m_subset_stack->has_subsets())
896  SG_ERROR("Cannot call set_features() with subset.\n")
897 
898  SGString<ST>* new_features=SG_MALLOC(SGString<ST>, sf->get_num_vectors());
899 
900  index_t sf_num_str=sf->get_num_vectors();
901  for (int32_t i=0; i<sf_num_str; i++)
902  {
903  int32_t real_i = sf->m_subset_stack->subset_idx_conversion(i);
904  int32_t length=sf->features[real_i].slen;
905  new_features[i].string=SG_MALLOC(ST, length);
906  memcpy(new_features[i].string, sf->features[real_i].string, length);
907  new_features[i].slen=length;
908  }
909  return append_features(new_features, sf_num_str,
910  sf->max_string_length);
911 }
912 
913 template<class ST> bool CStringFeatures<ST>::append_features(SGString<ST>* p_features, int32_t p_num_vectors, int32_t p_max_string_length)
914 {
915  if (m_subset_stack->has_subsets())
916  SG_ERROR("Cannot call set_features() with subset.\n")
917 
918  if (!features)
919  return set_features(p_features, p_num_vectors, p_max_string_length);
920 
921  CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet());
922 
923  //compute histogram for char/byte
924  for (int32_t i=0; i<p_num_vectors; i++)
925  alpha->add_string_to_histogram( p_features[i].string, p_features[i].slen);
926 
927  SG_INFO("max_value_in_histogram:%d\n", alpha->get_max_value_in_histogram())
928  SG_INFO("num_symbols_in_histogram:%d\n", alpha->get_num_symbols_in_histogram())
929 
930  if (alpha->check_alphabet_size() && alpha->check_alphabet())
931  {
932  SG_UNREF(alpha);
933  for (int32_t i=0; i<p_num_vectors; i++)
934  alphabet->add_string_to_histogram( p_features[i].string, p_features[i].slen);
935 
936  int32_t old_num_vectors=num_vectors;
937  num_vectors=old_num_vectors+p_num_vectors;
938  SGString<ST>* new_features=SG_MALLOC(SGString<ST>, num_vectors);
939 
940  for (int32_t i=0; i<num_vectors; i++)
941  {
942  if (i<old_num_vectors)
943  {
944  new_features[i].string=features[i].string;
945  new_features[i].slen=features[i].slen;
946  }
947  else
948  {
949  new_features[i].string=p_features[i-old_num_vectors].string;
950  new_features[i].slen=p_features[i-old_num_vectors].slen;
951  }
952  }
953  SG_FREE(features);
954  SG_FREE(p_features); // free now obsolete features
955 
956  this->features=new_features;
957  max_string_length=CMath::max(max_string_length, p_max_string_length);
958 
959  return true;
960  }
961  SG_UNREF(alpha);
962 
963  return false;
964 }
965 
967 {
968  SGStringList<ST> sl(NULL,0,0,false);
969 
970  sl.strings=get_features(sl.num_strings, sl.max_string_length);
971  return sl;
972 }
973 
974 template<class ST> SGString<ST>* CStringFeatures<ST>::get_features(int32_t& num_str, int32_t& max_str_len)
975 {
976  if (m_subset_stack->has_subsets())
977  SG_ERROR("get features() is not possible on subset")
978 
979  num_str=num_vectors;
980  max_str_len=max_string_length;
981  return features;
982 }
983 
984 template<class ST> SGString<ST>* CStringFeatures<ST>::copy_features(int32_t& num_str, int32_t& max_str_len)
985 {
986  ASSERT(num_vectors>0)
987 
988  num_str=get_num_vectors();
989  max_str_len=max_string_length;
990  SGString<ST>* new_feat=SG_MALLOC(SGString<ST>, num_str);
991 
992  for (int32_t i=0; i<num_str; i++)
993  {
994  int32_t len;
995  bool free_vec;
996  ST* vec=get_feature_vector(i, len, free_vec);
997  new_feat[i].string=SG_MALLOC(ST, len);
998  new_feat[i].slen=len;
999  memcpy(new_feat[i].string, vec, ((size_t) len) * sizeof(ST));
1000  free_feature_vector(vec, i, free_vec);
1001  }
1002 
1003  return new_feat;
1004 }
1005 
1006 template<class ST> void CStringFeatures<ST>::get_features(SGString<ST>** dst, int32_t* num_str)
1007 {
1008  int32_t num_vec;
1009  int32_t max_str_len;
1010  *dst=copy_features(num_vec, max_str_len);
1011  *num_str=num_vec;
1012 }
1013 
1014 template<class ST> bool CStringFeatures<ST>::load_compressed(char* src, bool decompress)
1015 {
1016  remove_all_subsets();
1017 
1018  FILE* file=NULL;
1019 
1020  if (!(file=fopen(src, "r")))
1021  return false;
1022  cleanup();
1023 
1024  // header shogun v0
1025  char id[4];
1026  if (fread(&id[0], sizeof(char), 1, file)!=1)
1027  SG_ERROR("failed to read header")
1028  ASSERT(id[0]=='S')
1029  if (fread(&id[1], sizeof(char), 1, file)!=1)
1030  SG_ERROR("failed to read header")
1031  ASSERT(id[1]=='G')
1032  if (fread(&id[2], sizeof(char), 1, file)!=1)
1033  SG_ERROR("failed to read header")
1034  ASSERT(id[2]=='V')
1035  if (fread(&id[3], sizeof(char), 1, file)!=1)
1036  SG_ERROR("failed to read header")
1037  ASSERT(id[3]=='0')
1038 
1039  //compression type
1040  uint8_t c;
1041  if (fread(&c, sizeof(uint8_t), 1, file)!=1)
1042  SG_ERROR("failed to read compression type")
1043  CCompressor* compressor= new CCompressor((E_COMPRESSION_TYPE) c);
1044  //alphabet
1045  uint8_t a;
1046  delete alphabet;
1047  if (fread(&a, sizeof(uint8_t), 1, file)!=1)
1048  SG_ERROR("failed to read compression alphabet")
1049  alphabet=new CAlphabet((EAlphabet) a);
1050  // number of vectors
1051  if (fread(&num_vectors, sizeof(int32_t), 1, file)!=1)
1052  SG_ERROR("failed to read compression number of vectors")
1053  ASSERT(num_vectors>0)
1054  // maximum string length
1055  if (fread(&max_string_length, sizeof(int32_t), 1, file)!=1)
1056  SG_ERROR("failed to read maximum string length")
1057  ASSERT(max_string_length>0)
1058 
1059  features=SG_MALLOC(SGString<ST>, num_vectors);
1060 
1061  // vectors
1062  for (int32_t i=0; i<num_vectors; i++)
1063  {
1064  // vector len compressed
1065  int32_t len_compressed;
1066  if (fread(&len_compressed, sizeof(int32_t), 1, file)!=1)
1067  SG_ERROR("failed to read vector length compressed")
1068  // vector len uncompressed
1069  int32_t len_uncompressed;
1070  if (fread(&len_uncompressed, sizeof(int32_t), 1, file)!=1)
1071  SG_ERROR("failed to read vector length uncompressed")
1072 
1073  // vector raw data
1074  if (decompress)
1075  {
1076  features[i].string=SG_MALLOC(ST, len_uncompressed);
1077  features[i].slen=len_uncompressed;
1078  uint8_t* compressed=SG_MALLOC(uint8_t, len_compressed);
1079  if (fread(compressed, sizeof(uint8_t), len_compressed, file)!=(size_t) len_compressed)
1080  SG_ERROR("failed to read compressed data (expected %d bytes)", len_compressed)
1081  uint64_t uncompressed_size=len_uncompressed;
1082  uncompressed_size*=sizeof(ST);
1083  compressor->decompress(compressed, len_compressed,
1084  (uint8_t*) features[i].string, uncompressed_size);
1085  SG_FREE(compressed);
1086  ASSERT(uncompressed_size==((uint64_t) len_uncompressed)*sizeof(ST))
1087  }
1088  else
1089  {
1090  int32_t offs=CMath::ceil(2.0*sizeof(int32_t)/sizeof(ST));
1091  features[i].string=SG_MALLOC(ST, len_compressed+offs);
1092  features[i].slen=len_compressed+offs;
1093  int32_t* feat32ptr=((int32_t*) (features[i].string));
1094  memset(features[i].string, 0, offs*sizeof(ST));
1095  feat32ptr[0]=(int32_t) len_compressed;
1096  feat32ptr[1]=(int32_t) len_uncompressed;
1097  uint8_t* compressed=(uint8_t*) (&features[i].string[offs]);
1098  if (fread(compressed, 1, len_compressed, file)!=(size_t) len_compressed)
1099  SG_ERROR("failed to read uncompressed data")
1100  }
1101  }
1102 
1103  delete compressor;
1104  fclose(file);
1105 
1106  return false;
1107 }
1108 
1109 template<class ST> bool CStringFeatures<ST>::save_compressed(char* dest, E_COMPRESSION_TYPE compression, int level)
1110 {
1111  if (m_subset_stack->has_subsets())
1112  SG_ERROR("save_compressed() is not possible on subset")
1113 
1114  FILE* file=NULL;
1115 
1116  if (!(file=fopen(dest, "wb")))
1117  return false;
1118 
1119  CCompressor* compressor= new CCompressor(compression);
1120 
1121  // header shogun v0
1122  const char* id="SGV0";
1123  fwrite(&id[0], sizeof(char), 1, file);
1124  fwrite(&id[1], sizeof(char), 1, file);
1125  fwrite(&id[2], sizeof(char), 1, file);
1126  fwrite(&id[3], sizeof(char), 1, file);
1127 
1128  //compression type
1129  uint8_t c=(uint8_t) compression;
1130  fwrite(&c, sizeof(uint8_t), 1, file);
1131  //alphabet
1132  uint8_t a=(uint8_t) alphabet->get_alphabet();
1133  fwrite(&a, sizeof(uint8_t), 1, file);
1134  // number of vectors
1135  fwrite(&num_vectors, sizeof(int32_t), 1, file);
1136  // maximum string length
1137  fwrite(&max_string_length, sizeof(int32_t), 1, file);
1138 
1139  // vectors
1140  for (int32_t i=0; i<num_vectors; i++)
1141  {
1142  int32_t len=-1;
1143  bool vfree;
1144  ST* vec=get_feature_vector(i, len, vfree);
1145 
1146  uint8_t* compressed=NULL;
1147  uint64_t compressed_size=0;
1148 
1149  compressor->compress((uint8_t*) vec, ((uint64_t) len)*sizeof(ST),
1150  compressed, compressed_size, level);
1151 
1152  int32_t len_compressed=(int32_t) compressed_size;
1153  // vector len compressed in bytes
1154  fwrite(&len_compressed, sizeof(int32_t), 1, file);
1155  // vector len uncompressed in number of elements of type ST
1156  fwrite(&len, sizeof(int32_t), 1, file);
1157  // vector raw data
1158  fwrite(compressed, compressed_size, 1, file);
1159  SG_FREE(compressed);
1160 
1161  free_feature_vector(vec, i, vfree);
1162  }
1163 
1164  delete compressor;
1165  fclose(file);
1166  return true;
1167 }
1168 
1169 template<class ST> bool CStringFeatures<ST>::apply_preprocessor(bool force_preprocessing)
1170 {
1171  SG_DEBUG("force: %d\n", force_preprocessing)
1172 
1173  for (int32_t i=0; i<get_num_preprocessors(); i++)
1174  {
1175  if ( (!is_preprocessed(i) || force_preprocessing) )
1176  {
1177  set_preprocessed(i);
1178  CStringPreprocessor<ST>* p=(CStringPreprocessor<ST>*) get_preprocessor(i);
1179  SG_INFO("preprocessing using preproc %s\n", p->get_name())
1180 
1181  if (!p->apply_to_string_features(this))
1182  {
1183  SG_UNREF(p);
1184  return false;
1185  }
1186  else
1187  SG_UNREF(p);
1188  }
1189  }
1190  return true;
1191 }
1192 
1193 template<class ST> int32_t CStringFeatures<ST>::obtain_by_sliding_window(int32_t window_size, int32_t step_size, int32_t skip)
1194 {
1195  if (m_subset_stack->has_subsets())
1197 
1198  ASSERT(step_size>0)
1199  ASSERT(window_size>0)
1200  ASSERT(num_vectors==1 || single_string)
1201  ASSERT(max_string_length>=window_size ||
1202  (single_string && length_of_single_string>=window_size));
1203 
1204  //in case we are dealing with a single remapped string
1205  //allow remapping
1206  if (single_string)
1207  num_vectors= (length_of_single_string-window_size)/step_size + 1;
1208  else if (num_vectors==1)
1209  {
1210  num_vectors= (max_string_length-window_size)/step_size + 1;
1211  length_of_single_string=max_string_length;
1212  }
1213 
1214  SGString<ST>* f=SG_MALLOC(SGString<ST>, num_vectors);
1215  int32_t offs=0;
1216  for (int32_t i=0; i<num_vectors; i++)
1217  {
1218  f[i].string=&features[0].string[offs+skip];
1219  f[i].slen=window_size-skip;
1220  offs+=step_size;
1221  }
1222  single_string=features[0].string;
1223  SG_FREE(features);
1224  features=f;
1225  max_string_length=window_size-skip;
1226 
1227  return num_vectors;
1228 }
1229 
1230 template<class ST> int32_t CStringFeatures<ST>::obtain_by_position_list(int32_t window_size, CDynamicArray<int32_t>* positions,
1231  int32_t skip)
1232 {
1233  if (m_subset_stack->has_subsets())
1235 
1236  ASSERT(positions)
1237  ASSERT(window_size>0)
1238  ASSERT(num_vectors==1 || single_string)
1239  ASSERT(max_string_length>=window_size ||
1240  (single_string && length_of_single_string>=window_size));
1241 
1242  num_vectors= positions->get_num_elements();
1243  ASSERT(num_vectors>0)
1244 
1245  int32_t len;
1246 
1247  //in case we are dealing with a single remapped string
1248  //allow remapping
1249  if (single_string)
1250  len=length_of_single_string;
1251  else
1252  {
1253  single_string=features[0].string;
1254  len=max_string_length;
1255  length_of_single_string=max_string_length;
1256  }
1257 
1258  SGString<ST>* f=SG_MALLOC(SGString<ST>, num_vectors);
1259  for (int32_t i=0; i<num_vectors; i++)
1260  {
1261  int32_t p=positions->get_element(i);
1262 
1263  if (p>=0 && p<=len-window_size)
1264  {
1265  f[i].string=&features[0].string[p+skip];
1266  f[i].slen=window_size-skip;
1267  }
1268  else
1269  {
1270  num_vectors=1;
1271  max_string_length=len;
1272  features[0].slen=len;
1273  single_string=NULL;
1274  SG_FREE(f);
1275  SG_ERROR("window (size:%d) starting at position[%d]=%d does not fit in sequence(len:%d)\n",
1276  window_size, i, p, len);
1277  return -1;
1278  }
1279  }
1280 
1281  SG_FREE(features);
1282  features=f;
1283  max_string_length=window_size-skip;
1284 
1285  return num_vectors;
1286 }
1287 
1288 template<class ST> bool CStringFeatures<ST>::obtain_from_char(CStringFeatures<char>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
1289 {
1290  return obtain_from_char_features(sf, start, p_order, gap, rev);
1291 }
1292 
1293 template<class ST> bool CStringFeatures<ST>::have_same_length(int32_t len)
1294 {
1295  if (len!=-1)
1296  {
1297  if (len!=max_string_length)
1298  return false;
1299  }
1300  len=max_string_length;
1301 
1302  index_t num_str=get_num_vectors();
1303  for (int32_t i=0; i<num_str; i++)
1304  {
1305  if (get_vector_length(i)!=len)
1306  return false;
1307  }
1308 
1309  return true;
1310 }
1311 
1312 template<class ST> void CStringFeatures<ST>::embed_features(int32_t p_order)
1313 {
1314  if (m_subset_stack->has_subsets())
1316 
1317  ASSERT(alphabet->get_num_symbols_in_histogram() > 0)
1318 
1319  order=p_order;
1320  original_num_symbols=alphabet->get_num_symbols();
1321  int32_t max_val=alphabet->get_num_bits();
1322 
1323  if (p_order>1)
1324  num_symbols=CMath::powl((floatmax_t) 2, (floatmax_t) max_val*p_order);
1325  else
1326  num_symbols=original_num_symbols;
1327 
1328  SG_INFO("max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols)
1329 
1330  if ( ((floatmax_t) num_symbols) > CMath::powl(((floatmax_t) 2),((floatmax_t) sizeof(ST)*8)) )
1331  SG_WARNING("symbols did not fit into datatype \"%c\" (%d)\n", (char) max_val, (int) max_val)
1332 
1333  ST mask=0;
1334  for (int32_t i=0; i<p_order*max_val; i++)
1335  mask= (mask<<1) | ((ST) 1);
1336 
1337  for (int32_t i=0; i<num_vectors; i++)
1338  {
1339  int32_t len=features[i].slen;
1340 
1341  if (len < p_order)
1342  SG_ERROR("Sequence must be longer than order (%d vs. %d)\n", len, p_order)
1343 
1344  ST* str=features[i].string;
1345 
1346  // convert first word
1347  for (int32_t j=0; j<p_order; j++)
1348  str[j]=(ST) alphabet->remap_to_bin(str[j]);
1349  str[0]=embed_word(&str[0], p_order);
1350 
1351  // convert the rest
1352  int32_t idx=0;
1353  for (int32_t j=p_order; j<len; j++)
1354  {
1355  str[j]=(ST) alphabet->remap_to_bin(str[j]);
1356  str[idx+1]= ((str[idx]<<max_val) | str[j]) & mask;
1357  idx++;
1358  }
1359 
1360  features[i].slen=len-p_order+1;
1361  }
1362 
1363  compute_symbol_mask_table(max_val);
1364 }
1365 
1366 template<class ST> void CStringFeatures<ST>::compute_symbol_mask_table(int64_t max_val)
1367 {
1368  if (m_subset_stack->has_subsets())
1370 
1371  SG_FREE(symbol_mask_table);
1372  symbol_mask_table=SG_MALLOC(ST, 256);
1373  symbol_mask_table_len=256;
1374 
1375  uint64_t mask=0;
1376  for (int32_t i=0; i< (int64_t) max_val; i++)
1377  mask=(mask<<1) | 1;
1378 
1379  for (int32_t i=0; i<256; i++)
1380  {
1381  uint8_t bits=(uint8_t) i;
1382  symbol_mask_table[i]=0;
1383 
1384  for (int32_t j=0; j<8; j++)
1385  {
1386  if (bits & 1)
1387  symbol_mask_table[i]|=mask<<(max_val*j);
1388 
1389  bits>>=1;
1390  }
1391  }
1392 }
1393 
1394 template<class ST> void CStringFeatures<ST>::unembed_word(ST word, uint8_t* seq, int32_t len)
1395 {
1396  uint32_t nbits= (uint32_t) alphabet->get_num_bits();
1397 
1398  ST mask=0;
1399  for (uint32_t i=0; i<nbits; i++)
1400  mask=(mask<<1) | (ST) 1;
1401 
1402  for (int32_t i=0; i<len; i++)
1403  {
1404  ST w=(word & mask);
1405  seq[len-i-1]=alphabet->remap_to_char((uint8_t) w);
1406  word>>=nbits;
1407  }
1408 }
1409 
1410 template<class ST> ST CStringFeatures<ST>::embed_word(ST* seq, int32_t len)
1411 {
1412  ST value=(ST) 0;
1413  uint32_t nbits= (uint32_t) alphabet->get_num_bits();
1414  for (int32_t i=0; i<len; i++)
1415  {
1416  value<<=nbits;
1417  value|=seq[i];
1418  }
1419 
1420  return value;
1421 }
1422 
1424 {
1425  max_string_length=0;
1426  index_t num_str=get_num_vectors();
1427 
1428  for (int32_t i=0; i<num_str; i++)
1429  {
1430  max_string_length=CMath::max(max_string_length,
1431  features[m_subset_stack->subset_idx_conversion(i)].slen);
1432  }
1433 }
1434 
1436 {
1437  int32_t l=str.slen;
1438  ST* s=SG_MALLOC(ST, l+1);
1439  memcpy(s, str.string, sizeof(ST)*l);
1440  s[l]='\0';
1441  return s;
1442 }
1443 
1444 template<class ST> void CStringFeatures<ST>::set_feature_vector(int32_t num, ST* string, int32_t len)
1445 {
1446  ASSERT(features)
1447  ASSERT(num<get_num_vectors())
1448 
1449  int32_t real_num=m_subset_stack->subset_idx_conversion(num);
1450 
1451 
1452  features[real_num].slen=len ;
1453  features[real_num].string=string ;
1454 
1455  max_string_length=CMath::max(len, max_string_length);
1456 }
1457 
1458 template<class ST> void CStringFeatures<ST>::get_histogram(float64_t** hist, int32_t* rows, int32_t* cols, bool normalize)
1459 {
1460  int32_t nsym=get_num_symbols();
1461  int32_t slen=get_max_vector_length();
1462  int64_t sz=int64_t(nsym)*slen*sizeof(float64_t);
1463  float64_t* h= SG_MALLOC(float64_t, sz);
1464  memset(h, 0, sz);
1465 
1466  float64_t* h_normalizer=SG_MALLOC(float64_t, slen);
1467  memset(h_normalizer, 0, slen*sizeof(float64_t));
1468  int32_t num_str=get_num_vectors();
1469  for (int32_t i=0; i<num_str; i++)
1470  {
1471  int32_t len;
1472  bool free_vec;
1473  ST* vec=get_feature_vector(i, len, free_vec);
1474  for (int32_t j=0; j<len; j++)
1475  {
1476  h[int64_t(j)*nsym+alphabet->remap_to_bin(vec[j])]++;
1477  h_normalizer[j]++;
1478  }
1479  free_feature_vector(vec, i, free_vec);
1480  }
1481 
1482  if (normalize)
1483  {
1484  for (int32_t i=0; i<slen; i++)
1485  {
1486  for (int32_t j=0; j<nsym; j++)
1487  {
1488  if (h_normalizer && h_normalizer[i])
1489  h[int64_t(i)*nsym+j]/=h_normalizer[i];
1490  }
1491  }
1492  }
1493  SG_FREE(h_normalizer);
1494 
1495  *hist=h;
1496  *rows=nsym;
1497  *cols=slen;
1498 }
1499 
1500 template<class ST> void CStringFeatures<ST>::create_random(float64_t* hist, int32_t rows, int32_t cols, int32_t num_vec)
1501 {
1502  ASSERT(rows == get_num_symbols())
1503  cleanup();
1504  float64_t* randoms=SG_MALLOC(float64_t, cols);
1505  SGString<ST>* sf=SG_MALLOC(SGString<ST>, num_vec);
1506 
1507  for (int32_t i=0; i<num_vec; i++)
1508  {
1509  sf[i].string=SG_MALLOC(ST, cols);
1510  sf[i].slen=cols;
1511 
1512  SGVector<float64_t>::random_vector(randoms, cols, 0.0, 1.0);
1513 
1514  for (int32_t j=0; j<cols; j++)
1515  {
1516  float64_t lik=hist[int64_t(j)*rows+0];
1517 
1518  int32_t c;
1519  for (c=0; c<rows-1; c++)
1520  {
1521  if (randoms[j]<=lik)
1522  break;
1523  lik+=hist[int64_t(j)*rows+c+1];
1524  }
1525  sf[i].string[j]=alphabet->remap_to_char(c);
1526  }
1527  }
1528  SG_FREE(randoms);
1529  set_features(sf, num_vec, cols);
1530 }
1531 
1532 /*
1533 CStringFeatures<SSKTripleFeature>* obtain_sssk_triple_from_cha(int d1, int d2)
1534 {
1535  int *s;
1536  int32_t nStr=get_num_vectors();
1537 
1538  int32_t nfeat=0;
1539  for (int32_t i=0; i < nStr; ++i)
1540  nfeat += get_vector_length[i] - d1 -d2;
1541  SGString<SSKFeature>* F= SG_MALLOC(SGString<SSKFeature>, nfeat);
1542  int32_t c=0;
1543  for (int32_t i=0; i < nStr; ++i)
1544  {
1545  int32_t len;
1546  bool free_vec;
1547  ST* S=get_feature_vector(vec_num, len, free_vec);
1548  free_feature_vector(vec, vec_num, free_vec);
1549  int32_t n=len - d1 - d2;
1550  s=S[i];
1551  for (int32_t j=0; j < n; ++j)
1552  {
1553  F[c].feature1=s[j];
1554  F[c].feature2=s[j+d1];
1555  F[c].feature3=s[j+d1+d2];
1556  F[c].group=i;
1557  c++;
1558  }
1559  }
1560  ASSERT(nfeat==c)
1561  return F;
1562 }
1563 
1564 CStringFeatures<SSKFeature>* obtain_sssk_double_from_char(int **S, int *len, int nStr, int d1)
1565 {
1566  int i, j;
1567  int n, nfeat;
1568  int *group;
1569  int *features;
1570  int *s;
1571  int c;
1572  SSKFeatures *F;
1573 
1574  nfeat=0;
1575  for (i=0; i < nStr; ++i)
1576  nfeat += len[i] - d1;
1577  group=(int *)SG_MALLOC(nfeat*sizeof(int));
1578  features=(int *)SG_MALLOC(nfeat*2*sizeof(int *));
1579  c=0;
1580  for (i=0; i < nStr; ++i)
1581  {
1582  n=len[i] - d1;
1583  s=S[i];
1584  for (j=0; j < n; ++j)
1585  {
1586  features[c]=s[j];
1587  features[c+nfeat]=s[j+d1];
1588  group[c]=i;
1589  c++;
1590  }
1591  }
1592  if (nfeat!=c)
1593  printf("Something is wrong...\n");
1594  F=(SSKFeatures *)SG_MALLOC(sizeof(SSKFeatures));
1595  (*F).features=features;
1596  (*F).group=group;
1597  (*F).n=nfeat;
1598  return F;
1599 }
1600 */
1601 
1603  SGVector<index_t> indices)
1604 {
1605  /* string list to create new CStringFeatures from */
1606  SGStringList<ST> list_copy(indices.vlen, max_string_length);
1607 
1608  /* copy all features */
1609  for (index_t i=0; i<indices.vlen; ++i)
1610  {
1611  /* index with respect to possible subset */
1612  index_t real_idx=m_subset_stack->subset_idx_conversion(indices.vector[i]);
1613 
1614  /* copy string */
1615  SGString<ST> current_string=features[real_idx];
1616  SGString<ST> string_copy(current_string.slen);
1617  memcpy(string_copy.string, current_string.string,
1618  current_string.slen*sizeof(ST));
1619  list_copy.strings[i]=string_copy;
1620  }
1621 
1622  /* create copy instance */
1623  CStringFeatures* result=new CStringFeatures(list_copy, alphabet);
1624 
1625  /* max string length may have changed */
1627 
1628  /* keep things from original features (otherwise assertions in x-val) */
1629  result->order=order;
1631 
1632  SG_REF(result);
1633 
1634  return result;
1635 }
1636 
1638 {
1639  /* max string length has to be updated */
1640  determine_maximum_string_length();
1641 }
1642 
1643 template<class ST> ST* CStringFeatures<ST>::compute_feature_vector(int32_t num, int32_t& len)
1644 {
1645  ASSERT(features && num<get_num_vectors())
1646 
1647  int32_t real_num=m_subset_stack->subset_idx_conversion(num);
1648 
1649  len=features[real_num].slen;
1650  if (len<=0)
1651  return NULL;
1652 
1653  ST* target=SG_MALLOC(ST, len);
1654  memcpy(target, features[real_num].string, len*sizeof(ST));
1655  return target;
1656 }
1657 
1658 template<class ST> void CStringFeatures<ST>::init()
1659 {
1660  set_generic<ST>();
1661 
1662  alphabet=NULL;
1663  num_vectors=0;
1664  features=NULL;
1665  single_string=NULL;
1666  length_of_single_string=0;
1667  max_string_length=0;
1668  order=0;
1669  preprocess_on_get=false;
1670  feature_cache=NULL;
1671  symbol_mask_table=NULL;
1672  symbol_mask_table_len=0;
1673  num_symbols=0.0;
1674  original_num_symbols=0;
1675 
1676  m_parameters->add((CSGObject**) &alphabet, "alphabet");
1677  m_parameters->add_vector(&features, &num_vectors, "features",
1678  "This contains the array of features.");
1679  m_parameters->add_vector(&single_string,
1680  &length_of_single_string,
1681  "single_string",
1682  "Created by sliding window.");
1683  m_parameters->add(&max_string_length, "max_string_length",
1684  "Length of longest string.");
1685  m_parameters->add(&num_symbols, "num_symbols",
1686  "Number of used symbols.");
1687  m_parameters->add(&original_num_symbols, "original_num_symbols",
1688  "Original number of used symbols.");
1689  m_parameters->add(&order, "order",
1690  "Order used in higher order mapping.");
1691  m_parameters->add(&preprocess_on_get, "preprocess_on_get",
1692  "Preprocess on-the-fly?");
1693 
1694  m_parameters->add_vector(&symbol_mask_table, &symbol_mask_table_len, "mask_table", "Symbol mask table - using in higher order mapping");
1695 }
1696 
1702 {
1703  return F_BOOL;
1704 }
1705 
1711 {
1712  return F_CHAR;
1713 }
1714 
1720 {
1721  return F_BYTE;
1722 }
1723 
1729 {
1730  return F_SHORT;
1731 }
1732 
1738 {
1739  return F_WORD;
1740 }
1741 
1747 {
1748  return F_INT;
1749 }
1750 
1756 {
1757  return F_UINT;
1758 }
1759 
1765 {
1766  return F_LONG;
1767 }
1768 
1774 {
1775  return F_ULONG;
1776 }
1777 
1783 {
1784  return F_SHORTREAL;
1785 }
1786 
1792 {
1793  return F_DREAL;
1794 }
1795 
1801 {
1802  return F_LONGREAL;
1803 }
1804 
1805 template<> bool CStringFeatures<bool>::get_masked_symbols(bool symbol, uint8_t mask)
1806 {
1807  return symbol;
1808 }
1810 {
1811  return symbol;
1812 }
1814 {
1815  return symbol;
1816 }
1818 {
1819  return symbol;
1820 }
1821 
1822 template<> bool CStringFeatures<bool>::shift_offset(bool symbol, int32_t amount)
1823 {
1824  return false;
1825 }
1827 {
1828  return 0;
1829 }
1831 {
1832  return 0;
1833 }
1835 {
1836  return 0;
1837 }
1838 
1839 template<> bool CStringFeatures<bool>::shift_symbol(bool symbol, int32_t amount)
1840 {
1841  return symbol;
1842 }
1844 {
1845  return symbol;
1846 }
1848 {
1849  return symbol;
1850 }
1852 {
1853  return symbol;
1854 }
1855 
1856 #ifndef SUNOS
1857 template<> template <class CT> bool CStringFeatures<float32_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
1858 {
1859  return false;
1860 }
1861 template<> template <class CT> bool CStringFeatures<float64_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
1862 {
1863  return false;
1864 }
1865 template<> template <class CT> bool CStringFeatures<floatmax_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
1866 {
1867  return false;
1868 }
1869 #endif
1870 
1871 template<> void CStringFeatures<float32_t>::embed_features(int32_t p_order)
1872 {
1873 }
1874 template<> void CStringFeatures<float64_t>::embed_features(int32_t p_order)
1875 {
1876 }
1877 template<> void CStringFeatures<floatmax_t>::embed_features(int32_t p_order)
1878 {
1879 }
1880 
1882 {
1883 }
1885 {
1886 }
1888 {
1889 }
1890 
1892 {
1893  return 0;
1894 }
1896 {
1897  return 0;
1898 }
1900 {
1901  return 0;
1902 }
1903 
1904 template<> void CStringFeatures<float32_t>::unembed_word(float32_t word, uint8_t* seq, int32_t len)
1905 {
1906 }
1907 template<> void CStringFeatures<float64_t>::unembed_word(float64_t word, uint8_t* seq, int32_t len)
1908 {
1909 }
1910 template<> void CStringFeatures<floatmax_t>::unembed_word(floatmax_t word, uint8_t* seq, int32_t len)
1911 {
1912 }
1913 #define LOAD(f_load, sg_type) \
1914 template<> void CStringFeatures<sg_type>::load(CFile* loader) \
1915 { \
1916  SG_INFO("loading...\n") \
1917  \
1918  SG_SET_LOCALE_C; \
1919  SGString<sg_type>* strs; \
1920  int32_t num_str; \
1921  int32_t max_len; \
1922  loader->f_load(strs, num_str, max_len); \
1923  set_features(strs, num_str, max_len); \
1924  SG_RESET_LOCALE; \
1925 }
1926 
1927 LOAD(get_string_list, bool)
1928 LOAD(get_string_list, char)
1929 LOAD(get_string_list, int8_t)
1930 LOAD(get_string_list, uint8_t)
1931 LOAD(get_string_list, int16_t)
1932 LOAD(get_string_list, uint16_t)
1933 LOAD(get_string_list, int32_t)
1934 LOAD(get_string_list, uint32_t)
1935 LOAD(get_string_list, int64_t)
1936 LOAD(get_string_list, uint64_t)
1937 LOAD(get_string_list, float32_t)
1938 LOAD(get_string_list, float64_t)
1939 LOAD(get_string_list, floatmax_t)
1940 #undef LOAD
1941 
1942 #define SAVE(f_write, sg_type) \
1943 template<> void CStringFeatures<sg_type>::save(CFile* writer) \
1944 { \
1945  if (m_subset_stack->has_subsets()) \
1946  SG_ERROR("save() is not possible on subset") \
1947  SG_SET_LOCALE_C; \
1948  ASSERT(writer) \
1949  writer->f_write(features, num_vectors); \
1950  SG_RESET_LOCALE; \
1951 }
1952 
1953 SAVE(set_string_list, bool)
1954 SAVE(set_string_list, char)
1955 SAVE(set_string_list, int8_t)
1956 SAVE(set_string_list, uint8_t)
1957 SAVE(set_string_list, int16_t)
1958 SAVE(set_string_list, uint16_t)
1959 SAVE(set_string_list, int32_t)
1960 SAVE(set_string_list, uint32_t)
1961 SAVE(set_string_list, int64_t)
1962 SAVE(set_string_list, uint64_t)
1963 SAVE(set_string_list, float32_t)
1964 SAVE(set_string_list, float64_t)
1965 SAVE(set_string_list, floatmax_t)
1966 #undef SAVE
1967 
1968 template <class ST> template <class CT>
1970  int32_t p_order, int32_t gap, bool rev)
1971 {
1972  remove_all_subsets();
1973  ASSERT(sf)
1974 
1975  CAlphabet* alpha=sf->get_alphabet();
1976  ASSERT(alpha->get_num_symbols_in_histogram() > 0)
1977 
1978  this->order=p_order;
1979  cleanup();
1980 
1981  num_vectors=sf->get_num_vectors();
1982  ASSERT(num_vectors>0)
1983  max_string_length=sf->get_max_vector_length()-start;
1984  features=SG_MALLOC(SGString<ST>, num_vectors);
1985 
1986  SG_DEBUG("%1.0llf symbols in StringFeatures<*> %d symbols in histogram\n", sf->get_num_symbols(),
1987  alpha->get_num_symbols_in_histogram());
1988 
1989  for (int32_t i=0; i<num_vectors; i++)
1990  {
1991  int32_t len=-1;
1992  bool vfree;
1993  CT* c=sf->get_feature_vector(i, len, vfree);
1994  ASSERT(!vfree) // won't work when preprocessors are attached
1995 
1996  features[i].string=SG_MALLOC(ST, len);
1997  features[i].slen=len;
1998 
1999  ST* str=features[i].string;
2000  for (int32_t j=0; j<len; j++)
2001  str[j]=(ST) alpha->remap_to_bin(c[j]);
2002  }
2003 
2004  original_num_symbols=alpha->get_num_symbols();
2005  int32_t max_val=alpha->get_num_bits();
2006 
2007  SG_UNREF(alpha);
2008 
2009  if (p_order>1)
2010  num_symbols=CMath::powl((floatmax_t) 2, (floatmax_t) max_val*p_order);
2011  else
2012  num_symbols=original_num_symbols;
2013  SG_INFO("max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols)
2014 
2015  if ( ((floatmax_t) num_symbols) > CMath::powl(((floatmax_t) 2),((floatmax_t) sizeof(ST)*8)) )
2016  {
2017  SG_ERROR("symbol does not fit into datatype \"%c\" (%d)\n", (char) max_val, (int) max_val)
2018  return false;
2019  }
2020 
2021  SG_DEBUG("translate: start=%i order=%i gap=%i(size:%i)\n", start, p_order, gap, sizeof(ST))
2022  for (int32_t line=0; line<num_vectors; line++)
2023  {
2024  int32_t len=0;
2025  bool vfree;
2026  ST* fv=get_feature_vector(line, len, vfree);
2027  ASSERT(!vfree) // won't work when preprocessors are attached
2028 
2029  if (rev)
2030  CAlphabet::translate_from_single_order_reversed(fv, len, start+gap, p_order+gap, max_val, gap);
2031  else
2032  CAlphabet::translate_from_single_order(fv, len, start+gap, p_order+gap, max_val, gap);
2033 
2034  /* fix the length of the string -- hacky */
2035  features[line].slen-=start+gap ;
2036  if (features[line].slen<0)
2037  features[line].slen=0 ;
2038  }
2039 
2040  compute_symbol_mask_table(max_val);
2041 
2042  return true;
2043 }
2044 
2045 template class CStringFeatures<bool>;
2046 template class CStringFeatures<char>;
2047 template class CStringFeatures<int8_t>;
2048 template class CStringFeatures<uint8_t>;
2049 template class CStringFeatures<int16_t>;
2050 template class CStringFeatures<uint16_t>;
2051 template class CStringFeatures<int32_t>;
2052 template class CStringFeatures<uint32_t>;
2053 template class CStringFeatures<int64_t>;
2054 template class CStringFeatures<uint64_t>;
2055 template class CStringFeatures<float32_t>;
2056 template class CStringFeatures<float64_t>;
2057 template class CStringFeatures<floatmax_t>;
2058 
2059 template bool CStringFeatures<uint16_t>::obtain_from_char_features<uint8_t>(CStringFeatures<uint8_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
2060 template bool CStringFeatures<uint32_t>::obtain_from_char_features<uint8_t>(CStringFeatures<uint8_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
2061 template bool CStringFeatures<uint64_t>::obtain_from_char_features<uint8_t>(CStringFeatures<uint8_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
2062 
2063 template bool CStringFeatures<uint16_t>::obtain_from_char_features<uint16_t>(CStringFeatures<uint16_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
2064 template bool CStringFeatures<uint32_t>::obtain_from_char_features<uint16_t>(CStringFeatures<uint16_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
2065 template bool CStringFeatures<uint64_t>::obtain_from_char_features<uint16_t>(CStringFeatures<uint16_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
2066 }

SHOGUN Machine Learning Toolbox - Documentation