SHOGUN  v2.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
StringFeatures.cpp
Go to the documentation of this file.
5 #include <shogun/io/SGIO.h>
8 
9 #include <sys/types.h>
10 #include <sys/stat.h>
11 #include <dirent.h>
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include <unistd.h>
15 
16 
17 namespace shogun
18 {
19 
21 {
22  init();
23  alphabet=new CAlphabet();
24 }
25 
27 {
28  init();
29 
30  alphabet=new CAlphabet(alpha);
34 }
35 
36 template<class ST> CStringFeatures<ST>::CStringFeatures(SGStringList<ST> string_list, EAlphabet alpha)
37 : CFeatures(0)
38 {
39  init();
40 
41  alphabet=new CAlphabet(alpha);
45  set_features(string_list.strings, string_list.num_strings, string_list.max_string_length);
46 }
47 
48 template<class ST> CStringFeatures<ST>::CStringFeatures(SGStringList<ST> string_list, CAlphabet* alpha)
49 : CFeatures(0)
50 {
51  init();
52 
53  alphabet=new CAlphabet(alpha);
57  set_features(string_list.strings, string_list.num_strings, string_list.max_string_length);
58 }
59 
61 : CFeatures(0)
62 {
63  init();
64 
65  ASSERT(alpha);
66  SG_REF(alpha);
67  alphabet=alpha;
70 }
71 
72 template<class ST> CStringFeatures<ST>::CStringFeatures(const CStringFeatures & orig)
73 : CFeatures(orig), num_vectors(orig.num_vectors),
74  single_string(orig.single_string),
75  length_of_single_string(orig.length_of_single_string),
76  max_string_length(orig.max_string_length),
77  num_symbols(orig.num_symbols),
78  original_num_symbols(orig.original_num_symbols),
79  order(orig.order), preprocess_on_get(false),
80  feature_cache(NULL)
81 {
82  init();
83 
84  ASSERT(orig.single_string == NULL); //not implemented
85 
86  alphabet=orig.alphabet;
88 
89  if (orig.features)
90  {
92 
93  for (int32_t i=0; i<num_vectors; i++)
94  {
95  features[i].string=SG_MALLOC(ST, orig.features[i].slen);
96  features[i].slen=orig.features[i].slen;
97  memcpy(features[i].string, orig.features[i].string, sizeof(ST)*orig.features[i].slen);
98  }
99  }
100 
101  if (orig.symbol_mask_table)
102  {
103  symbol_mask_table=SG_MALLOC(ST, 256);
104  for (int32_t i=0; i<256; i++)
106  }
107 
110 }
111 
112 template<class ST> CStringFeatures<ST>::CStringFeatures(CFile* loader, EAlphabet alpha)
113 : CFeatures(loader), num_vectors(0),
114  features(NULL), single_string(NULL), length_of_single_string(0),
115  max_string_length(0), order(0),
116  symbol_mask_table(NULL), preprocess_on_get(false), feature_cache(NULL)
117 {
118  init();
119 
120  alphabet=new CAlphabet(alpha);
121  SG_REF(alphabet);
124  load(loader);
125 }
126 
128 {
129  cleanup();
130 
131  SG_UNREF(alphabet);
132 }
133 
134 template<class ST> void CStringFeatures<ST>::cleanup()
135 {
136  remove_all_subsets();
137 
138  if (single_string)
139  {
140  SG_FREE(single_string);
141  single_string=NULL;
142  }
143  else
144  cleanup_feature_vectors(0, num_vectors-1);
145 
146  num_vectors=0;
147  SG_FREE(features);
148  SG_FREE(symbol_mask_table);
149  features=NULL;
150  symbol_mask_table=NULL;
151 
152  /* start with a fresh alphabet, but instead of emptying the histogram
153  * create a new object (to leave the alphabet object alone if it is used
154  * by others)
155  */
156  CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet());
157  SG_UNREF(alphabet);
158  alphabet=alpha;
159  SG_REF(alphabet);
160 }
161 
162 template<class ST> void CStringFeatures<ST>::cleanup_feature_vector(int32_t num)
163 {
164  ASSERT(num<get_num_vectors());
165 
166  if (features)
167  {
168  int32_t real_num=m_subset_stack->subset_idx_conversion(num);
169  SG_FREE(features[real_num].string);
170  features[real_num].string=NULL;
171  features[real_num].slen=0;
172 
173  determine_maximum_string_length();
174  }
175 }
176 
177 template<class ST> void CStringFeatures<ST>::cleanup_feature_vectors(int32_t start, int32_t stop)
178 {
179  if (features && get_num_vectors())
180  {
181  ASSERT(start<get_num_vectors());
182  ASSERT(stop<get_num_vectors());
183 
184  for (int32_t i=start; i<=stop; i++)
185  {
186  int32_t real_num=m_subset_stack->subset_idx_conversion(i);
187  SG_FREE(features[real_num].string);
188  features[real_num].string=NULL;
189  features[real_num].slen=0;
190  }
191  determine_maximum_string_length();
192  }
193 }
194 
195 template<class ST> EFeatureClass CStringFeatures<ST>::get_feature_class() const { return C_STRING; }
196 
197 template<class ST> EFeatureType CStringFeatures<ST>::get_feature_type() const { return F_UNKNOWN; }
198 
200 {
201  SG_REF(alphabet);
202  return alphabet;
203 }
204 
205 template<class ST> CFeatures* CStringFeatures<ST>::duplicate() const
206 {
207  return new CStringFeatures<ST>(*this);
208 }
209 
211 {
212  ASSERT(features);
213  if (num>=get_num_vectors())
214  {
215  SG_ERROR("Index out of bounds (number of strings %d, you "
216  "requested %d)\n", get_num_vectors(), num);
217  }
218 
219  int32_t l;
220  bool free_vec;
221  ST* vec=get_feature_vector(num, l, free_vec);
222  ST* dst=SG_MALLOC(ST, l);
223  memcpy(dst, vec, l*sizeof(ST));
224  free_feature_vector(vec, num, free_vec);
225  return SGVector<ST>(dst, l, true);
226 }
227 
228 template<class ST> void CStringFeatures<ST>::set_feature_vector(SGVector<ST> vector, int32_t num)
229 {
230  ASSERT(features);
231 
232  if (m_subset_stack->has_subsets())
233  SG_ERROR("A subset is set, cannot set feature vector\n");
234 
235  if (num>=num_vectors)
236  {
237  SG_ERROR("Index out of bounds (number of strings %d, you "
238  "requested %d)\n", num_vectors, num);
239  }
240 
241  if (vector.vlen<=0)
242  SG_ERROR("String has zero or negative length\n");
243 
244  cleanup_feature_vector(num);
245  features[num].slen=vector.vlen;
246  features[num].string=SG_MALLOC(ST, vector.vlen);
247  memcpy(features[num].string, vector.vector, vector.vlen*sizeof(ST));
248 
249  determine_maximum_string_length();
250 }
251 
253 {
254  preprocess_on_get=true;
255 }
256 
258 {
259  preprocess_on_get=false;
260 }
261 
262 template<class ST> ST* CStringFeatures<ST>::get_feature_vector(int32_t num, int32_t& len, bool& dofree)
263 {
264  ASSERT(features);
265  if (num>=get_num_vectors())
266  SG_ERROR("Requested feature vector with index %d while total num is", num, get_num_vectors());
267 
268  int32_t real_num=m_subset_stack->subset_idx_conversion(num);
269 
270  if (!preprocess_on_get)
271  {
272  dofree=false;
273  len=features[real_num].slen;
274  return features[real_num].string;
275  }
276  else
277  {
278  SG_DEBUG( "computing feature vector!\n") ;
279  ST* feat=compute_feature_vector(num, len);
280  dofree=true;
281 
282  if (get_num_preprocessors())
283  {
284  ST* tmp_feat_before=feat;
285 
286  for (int32_t i=0; i<get_num_preprocessors(); i++)
287  {
288  CStringPreprocessor<ST>* p=(CStringPreprocessor<ST>*) get_preprocessor(i);
289  feat=p->apply_to_string(tmp_feat_before, len);
290  SG_UNREF(p);
291  SG_FREE(tmp_feat_before);
292  tmp_feat_before=feat;
293  }
294  }
295  // TODO: implement caching
296  return feat;
297  }
298 }
299 
301 {
302  int32_t num_feat;
303  int32_t num_vec;
304  SGString<ST>* s=get_transposed(num_feat, num_vec);
305  SGStringList<ST> string_list;
306  string_list.strings = s;
307  string_list.num_strings = num_vec;
308  string_list.max_string_length = num_feat;
309 
310  return new CStringFeatures<ST>(string_list, alphabet);
311 }
312 
313 template<class ST> SGString<ST>* CStringFeatures<ST>::get_transposed(int32_t &num_feat, int32_t &num_vec)
314 {
315  num_feat=get_num_vectors();
316  num_vec=get_max_vector_length();
317  ASSERT(have_same_length());
318 
319  SG_DEBUG("Allocating memory for transposed string features of size %ld\n",
320  int64_t(num_feat)*num_vec);
321 
322  SGString<ST>* sf=SG_MALLOC(SGString<ST>, num_vec);
323 
324  for (int32_t i=0; i<num_vec; i++)
325  {
326  sf[i].string=SG_MALLOC(ST, num_feat);
327  sf[i].slen=num_feat;
328  }
329 
330  for (int32_t i=0; i<num_feat; i++)
331  {
332  int32_t len=0;
333  bool free_vec=false;
334  ST* vec=get_feature_vector(i, len, free_vec);
335 
336  for (int32_t j=0; j<num_vec; j++)
337  sf[j].string[i]=vec[j];
338 
339  free_feature_vector(vec, i, free_vec);
340  }
341  return sf;
342 }
343 
344 template<class ST> void CStringFeatures<ST>::free_feature_vector(ST* feat_vec, int32_t num, bool dofree)
345 {
346  if (num>=get_num_vectors())
347  {
348  SG_ERROR(
349  "Trying to access string[%d] but num_str=%d\n", num,
350  get_num_vectors());
351  }
352 
353  int32_t real_num=m_subset_stack->subset_idx_conversion(num);
354 
355  if (feature_cache)
356  feature_cache->unlock_entry(real_num);
357 
358  if (dofree)
359  SG_FREE(feat_vec);
360 }
361 
362 template<class ST> void CStringFeatures<ST>::free_feature_vector(SGVector<ST> feat_vec, int32_t num)
363 {
364  if (num>=get_num_vectors())
365  {
366  SG_ERROR(
367  "Trying to access string[%d] but num_str=%d\n", num,
368  get_num_vectors());
369  }
370 
371  int32_t real_num=m_subset_stack->subset_idx_conversion(num);
372 
373  if (feature_cache)
374  feature_cache->unlock_entry(real_num);
375 }
376 
377 template<class ST> ST CStringFeatures<ST>::get_feature(int32_t vec_num, int32_t feat_num)
378 {
379  ASSERT(vec_num<get_num_vectors());
380 
381  int32_t len;
382  bool free_vec;
383  ST* vec=get_feature_vector(vec_num, len, free_vec);
384  ASSERT(feat_num<len);
385  ST result=vec[feat_num];
386  free_feature_vector(vec, vec_num, free_vec);
387 
388  return result;
389 }
390 
391 template<class ST> int32_t CStringFeatures<ST>::get_vector_length(int32_t vec_num)
392 {
393  ASSERT(vec_num<get_num_vectors());
394 
395  int32_t len;
396  bool free_vec;
397  ST* vec=get_feature_vector(vec_num, len, free_vec);
398  free_feature_vector(vec, vec_num, free_vec);
399  return len;
400 }
401 
402 template<class ST> int32_t CStringFeatures<ST>::get_max_vector_length()
403 {
404  return max_string_length;
405 }
406 
407 template<class ST> int32_t CStringFeatures<ST>::get_num_vectors() const
408 {
409  return m_subset_stack->has_subsets() ? m_subset_stack->get_size() : num_vectors;
410 }
411 
412 template<class ST> floatmax_t CStringFeatures<ST>::get_num_symbols() { return num_symbols; }
413 
414 template<class ST> floatmax_t CStringFeatures<ST>::get_max_num_symbols() { return CMath::powl(2,sizeof(ST)*8); }
415 
416 template<class ST> floatmax_t CStringFeatures<ST>::get_original_num_symbols() { return original_num_symbols; }
417 
418 template<class ST> int32_t CStringFeatures<ST>::get_order() { return order; }
419 
420 template<class ST> ST CStringFeatures<ST>::get_masked_symbols(ST symbol, uint8_t mask)
421 {
422  ASSERT(symbol_mask_table);
423  return symbol_mask_table[mask] & symbol;
424 }
425 
426 template<class ST> ST CStringFeatures<ST>::shift_offset(ST offset, int32_t amount)
427 {
428  ASSERT(alphabet);
429  return (offset << (amount*alphabet->get_num_bits()));
430 }
431 
432 template<class ST> ST CStringFeatures<ST>::shift_symbol(ST symbol, int32_t amount)
433 {
434  ASSERT(alphabet);
435  return (symbol >> (amount*alphabet->get_num_bits()));
436 }
437 
438 template<class ST> void CStringFeatures<ST>::load_ascii_file(char* fname, bool remap_to_bin,
439  EAlphabet ascii_alphabet, EAlphabet binary_alphabet)
440 {
441  remove_all_subsets();
442 
443  size_t blocksize=1024*1024;
444  size_t required_blocksize=0;
445  uint8_t* dummy=SG_MALLOC(uint8_t, blocksize);
446  uint8_t* overflow=NULL;
447  int32_t overflow_len=0;
448 
449  cleanup();
450 
451  CAlphabet* alpha=new CAlphabet(ascii_alphabet);
452  CAlphabet* alpha_bin=new CAlphabet(binary_alphabet);
453 
454  FILE* f=fopen(fname, "ro");
455 
456  if (f)
457  {
458  num_vectors=0;
459  max_string_length=0;
460 
461  SG_INFO("counting line numbers in file %s\n", fname);
462  size_t block_offs=0;
463  size_t old_block_offs=0;
464  fseek(f, 0, SEEK_END);
465  size_t fsize=ftell(f);
466  rewind(f);
467 
468  if (blocksize>fsize)
469  blocksize=fsize;
470 
471  SG_DEBUG("block_size=%ld file_size=%ld\n", blocksize, fsize);
472 
473  size_t sz=blocksize;
474  while (sz == blocksize)
475  {
476  sz=fread(dummy, sizeof(uint8_t), blocksize, f);
477  for (size_t i=0; i<sz; i++)
478  {
479  block_offs++;
480  if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
481  {
482  num_vectors++;
483  required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs);
484  old_block_offs=block_offs;
485  }
486  }
487  SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
488  }
489 
490  SG_INFO("found %d strings\n", num_vectors);
491  SG_FREE(dummy);
492  blocksize=required_blocksize;
493  dummy=SG_MALLOC(uint8_t, blocksize);
494  overflow=SG_MALLOC(uint8_t, blocksize);
495  features=SG_MALLOC(SGString<ST>, num_vectors);
496 
497  rewind(f);
498  sz=blocksize;
499  int32_t lines=0;
500  while (sz == blocksize)
501  {
502  sz=fread(dummy, sizeof(uint8_t), blocksize, f);
503 
504  size_t old_sz=0;
505  for (size_t i=0; i<sz; i++)
506  {
507  if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
508  {
509  int32_t len=i-old_sz;
510  //SG_PRINT("i:%d len:%d old_sz:%d\n", i, len, old_sz);
511  max_string_length=CMath::max(max_string_length, len+overflow_len);
512 
513  features[lines].slen=len;
514  features[lines].string=SG_MALLOC(ST, len);
515 
516  if (remap_to_bin)
517  {
518  for (int32_t j=0; j<overflow_len; j++)
519  features[lines].string[j]=alpha->remap_to_bin(overflow[j]);
520  for (int32_t j=0; j<len; j++)
521  features[lines].string[j+overflow_len]=alpha->remap_to_bin(dummy[old_sz+j]);
522  alpha->add_string_to_histogram(&dummy[old_sz], len);
523  alpha_bin->add_string_to_histogram(features[lines].string, features[lines].slen);
524  }
525  else
526  {
527  for (int32_t j=0; j<overflow_len; j++)
528  features[lines].string[j]=overflow[j];
529  for (int32_t j=0; j<len; j++)
530  features[lines].string[j+overflow_len]=dummy[old_sz+j];
531  alpha->add_string_to_histogram(&dummy[old_sz], len);
532  alpha->add_string_to_histogram(features[lines].string, features[lines].slen);
533  }
534 
535  // clear overflow
536  overflow_len=0;
537 
538  //CMath::display_vector(features[lines].string, len);
539  old_sz=i+1;
540  lines++;
541  SG_PROGRESS(lines, 0, num_vectors, 1, "LOADING:\t");
542  }
543  }
544  for (size_t i=old_sz; i<sz; i++)
545  overflow[i-old_sz]=dummy[i];
546 
547  overflow_len=sz-old_sz;
548  }
549 
550  if (alpha->check_alphabet_size() && alpha->check_alphabet())
551  {
552  SG_INFO("file successfully read\n");
553  SG_INFO("max_string_length=%d\n", max_string_length);
554  SG_INFO("num_strings=%d\n", num_vectors);
555  }
556  fclose(f);
557  }
558 
559  SG_FREE(dummy);
560 
561  SG_UNREF(alphabet);
562 
563  if (remap_to_bin)
564  alphabet=alpha_bin;
565  else
566  alphabet=alpha;
567  SG_REF(alphabet);
568  num_symbols=alphabet->get_num_symbols();
569 }
570 
571 template<class ST> bool CStringFeatures<ST>::load_fasta_file(const char* fname, bool ignore_invalid)
572 {
573  remove_all_subsets();
574 
575  int32_t i=0;
576  uint64_t len=0;
577  uint64_t offs=0;
578  int32_t num=0;
579  int32_t max_len=0;
580 
581  CMemoryMappedFile<char> f(fname);
582 
583  while (true)
584  {
585  char* s=f.get_line(len, offs);
586  if (!s)
587  break;
588 
589  if (len>0 && s[0]=='>')
590  num++;
591  }
592 
593  if (num==0)
594  SG_ERROR("No fasta hunks (lines starting with '>') found\n");
595 
596  cleanup();
597  SG_UNREF(alphabet);
598  alphabet=new CAlphabet(DNA);
599  num_symbols=alphabet->get_num_symbols();
600 
601  SGString<ST>* strings=SG_MALLOC(SGString<ST>, num);
602  offs=0;
603 
604  for (i=0;i<num; i++)
605  {
606  uint64_t id_len=0;
607  char* id=f.get_line(id_len, offs);
608 
609  char* fasta=f.get_line(len, offs);
610  char* s=fasta;
611  int32_t fasta_len=0;
612  int32_t spanned_lines=0;
613 
614  while (true)
615  {
616  if (!s || len==0)
617  SG_ERROR("Error reading fasta entry in line %d len=%ld", 4*i+1, len);
618 
619  if (s[0]=='>' || offs==f.get_size())
620  {
621  offs-=len+1; // seek to beginning
622  if (offs==f.get_size())
623  {
624  SG_DEBUG("at EOF\n");
625  fasta_len+=len;
626  }
627 
628  len=fasta_len-spanned_lines;
629  strings[i].string=SG_MALLOC(ST, len);
630  strings[i].slen=len;
631 
632  ST* str=strings[i].string;
633  int32_t idx=0;
634  SG_DEBUG("'%.*s', len=%d, spanned_lines=%d\n", (int32_t) id_len, id, (int32_t) len, (int32_t) spanned_lines);
635 
636  for (int32_t j=0; j<fasta_len; j++)
637  {
638  if (fasta[j]=='\n')
639  continue;
640 
641  ST c=(ST) fasta[j];
642 
643  if (ignore_invalid && !alphabet->is_valid((uint8_t) fasta[j]))
644  c=(ST) 'A';
645 
646  if (uint64_t(idx)>=len)
647  SG_ERROR("idx=%d j=%d fasta_len=%d, spanned_lines=%d str='%.*s'\n", idx, j, fasta_len, spanned_lines, idx, str);
648  str[idx++]=c;
649  }
650  max_len=CMath::max(max_len, strings[i].slen);
651 
652 
653  break;
654  }
655 
656  spanned_lines++;
657  fasta_len+=len+1; // including '\n'
658  s=f.get_line(len, offs);
659  }
660  }
661  return set_features(strings, num, max_len);
662 }
663 
664 template<class ST> bool CStringFeatures<ST>::load_fastq_file(const char* fname,
665  bool ignore_invalid, bool bitremap_in_single_string)
666 {
667  remove_all_subsets();
668 
669  CMemoryMappedFile<char> f(fname);
670 
671  int32_t i=0;
672  uint64_t len=0;
673  uint64_t offs=0;
674 
675  int32_t num=f.get_num_lines();
676  int32_t max_len=0;
677 
678  if (num%4)
679  SG_ERROR("Number of lines must be divisible by 4 in fastq files\n");
680  num/=4;
681 
682  cleanup();
683  SG_UNREF(alphabet);
684  alphabet=new CAlphabet(DNA);
685 
686  SGString<ST>* strings;
687 
688  ST* str=NULL;
689  if (bitremap_in_single_string)
690  {
691  strings=SG_MALLOC(SGString<ST>, 1);
692  strings[0].string=SG_MALLOC(ST, num);
693  strings[0].slen=num;
694  f.get_line(len, offs);
695  f.get_line(len, offs);
696  order=len;
697  max_len=num;
698  offs=0;
699  original_num_symbols=alphabet->get_num_symbols();
700  str=SG_MALLOC(ST, len);
701  }
702  else
703  strings=SG_MALLOC(SGString<ST>, num);
704 
705  for (i=0;i<num; i++)
706  {
707  if (!f.get_line(len, offs))
708  SG_ERROR("Error reading 'read' identifier in line %d", 4*i);
709 
710  char* s=f.get_line(len, offs);
711  if (!s || len==0)
712  SG_ERROR("Error reading 'read' in line %d len=%ld", 4*i+1, len);
713 
714  if (bitremap_in_single_string)
715  {
716  if (len!=(uint64_t) order)
717  SG_ERROR("read in line %d not of length %d (is %d)\n", 4*i+1, order, len);
718  for (int32_t j=0; j<order; j++)
719  str[j]=(ST) alphabet->remap_to_bin((uint8_t) s[j]);
720 
721  strings[0].string[i]=embed_word(str, order);
722  }
723  else
724  {
725  strings[i].string=SG_MALLOC(ST, len);
726  strings[i].slen=len;
727  str=strings[i].string;
728 
729  if (ignore_invalid)
730  {
731  for (uint64_t j=0; j<len; j++)
732  {
733  if (alphabet->is_valid((uint8_t) s[j]))
734  str[j]= (ST) s[j];
735  else
736  str[j]= (ST) 'A';
737  }
738  }
739  else
740  {
741  for (uint64_t j=0; j<len; j++)
742  str[j]= (ST) s[j];
743  }
744  max_len=CMath::max(max_len, (int32_t) len);
745  }
746 
747 
748  if (!f.get_line(len, offs))
749  SG_ERROR("Error reading 'read' quality identifier in line %d", 4*i+2);
750 
751  if (!f.get_line(len, offs))
752  SG_ERROR("Error reading 'read' quality in line %d", 4*i+3);
753  }
754 
755  if (bitremap_in_single_string)
756  num=1;
757 
758  num_vectors=num;
759  max_string_length=max_len;
760  features=strings;
761 
762  return true;
763 }
764 
765 template<class ST> bool CStringFeatures<ST>::load_from_directory(char* dirname)
766 {
767  remove_all_subsets();
768 
769  struct dirent **namelist;
770  int32_t n;
771 
772  SGIO::set_dirname(dirname);
773 
774  SG_DEBUG("dirname '%s'\n", dirname);
775 
776  n=scandir(dirname, &namelist, &SGIO::filter, alphasort);
777  if (n <= 0)
778  {
779  SG_ERROR("error calling scandir - no files found\n");
780  return false;
781  }
782  else
783  {
784  SGString<ST>* strings=NULL;
785 
786  int32_t num=0;
787  int32_t max_len=-1;
788 
789  //usually n==num_vec, but it might not in race conditions
790  //(file perms modified, file erased)
791  strings=SG_MALLOC(SGString<ST>, n);
792 
793  for (int32_t i=0; i<n; i++)
794  {
795  char* fname=SGIO::concat_filename(namelist[i]->d_name);
796 
797  struct stat s;
798  off_t filesize=0;
799 
800  if (!stat(fname, &s) && s.st_size>0)
801  {
802  filesize=s.st_size/sizeof(ST);
803 
804  FILE* f=fopen(fname, "ro");
805  if (f)
806  {
807  ST* str=SG_MALLOC(ST, filesize);
808  SG_DEBUG("%s:%ld\n", fname, (int64_t) filesize);
809  if (fread(str, sizeof(ST), filesize, f)!=(size_t) filesize)
810  SG_ERROR("failed to read file\n");
811  strings[num].string=str;
812  strings[num].slen=filesize;
813  max_len=CMath::max(max_len, strings[num].slen);
814 
815  num++;
816  fclose(f);
817  }
818  }
819  else
820  SG_ERROR("empty or non readable file \'%s\'\n", fname);
821 
822  SG_FREE(namelist[i]);
823  }
824  SG_FREE(namelist);
825 
826  if (num>0 && strings)
827  {
828  set_features(strings, num, max_len);
829  return true;
830  }
831  }
832  return false;
833 }
834 
836 {
837  set_features(feats.strings, feats.num_strings, feats.max_string_length);
838 }
839 
840 template<class ST> bool CStringFeatures<ST>::set_features(SGString<ST>* p_features, int32_t p_num_vectors, int32_t p_max_string_length)
841 {
842  if (m_subset_stack->has_subsets())
843  SG_ERROR("Cannot call set_features() with subset.\n");
844 
845  if (p_features)
846  {
847  CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet());
848 
849  //compute histogram for char/byte
850  for (int32_t i=0; i<p_num_vectors; i++)
851  alpha->add_string_to_histogram( p_features[i].string, p_features[i].slen);
852 
853  SG_INFO("max_value_in_histogram:%d\n", alpha->get_max_value_in_histogram());
854  SG_INFO("num_symbols_in_histogram:%d\n", alpha->get_num_symbols_in_histogram());
855 
856  if (alpha->check_alphabet_size() && alpha->check_alphabet())
857  {
858  cleanup();
859  SG_UNREF(alphabet);
860 
861  alphabet=alpha;
862  SG_REF(alphabet);
863 
864  features=p_features;
865  num_vectors=p_num_vectors;
866  max_string_length=p_max_string_length;
867 
868  return true;
869  }
870  else
871  SG_UNREF(alpha);
872  }
873 
874  return false;
875 }
876 
878 {
879  ASSERT(sf);
880 
881  if (m_subset_stack->has_subsets())
882  SG_ERROR("Cannot call set_features() with subset.\n");
883 
884  SGString<ST>* new_features=SG_MALLOC(SGString<ST>, sf->get_num_vectors());
885 
886  index_t sf_num_str=sf->get_num_vectors();
887  for (int32_t i=0; i<sf_num_str; i++)
888  {
889  int32_t real_i = sf->m_subset_stack->subset_idx_conversion(i);
890  int32_t length=sf->features[real_i].slen;
891  new_features[i].string=SG_MALLOC(ST, length);
892  memcpy(new_features[i].string, sf->features[real_i].string, length);
893  new_features[i].slen=length;
894  }
895  return append_features(new_features, sf_num_str,
896  sf->max_string_length);
897 }
898 
899 template<class ST> bool CStringFeatures<ST>::append_features(SGString<ST>* p_features, int32_t p_num_vectors, int32_t p_max_string_length)
900 {
901  if (m_subset_stack->has_subsets())
902  SG_ERROR("Cannot call set_features() with subset.\n");
903 
904  if (!features)
905  return set_features(p_features, p_num_vectors, p_max_string_length);
906 
907  CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet());
908 
909  //compute histogram for char/byte
910  for (int32_t i=0; i<p_num_vectors; i++)
911  alpha->add_string_to_histogram( p_features[i].string, p_features[i].slen);
912 
913  SG_INFO("max_value_in_histogram:%d\n", alpha->get_max_value_in_histogram());
914  SG_INFO("num_symbols_in_histogram:%d\n", alpha->get_num_symbols_in_histogram());
915 
916  if (alpha->check_alphabet_size() && alpha->check_alphabet())
917  {
918  SG_UNREF(alpha);
919  for (int32_t i=0; i<p_num_vectors; i++)
920  alphabet->add_string_to_histogram( p_features[i].string, p_features[i].slen);
921 
922  int32_t old_num_vectors=num_vectors;
923  num_vectors=old_num_vectors+p_num_vectors;
924  SGString<ST>* new_features=SG_MALLOC(SGString<ST>, num_vectors);
925 
926  for (int32_t i=0; i<num_vectors; i++)
927  {
928  if (i<old_num_vectors)
929  {
930  new_features[i].string=features[i].string;
931  new_features[i].slen=features[i].slen;
932  }
933  else
934  {
935  new_features[i].string=p_features[i-old_num_vectors].string;
936  new_features[i].slen=p_features[i-old_num_vectors].slen;
937  }
938  }
939  SG_FREE(features);
940  SG_FREE(p_features); // free now obsolete features
941 
942  this->features=new_features;
943  max_string_length=CMath::max(max_string_length, p_max_string_length);
944 
945  return true;
946  }
947  SG_UNREF(alpha);
948 
949  return false;
950 }
951 
953 {
954  SGStringList<ST> sl;
955 
956  sl.strings=get_features(sl.num_strings, sl.max_string_length);
957  return sl;
958 }
959 
960 template<class ST> SGString<ST>* CStringFeatures<ST>::get_features(int32_t& num_str, int32_t& max_str_len)
961 {
962  if (m_subset_stack->has_subsets())
963  SG_ERROR("get features() is not possible on subset");
964 
965  num_str=num_vectors;
966  max_str_len=max_string_length;
967  return features;
968 }
969 
970 template<class ST> SGString<ST>* CStringFeatures<ST>::copy_features(int32_t& num_str, int32_t& max_str_len)
971 {
972  ASSERT(num_vectors>0);
973 
974  num_str=get_num_vectors();
975  max_str_len=max_string_length;
976  SGString<ST>* new_feat=SG_MALLOC(SGString<ST>, num_str);
977 
978  for (int32_t i=0; i<num_str; i++)
979  {
980  int32_t len;
981  bool free_vec;
982  ST* vec=get_feature_vector(i, len, free_vec);
983  new_feat[i].string=SG_MALLOC(ST, len);
984  new_feat[i].slen=len;
985  memcpy(new_feat[i].string, vec, ((size_t) len) * sizeof(ST));
986  free_feature_vector(vec, i, free_vec);
987  }
988 
989  return new_feat;
990 }
991 
992 template<class ST> void CStringFeatures<ST>::get_features(SGString<ST>** dst, int32_t* num_str)
993 {
994  int32_t num_vec;
995  int32_t max_str_len;
996  *dst=copy_features(num_vec, max_str_len);
997  *num_str=num_vec;
998 }
999 
1000 template<class ST> bool CStringFeatures<ST>::load_compressed(char* src, bool decompress)
1001 {
1002  remove_all_subsets();
1003 
1004  FILE* file=NULL;
1005 
1006  if (!(file=fopen(src, "r")))
1007  return false;
1008  cleanup();
1009 
1010  // header shogun v0
1011  char id[4];
1012  if (fread(&id[0], sizeof(char), 1, file)!=1)
1013  SG_ERROR("failed to read header");
1014  ASSERT(id[0]=='S');
1015  if (fread(&id[1], sizeof(char), 1, file)!=1)
1016  SG_ERROR("failed to read header");
1017  ASSERT(id[1]=='G');
1018  if (fread(&id[2], sizeof(char), 1, file)!=1)
1019  SG_ERROR("failed to read header");
1020  ASSERT(id[2]=='V');
1021  if (fread(&id[3], sizeof(char), 1, file)!=1)
1022  SG_ERROR("failed to read header");
1023  ASSERT(id[3]=='0');
1024 
1025  //compression type
1026  uint8_t c;
1027  if (fread(&c, sizeof(uint8_t), 1, file)!=1)
1028  SG_ERROR("failed to read compression type");
1029  CCompressor* compressor= new CCompressor((E_COMPRESSION_TYPE) c);
1030  //alphabet
1031  uint8_t a;
1032  delete alphabet;
1033  if (fread(&a, sizeof(uint8_t), 1, file)!=1)
1034  SG_ERROR("failed to read compression alphabet");
1035  alphabet=new CAlphabet((EAlphabet) a);
1036  // number of vectors
1037  if (fread(&num_vectors, sizeof(int32_t), 1, file)!=1)
1038  SG_ERROR("failed to read compression number of vectors");
1039  ASSERT(num_vectors>0);
1040  // maximum string length
1041  if (fread(&max_string_length, sizeof(int32_t), 1, file)!=1)
1042  SG_ERROR("failed to read maximum string length");
1043  ASSERT(max_string_length>0);
1044 
1045  features=SG_MALLOC(SGString<ST>, num_vectors);
1046 
1047  // vectors
1048  for (int32_t i=0; i<num_vectors; i++)
1049  {
1050  // vector len compressed
1051  int32_t len_compressed;
1052  if (fread(&len_compressed, sizeof(int32_t), 1, file)!=1)
1053  SG_ERROR("failed to read vector length compressed");
1054  // vector len uncompressed
1055  int32_t len_uncompressed;
1056  if (fread(&len_uncompressed, sizeof(int32_t), 1, file)!=1)
1057  SG_ERROR("failed to read vector length uncompressed");
1058 
1059  // vector raw data
1060  if (decompress)
1061  {
1062  features[i].string=SG_MALLOC(ST, len_uncompressed);
1063  features[i].slen=len_uncompressed;
1064  uint8_t* compressed=SG_MALLOC(uint8_t, len_compressed);
1065  if (fread(compressed, sizeof(uint8_t), len_compressed, file)!=(size_t) len_compressed)
1066  SG_ERROR("failed to read compressed data (expected %d bytes)", len_compressed);
1067  uint64_t uncompressed_size=len_uncompressed;
1068  uncompressed_size*=sizeof(ST);
1069  compressor->decompress(compressed, len_compressed,
1070  (uint8_t*) features[i].string, uncompressed_size);
1071  SG_FREE(compressed);
1072  ASSERT(uncompressed_size==((uint64_t) len_uncompressed)*sizeof(ST));
1073  }
1074  else
1075  {
1076  int32_t offs=CMath::ceil(2.0*sizeof(int32_t)/sizeof(ST));
1077  features[i].string=SG_MALLOC(ST, len_compressed+offs);
1078  features[i].slen=len_compressed+offs;
1079  int32_t* feat32ptr=((int32_t*) (features[i].string));
1080  memset(features[i].string, 0, offs*sizeof(ST));
1081  feat32ptr[0]=(int32_t) len_compressed;
1082  feat32ptr[1]=(int32_t) len_uncompressed;
1083  uint8_t* compressed=(uint8_t*) (&features[i].string[offs]);
1084  if (fread(compressed, 1, len_compressed, file)!=(size_t) len_compressed)
1085  SG_ERROR("failed to read uncompressed data");
1086  }
1087  }
1088 
1089  delete compressor;
1090  fclose(file);
1091 
1092  return false;
1093 }
1094 
1095 template<class ST> bool CStringFeatures<ST>::save_compressed(char* dest, E_COMPRESSION_TYPE compression, int level)
1096 {
1097  if (m_subset_stack->has_subsets())
1098  SG_ERROR("save_compressed() is not possible on subset");
1099 
1100  FILE* file=NULL;
1101 
1102  if (!(file=fopen(dest, "wb")))
1103  return false;
1104 
1105  CCompressor* compressor= new CCompressor(compression);
1106 
1107  // header shogun v0
1108  const char* id="SGV0";
1109  fwrite(&id[0], sizeof(char), 1, file);
1110  fwrite(&id[1], sizeof(char), 1, file);
1111  fwrite(&id[2], sizeof(char), 1, file);
1112  fwrite(&id[3], sizeof(char), 1, file);
1113 
1114  //compression type
1115  uint8_t c=(uint8_t) compression;
1116  fwrite(&c, sizeof(uint8_t), 1, file);
1117  //alphabet
1118  uint8_t a=(uint8_t) alphabet->get_alphabet();
1119  fwrite(&a, sizeof(uint8_t), 1, file);
1120  // number of vectors
1121  fwrite(&num_vectors, sizeof(int32_t), 1, file);
1122  // maximum string length
1123  fwrite(&max_string_length, sizeof(int32_t), 1, file);
1124 
1125  // vectors
1126  for (int32_t i=0; i<num_vectors; i++)
1127  {
1128  int32_t len=-1;
1129  bool vfree;
1130  ST* vec=get_feature_vector(i, len, vfree);
1131 
1132  uint8_t* compressed=NULL;
1133  uint64_t compressed_size=0;
1134 
1135  compressor->compress((uint8_t*) vec, ((uint64_t) len)*sizeof(ST),
1136  compressed, compressed_size, level);
1137 
1138  int32_t len_compressed=(int32_t) compressed_size;
1139  // vector len compressed in bytes
1140  fwrite(&len_compressed, sizeof(int32_t), 1, file);
1141  // vector len uncompressed in number of elements of type ST
1142  fwrite(&len, sizeof(int32_t), 1, file);
1143  // vector raw data
1144  fwrite(compressed, compressed_size, 1, file);
1145  SG_FREE(compressed);
1146 
1147  free_feature_vector(vec, i, vfree);
1148  }
1149 
1150  delete compressor;
1151  fclose(file);
1152  return true;
1153 }
1154 
1155 template<class ST> int32_t CStringFeatures<ST>::get_size() const { return sizeof(ST); }
1156 
1157 template<class ST> bool CStringFeatures<ST>::apply_preprocessor(bool force_preprocessing)
1158 {
1159  SG_DEBUG( "force: %d\n", force_preprocessing);
1160 
1161  for (int32_t i=0; i<get_num_preprocessors(); i++)
1162  {
1163  if ( (!is_preprocessed(i) || force_preprocessing) )
1164  {
1165  set_preprocessed(i);
1166  CStringPreprocessor<ST>* p=(CStringPreprocessor<ST>*) get_preprocessor(i);
1167  SG_INFO( "preprocessing using preproc %s\n", p->get_name());
1168 
1169  if (!p->apply_to_string_features(this))
1170  {
1171  SG_UNREF(p);
1172  return false;
1173  }
1174  else
1175  SG_UNREF(p);
1176  }
1177  }
1178  return true;
1179 }
1180 
1181 template<class ST> int32_t CStringFeatures<ST>::obtain_by_sliding_window(int32_t window_size, int32_t step_size, int32_t skip)
1182 {
1183  if (m_subset_stack->has_subsets())
1185 
1186  ASSERT(step_size>0);
1187  ASSERT(window_size>0);
1188  ASSERT(num_vectors==1 || single_string);
1189  ASSERT(max_string_length>=window_size ||
1190  (single_string && length_of_single_string>=window_size));
1191 
1192  //in case we are dealing with a single remapped string
1193  //allow remapping
1194  if (single_string)
1195  num_vectors= (length_of_single_string-window_size)/step_size + 1;
1196  else if (num_vectors==1)
1197  {
1198  num_vectors= (max_string_length-window_size)/step_size + 1;
1199  length_of_single_string=max_string_length;
1200  }
1201 
1202  SGString<ST>* f=SG_MALLOC(SGString<ST>, num_vectors);
1203  int32_t offs=0;
1204  for (int32_t i=0; i<num_vectors; i++)
1205  {
1206  f[i].string=&features[0].string[offs+skip];
1207  f[i].slen=window_size-skip;
1208  offs+=step_size;
1209  }
1210  single_string=features[0].string;
1211  SG_FREE(features);
1212  features=f;
1213  max_string_length=window_size-skip;
1214 
1215  return num_vectors;
1216 }
1217 
1218 template<class ST> int32_t CStringFeatures<ST>::obtain_by_position_list(int32_t window_size, CDynamicArray<int32_t>* positions,
1219  int32_t skip)
1220 {
1221  if (m_subset_stack->has_subsets())
1223 
1224  ASSERT(positions);
1225  ASSERT(window_size>0);
1226  ASSERT(num_vectors==1 || single_string);
1227  ASSERT(max_string_length>=window_size ||
1228  (single_string && length_of_single_string>=window_size));
1229 
1230  num_vectors= positions->get_num_elements();
1231  ASSERT(num_vectors>0);
1232 
1233  int32_t len;
1234 
1235  //in case we are dealing with a single remapped string
1236  //allow remapping
1237  if (single_string)
1238  len=length_of_single_string;
1239  else
1240  {
1241  single_string=features[0].string;
1242  len=max_string_length;
1243  length_of_single_string=max_string_length;
1244  }
1245 
1246  SGString<ST>* f=SG_MALLOC(SGString<ST>, num_vectors);
1247  for (int32_t i=0; i<num_vectors; i++)
1248  {
1249  int32_t p=positions->get_element(i);
1250 
1251  if (p>=0 && p<=len-window_size)
1252  {
1253  f[i].string=&features[0].string[p+skip];
1254  f[i].slen=window_size-skip;
1255  }
1256  else
1257  {
1258  num_vectors=1;
1259  max_string_length=len;
1260  features[0].slen=len;
1261  single_string=NULL;
1262  SG_FREE(f);
1263  SG_ERROR("window (size:%d) starting at position[%d]=%d does not fit in sequence(len:%d)\n",
1264  window_size, i, p, len);
1265  return -1;
1266  }
1267  }
1268 
1269  SG_FREE(features);
1270  features=f;
1271  max_string_length=window_size-skip;
1272 
1273  return num_vectors;
1274 }
1275 
1276 template<class ST> bool CStringFeatures<ST>::obtain_from_char(CStringFeatures<char>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
1277 {
1278  return obtain_from_char_features(sf, start, p_order, gap, rev);
1279 }
1280 
1281 template<class ST> bool CStringFeatures<ST>::have_same_length(int32_t len)
1282 {
1283  if (len!=-1)
1284  {
1285  if (len!=max_string_length)
1286  return false;
1287  }
1288  len=max_string_length;
1289 
1290  index_t num_str=get_num_vectors();
1291  for (int32_t i=0; i<num_str; i++)
1292  {
1293  if (get_vector_length(i)!=len)
1294  return false;
1295  }
1296 
1297  return true;
1298 }
1299 
1300 template<class ST> void CStringFeatures<ST>::embed_features(int32_t p_order)
1301 {
1302  if (m_subset_stack->has_subsets())
1304 
1305  ASSERT(alphabet->get_num_symbols_in_histogram() > 0);
1306 
1307  order=p_order;
1308  original_num_symbols=alphabet->get_num_symbols();
1309  int32_t max_val=alphabet->get_num_bits();
1310 
1311  if (p_order>1)
1312  num_symbols=CMath::powl((floatmax_t) 2, (floatmax_t) max_val*p_order);
1313  else
1314  num_symbols=original_num_symbols;
1315 
1316  SG_INFO( "max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols);
1317 
1318  if ( ((floatmax_t) num_symbols) > CMath::powl(((floatmax_t) 2),((floatmax_t) sizeof(ST)*8)) )
1319  SG_WARNING("symbols did not fit into datatype \"%c\" (%d)\n", (char) max_val, (int) max_val);
1320 
1321  ST mask=0;
1322  for (int32_t i=0; i<p_order*max_val; i++)
1323  mask= (mask<<1) | ((ST) 1);
1324 
1325  for (int32_t i=0; i<num_vectors; i++)
1326  {
1327  int32_t len=features[i].slen;
1328 
1329  if (len < p_order)
1330  SG_ERROR("Sequence must be longer than order (%d vs. %d)\n", len, p_order);
1331 
1332  ST* str=features[i].string;
1333 
1334  // convert first word
1335  for (int32_t j=0; j<p_order; j++)
1336  str[j]=(ST) alphabet->remap_to_bin(str[j]);
1337  str[0]=embed_word(&str[0], p_order);
1338 
1339  // convert the rest
1340  int32_t idx=0;
1341  for (int32_t j=p_order; j<len; j++)
1342  {
1343  str[j]=(ST) alphabet->remap_to_bin(str[j]);
1344  str[idx+1]= ((str[idx]<<max_val) | str[j]) & mask;
1345  idx++;
1346  }
1347 
1348  features[i].slen=len-p_order+1;
1349  }
1350 
1351  compute_symbol_mask_table(max_val);
1352 }
1353 
1354 template<class ST> void CStringFeatures<ST>::compute_symbol_mask_table(int64_t max_val)
1355 {
1356  if (m_subset_stack->has_subsets())
1358 
1359  SG_FREE(symbol_mask_table);
1360  symbol_mask_table=SG_MALLOC(ST, 256);
1361 
1362  uint64_t mask=0;
1363  for (int32_t i=0; i< (int64_t) max_val; i++)
1364  mask=(mask<<1) | 1;
1365 
1366  for (int32_t i=0; i<256; i++)
1367  {
1368  uint8_t bits=(uint8_t) i;
1369  symbol_mask_table[i]=0;
1370 
1371  for (int32_t j=0; j<8; j++)
1372  {
1373  if (bits & 1)
1374  symbol_mask_table[i]|=mask<<(max_val*j);
1375 
1376  bits>>=1;
1377  }
1378  }
1379 }
1380 
1381 template<class ST> void CStringFeatures<ST>::unembed_word(ST word, uint8_t* seq, int32_t len)
1382 {
1383  uint32_t nbits= (uint32_t) alphabet->get_num_bits();
1384 
1385  ST mask=0;
1386  for (uint32_t i=0; i<nbits; i++)
1387  mask=(mask<<1) | (ST) 1;
1388 
1389  for (int32_t i=0; i<len; i++)
1390  {
1391  ST w=(word & mask);
1392  seq[len-i-1]=alphabet->remap_to_char((uint8_t) w);
1393  word>>=nbits;
1394  }
1395 }
1396 
1397 template<class ST> ST CStringFeatures<ST>::embed_word(ST* seq, int32_t len)
1398 {
1399  ST value=(ST) 0;
1400  uint32_t nbits= (uint32_t) alphabet->get_num_bits();
1401  for (int32_t i=0; i<len; i++)
1402  {
1403  value<<=nbits;
1404  value|=seq[i];
1405  }
1406 
1407  return value;
1408 }
1409 
1411 {
1412  max_string_length=0;
1413  index_t num_str=get_num_vectors();
1414 
1415  for (int32_t i=0; i<num_str; i++)
1416  {
1417  max_string_length=CMath::max(max_string_length,
1418  features[m_subset_stack->subset_idx_conversion(i)].slen);
1419  }
1420 }
1421 
1423 {
1424  int32_t l=str.slen;
1425  ST* s=SG_MALLOC(ST, l+1);
1426  memcpy(s, str.string, sizeof(ST)*l);
1427  s[l]='\0';
1428  return s;
1429 }
1430 
1431 template<class ST> void CStringFeatures<ST>::set_feature_vector(int32_t num, ST* string, int32_t len)
1432 {
1433  ASSERT(features);
1434  ASSERT(num<get_num_vectors());
1435 
1436  int32_t real_num=m_subset_stack->subset_idx_conversion(num);
1437 
1438 
1439  features[real_num].slen=len ;
1440  features[real_num].string=string ;
1441 
1442  max_string_length=CMath::max(len, max_string_length);
1443 }
1444 
1445 template<class ST> void CStringFeatures<ST>::get_histogram(float64_t** hist, int32_t* rows, int32_t* cols, bool normalize)
1446 {
1447  int32_t nsym=get_num_symbols();
1448  int32_t slen=get_max_vector_length();
1449  int64_t sz=int64_t(nsym)*slen*sizeof(float64_t);
1450  float64_t* h= SG_MALLOC(float64_t, sz);
1451  memset(h, 0, sz);
1452 
1453  float64_t* h_normalizer=SG_MALLOC(float64_t, slen);
1454  memset(h_normalizer, 0, slen*sizeof(float64_t));
1455  int32_t num_str=get_num_vectors();
1456  for (int32_t i=0; i<num_str; i++)
1457  {
1458  int32_t len;
1459  bool free_vec;
1460  ST* vec=get_feature_vector(i, len, free_vec);
1461  for (int32_t j=0; j<len; j++)
1462  {
1463  h[int64_t(j)*nsym+alphabet->remap_to_bin(vec[j])]++;
1464  h_normalizer[j]++;
1465  }
1466  free_feature_vector(vec, i, free_vec);
1467  }
1468 
1469  if (normalize)
1470  {
1471  for (int32_t i=0; i<slen; i++)
1472  {
1473  for (int32_t j=0; j<nsym; j++)
1474  {
1475  if (h_normalizer && h_normalizer[i])
1476  h[int64_t(i)*nsym+j]/=h_normalizer[i];
1477  }
1478  }
1479  }
1480  SG_FREE(h_normalizer);
1481 
1482  *hist=h;
1483  *rows=nsym;
1484  *cols=slen;
1485 }
1486 
1487 template<class ST> void CStringFeatures<ST>::create_random(float64_t* hist, int32_t rows, int32_t cols, int32_t num_vec)
1488 {
1489  ASSERT(rows == get_num_symbols());
1490  cleanup();
1491  float64_t* randoms=SG_MALLOC(float64_t, cols);
1492  SGString<ST>* sf=SG_MALLOC(SGString<ST>, num_vec);
1493 
1494  for (int32_t i=0; i<num_vec; i++)
1495  {
1496  sf[i].string=SG_MALLOC(ST, cols);
1497  sf[i].slen=cols;
1498 
1499  SGVector<float64_t>::random_vector(randoms, cols, 0.0, 1.0);
1500 
1501  for (int32_t j=0; j<cols; j++)
1502  {
1503  float64_t lik=hist[int64_t(j)*rows+0];
1504 
1505  int32_t c;
1506  for (c=0; c<rows-1; c++)
1507  {
1508  if (randoms[j]<=lik)
1509  break;
1510  lik+=hist[int64_t(j)*rows+c+1];
1511  }
1512  sf[i].string[j]=alphabet->remap_to_char(c);
1513  }
1514  }
1515  SG_FREE(randoms);
1516  set_features(sf, num_vec, cols);
1517 }
1518 
1519 /*
1520 CStringFeatures<SSKTripleFeature>* obtain_sssk_triple_from_cha(int d1, int d2)
1521 {
1522  int *s;
1523  int32_t nStr=get_num_vectors();
1524 
1525  int32_t nfeat=0;
1526  for (int32_t i=0; i < nStr; ++i)
1527  nfeat += get_vector_length[i] - d1 -d2;
1528  SGString<SSKFeature>* F= SG_MALLOC(SGString<SSKFeature>, nfeat);
1529  int32_t c=0;
1530  for (int32_t i=0; i < nStr; ++i)
1531  {
1532  int32_t len;
1533  bool free_vec;
1534  ST* S=get_feature_vector(vec_num, len, free_vec);
1535  free_feature_vector(vec, vec_num, free_vec);
1536  int32_t n=len - d1 - d2;
1537  s=S[i];
1538  for (int32_t j=0; j < n; ++j)
1539  {
1540  F[c].feature1=s[j];
1541  F[c].feature2=s[j+d1];
1542  F[c].feature3=s[j+d1+d2];
1543  F[c].group=i;
1544  c++;
1545  }
1546  }
1547  ASSERT(nfeat==c);
1548  return F;
1549 }
1550 
1551 CStringFeatures<SSKFeature>* obtain_sssk_double_from_char(int **S, int *len, int nStr, int d1)
1552 {
1553  int i, j;
1554  int n, nfeat;
1555  int *group;
1556  int *features;
1557  int *s;
1558  int c;
1559  SSKFeatures *F;
1560 
1561  nfeat=0;
1562  for (i=0; i < nStr; ++i)
1563  nfeat += len[i] - d1;
1564  group=(int *)SG_MALLOC(nfeat*sizeof(int));
1565  features=(int *)SG_MALLOC(nfeat*2*sizeof(int *));
1566  c=0;
1567  for (i=0; i < nStr; ++i)
1568  {
1569  n=len[i] - d1;
1570  s=S[i];
1571  for (j=0; j < n; ++j)
1572  {
1573  features[c]=s[j];
1574  features[c+nfeat]=s[j+d1];
1575  group[c]=i;
1576  c++;
1577  }
1578  }
1579  if (nfeat!=c)
1580  printf("Something is wrong...\n");
1581  F=(SSKFeatures *)SG_MALLOC(sizeof(SSKFeatures));
1582  (*F).features=features;
1583  (*F).group=group;
1584  (*F).n=nfeat;
1585  return F;
1586 }
1587 */
1588 
1590 {
1591  /* string list to create new CStringFeatures from */
1592  SGStringList<ST> list_copy(indices.vlen, max_string_length);
1593 
1594  /* copy all features */
1595  for (index_t i=0; i<indices.vlen; ++i)
1596  {
1597  /* index with respect to possible subset */
1598  index_t real_idx=m_subset_stack->subset_idx_conversion(indices.vector[i]);
1599 
1600  /* copy string */
1601  SGString<ST> current_string=features[real_idx];
1602  SGString<ST> string_copy(current_string.slen);
1603  memcpy(string_copy.string, current_string.string,
1604  current_string.slen*sizeof(ST));
1605  list_copy.strings[i]=string_copy;
1606  }
1607 
1608  /* create copy instance */
1609  CStringFeatures* result=new CStringFeatures(list_copy, alphabet);
1610 
1611  /* max string length may have changed */
1613 
1614  SG_REF(result);
1615 
1616  return result;
1617 }
1618 
1620 {
1621  /* max string length has to be updated */
1622  determine_maximum_string_length();
1623 }
1624 
1625 template<class ST> ST* CStringFeatures<ST>::compute_feature_vector(int32_t num, int32_t& len)
1626 {
1627  ASSERT(features && num<get_num_vectors());
1628 
1629  int32_t real_num=m_subset_stack->subset_idx_conversion(num);
1630 
1631  len=features[real_num].slen;
1632  if (len<=0)
1633  return NULL;
1634 
1635  ST* target=SG_MALLOC(ST, len);
1636  memcpy(target, features[real_num].string, len*sizeof(ST));
1637  return target;
1638 }
1639 
1640 template<class ST> void CStringFeatures<ST>::init()
1641 {
1642  set_generic<ST>();
1643 
1644  alphabet=NULL;
1645  num_vectors=0;
1646  features=NULL;
1647  single_string=NULL;
1648  length_of_single_string=0;
1649  max_string_length=0;
1650  order=0;
1651  symbol_mask_table=0;
1652  preprocess_on_get=false;
1653  feature_cache=NULL;
1654 
1655  m_parameters->add((CSGObject**) &alphabet, "alphabet");
1656  m_parameters->add_vector(&features, &num_vectors, "features",
1657  "This contains the array of features.");
1658  m_parameters->add_vector(&single_string,
1659  &length_of_single_string,
1660  "single_string",
1661  "Created by sliding window.");
1662  m_parameters->add(&max_string_length, "max_string_length",
1663  "Length of longest string.");
1664  m_parameters->add(&num_symbols, "num_symbols",
1665  "Number of used symbols.");
1666  m_parameters->add(&original_num_symbols, "original_num_symbols",
1667  "Original number of used symbols.");
1668  m_parameters->add(&order, "order",
1669  "Order used in higher order mapping.");
1670  m_parameters->add(&preprocess_on_get, "preprocess_on_get",
1671  "Preprocess on-the-fly?");
1672 
1673  /* TODO M_PARAMETERS->ADD?
1674  * /// order used in higher order mapping
1675  * ST* symbol_mask_table;
1676  */
1677 }
1678 
1684 {
1685  return F_BOOL;
1686 }
1687 
1693 {
1694  return F_CHAR;
1695 }
1696 
1702 {
1703  return F_BYTE;
1704 }
1705 
1711 {
1712  return F_SHORT;
1713 }
1714 
1720 {
1721  return F_WORD;
1722 }
1723 
1729 {
1730  return F_INT;
1731 }
1732 
1738 {
1739  return F_UINT;
1740 }
1741 
1747 {
1748  return F_LONG;
1749 }
1750 
1756 {
1757  return F_ULONG;
1758 }
1759 
1765 {
1766  return F_SHORTREAL;
1767 }
1768 
1774 {
1775  return F_DREAL;
1776 }
1777 
1783 {
1784  return F_LONGREAL;
1785 }
1786 
1787 template<> bool CStringFeatures<bool>::get_masked_symbols(bool symbol, uint8_t mask)
1788 {
1789  return symbol;
1790 }
1792 {
1793  return symbol;
1794 }
1796 {
1797  return symbol;
1798 }
1800 {
1801  return symbol;
1802 }
1803 
1804 template<> bool CStringFeatures<bool>::shift_offset(bool symbol, int32_t amount)
1805 {
1806  return false;
1807 }
1809 {
1810  return 0;
1811 }
1813 {
1814  return 0;
1815 }
1817 {
1818  return 0;
1819 }
1820 
1821 template<> bool CStringFeatures<bool>::shift_symbol(bool symbol, int32_t amount)
1822 {
1823  return symbol;
1824 }
1826 {
1827  return symbol;
1828 }
1830 {
1831  return symbol;
1832 }
1834 {
1835  return symbol;
1836 }
1837 
1838 #ifndef SUNOS
1839 template<> template <class CT> bool CStringFeatures<float32_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
1840 {
1841  return false;
1842 }
1843 template<> template <class CT> bool CStringFeatures<float64_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
1844 {
1845  return false;
1846 }
1847 template<> template <class CT> bool CStringFeatures<floatmax_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
1848 {
1849  return false;
1850 }
1851 #endif
1852 
1853 template<> void CStringFeatures<float32_t>::embed_features(int32_t p_order)
1854 {
1855 }
1856 template<> void CStringFeatures<float64_t>::embed_features(int32_t p_order)
1857 {
1858 }
1859 template<> void CStringFeatures<floatmax_t>::embed_features(int32_t p_order)
1860 {
1861 }
1862 
1864 {
1865 }
1867 {
1868 }
1870 {
1871 }
1872 
1874 {
1875  return 0;
1876 }
1878 {
1879  return 0;
1880 }
1882 {
1883  return 0;
1884 }
1885 
1886 template<> void CStringFeatures<float32_t>::unembed_word(float32_t word, uint8_t* seq, int32_t len)
1887 {
1888 }
1889 template<> void CStringFeatures<float64_t>::unembed_word(float64_t word, uint8_t* seq, int32_t len)
1890 {
1891 }
1892 template<> void CStringFeatures<floatmax_t>::unembed_word(floatmax_t word, uint8_t* seq, int32_t len)
1893 {
1894 }
1895 #define LOAD(f_load, sg_type) \
1896 template<> void CStringFeatures<sg_type>::load(CFile* loader) \
1897 { \
1898  SG_INFO( "loading...\n"); \
1899  \
1900  SG_SET_LOCALE_C; \
1901  SGString<sg_type>* strs; \
1902  int32_t num_str; \
1903  int32_t max_len; \
1904  loader->f_load(strs, num_str, max_len); \
1905  set_features(strs, num_str, max_len); \
1906  SG_RESET_LOCALE; \
1907 }
1908 
1909 LOAD(get_string_list, bool)
1910 LOAD(get_string_list, char)
1911 LOAD(get_int8_string_list, int8_t)
1912 LOAD(get_string_list, uint8_t)
1913 LOAD(get_string_list, int16_t)
1914 LOAD(get_string_list, uint16_t)
1915 LOAD(get_string_list, int32_t)
1916 LOAD(get_uint_string_list, uint32_t)
1917 LOAD(get_long_string_list, int64_t)
1918 LOAD(get_ulong_string_list, uint64_t)
1919 LOAD(get_string_list, float32_t)
1920 LOAD(get_string_list, float64_t)
1921 LOAD(get_longreal_string_list, floatmax_t)
1922 #undef LOAD
1923 
1924 #define SAVE(f_write, sg_type) \
1925 template<> void CStringFeatures<sg_type>::save(CFile* writer) \
1926 { \
1927  if (m_subset_stack->has_subsets()) \
1928  SG_ERROR("save() is not possible on subset"); \
1929  SG_SET_LOCALE_C; \
1930  ASSERT(writer); \
1931  writer->f_write(features, num_vectors); \
1932  SG_RESET_LOCALE; \
1933 }
1934 
1935 SAVE(set_string_list, bool)
1936 SAVE(set_string_list, char)
1937 SAVE(set_int8_string_list, int8_t)
1938 SAVE(set_string_list, uint8_t)
1939 SAVE(set_string_list, int16_t)
1940 SAVE(set_string_list, uint16_t)
1941 SAVE(set_string_list, int32_t)
1942 SAVE(set_uint_string_list, uint32_t)
1943 SAVE(set_long_string_list, int64_t)
1944 SAVE(set_ulong_string_list, uint64_t)
1945 SAVE(set_string_list, float32_t)
1946 SAVE(set_string_list, float64_t)
1947 SAVE(set_longreal_string_list, floatmax_t)
1948 #undef SAVE
1949 
1950 template <class ST> template <class CT>
1952  int32_t p_order, int32_t gap, bool rev)
1953 {
1954  remove_all_subsets();
1955  ASSERT(sf);
1956 
1957  CAlphabet* alpha=sf->get_alphabet();
1958  ASSERT(alpha->get_num_symbols_in_histogram() > 0);
1959 
1960  this->order=p_order;
1961  cleanup();
1962 
1963  num_vectors=sf->get_num_vectors();
1964  ASSERT(num_vectors>0);
1965  max_string_length=sf->get_max_vector_length()-start;
1966  features=SG_MALLOC(SGString<ST>, num_vectors);
1967 
1968  SG_DEBUG( "%1.0llf symbols in StringFeatures<*> %d symbols in histogram\n", sf->get_num_symbols(),
1969  alpha->get_num_symbols_in_histogram());
1970 
1971  for (int32_t i=0; i<num_vectors; i++)
1972  {
1973  int32_t len=-1;
1974  bool vfree;
1975  CT* c=sf->get_feature_vector(i, len, vfree);
1976  ASSERT(!vfree); // won't work when preprocessors are attached
1977 
1978  features[i].string=SG_MALLOC(ST, len);
1979  features[i].slen=len;
1980 
1981  ST* str=features[i].string;
1982  for (int32_t j=0; j<len; j++)
1983  str[j]=(ST) alpha->remap_to_bin(c[j]);
1984  }
1985 
1986  original_num_symbols=alpha->get_num_symbols();
1987  int32_t max_val=alpha->get_num_bits();
1988 
1989  SG_UNREF(alpha);
1990 
1991  if (p_order>1)
1992  num_symbols=CMath::powl((floatmax_t) 2, (floatmax_t) max_val*p_order);
1993  else
1994  num_symbols=original_num_symbols;
1995  SG_INFO( "max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols);
1996 
1997  if ( ((floatmax_t) num_symbols) > CMath::powl(((floatmax_t) 2),((floatmax_t) sizeof(ST)*8)) )
1998  {
1999  SG_ERROR( "symbol does not fit into datatype \"%c\" (%d)\n", (char) max_val, (int) max_val);
2000  return false;
2001  }
2002 
2003  SG_DEBUG( "translate: start=%i order=%i gap=%i(size:%i)\n", start, p_order, gap, sizeof(ST)) ;
2004  for (int32_t line=0; line<num_vectors; line++)
2005  {
2006  int32_t len=0;
2007  bool vfree;
2008  ST* fv=get_feature_vector(line, len, vfree);
2009  ASSERT(!vfree); // won't work when preprocessors are attached
2010 
2011  if (rev)
2012  CAlphabet::translate_from_single_order_reversed(fv, len, start+gap, p_order+gap, max_val, gap);
2013  else
2014  CAlphabet::translate_from_single_order(fv, len, start+gap, p_order+gap, max_val, gap);
2015 
2016  /* fix the length of the string -- hacky */
2017  features[line].slen-=start+gap ;
2018  if (features[line].slen<0)
2019  features[line].slen=0 ;
2020  }
2021 
2022  compute_symbol_mask_table(max_val);
2023 
2024  return true;
2025 }
2026 
2027 template class CStringFeatures<bool>;
2028 template class CStringFeatures<char>;
2029 template class CStringFeatures<int8_t>;
2030 template class CStringFeatures<uint8_t>;
2031 template class CStringFeatures<int16_t>;
2032 template class CStringFeatures<uint16_t>;
2033 template class CStringFeatures<int32_t>;
2034 template class CStringFeatures<uint32_t>;
2035 template class CStringFeatures<int64_t>;
2036 template class CStringFeatures<uint64_t>;
2037 template class CStringFeatures<float32_t>;
2038 template class CStringFeatures<float64_t>;
2039 template class CStringFeatures<floatmax_t>;
2040 
2041 template bool CStringFeatures<uint16_t>::obtain_from_char_features<uint8_t>(CStringFeatures<uint8_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
2042 template bool CStringFeatures<uint32_t>::obtain_from_char_features<uint8_t>(CStringFeatures<uint8_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
2043 template bool CStringFeatures<uint64_t>::obtain_from_char_features<uint8_t>(CStringFeatures<uint8_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
2044 
2045 template bool CStringFeatures<uint16_t>::obtain_from_char_features<uint16_t>(CStringFeatures<uint16_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
2046 template bool CStringFeatures<uint32_t>::obtain_from_char_features<uint16_t>(CStringFeatures<uint16_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
2047 template bool CStringFeatures<uint64_t>::obtain_from_char_features<uint16_t>(CStringFeatures<uint16_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
2048 }

SHOGUN Machine Learning Toolbox - Documentation