SHOGUN  v3.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
StringFeatures.h
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 1999-2009 Soeren Sonnenburg
8  * Written (W) 1999-2008 Gunnar Raetsch
9  * Written (W) 2011-2012 Heiko Strathmann
10  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
11  */
12 
13 #ifndef _CSTRINGFEATURES__H__
14 #define _CSTRINGFEATURES__H__
15 
16 #include <shogun/lib/common.h>
17 #include <shogun/lib/Cache.h>
19 #include <shogun/lib/Compressor.h>
20 #include <shogun/io/File.h>
21 
24 
25 namespace shogun
26 {
27 class CAlphabet;
28 template <class T> class CDynamicArray;
29 class CFile;
30 template <class T> class SGString;
31 
32 #ifndef DOXYGEN_SHOULD_SKIP_THIS
33 struct SSKDoubleFeature
34 {
35  int feature1;
36  int feature2;
37  int group;
38 };
39 
40 struct SSKTripleFeature
41 {
42  int feature1;
43  int feature2;
44  int feature3;
45  int group;
46 };
47 #endif
48 
72 template <class ST> class CStringFeatures : public CFeatures
73 {
74  public:
79 
85 
90  CStringFeatures(SGStringList<ST> string_list, EAlphabet alpha);
91 
96  CStringFeatures(SGStringList<ST> string_list, CAlphabet* alpha);
97 
102  CStringFeatures(CAlphabet* alpha);
103 
105  CStringFeatures(const CStringFeatures & orig);
106 
112  CStringFeatures(CFile* loader, EAlphabet alpha=DNA);
113 
114  virtual ~CStringFeatures();
115 
121  virtual void cleanup();
122 
129  virtual void cleanup_feature_vector(int32_t num);
130 
138  virtual void cleanup_feature_vectors(int32_t start, int32_t stop);
139 
144  virtual EFeatureClass get_feature_class() const;
145 
150  virtual EFeatureType get_feature_type() const;
151 
157 
162  virtual CFeatures* duplicate() const;
163 
170  SGVector<ST> get_feature_vector(int32_t num);
171 
179  void set_feature_vector(SGVector<ST> vector, int32_t num);
180 
184 
189 
200  ST* get_feature_vector(int32_t num, int32_t& len, bool& dofree);
201 
209 
223  SGString<ST>* get_transposed(int32_t &num_feat, int32_t &num_vec);
224 
233  void free_feature_vector(ST* feat_vec, int32_t num, bool dofree);
234 
242  void free_feature_vector(SGVector<ST> feat_vec, int32_t num);
243 
252  virtual ST get_feature(int32_t vec_num, int32_t feat_num);
253 
261  virtual int32_t get_vector_length(int32_t vec_num);
262 
269  virtual int32_t get_max_vector_length();
270 
272  virtual int32_t get_num_vectors() const;
273 
281 
290 
291  // these functions are necessary to find out about a former conversion process
292 
298 
303  int32_t get_order();
304 
312  ST get_masked_symbols(ST symbol, uint8_t mask);
313 
320  ST shift_offset(ST offset, int32_t amount);
321 
328  ST shift_symbol(ST symbol, int32_t amount);
329 
334  virtual void load(CFile* loader);
335 
346  void load_ascii_file(char* fname, bool remap_to_bin=true,
347  EAlphabet ascii_alphabet=DNA, EAlphabet binary_alphabet=RAWDNA);
348 
357  bool load_fasta_file(const char* fname, bool ignore_invalid=false);
358 
368  bool load_fastq_file(const char* fname,
369  bool ignore_invalid=false, bool bitremap_in_single_string=false);
370 
378  bool load_from_directory(char* dirname);
379 
385  void set_features(SGStringList<ST> feats);
386 
396  bool set_features(SGString<ST>* p_features, int32_t p_num_vectors,
397  int32_t p_max_string_length);
398 
408 
421  bool append_features(SGString<ST>* p_features, int32_t p_num_vectors,
422  int32_t p_max_string_length);
423 
428 
437  virtual SGString<ST>* get_features(int32_t& num_str, int32_t& max_str_len);
438 
447  virtual SGString<ST>* copy_features(int32_t& num_str, int32_t& max_str_len);
448 
456  virtual void get_features(SGString<ST>** dst, int32_t* num_str);
457 
464  virtual void save(CFile* writer);
465 
474  virtual bool load_compressed(char* src, bool decompress);
475 
485  virtual bool save_compressed(char* dest, E_COMPRESSION_TYPE compression, int level);
486 
492  virtual bool apply_preprocessor(bool force_preprocessing=false);
493 
506  int32_t obtain_by_sliding_window(int32_t window_size, int32_t step_size, int32_t skip=0);
507 
518  int32_t obtain_by_position_list(int32_t window_size, CDynamicArray<int32_t>* positions,
519  int32_t skip=0);
520 
534  bool obtain_from_char(CStringFeatures<char>* sf, int32_t start,
535  int32_t p_order, int32_t gap, bool rev);
536 
548  template <class CT>
549  bool obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start,
550  int32_t p_order, int32_t gap, bool rev);
551 
561  bool have_same_length(int32_t len=-1);
562 
568  void embed_features(int32_t p_order);
569 
576  void compute_symbol_mask_table(int64_t max_val);
577 
584  void unembed_word(ST word, uint8_t* seq, int32_t len);
585 
591  ST embed_word(ST* seq, int32_t len);
592 
598 
607 
616  virtual void set_feature_vector(int32_t num, ST* string, int32_t len);
617 
622  virtual void get_histogram(float64_t** hist, int32_t* rows, int32_t* cols,
623  bool normalize=true);
624 
629  virtual void create_random(float64_t* hist, int32_t rows, int32_t cols,
630  int32_t num_vec);
631 
640  virtual CFeatures* copy_subset(SGVector<index_t> indices);
641 
643  virtual const char* get_name() const { return "StringFeatures"; }
644 
646  virtual void subset_changed_post();
647 
648  protected:
659  virtual ST* compute_feature_vector(int32_t num, int32_t& len);
660 
661  private:
662  void init();
663 
664  protected:
665 
668 
670  int32_t num_vectors;
671 
674 
677 
680 
683 
686 
689 
691  int32_t order;
692 
695 
698 
701 
704 };
705 }
706 #endif // _CSTRINGFEATURES__H__

SHOGUN Machine Learning Toolbox - Documentation