SHOGUN  4.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
StringFeatures.h
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 1999-2009 Soeren Sonnenburg
8  * Written (W) 1999-2008 Gunnar Raetsch
9  * Written (W) 2011-2012 Heiko Strathmann
10  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
11  */
12 
13 #ifndef _CSTRINGFEATURES__H__
14 #define _CSTRINGFEATURES__H__
15 
16 #include <shogun/lib/config.h>
17 
18 #include <shogun/lib/common.h>
19 #include <shogun/lib/Cache.h>
21 #include <shogun/lib/Compressor.h>
22 #include <shogun/io/File.h>
23 
26 #include <shogun/lib/SGString.h>
27 
28 namespace shogun
29 {
30 class CAlphabet;
31 template <class T> class CDynamicArray;
32 class CFile;
33 template <class T> class SGString;
34 template <class T> class SGStringList;
35 
36 #ifndef DOXYGEN_SHOULD_SKIP_THIS
37 struct SSKDoubleFeature
38 {
39  int feature1;
40  int feature2;
41  int group;
42 };
43 
44 struct SSKTripleFeature
45 {
46  int feature1;
47  int feature2;
48  int feature3;
49  int group;
50 };
51 #endif
52 
76 template <class ST> class CStringFeatures : public CFeatures
77 {
78  public:
81 
87 
93  CStringFeatures(SGStringList<ST> string_list, EAlphabet alpha);
94 
100  CStringFeatures(SGStringList<ST> string_list, CAlphabet* alpha);
101 
106  CStringFeatures(CAlphabet* alpha);
107 
112  CStringFeatures(const CStringFeatures& orig);
113 
119  CStringFeatures(CFile* loader, EAlphabet alpha=DNA);
120 
122  virtual ~CStringFeatures();
123 
129  virtual void cleanup();
130 
137  virtual void cleanup_feature_vector(int32_t num);
138 
146  virtual void cleanup_feature_vectors(int32_t start, int32_t stop);
147 
152  virtual EFeatureClass get_feature_class() const;
153 
158  virtual EFeatureType get_feature_type() const;
159 
164  CAlphabet* get_alphabet();
165 
170  virtual CFeatures* duplicate() const;
171 
179  SGVector<ST> get_feature_vector(int32_t num);
180 
188  void set_feature_vector(SGVector<ST> vector, int32_t num);
189 
192 
197 
208  ST* get_feature_vector(int32_t num, int32_t& len, bool& dofree);
209 
216  CStringFeatures<ST>* get_transposed();
217 
231  SGString<ST>* get_transposed(int32_t &num_feat, int32_t &num_vec);
232 
241  void free_feature_vector(ST* feat_vec, int32_t num, bool dofree);
242 
250  void free_feature_vector(SGVector<ST> feat_vec, int32_t num);
251 
260  virtual ST get_feature(int32_t vec_num, int32_t feat_num);
261 
269  virtual int32_t get_vector_length(int32_t vec_num);
270 
277  virtual int32_t get_max_vector_length();
278 
280  virtual int32_t get_num_vectors() const;
281 
289 
298 
299  // these functions are necessary to find out about a former conversion process
300 
306 
311  int32_t get_order();
312 
320  ST get_masked_symbols(ST symbol, uint8_t mask);
321 
328  ST shift_offset(ST offset, int32_t amount);
329 
336  ST shift_symbol(ST symbol, int32_t amount);
337 
342  virtual void load(CFile* loader);
343 
354  void load_ascii_file(char* fname, bool remap_to_bin=true,
355  EAlphabet ascii_alphabet=DNA, EAlphabet binary_alphabet=RAWDNA);
356 
365  bool load_fasta_file(const char* fname, bool ignore_invalid=false);
366 
376  bool load_fastq_file(const char* fname,
377  bool ignore_invalid=false, bool bitremap_in_single_string=false);
378 
386  bool load_from_directory(char* dirname);
387 
393  void set_features(SGStringList<ST> feats);
394 
404  bool set_features(SGString<ST>* p_features, int32_t p_num_vectors,
405  int32_t p_max_string_length);
406 
415  bool append_features(CStringFeatures<ST>* sf);
416 
429  bool append_features(SGString<ST>* p_features, int32_t p_num_vectors,
430  int32_t p_max_string_length);
431 
435  SGStringList<ST> get_features();
436 
445  virtual SGString<ST>* get_features(int32_t& num_str, int32_t& max_str_len);
446 
455  virtual SGString<ST>* copy_features(int32_t& num_str, int32_t& max_str_len);
456 
464  virtual void get_features(SGString<ST>** dst, int32_t* num_str);
465 
472  virtual void save(CFile* writer);
473 
482  virtual bool load_compressed(char* src, bool decompress);
483 
493  virtual bool save_compressed(char* dest, E_COMPRESSION_TYPE compression, int level);
494 
500  virtual bool apply_preprocessor(bool force_preprocessing=false);
501 
514  int32_t obtain_by_sliding_window(int32_t window_size, int32_t step_size, int32_t skip=0);
515 
526  int32_t obtain_by_position_list(int32_t window_size, CDynamicArray<int32_t>* positions,
527  int32_t skip=0);
528 
542  bool obtain_from_char(CStringFeatures<char>* sf, int32_t start,
543  int32_t p_order, int32_t gap, bool rev);
544 
556  template <class CT>
557  bool obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start,
558  int32_t p_order, int32_t gap, bool rev);
559 
569  bool have_same_length(int32_t len=-1);
570 
576  void embed_features(int32_t p_order);
577 
584  void compute_symbol_mask_table(int64_t max_val);
585 
592  void unembed_word(ST word, uint8_t* seq, int32_t len);
593 
599  ST embed_word(ST* seq, int32_t len);
600 
606 
614  static ST* get_zero_terminated_string_copy(SGString<ST> str);
615 
624  virtual void set_feature_vector(int32_t num, ST* string, int32_t len);
625 
630  virtual void get_histogram(float64_t** hist, int32_t* rows, int32_t* cols,
631  bool normalize=true);
632 
637  virtual void create_random(float64_t* hist, int32_t rows, int32_t cols,
638  int32_t num_vec);
639 
648  virtual CFeatures* copy_subset(SGVector<index_t> indices);
649 
651  virtual const char* get_name() const { return "StringFeatures"; }
652 
654  virtual void subset_changed_post();
655 
656  protected:
667  virtual ST* compute_feature_vector(int32_t num, int32_t& len);
668 
669  private:
670  void init();
671 
672  protected:
675 
677  int32_t num_vectors;
678 
681 
684 
687 
690 
693 
696 
698  int32_t order;
699 
702 
705 
708 
711 };
712 }
713 #endif // _CSTRINGFEATURES__H__
void set_feature_vector(SGVector< ST > vector, int32_t num)
int32_t length_of_single_string
length of prior single string
int32_t obtain_by_sliding_window(int32_t window_size, int32_t step_size, int32_t skip=0)
virtual int32_t get_max_vector_length()
SGVector< ST > get_feature_vector(int32_t num)
virtual void load(CFile *loader)
virtual CFeatures * duplicate() const
floatmax_t num_symbols
number of used symbols
DNA - letters A,C,G,T.
Definition: Alphabet.h:26
bool load_from_directory(char *dirname)
bool obtain_from_char(CStringFeatures< char > *sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
virtual CFeatures * copy_subset(SGVector< index_t > indices)
ST shift_offset(ST offset, int32_t amount)
RAWDNA - letters 0,1,2,3.
Definition: Alphabet.h:29
virtual void subset_changed_post()
void unembed_word(ST word, uint8_t *seq, int32_t len)
EAlphabet
Alphabet of charfeatures/observations.
Definition: Alphabet.h:23
virtual SGString< ST > * copy_features(int32_t &num_str, int32_t &max_str_len)
floatmax_t get_max_num_symbols()
bool load_fasta_file(const char *fname, bool ignore_invalid=false)
SGString< ST > * features
virtual int32_t get_num_vectors() const
bool append_features(CStringFeatures< ST > *sf)
CFeatures(int32_t size=0)
Definition: Features.cpp:23
virtual void create_random(float64_t *hist, int32_t rows, int32_t cols, int32_t num_vec)
The class Alphabet implements an alphabet and alphabet utility functions.
Definition: Alphabet.h:91
CCache< ST > * feature_cache
void free_feature_vector(ST *feat_vec, int32_t num, bool dofree)
floatmax_t get_original_num_symbols()
bool preprocess_on_get
preprocess on-the-fly?
virtual bool load_compressed(char *src, bool decompress)
EFeatureClass
shogun feature class
Definition: FeatureTypes.h:38
void load_ascii_file(char *fname, bool remap_to_bin=true, EAlphabet ascii_alphabet=DNA, EAlphabet binary_alphabet=RAWDNA)
virtual bool apply_preprocessor(bool force_preprocessing=false)
ST get_masked_symbols(ST symbol, uint8_t mask)
int32_t symbol_mask_table_len
order used in higher order mapping
CStringFeatures< ST > * get_transposed()
int32_t order
order used in higher order mapping
ST embed_word(ST *seq, int32_t len)
int32_t obtain_by_position_list(int32_t window_size, CDynamicArray< int32_t > *positions, int32_t skip=0)
bool load_fastq_file(const char *fname, bool ignore_invalid=false, bool bitremap_in_single_string=false)
double float64_t
Definition: common.h:50
floatmax_t original_num_symbols
original number of used symbols (before higher order mapping)
long double floatmax_t
Definition: common.h:51
virtual EFeatureClass get_feature_class() const
virtual bool save_compressed(char *dest, E_COMPRESSION_TYPE compression, int level)
virtual ST * compute_feature_vector(int32_t num, int32_t &len)
virtual ST get_feature(int32_t vec_num, int32_t feat_num)
void compute_symbol_mask_table(int64_t max_val)
virtual EFeatureType get_feature_type() const
EFeatureType
shogun feature type
Definition: FeatureTypes.h:19
bool have_same_length(int32_t len=-1)
virtual void cleanup_feature_vector(int32_t num)
E_COMPRESSION_TYPE
Definition: Compressor.h:21
virtual const char * get_name() const
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
virtual void save(CFile *writer)
SGStringList< ST > get_features()
ST shift_symbol(ST symbol, int32_t amount)
void embed_features(int32_t p_order)
virtual void get_histogram(float64_t **hist, int32_t *rows, int32_t *cols, bool normalize=true)
virtual void cleanup_feature_vectors(int32_t start, int32_t stop)
bool obtain_from_char_features(CStringFeatures< CT > *sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
ST * symbol_mask_table
order used in higher order mapping
void set_features(SGStringList< ST > feats)
virtual int32_t get_vector_length(int32_t vec_num)
static ST * get_zero_terminated_string_copy(SGString< ST > str)

SHOGUN Machine Learning Toolbox - Documentation