00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013 #ifndef _CSTRINGFEATURES__H__
00014 #define _CSTRINGFEATURES__H__
00015
00016 #include <shogun/lib/common.h>
00017 #include <shogun/lib/Cache.h>
00018 #include <shogun/lib/DynamicArray.h>
00019 #include <shogun/lib/Compressor.h>
00020 #include <shogun/io/File.h>
00021
00022 #include <shogun/features/Features.h>
00023 #include <shogun/features/Alphabet.h>
00024
00025 namespace shogun
00026 {
00027 class CAlphabet;
00028 template <class T> class CDynamicArray;
00029 class CFile;
00030 template <class T> class SGString;
00031
00032 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00033 struct SSKDoubleFeature
00034 {
00035 int feature1;
00036 int feature2;
00037 int group;
00038 };
00039
00040 struct SSKTripleFeature
00041 {
00042 int feature1;
00043 int feature2;
00044 int feature3;
00045 int group;
00046 };
00047 #endif
00048
00072 template <class ST> class CStringFeatures : public CFeatures
00073 {
00074 public:
00078 CStringFeatures();
00079
00084 CStringFeatures(EAlphabet alpha);
00085
00090 CStringFeatures(SGStringList<ST> string_list, EAlphabet alpha);
00091
00096 CStringFeatures(SGStringList<ST> string_list, CAlphabet* alpha);
00097
00102 CStringFeatures(CAlphabet* alpha);
00103
00105 CStringFeatures(const CStringFeatures & orig);
00106
00112 CStringFeatures(CFile* loader, EAlphabet alpha=DNA);
00113
00114 virtual ~CStringFeatures();
00115
00121 virtual void cleanup();
00122
00129 virtual void cleanup_feature_vector(int32_t num);
00130
00138 virtual void cleanup_feature_vectors(int32_t start, int32_t stop);
00139
00144 virtual EFeatureClass get_feature_class() const;
00145
00150 virtual EFeatureType get_feature_type() const;
00151
00156 CAlphabet* get_alphabet();
00157
00162 virtual CFeatures* duplicate() const;
00163
00170 SGVector<ST> get_feature_vector(int32_t num);
00171
00179 void set_feature_vector(SGVector<ST> vector, int32_t num);
00180
00183 void enable_on_the_fly_preprocessing();
00184
00188 void disable_on_the_fly_preprocessing();
00189
00200 ST* get_feature_vector(int32_t num, int32_t& len, bool& dofree);
00201
00208 CStringFeatures<ST>* get_transposed();
00209
00223 SGString<ST>* get_transposed(int32_t &num_feat, int32_t &num_vec);
00224
00233 void free_feature_vector(ST* feat_vec, int32_t num, bool dofree);
00234
00242 void free_feature_vector(SGVector<ST> feat_vec, int32_t num);
00243
00252 virtual ST get_feature(int32_t vec_num, int32_t feat_num);
00253
00261 virtual int32_t get_vector_length(int32_t vec_num);
00262
00269 virtual int32_t get_max_vector_length();
00270
00272 virtual int32_t get_num_vectors() const;
00273
00280 floatmax_t get_num_symbols();
00281
00289 floatmax_t get_max_num_symbols();
00290
00291
00292
00297 floatmax_t get_original_num_symbols();
00298
00303 int32_t get_order();
00304
00312 ST get_masked_symbols(ST symbol, uint8_t mask);
00313
00320 ST shift_offset(ST offset, int32_t amount);
00321
00328 ST shift_symbol(ST symbol, int32_t amount);
00329
00334 virtual void load(CFile* loader);
00335
00346 void load_ascii_file(char* fname, bool remap_to_bin=true,
00347 EAlphabet ascii_alphabet=DNA, EAlphabet binary_alphabet=RAWDNA);
00348
00357 bool load_fasta_file(const char* fname, bool ignore_invalid=false);
00358
00368 bool load_fastq_file(const char* fname,
00369 bool ignore_invalid=false, bool bitremap_in_single_string=false);
00370
00378 bool load_from_directory(char* dirname);
00379
00385 void set_features(SGStringList<ST> feats);
00386
00396 bool set_features(SGString<ST>* p_features, int32_t p_num_vectors,
00397 int32_t p_max_string_length);
00398
00407 bool append_features(CStringFeatures<ST>* sf);
00408
00421 bool append_features(SGString<ST>* p_features, int32_t p_num_vectors,
00422 int32_t p_max_string_length);
00423
00427 SGStringList<ST> get_features();
00428
00437 virtual SGString<ST>* get_features(int32_t& num_str, int32_t& max_str_len);
00438
00447 virtual SGString<ST>* copy_features(int32_t& num_str, int32_t& max_str_len);
00448
00456 virtual void get_features(SGString<ST>** dst, int32_t* num_str);
00457
00464 virtual void save(CFile* writer);
00465
00474 virtual bool load_compressed(char* src, bool decompress);
00475
00485 virtual bool save_compressed(char* dest, E_COMPRESSION_TYPE compression, int level);
00486
00491 virtual int32_t get_size() const;
00492
00498 virtual bool apply_preprocessor(bool force_preprocessing=false);
00499
00512 int32_t obtain_by_sliding_window(int32_t window_size, int32_t step_size, int32_t skip=0);
00513
00524 int32_t obtain_by_position_list(int32_t window_size, CDynamicArray<int32_t>* positions,
00525 int32_t skip=0);
00526
00540 bool obtain_from_char(CStringFeatures<char>* sf, int32_t start,
00541 int32_t p_order, int32_t gap, bool rev);
00542
00554 template <class CT>
00555 bool obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start,
00556 int32_t p_order, int32_t gap, bool rev);
00557
00567 bool have_same_length(int32_t len=-1);
00568
00574 void embed_features(int32_t p_order);
00575
00582 void compute_symbol_mask_table(int64_t max_val);
00583
00590 void unembed_word(ST word, uint8_t* seq, int32_t len);
00591
00597 ST embed_word(ST* seq, int32_t len);
00598
00603 void determine_maximum_string_length();
00604
00612 static ST* get_zero_terminated_string_copy(SGString<ST> str);
00613
00622 virtual void set_feature_vector(int32_t num, ST* string, int32_t len);
00623
00628 virtual void get_histogram(float64_t** hist, int32_t* rows, int32_t* cols,
00629 bool normalize=true);
00630
00635 virtual void create_random(float64_t* hist, int32_t rows, int32_t cols,
00636 int32_t num_vec);
00637
00646 virtual CFeatures* copy_subset(SGVector<index_t> indices);
00647
00649 virtual const char* get_name() const { return "StringFeatures"; }
00650
00652 virtual void subset_changed_post();
00653
00654 protected:
00665 virtual ST* compute_feature_vector(int32_t num, int32_t& len);
00666
00667 private:
00668 void init();
00669
00670 protected:
00671
00673 CAlphabet* alphabet;
00674
00676 int32_t num_vectors;
00677
00679 SGString<ST>* features;
00680
00682 ST* single_string;
00683
00685 int32_t length_of_single_string;
00686
00688 int32_t max_string_length;
00689
00691 floatmax_t num_symbols;
00692
00694 floatmax_t original_num_symbols;
00695
00697 int32_t order;
00698
00700 ST* symbol_mask_table;
00701
00703 int32_t symbol_mask_table_len;
00704
00706 bool preprocess_on_get;
00707
00709 CCache<ST>* feature_cache;
00710 };
00711 }
00712 #endif // _CSTRINGFEATURES__H__