00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013 #ifndef _CSTRINGFEATURES__H__
00014 #define _CSTRINGFEATURES__H__
00015
00016 #include <shogun/lib/common.h>
00017 #include <shogun/lib/Cache.h>
00018 #include <shogun/lib/DynamicArray.h>
00019 #include <shogun/lib/Compressor.h>
00020 #include <shogun/io/File.h>
00021
00022 #include <shogun/features/Features.h>
00023 #include <shogun/features/Alphabet.h>
00024
00025 namespace shogun
00026 {
00027 class CAlphabet;
00028 template <class T> class CDynamicArray;
00029 class CFile;
00030 template <class T> class SGString;
00031
00032 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00033 struct SSKDoubleFeature
00034 {
00035 int feature1;
00036 int feature2;
00037 int group;
00038 };
00039
00040 struct SSKTripleFeature
00041 {
00042 int feature1;
00043 int feature2;
00044 int feature3;
00045 int group;
00046 };
00047 #endif
00048
00071 template <class ST> class CStringFeatures : public CFeatures
00072 {
00073 public:
00077 CStringFeatures();
00078
00083 CStringFeatures(EAlphabet alpha);
00084
00089 CStringFeatures(SGStringList<ST> string_list, EAlphabet alpha);
00090
00095 CStringFeatures(SGStringList<ST> string_list, CAlphabet* alpha);
00096
00101 CStringFeatures(CAlphabet* alpha);
00102
00104 CStringFeatures(const CStringFeatures & orig);
00105
00111 CStringFeatures(CFile* loader, EAlphabet alpha=DNA);
00112
00113 virtual ~CStringFeatures();
00114
00120 virtual void cleanup();
00121
00128 virtual void cleanup_feature_vector(int32_t num);
00129
00137 virtual void cleanup_feature_vectors(int32_t start, int32_t stop);
00138
00143 virtual EFeatureClass get_feature_class();
00144
00149 virtual EFeatureType get_feature_type();
00150
00155 CAlphabet* get_alphabet();
00156
00161 virtual CFeatures* duplicate() const;
00162
00169 SGVector<ST> get_feature_vector(int32_t num);
00170
00178 void set_feature_vector(SGVector<ST> vector, int32_t num);
00179
00182 void enable_on_the_fly_preprocessing();
00183
00187 void disable_on_the_fly_preprocessing();
00188
00199 ST* get_feature_vector(int32_t num, int32_t& len, bool& dofree);
00200
00207 CStringFeatures<ST>* get_transposed();
00208
00222 SGString<ST>* get_transposed(int32_t &num_feat, int32_t &num_vec);
00223
00232 void free_feature_vector(ST* feat_vec, int32_t num, bool dofree);
00233
00241 void free_feature_vector(SGVector<ST> feat_vec, int32_t num);
00242
00251 virtual ST get_feature(int32_t vec_num, int32_t feat_num);
00252
00260 virtual int32_t get_vector_length(int32_t vec_num);
00261
00268 virtual int32_t get_max_vector_length();
00269
00271 virtual int32_t get_num_vectors() const;
00272
00279 floatmax_t get_num_symbols();
00280
00288 floatmax_t get_max_num_symbols();
00289
00290
00291
00296 floatmax_t get_original_num_symbols();
00297
00302 int32_t get_order();
00303
00311 ST get_masked_symbols(ST symbol, uint8_t mask);
00312
00319 ST shift_offset(ST offset, int32_t amount);
00320
00327 ST shift_symbol(ST symbol, int32_t amount);
00328
00333 virtual void load(CFile* loader);
00334
00345 void load_ascii_file(char* fname, bool remap_to_bin=true,
00346 EAlphabet ascii_alphabet=DNA, EAlphabet binary_alphabet=RAWDNA);
00347
00356 bool load_fasta_file(const char* fname, bool ignore_invalid=false);
00357
00367 bool load_fastq_file(const char* fname,
00368 bool ignore_invalid=false, bool bitremap_in_single_string=false);
00369
00377 bool load_from_directory(char* dirname);
00378
00384 void set_features(SGStringList<ST> feats);
00385
00395 bool set_features(SGString<ST>* p_features, int32_t p_num_vectors,
00396 int32_t p_max_string_length);
00397
00406 bool append_features(CStringFeatures<ST>* sf);
00407
00420 bool append_features(SGString<ST>* p_features, int32_t p_num_vectors,
00421 int32_t p_max_string_length);
00422
00426 SGStringList<ST> get_features();
00427
00436 virtual SGString<ST>* get_features(int32_t& num_str, int32_t& max_str_len);
00437
00446 virtual SGString<ST>* copy_features(int32_t& num_str, int32_t& max_str_len);
00447
00455 virtual void get_features(SGString<ST>** dst, int32_t* num_str);
00456
00463 virtual void save(CFile* writer);
00464
00473 virtual bool load_compressed(char* src, bool decompress);
00474
00484 virtual bool save_compressed(char* dest, E_COMPRESSION_TYPE compression, int level);
00485
00490 virtual int32_t get_size();
00491
00497 virtual bool apply_preprocessor(bool force_preprocessing=false);
00498
00511 int32_t obtain_by_sliding_window(int32_t window_size, int32_t step_size, int32_t skip=0);
00512
00523 int32_t obtain_by_position_list(int32_t window_size, CDynamicArray<int32_t>* positions,
00524 int32_t skip=0);
00525
00539 bool obtain_from_char(CStringFeatures<char>* sf, int32_t start,
00540 int32_t p_order, int32_t gap, bool rev);
00541
00553 template <class CT>
00554 bool obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start,
00555 int32_t p_order, int32_t gap, bool rev);
00556
00566 bool have_same_length(int32_t len=-1);
00567
00573 void embed_features(int32_t p_order);
00574
00581 void compute_symbol_mask_table(int64_t max_val);
00582
00589 void unembed_word(ST word, uint8_t* seq, int32_t len);
00590
00596 ST embed_word(ST* seq, int32_t len);
00597
00602 void determine_maximum_string_length();
00603
00611 static ST* get_zero_terminated_string_copy(SGString<ST> str);
00612
00621 virtual void set_feature_vector(int32_t num, ST* string, int32_t len);
00622
00627 virtual void get_histogram(float64_t** hist, int32_t* rows, int32_t* cols,
00628 bool normalize=true);
00629
00634 virtual void create_random(float64_t* hist, int32_t rows, int32_t cols,
00635 int32_t num_vec);
00636
00645 virtual CFeatures* copy_subset(SGVector<index_t> indices);
00646
00648 inline virtual const char* get_name() const { return "StringFeatures"; }
00649
00651 virtual void subset_changed_post();
00652
00653 protected:
00664 virtual ST* compute_feature_vector(int32_t num, int32_t& len);
00665
00666 private:
00667 void init();
00668
00669 protected:
00670
00672 CAlphabet* alphabet;
00673
00675 int32_t num_vectors;
00676
00678 SGString<ST>* features;
00679
00681 ST* single_string;
00682
00684 int32_t length_of_single_string;
00685
00687 int32_t max_string_length;
00688
00690 floatmax_t num_symbols;
00691
00693 floatmax_t original_num_symbols;
00694
00696 int32_t order;
00697
00699 ST* symbol_mask_table;
00700
00702 bool preprocess_on_get;
00703
00705 CCache<ST>* feature_cache;
00706 };
00707 }
00708 #endif // _CSTRINGFEATURES__H__