StringFeatures.h

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2009 Soeren Sonnenburg
00008  * Written (W) 1999-2008 Gunnar Raetsch
00009  * Subset support written (W) 2011 Heiko Strathmann
00010  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00011  */
00012 
00013 #ifndef _CSTRINGFEATURES__H__
00014 #define _CSTRINGFEATURES__H__
00015 
00016 #include <shogun/lib/common.h>
00017 #include <shogun/lib/Cache.h>
00018 #include <shogun/lib/DynamicArray.h>
00019 #include <shogun/lib/Compressor.h>
00020 #include <shogun/io/File.h>
00021 
00022 #include <shogun/features/Features.h>
00023 #include <shogun/features/Alphabet.h>
00024 
00025 namespace shogun
00026 {
00027 class CAlphabet;
00028 template <class T> class CDynamicArray;
00029 class CFile;
00030 template <class T> class SGString;
00031 
00032 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00033 struct SSKDoubleFeature
00034 {
00035     int feature1;
00036     int feature2;
00037     int group;
00038 };
00039 
00040 struct SSKTripleFeature
00041 {
00042     int feature1;
00043     int feature2;
00044     int feature3;
00045     int group;
00046 };
00047 #endif
00048 
00071 template <class ST> class CStringFeatures : public CFeatures
00072 {
00073     public:
00077         CStringFeatures();
00078 
00083         CStringFeatures(EAlphabet alpha);
00084 
00089         CStringFeatures(SGStringList<ST> string_list, EAlphabet alpha);
00090 
00095         CStringFeatures(SGStringList<ST> string_list, CAlphabet* alpha);
00096 
00101         CStringFeatures(CAlphabet* alpha);
00102 
00104         CStringFeatures(const CStringFeatures & orig);
00105 
00111         CStringFeatures(CFile* loader, EAlphabet alpha=DNA);
00112 
00113         virtual ~CStringFeatures();
00114 
00120         virtual void cleanup();
00121 
00128         virtual void cleanup_feature_vector(int32_t num);
00129 
00137         virtual void cleanup_feature_vectors(int32_t start, int32_t stop);
00138 
00143         virtual EFeatureClass get_feature_class();
00144 
00149         virtual EFeatureType get_feature_type();
00150 
00155         CAlphabet* get_alphabet();
00156 
00161         virtual CFeatures* duplicate() const;
00162 
00169         SGVector<ST> get_feature_vector(int32_t num);
00170 
00178         void set_feature_vector(SGVector<ST> vector, int32_t num);
00179 
00182         void enable_on_the_fly_preprocessing();
00183 
00187         void disable_on_the_fly_preprocessing();
00188 
00199         ST* get_feature_vector(int32_t num, int32_t& len, bool& dofree);
00200 
00207         CStringFeatures<ST>* get_transposed();
00208 
00222         SGString<ST>* get_transposed(int32_t &num_feat, int32_t &num_vec);
00223 
00232         void free_feature_vector(ST* feat_vec, int32_t num, bool dofree);
00233 
00241         void free_feature_vector(SGVector<ST> feat_vec, int32_t num);
00242 
00251         virtual ST get_feature(int32_t vec_num, int32_t feat_num);
00252 
00260         virtual int32_t get_vector_length(int32_t vec_num);
00261 
00268         virtual int32_t get_max_vector_length();
00269 
00271         virtual int32_t get_num_vectors() const;
00272 
00279         floatmax_t get_num_symbols();
00280 
00288         floatmax_t get_max_num_symbols();
00289 
00290         // these functions are necessary to find out about a former conversion process
00291 
00296         floatmax_t get_original_num_symbols();
00297 
00302         int32_t get_order();
00303 
00311         ST get_masked_symbols(ST symbol, uint8_t mask);
00312 
00319         ST shift_offset(ST offset, int32_t amount);
00320 
00327         ST shift_symbol(ST symbol, int32_t amount);
00328 
00333         virtual void load(CFile* loader);
00334 
00345         void load_ascii_file(char* fname, bool remap_to_bin=true,
00346                 EAlphabet ascii_alphabet=DNA, EAlphabet binary_alphabet=RAWDNA);
00347 
00356         bool load_fasta_file(const char* fname, bool ignore_invalid=false);
00357 
00367         bool load_fastq_file(const char* fname,
00368                 bool ignore_invalid=false, bool bitremap_in_single_string=false);
00369 
00377         bool load_from_directory(char* dirname);
00378 
00384         void set_features(SGStringList<ST> feats);
00385 
00395         bool set_features(SGString<ST>* p_features, int32_t p_num_vectors,
00396                 int32_t p_max_string_length);
00397 
00406         bool append_features(CStringFeatures<ST>* sf);
00407 
00420         bool append_features(SGString<ST>* p_features, int32_t p_num_vectors,
00421                 int32_t p_max_string_length);
00422 
00426         SGStringList<ST> get_features();
00427 
00436         virtual SGString<ST>* get_features(int32_t& num_str, int32_t& max_str_len);
00437 
00446         virtual SGString<ST>* copy_features(int32_t& num_str, int32_t& max_str_len);
00447 
00455         virtual void get_features(SGString<ST>** dst, int32_t* num_str);
00456 
00463         virtual void save(CFile* writer);
00464 
00473         virtual bool load_compressed(char* src, bool decompress);
00474 
00484         virtual bool save_compressed(char* dest, E_COMPRESSION_TYPE compression, int level);
00485 
00490         virtual int32_t get_size();
00491 
00497         virtual bool apply_preprocessor(bool force_preprocessing=false);
00498 
00511         int32_t obtain_by_sliding_window(int32_t window_size, int32_t step_size, int32_t skip=0);
00512 
00523         int32_t obtain_by_position_list(int32_t window_size, CDynamicArray<int32_t>* positions,
00524                 int32_t skip=0);
00525 
00539         bool obtain_from_char(CStringFeatures<char>* sf, int32_t start,
00540                 int32_t p_order, int32_t gap, bool rev);
00541 
00553         template <class CT>
00554             bool obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start,
00555                     int32_t p_order, int32_t gap, bool rev);
00556 
00566         bool have_same_length(int32_t len=-1);
00567 
00573         void embed_features(int32_t p_order);
00574 
00581         void compute_symbol_mask_table(int64_t max_val);
00582 
00589         void unembed_word(ST word, uint8_t* seq, int32_t len);
00590 
00596         ST embed_word(ST* seq, int32_t len);
00597 
00602         void determine_maximum_string_length();
00603 
00611         static ST* get_zero_terminated_string_copy(SGString<ST> str);
00612 
00621         virtual void set_feature_vector(int32_t num, ST* string, int32_t len);
00622 
00627         virtual void get_histogram(float64_t** hist, int32_t* rows, int32_t* cols,
00628                 bool normalize=true);
00629 
00634         virtual void create_random(float64_t* hist, int32_t rows, int32_t cols,
00635                 int32_t num_vec);
00636 
00645         virtual CFeatures* copy_subset(SGVector<index_t> indices);
00646 
00648         inline virtual const char* get_name() const { return "StringFeatures"; }
00649 
00651         virtual void subset_changed_post();
00652 
00653     protected:
00664         virtual ST* compute_feature_vector(int32_t num, int32_t& len);
00665 
00666     private:
00667         void init();
00668 
00669     protected:
00670 
00672         CAlphabet* alphabet;
00673 
00675         int32_t num_vectors;
00676 
00678         SGString<ST>* features;
00679 
00681         ST* single_string;
00682 
00684         int32_t length_of_single_string;
00685 
00687         int32_t max_string_length;
00688 
00690         floatmax_t num_symbols;
00691 
00693         floatmax_t original_num_symbols;
00694 
00696         int32_t order;
00697 
00699         ST* symbol_mask_table;
00700 
00702         bool preprocess_on_get;
00703 
00705         CCache<ST>* feature_cache;
00706 };
00707 }
00708 #endif // _CSTRINGFEATURES__H__
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation