StringFeatures.h

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2009 Soeren Sonnenburg
00008  * Written (W) 1999-2008 Gunnar Raetsch
00009  * Written (W) 2011-2012 Heiko Strathmann
00010  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00011  */
00012 
00013 #ifndef _CSTRINGFEATURES__H__
00014 #define _CSTRINGFEATURES__H__
00015 
00016 #include <shogun/lib/common.h>
00017 #include <shogun/lib/Cache.h>
00018 #include <shogun/lib/DynamicArray.h>
00019 #include <shogun/lib/Compressor.h>
00020 #include <shogun/io/File.h>
00021 
00022 #include <shogun/features/Features.h>
00023 #include <shogun/features/Alphabet.h>
00024 
00025 namespace shogun
00026 {
00027 class CAlphabet;
00028 template <class T> class CDynamicArray;
00029 class CFile;
00030 template <class T> class SGString;
00031 
00032 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00033 struct SSKDoubleFeature
00034 {
00035     int feature1;
00036     int feature2;
00037     int group;
00038 };
00039 
00040 struct SSKTripleFeature
00041 {
00042     int feature1;
00043     int feature2;
00044     int feature3;
00045     int group;
00046 };
00047 #endif
00048 
00072 template <class ST> class CStringFeatures : public CFeatures
00073 {
00074     public:
00078         CStringFeatures();
00079 
00084         CStringFeatures(EAlphabet alpha);
00085 
00090         CStringFeatures(SGStringList<ST> string_list, EAlphabet alpha);
00091 
00096         CStringFeatures(SGStringList<ST> string_list, CAlphabet* alpha);
00097 
00102         CStringFeatures(CAlphabet* alpha);
00103 
00105         CStringFeatures(const CStringFeatures & orig);
00106 
00112         CStringFeatures(CFile* loader, EAlphabet alpha=DNA);
00113 
00114         virtual ~CStringFeatures();
00115 
00121         virtual void cleanup();
00122 
00129         virtual void cleanup_feature_vector(int32_t num);
00130 
00138         virtual void cleanup_feature_vectors(int32_t start, int32_t stop);
00139 
00144         virtual EFeatureClass get_feature_class() const;
00145 
00150         virtual EFeatureType get_feature_type() const;
00151 
00156         CAlphabet* get_alphabet();
00157 
00162         virtual CFeatures* duplicate() const;
00163 
00170         SGVector<ST> get_feature_vector(int32_t num);
00171 
00179         void set_feature_vector(SGVector<ST> vector, int32_t num);
00180 
00183         void enable_on_the_fly_preprocessing();
00184 
00188         void disable_on_the_fly_preprocessing();
00189 
00200         ST* get_feature_vector(int32_t num, int32_t& len, bool& dofree);
00201 
00208         CStringFeatures<ST>* get_transposed();
00209 
00223         SGString<ST>* get_transposed(int32_t &num_feat, int32_t &num_vec);
00224 
00233         void free_feature_vector(ST* feat_vec, int32_t num, bool dofree);
00234 
00242         void free_feature_vector(SGVector<ST> feat_vec, int32_t num);
00243 
00252         virtual ST get_feature(int32_t vec_num, int32_t feat_num);
00253 
00261         virtual int32_t get_vector_length(int32_t vec_num);
00262 
00269         virtual int32_t get_max_vector_length();
00270 
00272         virtual int32_t get_num_vectors() const;
00273 
00280         floatmax_t get_num_symbols();
00281 
00289         floatmax_t get_max_num_symbols();
00290 
00291         // these functions are necessary to find out about a former conversion process
00292 
00297         floatmax_t get_original_num_symbols();
00298 
00303         int32_t get_order();
00304 
00312         ST get_masked_symbols(ST symbol, uint8_t mask);
00313 
00320         ST shift_offset(ST offset, int32_t amount);
00321 
00328         ST shift_symbol(ST symbol, int32_t amount);
00329 
00334         virtual void load(CFile* loader);
00335 
00346         void load_ascii_file(char* fname, bool remap_to_bin=true,
00347                 EAlphabet ascii_alphabet=DNA, EAlphabet binary_alphabet=RAWDNA);
00348 
00357         bool load_fasta_file(const char* fname, bool ignore_invalid=false);
00358 
00368         bool load_fastq_file(const char* fname,
00369                 bool ignore_invalid=false, bool bitremap_in_single_string=false);
00370 
00378         bool load_from_directory(char* dirname);
00379 
00385         void set_features(SGStringList<ST> feats);
00386 
00396         bool set_features(SGString<ST>* p_features, int32_t p_num_vectors,
00397                 int32_t p_max_string_length);
00398 
00407         bool append_features(CStringFeatures<ST>* sf);
00408 
00421         bool append_features(SGString<ST>* p_features, int32_t p_num_vectors,
00422                 int32_t p_max_string_length);
00423 
00427         SGStringList<ST> get_features();
00428 
00437         virtual SGString<ST>* get_features(int32_t& num_str, int32_t& max_str_len);
00438 
00447         virtual SGString<ST>* copy_features(int32_t& num_str, int32_t& max_str_len);
00448 
00456         virtual void get_features(SGString<ST>** dst, int32_t* num_str);
00457 
00464         virtual void save(CFile* writer);
00465 
00474         virtual bool load_compressed(char* src, bool decompress);
00475 
00485         virtual bool save_compressed(char* dest, E_COMPRESSION_TYPE compression, int level);
00486 
00491         virtual int32_t get_size() const;
00492 
00498         virtual bool apply_preprocessor(bool force_preprocessing=false);
00499 
00512         int32_t obtain_by_sliding_window(int32_t window_size, int32_t step_size, int32_t skip=0);
00513 
00524         int32_t obtain_by_position_list(int32_t window_size, CDynamicArray<int32_t>* positions,
00525                 int32_t skip=0);
00526 
00540         bool obtain_from_char(CStringFeatures<char>* sf, int32_t start,
00541                 int32_t p_order, int32_t gap, bool rev);
00542 
00554         template <class CT>
00555             bool obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start,
00556                     int32_t p_order, int32_t gap, bool rev);
00557 
00567         bool have_same_length(int32_t len=-1);
00568 
00574         void embed_features(int32_t p_order);
00575 
00582         void compute_symbol_mask_table(int64_t max_val);
00583 
00590         void unembed_word(ST word, uint8_t* seq, int32_t len);
00591 
00597         ST embed_word(ST* seq, int32_t len);
00598 
00603         void determine_maximum_string_length();
00604 
00612         static ST* get_zero_terminated_string_copy(SGString<ST> str);
00613 
00622         virtual void set_feature_vector(int32_t num, ST* string, int32_t len);
00623 
00628         virtual void get_histogram(float64_t** hist, int32_t* rows, int32_t* cols,
00629                 bool normalize=true);
00630 
00635         virtual void create_random(float64_t* hist, int32_t rows, int32_t cols,
00636                 int32_t num_vec);
00637 
00646         virtual CFeatures* copy_subset(SGVector<index_t> indices);
00647 
00649         virtual const char* get_name() const { return "StringFeatures"; }
00650 
00652         virtual void subset_changed_post();
00653 
00654     protected:
00665         virtual ST* compute_feature_vector(int32_t num, int32_t& len);
00666 
00667     private:
00668         void init();
00669 
00670     protected:
00671 
00673         CAlphabet* alphabet;
00674 
00676         int32_t num_vectors;
00677 
00679         SGString<ST>* features;
00680 
00682         ST* single_string;
00683 
00685         int32_t length_of_single_string;
00686 
00688         int32_t max_string_length;
00689 
00691         floatmax_t num_symbols;
00692 
00694         floatmax_t original_num_symbols;
00695 
00697         int32_t order;
00698 
00700         ST* symbol_mask_table;
00701         
00703         int32_t symbol_mask_table_len;
00704 
00706         bool preprocess_on_get;
00707 
00709         CCache<ST>* feature_cache;
00710 };
00711 }
00712 #endif // _CSTRINGFEATURES__H__
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation