StringFeatures.h

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2009 Soeren Sonnenburg
00008  * Written (W) 1999-2008 Gunnar Raetsch
00009  * Subset support written (W) 2011 Heiko Strathmann
00010  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00011  */
00012 
00013 #ifndef _CSTRINGFEATURES__H__
00014 #define _CSTRINGFEATURES__H__
00015 
00016 #include <shogun/lib/common.h>
00017 #include <shogun/io/SGIO.h>
00018 #include <shogun/lib/Cache.h>
00019 #include <shogun/lib/DynamicArray.h>
00020 #include <shogun/io/File.h>
00021 #include <shogun/io/MemoryMappedFile.h>
00022 #include <shogun/mathematics/Math.h>
00023 #include <shogun/lib/Compressor.h>
00024 #include <shogun/base/Parameter.h>
00025 
00026 #include <shogun/preprocessor/Preprocessor.h>
00027 #include <shogun/preprocessor/StringPreprocessor.h>
00028 #include <shogun/features/Features.h>
00029 #include <shogun/features/Alphabet.h>
00030 
00031 #include <sys/types.h>
00032 #include <sys/stat.h>
00033 #include <dirent.h>
00034 #include <stdio.h>
00035 #include <stdlib.h>
00036 #include <unistd.h>
00037 
00038 namespace shogun
00039 {
00040 class CCompressor;
00041 enum E_COMPRESSION_TYPE;
00042 class CAlphabet;
00043 enum EAlphabet;
00044 template <class T> class CDynamicArray;
00045 class CFile;
00046 template <class T> class CMemoryMappedFile;
00047 class CMath;
00048 template <class ST> class CStringPreprocessor;
00049 template <class T> class SGString;
00050 
00051 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00052 struct SSKDoubleFeature
00053 {
00054     int feature1;
00055     int feature2;
00056     int group;
00057 };
00058 
00059 struct SSKTripleFeature
00060 {
00061     int feature1;
00062     int feature2;
00063     int feature3;
00064     int group;
00065 };
00066 #endif
00067 
00090 template <class ST> class CStringFeatures : public CFeatures
00091 {
00092     public:
00096         CStringFeatures() : CFeatures(0)
00097         {
00098             init();
00099             alphabet=new CAlphabet();
00100         }
00101 
00106         CStringFeatures(EAlphabet alpha) : CFeatures(0)
00107         {
00108             init();
00109 
00110             alphabet=new CAlphabet(alpha);
00111             SG_REF(alphabet);
00112             num_symbols=alphabet->get_num_symbols();
00113             original_num_symbols=num_symbols;
00114         }
00115 
00120         CStringFeatures(SGStringList<ST> string_list, EAlphabet alpha)
00121         : CFeatures(0)
00122         {
00123             init();
00124 
00125             alphabet=new CAlphabet(alpha);
00126             SG_REF(alphabet);
00127             num_symbols=alphabet->get_num_symbols();
00128             original_num_symbols=num_symbols;
00129             set_features(string_list.strings, string_list.num_strings, string_list.max_string_length);
00130         }
00131 
00136         CStringFeatures(SGStringList<ST> string_list, CAlphabet* alpha)
00137         : CFeatures(0)
00138         {
00139             init();
00140 
00141             alphabet=new CAlphabet(alpha);
00142             SG_REF(alphabet);
00143             num_symbols=alphabet->get_num_symbols();
00144             original_num_symbols=num_symbols;
00145             set_features(string_list.strings, string_list.num_strings, string_list.max_string_length);
00146         }
00147 
00152         CStringFeatures(CAlphabet* alpha)
00153         : CFeatures(0)
00154         {
00155             init();
00156 
00157             ASSERT(alpha);
00158             SG_REF(alpha);
00159             alphabet=alpha;
00160             num_symbols=alphabet->get_num_symbols();
00161             original_num_symbols=num_symbols;
00162         }
00163 
00165         CStringFeatures(const CStringFeatures & orig)
00166         : CFeatures(orig), num_vectors(orig.num_vectors),
00167             single_string(orig.single_string),
00168             length_of_single_string(orig.length_of_single_string),
00169             max_string_length(orig.max_string_length),
00170             num_symbols(orig.num_symbols),
00171             original_num_symbols(orig.original_num_symbols),
00172             order(orig.order), preprocess_on_get(false),
00173             feature_cache(NULL)
00174         {
00175             init();
00176 
00177             ASSERT(orig.single_string == NULL); //not implemented
00178 
00179             alphabet=orig.alphabet;
00180             SG_REF(alphabet);
00181 
00182             if (orig.features)
00183             {
00184                 features=SG_MALLOC(SGString<ST>, orig.num_vectors);
00185 
00186                 for (int32_t i=0; i<num_vectors; i++)
00187                 {
00188                     features[i].string=SG_MALLOC(ST, orig.features[i].slen);
00189                     features[i].slen=orig.features[i].slen;
00190                     memcpy(features[i].string, orig.features[i].string, sizeof(ST)*orig.features[i].slen);
00191                 }
00192             }
00193 
00194             if (orig.symbol_mask_table)
00195             {
00196                 symbol_mask_table=SG_MALLOC(ST, 256);
00197                 for (int32_t i=0; i<256; i++)
00198                     symbol_mask_table[i]=orig.symbol_mask_table[i];
00199             }
00200 
00201             m_subset=orig.m_subset->duplicate();
00202         }
00203 
00209         CStringFeatures(CFile* loader, EAlphabet alpha=DNA)
00210         : CFeatures(loader), num_vectors(0),
00211           features(NULL), single_string(NULL), length_of_single_string(0),
00212           max_string_length(0), order(0),
00213           symbol_mask_table(NULL), preprocess_on_get(false), feature_cache(NULL)
00214         {
00215             init();
00216 
00217             alphabet=new CAlphabet(alpha);
00218             SG_REF(alphabet);
00219             num_symbols=alphabet->get_num_symbols();
00220             original_num_symbols=num_symbols;
00221             load(loader);
00222         }
00223 
00224         virtual ~CStringFeatures()
00225         {
00226             cleanup();
00227 
00228             SG_UNREF(alphabet);
00229         }
00230 
00236         virtual void cleanup()
00237         {
00238             remove_subset();
00239 
00240             if (single_string)
00241             {
00242                 SG_FREE(single_string);
00243                 single_string=NULL;
00244             }
00245             else
00246             {
00247                 for (int32_t i=0; i<num_vectors; i++)
00248                     cleanup_feature_vector(i);
00249             }
00250 
00251             num_vectors=0;
00252             SG_FREE(features);
00253             SG_FREE(symbol_mask_table);
00254             features=NULL;
00255             symbol_mask_table=NULL;
00256 
00257             /* start with a fresh alphabet, but instead of emptying the histogram
00258              * create a new object (to leave the alphabet object alone if it is used
00259              * by others)
00260              */
00261             CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet());
00262             SG_UNREF(alphabet);
00263             alphabet=alpha;
00264             SG_REF(alphabet);
00265         }
00266 
00273         virtual void cleanup_feature_vector(int32_t num)
00274         {
00275             ASSERT(num<get_num_vectors());
00276 
00277             if (features)
00278             {
00279                 int32_t real_num=subset_idx_conversion(num);
00280                 SG_FREE(features[real_num].string);
00281                 features[real_num].string=NULL;
00282                 features[real_num].slen=0;
00283 
00284                 determine_maximum_string_length();
00285             }
00286         }
00287 
00292         inline virtual EFeatureClass get_feature_class() { return C_STRING; }
00293 
00298         inline virtual EFeatureType get_feature_type() { return F_UNKNOWN; }
00299 
00304         inline CAlphabet* get_alphabet()
00305         {
00306             SG_REF(alphabet);
00307             return alphabet;
00308         }
00309 
00314         virtual CFeatures* duplicate() const
00315         {
00316             return new CStringFeatures<ST>(*this);
00317         }
00318 
00325         SGVector<ST> get_feature_vector(int32_t num)
00326         {
00327             ASSERT(features);
00328             if (num>=get_num_vectors())
00329             {
00330                 SG_ERROR("Index out of bounds (number of strings %d, you "
00331                         "requested %d)\n", get_num_vectors(), num);
00332             }
00333 
00334             int32_t l;
00335             bool free_vec;
00336             ST* vec=get_feature_vector(num, l, free_vec);
00337             ST* dst=SG_MALLOC(ST, l);
00338             memcpy(dst, vec, l*sizeof(ST));
00339             free_feature_vector(vec, num, free_vec);
00340             return SGVector<ST>(dst, l);
00341         }
00342 
00350         void set_feature_vector(SGVector<ST> vector, int32_t num)
00351         {
00352             ASSERT(features);
00353 
00354             if (m_subset)
00355                 SG_ERROR("A subset is set, cannot set feature vector\n");
00356 
00357             if (num>=num_vectors)
00358             {
00359                 SG_ERROR("Index out of bounds (number of strings %d, you "
00360                         "requested %d)\n", num_vectors, num);
00361             }
00362 
00363             if (vector.vlen<=0)
00364                 SG_ERROR("String has zero or negative length\n");
00365 
00366             cleanup_feature_vector(num);
00367             features[num].slen=vector.vlen;
00368             features[num].string=SG_MALLOC(ST, vector.vlen);
00369             memcpy(features[num].string, vector.vector, vector.vlen*sizeof(ST));
00370 
00371             determine_maximum_string_length();
00372         }
00373 
00376         void enable_on_the_fly_preprocessing()
00377         {
00378             preprocess_on_get=true;
00379         }
00380 
00384         void disable_on_the_fly_preprocessing()
00385         {
00386             preprocess_on_get=false;
00387         }
00388 
00399         ST* get_feature_vector(int32_t num, int32_t& len, bool& dofree)
00400         {
00401             ASSERT(features);
00402             ASSERT(num<get_num_vectors());
00403 
00404 
00405             int32_t real_num=subset_idx_conversion(num);
00406 
00407             if (!preprocess_on_get)
00408             {
00409                 dofree=false;
00410                 len=features[real_num].slen;
00411                 return features[real_num].string;
00412             }
00413             else
00414             {
00415                 SG_DEBUG( "computing feature vector!\n") ;
00416                 ST* feat=compute_feature_vector(num, len);
00417                 dofree=true;
00418 
00419                 if (get_num_preprocessors())
00420                 {
00421                     ST* tmp_feat_before=feat;
00422 
00423                     for (int32_t i=0; i<get_num_preprocessors(); i++)
00424                     {
00425                         CStringPreprocessor<ST>* p=(CStringPreprocessor<ST>*) get_preprocessor(i);
00426                         feat=p->apply_to_string(tmp_feat_before, len);
00427                         SG_UNREF(p);
00428                         SG_FREE(tmp_feat_before);
00429                         tmp_feat_before=feat;
00430                     }
00431                 }
00432                 // TODO: implement caching
00433                 return feat;
00434             }
00435         }
00436 
00443         CStringFeatures<ST>* get_transposed()
00444         {
00445             int32_t num_feat;
00446             int32_t num_vec;
00447             SGString<ST>* s=get_transposed(num_feat, num_vec);
00448             SGStringList<ST> string_list;
00449             string_list.strings = s;
00450             string_list.num_strings = num_vec;
00451             string_list.max_string_length = num_feat;
00452 
00453             return new CStringFeatures<ST>(string_list, alphabet);
00454         }
00455 
00469         SGString<ST>* get_transposed(int32_t &num_feat, int32_t &num_vec)
00470         {
00471             num_feat=get_num_vectors();
00472             num_vec=get_max_vector_length();
00473             ASSERT(have_same_length());
00474 
00475             SG_DEBUG("Allocating memory for transposed string features of size %ld\n",
00476                     int64_t(num_feat)*num_vec);
00477 
00478             SGString<ST>* sf=SG_MALLOC(SGString<ST>, num_vec);
00479 
00480             for (int32_t i=0; i<num_vec; i++)
00481             {
00482                 sf[i].string=SG_MALLOC(ST, num_feat);
00483                 sf[i].slen=num_feat;
00484             }
00485 
00486             for (int32_t i=0; i<num_feat; i++)
00487             {
00488                 int32_t len=0;
00489                 bool free_vec=false;
00490                 ST* vec=get_feature_vector(i, len, free_vec);
00491 
00492                 for (int32_t j=0; j<num_vec; j++)
00493                     sf[j].string[i]=vec[j];
00494 
00495                 free_feature_vector(vec, i, free_vec);
00496             }
00497             return sf;
00498         }
00499 
00508         void free_feature_vector(ST* feat_vec, int32_t num, bool dofree)
00509         {
00510             if (num>=get_num_vectors())
00511             {
00512                 SG_ERROR(
00513                     "Trying to access string[%d] but num_str=%d\n", num,
00514                     get_num_vectors());
00515             }
00516 
00517             int32_t real_num=subset_idx_conversion(num);
00518 
00519             if (feature_cache)
00520                 feature_cache->unlock_entry(real_num);
00521 
00522             if (dofree)
00523                 SG_FREE(feat_vec);
00524         }
00525 
00533         void free_feature_vector(SGVector<ST> feat_vec, int32_t num)
00534         {
00535             if (num>=get_num_vectors())
00536             {
00537                 SG_ERROR(
00538                     "Trying to access string[%d] but num_str=%d\n", num,
00539                     get_num_vectors());
00540             }
00541 
00542             int32_t real_num=subset_idx_conversion(num);
00543 
00544             if (feature_cache)
00545                 feature_cache->unlock_entry(real_num);
00546 
00547             if (feat_vec.do_free)
00548                 SG_FREE(feat_vec.vector);
00549         }
00550 
00559         virtual ST inline get_feature(int32_t vec_num, int32_t feat_num)
00560         {
00561             ASSERT(vec_num<get_num_vectors());
00562 
00563             int32_t len;
00564             bool free_vec;
00565             ST* vec=get_feature_vector(vec_num, len, free_vec);
00566             ASSERT(feat_num<len);
00567             ST result=vec[feat_num];
00568             free_feature_vector(vec, vec_num, free_vec);
00569 
00570             return result;
00571         }
00572 
00580         virtual inline int32_t get_vector_length(int32_t vec_num)
00581         {
00582             ASSERT(vec_num<get_num_vectors());
00583 
00584             int32_t len;
00585             bool free_vec;
00586             ST* vec=get_feature_vector(vec_num, len, free_vec);
00587             free_feature_vector(vec, vec_num, free_vec);
00588             return len;
00589         }
00590 
00597         virtual inline int32_t get_max_vector_length()
00598         {
00599             return max_string_length;
00600         }
00601 
00603         virtual inline int32_t get_num_vectors() const
00604         {
00605             return m_subset ? m_subset->get_size() : num_vectors;
00606         }
00607 
00614         inline floatmax_t get_num_symbols() { return num_symbols; }
00615 
00623         inline floatmax_t get_max_num_symbols() { return CMath::powl(2,sizeof(ST)*8); }
00624 
00625         // these functions are necessary to find out about a former conversion process
00626 
00631         inline floatmax_t get_original_num_symbols() { return original_num_symbols; }
00632 
00637         inline int32_t get_order() { return order; }
00638 
00646         inline ST get_masked_symbols(ST symbol, uint8_t mask)
00647         {
00648             ASSERT(symbol_mask_table);
00649             return symbol_mask_table[mask] & symbol;
00650         }
00651 
00658         inline ST shift_offset(ST offset, int32_t amount)
00659         {
00660             ASSERT(alphabet);
00661             return (offset << (amount*alphabet->get_num_bits()));
00662         }
00663 
00670         inline ST shift_symbol(ST symbol, int32_t amount)
00671         {
00672             ASSERT(alphabet);
00673             return (symbol >> (amount*alphabet->get_num_bits()));
00674         }
00675 
00680         virtual inline void load(CFile* loader);
00681 
00692         void load_ascii_file(char* fname, bool remap_to_bin=true,
00693                 EAlphabet ascii_alphabet=DNA, EAlphabet binary_alphabet=RAWDNA)
00694         {
00695             remove_subset();
00696 
00697             size_t blocksize=1024*1024;
00698             size_t required_blocksize=0;
00699             uint8_t* dummy=SG_MALLOC(uint8_t, blocksize);
00700             uint8_t* overflow=NULL;
00701             int32_t overflow_len=0;
00702 
00703             cleanup();
00704 
00705             CAlphabet* alpha=new CAlphabet(ascii_alphabet);
00706             CAlphabet* alpha_bin=new CAlphabet(binary_alphabet);
00707 
00708             FILE* f=fopen(fname, "ro");
00709 
00710             if (f)
00711             {
00712                 num_vectors=0;
00713                 max_string_length=0;
00714 
00715                 SG_INFO("counting line numbers in file %s\n", fname);
00716                 size_t block_offs=0;
00717                 size_t old_block_offs=0;
00718                 fseek(f, 0, SEEK_END);
00719                 size_t fsize=ftell(f);
00720                 rewind(f);
00721 
00722                 if (blocksize>fsize)
00723                     blocksize=fsize;
00724 
00725                 SG_DEBUG("block_size=%ld file_size=%ld\n", blocksize, fsize);
00726 
00727                 size_t sz=blocksize;
00728                 while (sz == blocksize)
00729                 {
00730                     sz=fread(dummy, sizeof(uint8_t), blocksize, f);
00731                     for (size_t i=0; i<sz; i++)
00732                     {
00733                         block_offs++;
00734                         if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00735                         {
00736                             num_vectors++;
00737                             required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs);
00738                             old_block_offs=block_offs;
00739                         }
00740                     }
00741                     SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
00742                 }
00743 
00744                 SG_INFO("found %d strings\n", num_vectors);
00745                 SG_FREE(dummy);
00746                 blocksize=required_blocksize;
00747                 dummy=SG_MALLOC(uint8_t, blocksize);
00748                 overflow=SG_MALLOC(uint8_t, blocksize);
00749                 features=SG_MALLOC(SGString<ST>, num_vectors);
00750 
00751                 rewind(f);
00752                 sz=blocksize;
00753                 int32_t lines=0;
00754                 while (sz == blocksize)
00755                 {
00756                     sz=fread(dummy, sizeof(uint8_t), blocksize, f);
00757 
00758                     size_t old_sz=0;
00759                     for (size_t i=0; i<sz; i++)
00760                     {
00761                         if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00762                         {
00763                             int32_t len=i-old_sz;
00764                             //SG_PRINT("i:%d len:%d old_sz:%d\n", i, len, old_sz);
00765                             max_string_length=CMath::max(max_string_length, len+overflow_len);
00766 
00767                             features[lines].slen=len;
00768                             features[lines].string=SG_MALLOC(ST, len);
00769 
00770                             if (remap_to_bin)
00771                             {
00772                                 for (int32_t j=0; j<overflow_len; j++)
00773                                     features[lines].string[j]=alpha->remap_to_bin(overflow[j]);
00774                                 for (int32_t j=0; j<len; j++)
00775                                     features[lines].string[j+overflow_len]=alpha->remap_to_bin(dummy[old_sz+j]);
00776                                 alpha->add_string_to_histogram(&dummy[old_sz], len);
00777                                 alpha_bin->add_string_to_histogram(features[lines].string, features[lines].slen);
00778                             }
00779                             else
00780                             {
00781                                 for (int32_t j=0; j<overflow_len; j++)
00782                                     features[lines].string[j]=overflow[j];
00783                                 for (int32_t j=0; j<len; j++)
00784                                     features[lines].string[j+overflow_len]=dummy[old_sz+j];
00785                                 alpha->add_string_to_histogram(&dummy[old_sz], len);
00786                                 alpha->add_string_to_histogram(features[lines].string, features[lines].slen);
00787                             }
00788 
00789                             // clear overflow
00790                             overflow_len=0;
00791 
00792                             //CMath::display_vector(features[lines].string, len);
00793                             old_sz=i+1;
00794                             lines++;
00795                             SG_PROGRESS(lines, 0, num_vectors, 1, "LOADING:\t");
00796                         }
00797                     }
00798                     for (size_t i=old_sz; i<sz; i++)
00799                         overflow[i-old_sz]=dummy[i];
00800 
00801                     overflow_len=sz-old_sz;
00802                 }
00803 
00804                 if (alpha->check_alphabet_size() && alpha->check_alphabet())
00805                 {
00806                     SG_INFO("file successfully read\n");
00807                     SG_INFO("max_string_length=%d\n", max_string_length);
00808                     SG_INFO("num_strings=%d\n", num_vectors);
00809                 }
00810                 fclose(f);
00811             }
00812 
00813             SG_FREE(dummy);
00814 
00815             SG_UNREF(alphabet);
00816 
00817             if (remap_to_bin)
00818                 alphabet=alpha_bin;
00819             else
00820                 alphabet=alpha;
00821             SG_REF(alphabet);
00822             num_symbols=alphabet->get_num_symbols();
00823         }
00824 
00833         bool load_fasta_file(const char* fname, bool ignore_invalid=false)
00834         {
00835             remove_subset();
00836 
00837             int32_t i=0;
00838             uint64_t len=0;
00839             uint64_t offs=0;
00840             int32_t num=0;
00841             int32_t max_len=0;
00842 
00843             CMemoryMappedFile<char> f(fname);
00844 
00845             while (true)
00846             {
00847                 char* s=f.get_line(len, offs);
00848                 if (!s)
00849                     break;
00850 
00851                 if (len>0 && s[0]=='>')
00852                     num++;
00853             }
00854 
00855             if (num==0)
00856                 SG_ERROR("No fasta hunks (lines starting with '>') found\n");
00857 
00858             cleanup();
00859             SG_UNREF(alphabet);
00860             alphabet=new CAlphabet(DNA);
00861             num_symbols=alphabet->get_num_symbols();
00862 
00863             SGString<ST>* strings=SG_MALLOC(SGString<ST>, num);
00864             offs=0;
00865 
00866             for (i=0;i<num; i++)
00867             {
00868                 uint64_t id_len=0;
00869                 char* id=f.get_line(id_len, offs);
00870 
00871                 char* fasta=f.get_line(len, offs);
00872                 char* s=fasta;
00873                 int32_t fasta_len=0;
00874                 int32_t spanned_lines=0;
00875 
00876                 while (true)
00877                 {
00878                     if (!s || len==0)
00879                         SG_ERROR("Error reading fasta entry in line %d len=%ld", 4*i+1, len);
00880 
00881                     if (s[0]=='>' || offs==f.get_size())
00882                     {
00883                         offs-=len+1; // seek to beginning
00884                         if (offs==f.get_size())
00885                         {
00886                             SG_DEBUG("at EOF\n");
00887                             fasta_len+=len;
00888                         }
00889 
00890                         len=fasta_len-spanned_lines;
00891                         strings[i].string=SG_MALLOC(ST, len);
00892                         strings[i].slen=len;
00893 
00894                         ST* str=strings[i].string;
00895                         int32_t idx=0;
00896                         SG_DEBUG("'%.*s', len=%d, spanned_lines=%d\n", (int32_t) id_len, id, (int32_t) len, (int32_t) spanned_lines);
00897 
00898                         for (int32_t j=0; j<fasta_len; j++)
00899                         {
00900                             if (fasta[j]=='\n')
00901                                 continue;
00902 
00903                             ST c=(ST) fasta[j];
00904 
00905                             if (ignore_invalid  && !alphabet->is_valid((uint8_t) fasta[j]))
00906                                 c=(ST) 'A';
00907 
00908                             if (idx>=len)
00909                                 SG_ERROR("idx=%d j=%d fasta_len=%d, spanned_lines=%d str='%.*s'\n", idx, j, fasta_len, spanned_lines, idx, str);
00910                             str[idx++]=c;
00911                         }
00912                         max_len=CMath::max(max_len, strings[i].slen);
00913 
00914 
00915                         break;
00916                     }
00917 
00918                     spanned_lines++;
00919                     fasta_len+=len+1; // including '\n'
00920                     s=f.get_line(len, offs);
00921                 }
00922             }
00923             return set_features(strings, num, max_len);
00924         }
00925 
00935         bool load_fastq_file(const char* fname,
00936                 bool ignore_invalid=false, bool bitremap_in_single_string=false)
00937         {
00938             remove_subset();
00939 
00940             CMemoryMappedFile<char> f(fname);
00941 
00942             int32_t i=0;
00943             uint64_t len=0;
00944             uint64_t offs=0;
00945 
00946             int32_t num=f.get_num_lines();
00947             int32_t max_len=0;
00948 
00949             if (num%4)
00950                 SG_ERROR("Number of lines must be divisible by 4 in fastq files\n");
00951             num/=4;
00952 
00953             cleanup();
00954             SG_UNREF(alphabet);
00955             alphabet=new CAlphabet(DNA);
00956 
00957             SGString<ST>* strings;
00958 
00959             ST* str;
00960             if (bitremap_in_single_string)
00961             {
00962                 strings=SG_MALLOC(SGString<ST>, 1);
00963                 strings[0].string=SG_MALLOC(ST, num);
00964                 strings[0].slen=num;
00965                 f.get_line(len, offs);
00966                 f.get_line(len, offs);
00967                 order=len;
00968                 max_len=num;
00969                 offs=0;
00970                 original_num_symbols=alphabet->get_num_symbols();
00971                 int32_t max_val=alphabet->get_num_bits();
00972                 str=SG_MALLOC(ST, len);
00973             }
00974             else
00975                 strings=SG_MALLOC(SGString<ST>, num);
00976 
00977             for (i=0;i<num; i++)
00978             {
00979                 if (!f.get_line(len, offs))
00980                     SG_ERROR("Error reading 'read' identifier in line %d", 4*i);
00981 
00982                 char* s=f.get_line(len, offs);
00983                 if (!s || len==0)
00984                     SG_ERROR("Error reading 'read' in line %d len=%ld", 4*i+1, len);
00985 
00986                 if (bitremap_in_single_string)
00987                 {
00988                     if (len!=order)
00989                         SG_ERROR("read in line %d not of length %d (is %d)\n", 4*i+1, order, len);
00990                     for (int32_t j=0; j<order; j++)
00991                         str[j]=(ST) alphabet->remap_to_bin((uint8_t) s[j]);
00992 
00993                     strings[0].string[i]=embed_word(str, order);
00994                 }
00995                 else
00996                 {
00997                     strings[i].string=SG_MALLOC(ST, len);
00998                     strings[i].slen=len;
00999                     str=strings[i].string;
01000 
01001                     if (ignore_invalid)
01002                     {
01003                         for (int32_t j=0; j<len; j++)
01004                         {
01005                             if (alphabet->is_valid((uint8_t) s[j]))
01006                                 str[j]= (ST) s[j];
01007                             else
01008                                 str[j]= (ST) 'A';
01009                         }
01010                     }
01011                     else
01012                     {
01013                         for (int32_t j=0; j<len; j++)
01014                             str[j]= (ST) s[j];
01015                     }
01016                     max_len=CMath::max(max_len, (int32_t) len);
01017                 }
01018 
01019 
01020                 if (!f.get_line(len, offs))
01021                     SG_ERROR("Error reading 'read' quality identifier in line %d", 4*i+2);
01022 
01023                 if (!f.get_line(len, offs))
01024                     SG_ERROR("Error reading 'read' quality in line %d", 4*i+3);
01025             }
01026 
01027             if (bitremap_in_single_string)
01028                 num=1;
01029 
01030             num_vectors=num;
01031             max_string_length=max_len;
01032             features=strings;
01033 
01034             return true;
01035         }
01036 
01044         bool load_from_directory(char* dirname)
01045         {
01046             remove_subset();
01047 
01048             struct dirent **namelist;
01049             int32_t n;
01050 
01051             SGIO::set_dirname(dirname);
01052 
01053             SG_DEBUG("dirname '%s'\n", dirname);
01054 
01055             n=scandir(dirname, &namelist, &SGIO::filter, alphasort);
01056             if (n <= 0)
01057             {
01058                 SG_ERROR("error calling scandir - no files found\n");
01059                 return false;
01060             }
01061             else
01062             {
01063                 SGString<ST>* strings=NULL;
01064 
01065                 int32_t num=0;
01066                 int32_t max_len=-1;
01067 
01068                 //usually n==num_vec, but it might not in race conditions
01069                 //(file perms modified, file erased)
01070                 strings=SG_MALLOC(SGString<ST>, n);
01071 
01072                 for (int32_t i=0; i<n; i++)
01073                 {
01074                     char* fname=SGIO::concat_filename(namelist[i]->d_name);
01075 
01076                     struct stat s;
01077                     off_t filesize=0;
01078 
01079                     if (!stat(fname, &s) && s.st_size>0)
01080                     {
01081                         filesize=s.st_size/sizeof(ST);
01082 
01083                         FILE* f=fopen(fname, "ro");
01084                         if (f)
01085                         {
01086                             ST* str=SG_MALLOC(ST, filesize);
01087                             SG_DEBUG("%s:%ld\n", fname, (int64_t) filesize);
01088                             if (fread(str, sizeof(ST), filesize, f)!=(size_t) filesize)
01089                                 SG_ERROR("failed to read file\n");
01090                             strings[num].string=str;
01091                             strings[num].slen=filesize;
01092                             max_len=CMath::max(max_len, strings[num].slen);
01093 
01094                             num++;
01095                             fclose(f);
01096                         }
01097                     }
01098                     else
01099                         SG_ERROR("empty or non readable file \'%s\'\n", fname);
01100 
01101                     SG_FREE(namelist[i]);
01102                 }
01103                 SG_FREE(namelist);
01104 
01105                 if (num>0 && strings)
01106                 {
01107                     set_features(strings, num, max_len);
01108                     return true;
01109                 }
01110             }
01111             return false;
01112         }
01113 
01119         void set_features(SGStringList<ST> feats)
01120         {
01121             set_features(feats.strings, feats.num_strings, feats.max_string_length);
01122         }
01123 
01133         bool set_features(SGString<ST>* p_features, int32_t p_num_vectors, int32_t p_max_string_length)
01134         {
01135             if (m_subset)
01136                 SG_ERROR("Cannot call set_features() with subset.\n");
01137 
01138             if (p_features)
01139             {
01140                 CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet());
01141 
01142                 //compute histogram for char/byte
01143                 for (int32_t i=0; i<p_num_vectors; i++)
01144                     alpha->add_string_to_histogram( p_features[i].string, p_features[i].slen);
01145 
01146                 SG_INFO("max_value_in_histogram:%d\n", alpha->get_max_value_in_histogram());
01147                 SG_INFO("num_symbols_in_histogram:%d\n", alpha->get_num_symbols_in_histogram());
01148 
01149                 if (alpha->check_alphabet_size() && alpha->check_alphabet())
01150                 {
01151                     cleanup();
01152                     SG_UNREF(alphabet);
01153 
01154                     alphabet=alpha;
01155                     SG_REF(alphabet);
01156 
01157                     features=p_features;
01158                     num_vectors=p_num_vectors;
01159                     max_string_length=p_max_string_length;
01160 
01161                     return true;
01162                 }
01163                 else
01164                     SG_UNREF(alpha);
01165             }
01166 
01167             return false;
01168         }
01169 
01178         bool append_features(CStringFeatures<ST>* sf)
01179         {
01180             ASSERT(sf);
01181 
01182             if (m_subset)
01183                 SG_ERROR("Cannot call set_features() with subset.\n");
01184 
01185             SGString<ST>* new_features=SG_MALLOC(SGString<ST>, sf->get_num_vectors());
01186 
01187             index_t sf_num_str=sf->get_num_vectors();
01188             for (int32_t i=0; i<sf_num_str; i++)
01189             {
01190                 int32_t real_i = sf->subset_idx_conversion(i);
01191                 int32_t length=sf->features[real_i].slen;
01192                 new_features[i].string=SG_MALLOC(ST, length);
01193                 memcpy(new_features[i].string, sf->features[real_i].string, length);
01194                 new_features[i].slen=length;
01195             }
01196             return append_features(new_features, sf_num_str,
01197                     sf->max_string_length);
01198         }
01199 
01212         bool append_features(SGString<ST>* p_features, int32_t p_num_vectors, int32_t p_max_string_length)
01213         {
01214             if (m_subset)
01215                 SG_ERROR("Cannot call set_features() with subset.\n");
01216 
01217             if (!features)
01218                 return set_features(p_features, p_num_vectors, p_max_string_length);
01219 
01220             CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet());
01221 
01222             //compute histogram for char/byte
01223             for (int32_t i=0; i<p_num_vectors; i++)
01224                 alpha->add_string_to_histogram( p_features[i].string, p_features[i].slen);
01225 
01226             SG_INFO("max_value_in_histogram:%d\n", alpha->get_max_value_in_histogram());
01227             SG_INFO("num_symbols_in_histogram:%d\n", alpha->get_num_symbols_in_histogram());
01228 
01229             if (alpha->check_alphabet_size() && alpha->check_alphabet())
01230             {
01231                 SG_UNREF(alpha);
01232                 for (int32_t i=0; i<p_num_vectors; i++)
01233                     alphabet->add_string_to_histogram( p_features[i].string, p_features[i].slen);
01234 
01235                 int32_t old_num_vectors=num_vectors;
01236                 num_vectors=old_num_vectors+p_num_vectors;
01237                 SGString<ST>* new_features=SG_MALLOC(SGString<ST>, num_vectors);
01238 
01239                 for (int32_t i=0; i<num_vectors; i++)
01240                 {
01241                     if (i<old_num_vectors)
01242                     {
01243                         new_features[i].string=features[i].string;
01244                         new_features[i].slen=features[i].slen;
01245                     }
01246                     else
01247                     {
01248                         new_features[i].string=p_features[i-old_num_vectors].string;
01249                         new_features[i].slen=p_features[i-old_num_vectors].slen;
01250                     }
01251                 }
01252                 SG_FREE(features);
01253                 SG_FREE(p_features); // free now obsolete features
01254 
01255                 this->features=new_features;
01256                 max_string_length=CMath::max(max_string_length, p_max_string_length);
01257 
01258                 return true;
01259             }
01260             SG_UNREF(alpha);
01261 
01262             return false;
01263         }
01264 
01268         SGStringList<ST> get_features()
01269         {
01270             SGStringList<ST> sl;
01271 
01272             sl.strings=get_features(sl.num_strings, sl.max_string_length);
01273             return sl;
01274         }
01275 
01284         virtual SGString<ST>* get_features(int32_t& num_str, int32_t& max_str_len)
01285         {
01286             if (m_subset)
01287                 SG_ERROR("get features() is not possible on subset");
01288 
01289             num_str=num_vectors;
01290             max_str_len=max_string_length;
01291             return features;
01292         }
01293 
01302         virtual SGString<ST>* copy_features(int32_t& num_str, int32_t& max_str_len)
01303         {
01304             ASSERT(num_vectors>0);
01305 
01306             num_str=get_num_vectors();
01307             max_str_len=max_string_length;
01308             SGString<ST>* new_feat=SG_MALLOC(SGString<ST>, num_str);
01309 
01310             for (int32_t i=0; i<num_str; i++)
01311             {
01312                 int32_t len;
01313                 bool free_vec;
01314                 ST* vec=get_feature_vector(i, len, free_vec);
01315                 new_feat[i].string=SG_MALLOC(ST, len);
01316                 new_feat[i].slen=len;
01317                 memcpy(new_feat[i].string, vec, ((size_t) len) * sizeof(ST));
01318                 free_feature_vector(vec, i, free_vec);
01319             }
01320 
01321             return new_feat;
01322         }
01323 
01331         virtual void get_features(SGString<ST>** dst, int32_t* num_str)
01332         {
01333             int32_t num_vec;
01334             int32_t max_str_len;
01335             *dst=copy_features(num_vec, max_str_len);
01336             *num_str=num_vec;
01337         }
01338 
01345         virtual inline void save(CFile* writer);
01346 
01355         virtual bool load_compressed(char* src, bool decompress)
01356         {
01357             remove_subset();
01358 
01359             FILE* file=NULL;
01360 
01361             if (!(file=fopen(src, "r")))
01362                 return false;
01363             cleanup();
01364 
01365             // header shogun v0
01366             char id[4];
01367             if (fread(&id[0], sizeof(char), 1, file)!=1)
01368                 SG_ERROR("failed to read header");
01369             ASSERT(id[0]=='S');
01370             if (fread(&id[1], sizeof(char), 1, file)!=1)
01371                 SG_ERROR("failed to read header");
01372             ASSERT(id[1]=='G');
01373             if (fread(&id[2], sizeof(char), 1, file)!=1)
01374                 SG_ERROR("failed to read header");
01375             ASSERT(id[2]=='V');
01376             if (fread(&id[3], sizeof(char), 1, file)!=1)
01377                 SG_ERROR("failed to read header");
01378             ASSERT(id[3]=='0');
01379 
01380             //compression type
01381             uint8_t c;
01382             if (fread(&c, sizeof(uint8_t), 1, file)!=1)
01383                 SG_ERROR("failed to read compression type");
01384             CCompressor* compressor= new CCompressor((E_COMPRESSION_TYPE) c);
01385             //alphabet
01386             uint8_t a;
01387             delete alphabet;
01388             if (fread(&a, sizeof(uint8_t), 1, file)!=1)
01389                 SG_ERROR("failed to read compression alphabet");
01390             alphabet=new CAlphabet((EAlphabet) a);
01391             // number of vectors
01392             if (fread(&num_vectors, sizeof(int32_t), 1, file)!=1)
01393                 SG_ERROR("failed to read compression number of vectors");
01394             ASSERT(num_vectors>0);
01395             // maximum string length
01396             if (fread(&max_string_length, sizeof(int32_t), 1, file)!=1)
01397                 SG_ERROR("failed to read maximum string length");
01398             ASSERT(max_string_length>0);
01399 
01400             features=SG_MALLOC(SGString<ST>, num_vectors);
01401 
01402             // vectors
01403             for (int32_t i=0; i<num_vectors; i++)
01404             {
01405                 // vector len compressed
01406                 int32_t len_compressed;
01407                 if (fread(&len_compressed, sizeof(int32_t), 1, file)!=1)
01408                     SG_ERROR("failed to read vector length compressed");
01409                 // vector len uncompressed
01410                 int32_t len_uncompressed;
01411                 if (fread(&len_uncompressed, sizeof(int32_t), 1, file)!=1)
01412                     SG_ERROR("failed to read vector length uncompressed");
01413 
01414                 // vector raw data
01415                 if (decompress)
01416                 {
01417                     features[i].string=SG_MALLOC(ST, len_uncompressed);
01418                     features[i].slen=len_uncompressed;
01419                     uint8_t* compressed=SG_MALLOC(uint8_t, len_compressed);
01420                     if (fread(compressed, sizeof(uint8_t), len_compressed, file)!=(size_t) len_compressed)
01421                         SG_ERROR("failed to read compressed data (expected %d bytes)", len_compressed);
01422                     uint64_t uncompressed_size=len_uncompressed;
01423                     uncompressed_size*=sizeof(ST);
01424                     compressor->decompress(compressed, len_compressed,
01425                             (uint8_t*) features[i].string, uncompressed_size);
01426                     SG_FREE(compressed);
01427                     ASSERT(uncompressed_size==((uint64_t) len_uncompressed)*sizeof(ST));
01428                 }
01429                 else
01430                 {
01431                     int32_t offs=CMath::ceil(2.0*sizeof(int32_t)/sizeof(ST));
01432                     features[i].string=SG_MALLOC(ST, len_compressed+offs);
01433                     features[i].slen=len_compressed+offs;
01434                     int32_t* feat32ptr=((int32_t*) (features[i].string));
01435                     memset(features[i].string, 0, offs*sizeof(ST));
01436                     feat32ptr[0]=(int32_t) len_compressed;
01437                     feat32ptr[1]=(int32_t) len_uncompressed;
01438                     uint8_t* compressed=(uint8_t*) (&features[i].string[offs]);
01439                     if (fread(compressed, 1, len_compressed, file)!=(size_t) len_compressed)
01440                         SG_ERROR("failed to read uncompressed data");
01441                 }
01442             }
01443 
01444             delete compressor;
01445             fclose(file);
01446 
01447             return false;
01448         }
01449 
01459         virtual bool save_compressed(char* dest, E_COMPRESSION_TYPE compression, int level)
01460         {
01461             if (m_subset)
01462                 SG_ERROR("save_compressed() is not possible on subset");
01463 
01464             FILE* file=NULL;
01465 
01466             if (!(file=fopen(dest, "wb")))
01467                 return false;
01468 
01469             CCompressor* compressor= new CCompressor(compression);
01470 
01471             // header shogun v0
01472             const char* id="SGV0";
01473             fwrite(&id[0], sizeof(char), 1, file);
01474             fwrite(&id[1], sizeof(char), 1, file);
01475             fwrite(&id[2], sizeof(char), 1, file);
01476             fwrite(&id[3], sizeof(char), 1, file);
01477 
01478             //compression type
01479             uint8_t c=(uint8_t) compression;
01480             fwrite(&c, sizeof(uint8_t), 1, file);
01481             //alphabet
01482             uint8_t a=(uint8_t) alphabet->get_alphabet();
01483             fwrite(&a, sizeof(uint8_t), 1, file);
01484             // number of vectors
01485             fwrite(&num_vectors, sizeof(int32_t), 1, file);
01486             // maximum string length
01487             fwrite(&max_string_length, sizeof(int32_t), 1, file);
01488 
01489             // vectors
01490             for (int32_t i=0; i<num_vectors; i++)
01491             {
01492                 int32_t len=-1;
01493                 bool vfree;
01494                 ST* vec=get_feature_vector(i, len, vfree);
01495 
01496                 uint8_t* compressed=NULL;
01497                 uint64_t compressed_size=0;
01498 
01499                 compressor->compress((uint8_t*) vec, ((uint64_t) len)*sizeof(ST),
01500                         compressed, compressed_size, level);
01501 
01502                 int32_t len_compressed=(int32_t) compressed_size;
01503                 // vector len compressed in bytes
01504                 fwrite(&len_compressed, sizeof(int32_t), 1, file);
01505                 // vector len uncompressed in number of elements of type ST
01506                 fwrite(&len, sizeof(int32_t), 1, file);
01507                 // vector raw data
01508                 fwrite(compressed, compressed_size, 1, file);
01509                 SG_FREE(compressed);
01510 
01511                 free_feature_vector(vec, i, vfree);
01512             }
01513 
01514             delete compressor;
01515             fclose(file);
01516             return true;
01517         }
01518 
01519 
01524         virtual int32_t get_size() { return sizeof(ST); }
01525 
01531         virtual bool apply_preprocessor(bool force_preprocessing=false)
01532         {
01533             SG_DEBUG( "force: %d\n", force_preprocessing);
01534 
01535             for (int32_t i=0; i<get_num_preprocessors(); i++)
01536             {
01537                 if ( (!is_preprocessed(i) || force_preprocessing) )
01538                 {
01539                     set_preprocessed(i);
01540                     CStringPreprocessor<ST>* p=(CStringPreprocessor<ST>*) get_preprocessor(i);
01541                     SG_INFO( "preprocessing using preproc %s\n", p->get_name());
01542 
01543                     if (!p->apply_to_string_features(this))
01544                     {
01545                         SG_UNREF(p);
01546                         return false;
01547                     }
01548                     else
01549                         SG_UNREF(p);
01550                 }
01551             }
01552             return true;
01553         }
01554 
01567         int32_t obtain_by_sliding_window(int32_t window_size, int32_t step_size, int32_t skip=0)
01568         {
01569             if (m_subset)
01570                 SG_NOTIMPLEMENTED;
01571 
01572             ASSERT(step_size>0);
01573             ASSERT(window_size>0);
01574             ASSERT(num_vectors==1 || single_string);
01575             ASSERT(max_string_length>=window_size ||
01576                     (single_string && length_of_single_string>=window_size));
01577 
01578             //in case we are dealing with a single remapped string
01579             //allow remapping
01580             if (single_string)
01581                 num_vectors= (length_of_single_string-window_size)/step_size + 1;
01582             else if (num_vectors==1)
01583             {
01584                 num_vectors= (max_string_length-window_size)/step_size + 1;
01585                 length_of_single_string=max_string_length;
01586             }
01587 
01588             SGString<ST>* f=SG_MALLOC(SGString<ST>, num_vectors);
01589             int32_t offs=0;
01590             for (int32_t i=0; i<num_vectors; i++)
01591             {
01592                 f[i].string=&features[0].string[offs+skip];
01593                 f[i].slen=window_size-skip;
01594                 offs+=step_size;
01595             }
01596             single_string=features[0].string;
01597             SG_FREE(features);
01598             features=f;
01599             max_string_length=window_size-skip;
01600 
01601             return num_vectors;
01602         }
01603 
01614         int32_t obtain_by_position_list(int32_t window_size, CDynamicArray<int32_t>* positions, int32_t skip=0)
01615         {
01616             if (m_subset)
01617                 SG_NOTIMPLEMENTED;
01618 
01619             ASSERT(positions);
01620             ASSERT(window_size>0);
01621             ASSERT(num_vectors==1 || single_string);
01622             ASSERT(max_string_length>=window_size ||
01623                     (single_string && length_of_single_string>=window_size));
01624 
01625             num_vectors= positions->get_num_elements();
01626             ASSERT(num_vectors>0);
01627 
01628             int32_t len;
01629 
01630             //in case we are dealing with a single remapped string
01631             //allow remapping
01632             if (single_string)
01633                 len=length_of_single_string;
01634             else
01635             {
01636                 single_string=features[0].string;
01637                 len=max_string_length;
01638                 length_of_single_string=max_string_length;
01639             }
01640 
01641             SGString<ST>* f=SG_MALLOC(SGString<ST>, num_vectors);
01642             for (int32_t i=0; i<num_vectors; i++)
01643             {
01644                 int32_t p=positions->get_element(i);
01645 
01646                 if (p>=0 && p<=len-window_size)
01647                 {
01648                     f[i].string=&features[0].string[p+skip];
01649                     f[i].slen=window_size-skip;
01650                 }
01651                 else
01652                 {
01653                     num_vectors=1;
01654                     max_string_length=len;
01655                     features[0].slen=len;
01656                     single_string=NULL;
01657                     SG_FREE(f);
01658                     SG_ERROR("window (size:%d) starting at position[%d]=%d does not fit in sequence(len:%d)\n",
01659                             window_size, i, p, len);
01660                     return -1;
01661                 }
01662             }
01663 
01664             SG_FREE(features);
01665             features=f;
01666             max_string_length=window_size-skip;
01667 
01668             return num_vectors;
01669         }
01670 
01684         inline bool obtain_from_char(CStringFeatures<char>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
01685         {
01686             return obtain_from_char_features(sf, start, p_order, gap, rev);
01687         }
01688 
01700         template <class CT>
01701             bool obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
01702             {
01703                 remove_subset();
01704                 ASSERT(sf);
01705 
01706                 CAlphabet* alpha=sf->get_alphabet();
01707                 ASSERT(alpha->get_num_symbols_in_histogram() > 0);
01708 
01709                 this->order=p_order;
01710                 cleanup();
01711 
01712                 num_vectors=sf->get_num_vectors();
01713                 ASSERT(num_vectors>0);
01714                 max_string_length=sf->get_max_vector_length()-start;
01715                 features=SG_MALLOC(SGString<ST>, num_vectors);
01716 
01717                 SG_DEBUG( "%1.0llf symbols in StringFeatures<*> %d symbols in histogram\n", sf->get_num_symbols(),
01718                         alpha->get_num_symbols_in_histogram());
01719 
01720                 for (int32_t i=0; i<num_vectors; i++)
01721                 {
01722                     int32_t len=-1;
01723                     bool vfree;
01724                     CT* c=sf->get_feature_vector(i, len, vfree);
01725                     ASSERT(!vfree); // won't work when preprocessors are attached
01726 
01727                     features[i].string=SG_MALLOC(ST, len);
01728                     features[i].slen=len;
01729 
01730                     ST* str=features[i].string;
01731                     for (int32_t j=0; j<len; j++)
01732                         str[j]=(ST) alpha->remap_to_bin(c[j]);
01733                 }
01734 
01735                 original_num_symbols=alpha->get_num_symbols();
01736                 int32_t max_val=alpha->get_num_bits();
01737 
01738                 SG_UNREF(alpha);
01739 
01740                 if (p_order>1)
01741                     num_symbols=CMath::powl((floatmax_t) 2, (floatmax_t) max_val*p_order);
01742                 else
01743                     num_symbols=original_num_symbols;
01744                 SG_INFO( "max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols);
01745 
01746                 if ( ((floatmax_t) num_symbols) > CMath::powl(((floatmax_t) 2),((floatmax_t) sizeof(ST)*8)) )
01747                 {
01748                     SG_ERROR( "symbol does not fit into datatype \"%c\" (%d)\n", (char) max_val, (int) max_val);
01749                     return false;
01750                 }
01751 
01752                 SG_DEBUG( "translate: start=%i order=%i gap=%i(size:%i)\n", start, p_order, gap, sizeof(ST)) ;
01753                 for (int32_t line=0; line<num_vectors; line++)
01754                 {
01755                     int32_t len=0;
01756                     bool vfree;
01757                     ST* fv=get_feature_vector(line, len, vfree);
01758                     ASSERT(!vfree); // won't work when preprocessors are attached
01759 
01760                     if (rev)
01761                         CAlphabet::translate_from_single_order_reversed(fv, len, start+gap, p_order+gap, max_val, gap);
01762                     else
01763                         CAlphabet::translate_from_single_order(fv, len, start+gap, p_order+gap, max_val, gap);
01764 
01765                     /* fix the length of the string -- hacky */
01766                     features[line].slen-=start+gap ;
01767                     if (features[line].slen<0)
01768                         features[line].slen=0 ;
01769                 }
01770 
01771                 compute_symbol_mask_table(max_val);
01772 
01773                 return true;
01774             }
01775 
01785         bool have_same_length(int32_t len=-1)
01786         {
01787             if (len!=-1)
01788             {
01789                 if (len!=max_string_length)
01790                     return false;
01791             }
01792             len=max_string_length;
01793 
01794             index_t num_str=get_num_vectors();
01795             for (int32_t i=0; i<num_str; i++)
01796             {
01797                 if (get_vector_length(i)!=len)
01798                     return false;
01799             }
01800 
01801             return true;
01802         }
01803 
01809         inline void embed_features(int32_t p_order)
01810         {
01811             if (m_subset)
01812                 SG_NOTIMPLEMENTED;
01813 
01814             ASSERT(alphabet->get_num_symbols_in_histogram() > 0);
01815 
01816             order=p_order;
01817             original_num_symbols=alphabet->get_num_symbols();
01818             int32_t max_val=alphabet->get_num_bits();
01819 
01820             if (p_order>1)
01821                 num_symbols=CMath::powl((floatmax_t) 2, (floatmax_t) max_val*p_order);
01822             else
01823                 num_symbols=original_num_symbols;
01824 
01825             SG_INFO( "max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols);
01826 
01827             if ( ((floatmax_t) num_symbols) > CMath::powl(((floatmax_t) 2),((floatmax_t) sizeof(ST)*8)) )
01828                 SG_WARNING("symbols did not fit into datatype \"%c\" (%d)\n", (char) max_val, (int) max_val);
01829 
01830             ST mask=0;
01831             for (int32_t i=0; i<p_order*max_val; i++)
01832                 mask= (mask<<1) | ((ST) 1);
01833 
01834             for (int32_t i=0; i<num_vectors; i++)
01835             {
01836                 int32_t len=features[i].slen;
01837 
01838                 if (len < p_order)
01839                     SG_ERROR("Sequence must be longer than order (%d vs. %d)\n", len, p_order);
01840 
01841                 ST* str=features[i].string;
01842 
01843                 // convert first word
01844                 for (int32_t j=0; j<p_order; j++)
01845                     str[j]=(ST) alphabet->remap_to_bin(str[j]);
01846                 str[0]=embed_word(&str[0], p_order);
01847 
01848                 // convert the rest
01849                 int32_t idx=0;
01850                 for (int32_t j=p_order; j<len; j++)
01851                 {
01852                     str[j]=(ST) alphabet->remap_to_bin(str[j]);
01853                     str[idx+1]= ((str[idx]<<max_val) | str[j]) & mask;
01854                     idx++;
01855                 }
01856 
01857                 features[i].slen=len-p_order+1;
01858             }
01859 
01860             compute_symbol_mask_table(max_val);
01861         }
01862 
01869         inline void compute_symbol_mask_table(int64_t max_val)
01870         {
01871             if (m_subset)
01872                 SG_NOTIMPLEMENTED;
01873 
01874             SG_FREE(symbol_mask_table);
01875             symbol_mask_table=SG_MALLOC(ST, 256);
01876 
01877             uint64_t mask=0;
01878             for (int32_t i=0; i< (int64_t) max_val; i++)
01879                 mask=(mask<<1) | 1;
01880 
01881             for (int32_t i=0; i<256; i++)
01882             {
01883                 uint8_t bits=(uint8_t) i;
01884                 symbol_mask_table[i]=0;
01885 
01886                 for (int32_t j=0; j<8; j++)
01887                 {
01888                     if (bits & 1)
01889                         symbol_mask_table[i]|=mask<<(max_val*j);
01890 
01891                     bits>>=1;
01892                 }
01893             }
01894         }
01895 
01902         inline void unembed_word(ST word, uint8_t* seq, int32_t len)
01903         {
01904             uint32_t nbits= (uint32_t) alphabet->get_num_bits();
01905 
01906             ST mask=0;
01907             for (int32_t i=0; i<nbits; i++)
01908                 mask=(mask<<1) | (ST) 1;
01909 
01910             for (int32_t i=0; i<len; i++)
01911             {
01912                 ST w=(word & mask);
01913                 seq[len-i-1]=alphabet->remap_to_char((uint8_t) w);
01914                 word>>=nbits;
01915             }
01916         }
01917 
01923         inline ST embed_word(ST* seq, int32_t len)
01924         {
01925             ST value=(ST) 0;
01926             uint32_t nbits= (uint32_t) alphabet->get_num_bits();
01927             for (int32_t i=0; i<len; i++)
01928             {
01929                 value<<=nbits;
01930                 value|=seq[i];
01931             }
01932 
01933             return value;
01934         }
01935 
01940         void determine_maximum_string_length()
01941         {
01942             max_string_length=0;
01943             index_t num_str=get_num_vectors();
01944             for (int32_t i=0; i<num_str; i++)
01945             {
01946                 max_string_length=CMath::max(max_string_length,
01947                     features[subset_idx_conversion(i)].slen);
01948             }
01949         }
01950 
01958         static ST* get_zero_terminated_string_copy(SGString<ST> str)
01959         {
01960             int32_t l=str.slen;
01961             ST* s=SG_MALLOC(ST, l+1);
01962             memcpy(s, str.string, sizeof(ST)*l);
01963             s[l]='\0';
01964             return s;
01965         }
01966 
01975         virtual void set_feature_vector(int32_t num, ST* string, int32_t len)
01976         {
01977             ASSERT(features);
01978             ASSERT(num<get_num_vectors());
01979 
01980             int32_t real_num=subset_idx_conversion(num);
01981 
01982 
01983             features[real_num].slen=len ;
01984             features[real_num].string=string ;
01985 
01986             max_string_length=CMath::max(len, max_string_length);
01987         }
01988 
01989 
01994         virtual void get_histogram(float64_t** hist, int32_t* rows, int32_t* cols, bool normalize=true)
01995         {
01996             int32_t nsym=get_num_symbols();
01997             int32_t slen=get_max_vector_length();
01998             int64_t sz=int64_t(nsym)*slen*sizeof(float64_t);
01999             float64_t* h= SG_MALLOC(float64_t, sz);
02000             memset(h, 0, sz);
02001 
02002             float64_t* h_normalizer=SG_MALLOC(float64_t, slen);
02003             memset(h_normalizer, 0, slen*sizeof(float64_t));
02004             int32_t num_str=get_num_vectors();
02005             for (int32_t i=0; i<num_str; i++)
02006             {
02007                 int32_t len;
02008                 bool free_vec;
02009                 ST* vec=get_feature_vector(i, len, free_vec);
02010                 for (int32_t j=0; j<len; j++)
02011                 {
02012                     h[int64_t(j)*nsym+alphabet->remap_to_bin(vec[j])]++;
02013                     h_normalizer[j]++;
02014                 }
02015                 free_feature_vector(vec, i, free_vec);
02016             }
02017 
02018             if (normalize)
02019             {
02020                 for (int32_t i=0; i<slen; i++)
02021                 {
02022                     for (int32_t j=0; j<nsym; j++)
02023                     {
02024                         if (h_normalizer && h_normalizer[i])
02025                             h[int64_t(i)*nsym+j]/=h_normalizer[i];
02026                     }
02027                 }
02028             }
02029             SG_FREE(h_normalizer);
02030 
02031             *hist=h;
02032             *rows=nsym;
02033             *cols=slen;
02034         }
02035 
02040         virtual void create_random(float64_t* hist, int32_t rows, int32_t cols, int32_t num_vec)
02041         {
02042             ASSERT(rows == get_num_symbols());
02043             cleanup();
02044             float64_t* randoms=SG_MALLOC(float64_t, cols);
02045             SGString<ST>* sf=SG_MALLOC(SGString<ST>, num_vec);
02046 
02047             for (int32_t i=0; i<num_vec; i++)
02048             {
02049                 sf[i].string=SG_MALLOC(ST, cols);
02050                 sf[i].slen=cols;
02051 
02052                 CMath::random_vector(randoms, cols, 0.0, 1.0);
02053 
02054                 for (int32_t j=0; j<cols; j++)
02055                 {
02056                     float64_t lik=hist[int64_t(j)*rows+0];
02057 
02058                     int32_t c;
02059                     for (c=0; c<rows-1; c++)
02060                     {
02061                         if (randoms[j]<=lik)
02062                             break;
02063                         lik+=hist[int64_t(j)*rows+c+1];
02064                     }
02065                     sf[i].string[j]=alphabet->remap_to_char(c);
02066                 }
02067             }
02068             SG_FREE(randoms);
02069             set_features(sf, num_vec, cols);
02070         }
02071 
02072         /*
02073         CStringFeatures<SSKTripleFeature>* obtain_sssk_triple_from_cha(int d1, int d2)
02074         {
02075             int *s;
02076             int32_t nStr=get_num_vectors();
02077 
02078             int32_t nfeat=0;
02079             for (int32_t i=0; i < nStr; ++i)
02080                 nfeat += get_vector_length[i] - d1 -d2;
02081             SGString<SSKFeature>* F= SG_MALLOC(SGString<SSKFeature>, nfeat);
02082             int32_t c=0;
02083             for (int32_t i=0; i < nStr; ++i)
02084             {
02085             int32_t len;
02086             bool free_vec;
02087             ST* S=get_feature_vector(vec_num, len, free_vec);
02088             free_feature_vector(vec, vec_num, free_vec);
02089                 int32_t n=len - d1 - d2;
02090                 s=S[i];
02091                 for (int32_t j=0; j < n; ++j)
02092                 {
02093                     F[c].feature1=s[j];
02094                     F[c].feature2=s[j+d1];
02095                     F[c].feature3=s[j+d1+d2];
02096                     F[c].group=i;
02097                     c++;
02098                 }
02099             }
02100             ASSERT(nfeat==c);
02101             return F;
02102         }
02103 
02104         CStringFeatures<SSKFeature>* obtain_sssk_double_from_char(int **S, int *len, int nStr, int d1)
02105         {
02106             int i, j;
02107             int n, nfeat;
02108             int *group;
02109             int *features;
02110             int *s;
02111             int c;
02112             SSKFeatures *F;
02113 
02114             nfeat=0;
02115             for (i=0; i < nStr; ++i)
02116                 nfeat += len[i] - d1;
02117             group=(int *)SG_MALLOC(nfeat*sizeof(int));
02118             features=(int *)SG_MALLOC(nfeat*2*sizeof(int *));
02119             c=0;
02120             for (i=0; i < nStr; ++i)
02121             {
02122                 n=len[i] - d1;
02123                 s=S[i];
02124                 for (j=0; j < n; ++j)
02125                 {
02126                     features[c]=s[j];
02127                     features[c+nfeat]=s[j+d1];
02128                     group[c]=i;
02129                     c++;
02130                 }
02131             }
02132             if (nfeat!=c)
02133                 printf("Something is wrong...\n");
02134             F=(SSKFeatures *)SG_MALLOC(sizeof(SSKFeatures));
02135             (*F).features=features;
02136             (*F).group=group;
02137             (*F).n=nfeat;
02138             return F;
02139         }
02140     */
02141 
02150         virtual CFeatures* copy_subset(SGVector<index_t> indices)
02151         {
02152             /* string list to create new CStringFeatures from */
02153             SGStringList<ST> list_copy(indices.vlen, max_string_length);
02154 
02155             /* copy all features */
02156             for (index_t i=0; i<indices.vlen; ++i)
02157             {
02158                 /* index with respect to possible subset */
02159                 index_t real_idx=subset_idx_conversion(indices.vector[i]);
02160 
02161                 /* copy string */
02162                 SGString<ST> current_string=features[real_idx];
02163                 SGString<ST> string_copy(current_string.slen);
02164                 memcpy(string_copy.string, current_string.string,
02165                     current_string.slen*sizeof(ST));
02166                 list_copy.strings[i]=string_copy;
02167             }
02168 
02169             /* create copy instance */
02170             CStringFeatures* result=new CStringFeatures(list_copy, alphabet);
02171 
02172             /* max string length may have changed */
02173             result->determine_maximum_string_length();
02174 
02175             return result;
02176         }
02177 
02179         inline virtual const char* get_name() const { return "StringFeatures"; }
02180 
02182         virtual void subset_changed_post()
02183         {
02184             /* max string length has to be updated */
02185             determine_maximum_string_length();
02186         }
02187     protected:
02188 
02199         virtual ST* compute_feature_vector(int32_t num, int32_t& len)
02200         {
02201             ASSERT(features && num<get_num_vectors());
02202 
02203             int32_t real_num=subset_idx_conversion(num);
02204 
02205             len=features[real_num].slen;
02206             if (len<=0)
02207                 return NULL;
02208 
02209             ST* target=SG_MALLOC(ST, len);
02210             memcpy(target, features[real_num].string, len*sizeof(ST));
02211             return target;
02212         }
02213 
02214     private:
02215         void init()
02216         {
02217             set_generic<ST>();
02218 
02219             alphabet=NULL;
02220             num_vectors=0;
02221             features=NULL;
02222             single_string=NULL;
02223             length_of_single_string=0;
02224             max_string_length=0;
02225             order=0;
02226             symbol_mask_table=0;
02227             preprocess_on_get=false;
02228             feature_cache=NULL;
02229 
02230             m_parameters->add((CSGObject**) &alphabet, "alphabet");
02231             m_parameters->add_vector(&features, &num_vectors, "features",
02232                     "This contains the array of features.");
02233             m_parameters->add_vector(&single_string,
02234                     &length_of_single_string,
02235                     "single_string",
02236                     "Created by sliding window.");
02237             m_parameters->add(&max_string_length, "max_string_length",
02238                     "Length of longest string.");
02239             m_parameters->add(&num_symbols, "num_symbols",
02240                     "Number of used symbols.");
02241             m_parameters->add(&original_num_symbols, "original_num_symbols",
02242                     "Original number of used symbols.");
02243             m_parameters->add(&order, "order",
02244                     "Order used in higher order mapping.");
02245             m_parameters->add(&preprocess_on_get, "preprocess_on_get",
02246                     "Preprocess on-the-fly?");
02247 
02248             /* TODO M_PARAMETERS->ADD?
02249              * /// order used in higher order mapping
02250              * ST* symbol_mask_table;
02251              */
02252         }
02253 
02254 
02255     protected:
02256 
02258         CAlphabet* alphabet;
02259 
02261         int32_t num_vectors;
02262 
02264         SGString<ST>* features;
02265 
02267         ST* single_string;
02268 
02270         int32_t length_of_single_string;
02271 
02273         int32_t max_string_length;
02274 
02276         floatmax_t num_symbols;
02277 
02279         floatmax_t original_num_symbols;
02280 
02282         int32_t order;
02283 
02285         ST* symbol_mask_table;
02286 
02288         bool preprocess_on_get;
02289 
02291         CCache<ST>* feature_cache;
02292 };
02293 
02294 #ifndef DOXYGEN_SHOULD_SKIP_THIS
02295 
02299 template<> inline EFeatureType CStringFeatures<bool>::get_feature_type()
02300 {
02301     return F_BOOL;
02302 }
02303 
02308 template<> inline EFeatureType CStringFeatures<char>::get_feature_type()
02309 {
02310     return F_CHAR;
02311 }
02312 
02317 template<> inline EFeatureType CStringFeatures<uint8_t>::get_feature_type()
02318 {
02319     return F_BYTE;
02320 }
02321 
02326 template<> inline EFeatureType CStringFeatures<int16_t>::get_feature_type()
02327 {
02328     return F_SHORT;
02329 }
02330 
02335 template<> inline EFeatureType CStringFeatures<uint16_t>::get_feature_type()
02336 {
02337     return F_WORD;
02338 }
02339 
02344 template<> inline EFeatureType CStringFeatures<int32_t>::get_feature_type()
02345 {
02346     return F_INT;
02347 }
02348 
02353 template<> inline EFeatureType CStringFeatures<uint32_t>::get_feature_type()
02354 {
02355     return F_UINT;
02356 }
02357 
02362 template<> inline EFeatureType CStringFeatures<int64_t>::get_feature_type()
02363 {
02364     return F_LONG;
02365 }
02366 
02371 template<> inline EFeatureType CStringFeatures<uint64_t>::get_feature_type()
02372 {
02373     return F_ULONG;
02374 }
02375 
02380 template<> inline EFeatureType CStringFeatures<float32_t>::get_feature_type()
02381 {
02382     return F_SHORTREAL;
02383 }
02384 
02389 template<> inline EFeatureType CStringFeatures<float64_t>::get_feature_type()
02390 {
02391     return F_DREAL;
02392 }
02393 
02398 template<> inline EFeatureType CStringFeatures<floatmax_t>::get_feature_type()
02399 {
02400     return F_LONGREAL;
02401 }
02402 
02403 template<> inline bool CStringFeatures<bool>::get_masked_symbols(bool symbol, uint8_t mask)
02404 {
02405     return symbol;
02406 }
02407 template<> inline float32_t CStringFeatures<float32_t>::get_masked_symbols(float32_t symbol, uint8_t mask)
02408 {
02409     return symbol;
02410 }
02411 template<> inline float64_t CStringFeatures<float64_t>::get_masked_symbols(float64_t symbol, uint8_t mask)
02412 {
02413     return symbol;
02414 }
02415 template<> inline floatmax_t CStringFeatures<floatmax_t>::get_masked_symbols(floatmax_t symbol, uint8_t mask)
02416 {
02417     return symbol;
02418 }
02419 
02420 template<> inline bool CStringFeatures<bool>::shift_offset(bool symbol, int32_t amount)
02421 {
02422     return false;
02423 }
02424 template<> inline float32_t CStringFeatures<float32_t>::shift_offset(float32_t symbol, int32_t amount)
02425 {
02426     return 0;
02427 }
02428 template<> inline float64_t CStringFeatures<float64_t>::shift_offset(float64_t symbol, int32_t amount)
02429 {
02430     return 0;
02431 }
02432 template<> inline floatmax_t CStringFeatures<floatmax_t>::shift_offset(floatmax_t symbol, int32_t amount)
02433 {
02434     return 0;
02435 }
02436 
02437 template<> inline bool CStringFeatures<bool>::shift_symbol(bool symbol, int32_t amount)
02438 {
02439     return symbol;
02440 }
02441 template<> inline float32_t CStringFeatures<float32_t>::shift_symbol(float32_t symbol, int32_t amount)
02442 {
02443     return symbol;
02444 }
02445 template<> inline float64_t CStringFeatures<float64_t>::shift_symbol(float64_t symbol, int32_t amount)
02446 {
02447     return symbol;
02448 }
02449 template<> inline floatmax_t CStringFeatures<floatmax_t>::shift_symbol(floatmax_t symbol, int32_t amount)
02450 {
02451     return symbol;
02452 }
02453 
02454 #ifndef SUNOS
02455 template<>  template <class CT> bool CStringFeatures<float32_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
02456 {
02457     return false;
02458 }
02459 template<>  template <class CT> bool CStringFeatures<float64_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
02460 {
02461     return false;
02462 }
02463 template<>  template <class CT> bool CStringFeatures<floatmax_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
02464 {
02465     return false;
02466 }
02467 #endif
02468 
02469 template<>  inline void CStringFeatures<float32_t>::embed_features(int32_t p_order)
02470 {
02471 }
02472 template<>  inline void CStringFeatures<float64_t>::embed_features(int32_t p_order)
02473 {
02474 }
02475 template<>  inline void CStringFeatures<floatmax_t>::embed_features(int32_t p_order)
02476 {
02477 }
02478 
02479 template<>  inline void CStringFeatures<float32_t>::compute_symbol_mask_table(int64_t max_val)
02480 {
02481 }
02482 template<>  inline void CStringFeatures<float64_t>::compute_symbol_mask_table(int64_t max_val)
02483 {
02484 }
02485 template<>  inline void CStringFeatures<floatmax_t>::compute_symbol_mask_table(int64_t max_val)
02486 {
02487 }
02488 
02489 template<>  inline float32_t CStringFeatures<float32_t>::embed_word(float32_t* seq, int32_t len)
02490 {
02491     return 0;
02492 }
02493 template<>  inline float64_t CStringFeatures<float64_t>::embed_word(float64_t* seq, int32_t len)
02494 {
02495     return 0;
02496 }
02497 template<>  inline floatmax_t CStringFeatures<floatmax_t>::embed_word(floatmax_t* seq, int32_t len)
02498 {
02499     return 0;
02500 }
02501 
02502 template<>  inline void CStringFeatures<float32_t>::unembed_word(float32_t word, uint8_t* seq, int32_t len)
02503 {
02504 }
02505 template<>  inline void CStringFeatures<float64_t>::unembed_word(float64_t word, uint8_t* seq, int32_t len)
02506 {
02507 }
02508 template<>  inline void CStringFeatures<floatmax_t>::unembed_word(floatmax_t word, uint8_t* seq, int32_t len)
02509 {
02510 }
02511 #define LOAD(f_load, sg_type)                                               \
02512 template<> inline void CStringFeatures<sg_type>::load(CFile* loader)        \
02513 {                                                                           \
02514     SG_INFO( "loading...\n");                                               \
02515                                                                             \
02516     SG_SET_LOCALE_C;                                                    \
02517     SGString<sg_type>* strs;                                                \
02518     int32_t num_str;                                                        \
02519     int32_t max_len;                                                        \
02520     loader->f_load(strs, num_str, max_len);                                 \
02521     set_features(strs, num_str, max_len);                                   \
02522     SG_RESET_LOCALE;                                                    \
02523 }
02524 
02525 LOAD(get_string_list, bool)
02526 LOAD(get_string_list, char)
02527 LOAD(get_int8_string_list, int8_t)
02528 LOAD(get_string_list, uint8_t)
02529 LOAD(get_string_list, int16_t)
02530 LOAD(get_string_list, uint16_t)
02531 LOAD(get_string_list, int32_t)
02532 LOAD(get_uint_string_list, uint32_t)
02533 LOAD(get_long_string_list, int64_t)
02534 LOAD(get_ulong_string_list, uint64_t)
02535 LOAD(get_string_list, float32_t)
02536 LOAD(get_string_list, float64_t)
02537 LOAD(get_longreal_string_list, floatmax_t)
02538 #undef LOAD
02539 
02540 #define SAVE(f_write, sg_type)                                              \
02541 template<> inline void CStringFeatures<sg_type>::save(CFile* writer)        \
02542 {                                                                           \
02543     if (m_subset)                                                           \
02544         SG_ERROR("save() is not possible on subset");                       \
02545     SG_SET_LOCALE_C;                                                    \
02546     ASSERT(writer);                                                         \
02547     writer->f_write(features, num_vectors);                                 \
02548     SG_RESET_LOCALE;                                                    \
02549 }
02550 
02551 SAVE(set_string_list, bool)
02552 SAVE(set_string_list, char)
02553 SAVE(set_int8_string_list, int8_t)
02554 SAVE(set_string_list, uint8_t)
02555 SAVE(set_string_list, int16_t)
02556 SAVE(set_string_list, uint16_t)
02557 SAVE(set_string_list, int32_t)
02558 SAVE(set_uint_string_list, uint32_t)
02559 SAVE(set_long_string_list, int64_t)
02560 SAVE(set_ulong_string_list, uint64_t)
02561 SAVE(set_string_list, float32_t)
02562 SAVE(set_string_list, float64_t)
02563 SAVE(set_longreal_string_list, floatmax_t)
02564 #undef SAVE
02565 #endif // DOXYGEN_SHOULD_SKIP_THIS
02566 }
02567 #endif // _CSTRINGFEATURES__H__
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation