StringFeatures.h

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2009 Soeren Sonnenburg
00008  * Written (W) 1999-2008 Gunnar Raetsch
00009  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00010  */
00011 
00012 #ifndef _CSTRINGFEATURES__H__
00013 #define _CSTRINGFEATURES__H__
00014 
00015 #include "lib/common.h"
00016 #include "lib/io.h"
00017 #include "lib/Cache.h"
00018 #include "lib/DynamicArray.h"
00019 #include "lib/File.h"
00020 #include "lib/MemoryMappedFile.h"
00021 #include "lib/Mathematics.h"
00022 #include "lib/Compressor.h"
00023 #include "base/Parameter.h"
00024 
00025 #include "preproc/PreProc.h"
00026 #include "preproc/StringPreProc.h"
00027 #include "features/Features.h"
00028 #include "features/Alphabet.h"
00029 
00030 #include <sys/types.h>
00031 #include <sys/stat.h>
00032 #include <dirent.h>
00033 #include <stdio.h>
00034 #include <stdlib.h>
00035 #include <unistd.h>
00036 
00037 namespace shogun
00038 {
00039 class CCompressor;
00040 enum E_COMPRESSION_TYPE;
00041 class CAlphabet;
00042 enum EAlphabet;
00043 template <class T> class CDynamicArray;
00044 class CFile;
00045 template <class T> class CMemoryMappedFile;
00046 class CMath;
00047 template <class ST> class CStringPreProc;
00048 template <class T> class TString;
00049 
00050 struct SSKDoubleFeature
00051 {
00052     int feature1;
00053     int feature2;
00054     int group;
00055 };
00056 
00057 struct SSKTripleFeature
00058 {
00059     int feature1;
00060     int feature2;
00061     int feature3;
00062     int group;
00063 };
00064 
00083 template <class ST> class CStringFeatures : public CFeatures
00084 {
00085     public:
00089         CStringFeatures() : CFeatures(0), alphabet(NULL), num_vectors(0),
00090         features(NULL), single_string(NULL),length_of_single_string(0),
00091         max_string_length(0), order(0), symbol_mask_table(NULL),
00092         preprocess_on_get(false), feature_cache(NULL)
00093         {
00094             init();
00095             alphabet=new CAlphabet();
00096         }
00097 
00102         CStringFeatures(EAlphabet alpha)
00103         : CFeatures(0), num_vectors(0), features(NULL),
00104             single_string(NULL),length_of_single_string(0),
00105             max_string_length(0), order(0), symbol_mask_table(NULL),
00106             preprocess_on_get(false), feature_cache(NULL)
00107         {
00108             init();
00109 
00110             alphabet=new CAlphabet(alpha);
00111             SG_REF(alphabet);
00112             num_symbols=alphabet->get_num_symbols();
00113             original_num_symbols=num_symbols;
00114         }
00115 
00123         CStringFeatures(TString<ST>* p_features, int32_t p_num_vectors,
00124                 int32_t p_max_string_length, EAlphabet alpha)
00125         : CFeatures(0), num_vectors(0), features(NULL),
00126             single_string(NULL),length_of_single_string(0),
00127             max_string_length(0), order(0), symbol_mask_table(NULL),
00128             preprocess_on_get(false), feature_cache(NULL)
00129         {
00130             init();
00131 
00132             alphabet=new CAlphabet(alpha);
00133             SG_REF(alphabet);
00134             num_symbols=alphabet->get_num_symbols();
00135             original_num_symbols=num_symbols;
00136             set_features(p_features, p_num_vectors, p_max_string_length);
00137         }
00138 
00146         CStringFeatures(TString<ST>* p_features, int32_t p_num_vectors,
00147                 int32_t p_max_string_length, CAlphabet* alpha)
00148         : CFeatures(0), num_vectors(0), features(NULL),
00149             single_string(NULL),length_of_single_string(0),
00150             max_string_length(0), order(0), symbol_mask_table(NULL),
00151             preprocess_on_get(false), feature_cache(NULL)
00152         {
00153             init();
00154 
00155             alphabet=new CAlphabet(alpha);
00156             SG_REF(alphabet);
00157             num_symbols=alphabet->get_num_symbols();
00158             original_num_symbols=num_symbols;
00159             set_features(p_features, p_num_vectors, p_max_string_length);
00160         }
00161 
00166         CStringFeatures(CAlphabet* alpha)
00167         : CFeatures(0), num_vectors(0), features(NULL),
00168             single_string(NULL),length_of_single_string(0),
00169             max_string_length(0), order(0), symbol_mask_table(NULL),
00170             preprocess_on_get(false), feature_cache(NULL)
00171         {
00172             init();
00173 
00174             ASSERT(alpha);
00175             SG_REF(alpha);
00176             alphabet=alpha;
00177             num_symbols=alphabet->get_num_symbols();
00178             original_num_symbols=num_symbols;
00179         }
00180 
00182         CStringFeatures(const CStringFeatures & orig)
00183         : CFeatures(orig), num_vectors(orig.num_vectors),
00184             single_string(orig.single_string),
00185             length_of_single_string(orig.length_of_single_string),
00186             max_string_length(orig.max_string_length),
00187             num_symbols(orig.num_symbols),
00188             original_num_symbols(orig.original_num_symbols),
00189             order(orig.order), preprocess_on_get(false),
00190             feature_cache(NULL)
00191         {
00192             init();
00193 
00194             ASSERT(orig.single_string == NULL); //not implemented
00195 
00196             alphabet=orig.alphabet;
00197             SG_REF(alphabet);
00198 
00199             if (orig.features)
00200             {
00201                 features=new TString<ST>[orig.num_vectors];
00202 
00203                 for (int32_t i=0; i<num_vectors; i++)
00204                 {
00205                     features[i].string=new ST[orig.features[i].length];
00206                     features[i].length=orig.features[i].length;
00207                     memcpy(features[i].string, orig.features[i].string, sizeof(ST)*orig.features[i].length);
00208                 }
00209             }
00210 
00211             if (orig.symbol_mask_table)
00212             {
00213                 symbol_mask_table=new ST[256];
00214                 for (int32_t i=0; i<256; i++)
00215                     symbol_mask_table[i]=orig.symbol_mask_table[i];
00216             }
00217         }
00218 
00224         CStringFeatures(CFile* loader, EAlphabet alpha=DNA)
00225         : CFeatures(loader), num_vectors(0), features(NULL), single_string(NULL),
00226             length_of_single_string(0), max_string_length(0), order(0),
00227             symbol_mask_table(NULL), preprocess_on_get(false), feature_cache(NULL)
00228         {
00229             init();
00230 
00231             alphabet=new CAlphabet(alpha);
00232             SG_REF(alphabet);
00233             num_symbols=alphabet->get_num_symbols();
00234             original_num_symbols=num_symbols;
00235             load(loader);
00236         }
00237 
00238         virtual ~CStringFeatures()
00239         {
00240             cleanup();
00241 
00242             SG_UNREF(alphabet);
00243         }
00244 
00246         virtual void cleanup()
00247         {
00248             if (single_string)
00249             {
00250                 delete[] single_string;
00251                 single_string=NULL;
00252             }
00253             else
00254             {
00255                 for (int32_t i=0; i<num_vectors; i++)
00256                     cleanup_feature_vector(i);
00257             }
00258 
00259             num_vectors=0;
00260             delete[] features;
00261             delete[] symbol_mask_table;
00262             features=NULL;
00263             symbol_mask_table=NULL;
00264 
00265             /* start with a fresh alphabet, but instead of emptying the histogram
00266              * create a new object (to leave the alphabet object alone if it is used
00267              * by others)
00268              */
00269             CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet());
00270             SG_UNREF(alphabet);
00271             alphabet=alpha;
00272             SG_REF(alphabet);
00273         }
00274 
00276         virtual void cleanup_feature_vector(int32_t num)
00277         {
00278             ASSERT(num<num_vectors);
00279             if (features)
00280             {
00281                 delete[] features[num].string;
00282                 features[num].string=NULL;
00283                 features[num].length=0;
00284             }
00285         }
00286 
00291         inline virtual EFeatureClass get_feature_class() { return C_STRING; }
00292 
00297         inline virtual EFeatureType get_feature_type() { return F_UNKNOWN; }
00298 
00303         inline CAlphabet* get_alphabet()
00304         {
00305             SG_REF(alphabet);
00306             return alphabet;
00307         }
00308 
00313         virtual CFeatures* duplicate() const
00314         {
00315             return new CStringFeatures<ST>(*this);
00316         }
00317 
00324         void get_feature_vector(ST** dst, int32_t* len, int32_t num)
00325         {
00326             ASSERT(features);
00327             if (num>=num_vectors)
00328             {
00329                 SG_ERROR("Index out of bounds (number of strings %d, you "
00330                         "requested %d)\n", num_vectors, num);
00331             }
00332 
00333             int32_t l;
00334             bool free_vec;
00335             ST* vec=get_feature_vector(num, l, free_vec);
00336             *len=l;
00337             *dst=(ST*) malloc(*len * sizeof(ST));
00338             ASSERT(*dst);
00339             memcpy(*dst, vec, *len * sizeof(ST));
00340             free_feature_vector(vec, num, free_vec);
00341         }
00342 
00349         void set_feature_vector(ST* src, int32_t len, int32_t num)
00350         {
00351             ASSERT(features);
00352             if (num>=num_vectors)
00353             {
00354                 SG_ERROR("Index out of bounds (number of strings %d, you "
00355                         "requested %d)\n", num_vectors, num);
00356             }
00357 
00358             if (len<=0)
00359                 SG_ERROR("String has zero or negative length\n");
00360 
00361 
00362             cleanup_feature_vector(num);
00363             features[num].length=len;
00364             features[num].string=new ST[len];
00365             memcpy(features[num].string, src, len*sizeof(ST));
00366 
00367             determine_maximum_string_length();
00368         }
00369 
00372         void enable_on_the_fly_preprocessing()
00373         {
00374             preprocess_on_get=true;
00375         }
00376 
00380         void disable_on_the_fly_preprocessing()
00381         {
00382             preprocess_on_get=false;
00383         }
00384 
00393         ST* get_feature_vector(int32_t num, int32_t& len, bool& dofree)
00394         {
00395             ASSERT(features);
00396             ASSERT(num<num_vectors);
00397 
00398             if (!preprocess_on_get)
00399             {
00400                 dofree=false;
00401                 len=features[num].length;
00402                 return features[num].string;
00403             }
00404             else
00405             {
00406                 SG_DEBUG( "computing feature vector!\n") ;
00407                 ST* feat=compute_feature_vector(num, len);
00408                 dofree=true;
00409 
00410                 if (get_num_preproc())
00411                 {
00412                     ST* tmp_feat_before = feat;
00413 
00414                     for (int32_t i=0; i<get_num_preproc(); i++)
00415                     {
00416                         CStringPreProc<ST>* p = (CStringPreProc<ST>*) get_preproc(i);
00417                         feat=p->apply_to_string(tmp_feat_before, len);
00418                         SG_UNREF(p);
00419                         delete[] tmp_feat_before;
00420                         tmp_feat_before=feat;
00421                     }
00422                 }
00423                 // TODO: implement caching
00424                 return feat;
00425             }
00426         }
00427 
00432         CStringFeatures<ST>* get_transposed()
00433         {
00434             int32_t num_feat;
00435             int32_t num_vec;
00436             TString<ST>* s=get_transposed(num_feat, num_vec);
00437 
00438             return new CStringFeatures<ST>(s, num_vec, num_feat, alphabet);
00439         }
00440 
00452         TString<ST>* get_transposed(int32_t &num_feat, int32_t &num_vec)
00453         {
00454             num_feat=num_vectors;
00455             num_vec=get_max_vector_length();
00456             ASSERT(have_same_length());
00457 
00458             SG_DEBUG("Allocating memory for transposed string features of size %ld\n",
00459                     int64_t(num_feat)*num_vec);
00460 
00461             TString<ST>* sf=new TString<ST>[num_vec];
00462 
00463             for (int32_t i=0; i<num_vec; i++)
00464             {
00465                 sf[i].string=new ST[num_feat];
00466                 sf[i].length=num_feat;
00467             }
00468 
00469             for (int32_t i=0; i<num_feat; i++)
00470             {
00471                 int32_t len=0;
00472                 bool free_vec=false;
00473                 ST* vec=get_feature_vector(i, len, free_vec);
00474 
00475                 for (int32_t j=0; j<num_vec; j++)
00476                     sf[j].string[i]=vec[j];
00477 
00478                 free_feature_vector(vec, i, free_vec);
00479             }
00480             return sf;
00481         }
00482 
00489         void free_feature_vector(ST* feat_vec, int32_t num, bool dofree)
00490         {
00491             if (feature_cache)
00492                 feature_cache->unlock_entry(num);
00493 
00494             if (dofree)
00495                 delete[] feat_vec ;
00496         }
00497 
00504         virtual ST inline get_feature(int32_t vec_num, int32_t feat_num)
00505         {
00506             int32_t len;
00507             bool free_vec;
00508             ST* vec=get_feature_vector(vec_num, len, free_vec);
00509             ASSERT(feat_num<len);
00510             ST result=vec[feat_num];
00511             free_feature_vector(vec, vec_num, free_vec);
00512 
00513             return result;
00514         }
00515 
00521         virtual inline int32_t get_vector_length(int32_t vec_num)
00522         {
00523             int32_t len;
00524             bool free_vec;
00525             ST* vec=get_feature_vector(vec_num, len, free_vec);
00526             free_feature_vector(vec, vec_num, free_vec);
00527             return len;
00528         }
00529 
00534         virtual inline int32_t get_max_vector_length()
00535         {
00536             return max_string_length;
00537         }
00538 
00543         virtual inline int32_t get_num_vectors() { return num_vectors; }
00544 
00551         inline floatmax_t get_num_symbols() { return num_symbols; }
00552 
00560         inline floatmax_t get_max_num_symbols() { return CMath::powl(2,sizeof(ST)*8); }
00561 
00562         // these functions are necessary to find out about a former conversion process
00563 
00568         inline floatmax_t get_original_num_symbols() { return original_num_symbols; }
00569 
00574         inline int32_t get_order() { return order; }
00575 
00583         inline ST get_masked_symbols(ST symbol, uint8_t mask)
00584         {
00585             ASSERT(symbol_mask_table);
00586             return symbol_mask_table[mask] & symbol;
00587         }
00588 
00595         inline ST shift_offset(ST offset, int32_t amount)
00596         {
00597             ASSERT(alphabet);
00598             return (offset << (amount*alphabet->get_num_bits()));
00599         }
00600 
00607         inline ST shift_symbol(ST symbol, int32_t amount)
00608         {
00609             ASSERT(alphabet);
00610             return (symbol >> (amount*alphabet->get_num_bits()));
00611         }
00612 
00617         virtual inline void load(CFile* loader);
00618 
00627         void load_ascii_file(char* fname, bool remap_to_bin=true,
00628                 EAlphabet ascii_alphabet=DNA, EAlphabet binary_alphabet=RAWDNA)
00629         {
00630             size_t blocksize=1024*1024;
00631             size_t required_blocksize=0;
00632             uint8_t* dummy=new uint8_t[blocksize];
00633             uint8_t* overflow=NULL;
00634             int32_t overflow_len=0;
00635 
00636             cleanup();
00637 
00638             CAlphabet* alpha=new CAlphabet(ascii_alphabet);
00639             CAlphabet* alpha_bin=new CAlphabet(binary_alphabet);
00640 
00641             FILE* f=fopen(fname, "ro");
00642 
00643             if (f)
00644             {
00645                 num_vectors=0;
00646                 max_string_length=0;
00647 
00648                 SG_INFO("counting line numbers in file %s\n", fname);
00649                 size_t block_offs=0;
00650                 size_t old_block_offs=0;
00651                 fseek(f, 0, SEEK_END);
00652                 size_t fsize=ftell(f);
00653                 rewind(f);
00654 
00655                 if (blocksize>fsize)
00656                     blocksize=fsize;
00657 
00658                 SG_DEBUG("block_size=%ld file_size=%ld\n", blocksize, fsize);
00659 
00660                 size_t sz=blocksize;
00661                 while (sz == blocksize)
00662                 {
00663                     sz=fread(dummy, sizeof(uint8_t), blocksize, f);
00664                     bool contains_cr=false;
00665                     for (size_t i=0; i<sz; i++)
00666                     {
00667                         block_offs++;
00668                         if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00669                         {
00670                             num_vectors++;
00671                             contains_cr=true;
00672                             required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs);
00673                             old_block_offs=block_offs;
00674                         }
00675                     }
00676                     SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
00677                 }
00678 
00679                 SG_INFO("found %d strings\n", num_vectors);
00680                 delete[] dummy;
00681                 blocksize=required_blocksize;
00682                 dummy = new uint8_t[blocksize];
00683                 overflow = new uint8_t[blocksize];
00684                 features=new TString<ST>[num_vectors];
00685 
00686                 rewind(f);
00687                 sz=blocksize;
00688                 int32_t lines=0;
00689                 while (sz == blocksize)
00690                 {
00691                     sz=fread(dummy, sizeof(uint8_t), blocksize, f);
00692 
00693                     size_t old_sz=0;
00694                     for (size_t i=0; i<sz; i++)
00695                     {
00696                         if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00697                         {
00698                             int32_t len=i-old_sz;
00699                             //SG_PRINT("i:%d len:%d old_sz:%d\n", i, len, old_sz);
00700                             max_string_length=CMath::max(max_string_length, len+overflow_len);
00701 
00702                             features[lines].length=len;
00703                             features[lines].string=new ST[len];
00704 
00705                             if (remap_to_bin)
00706                             {
00707                                 for (int32_t j=0; j<overflow_len; j++)
00708                                     features[lines].string[j]=alpha->remap_to_bin(overflow[j]);
00709                                 for (int32_t j=0; j<len; j++)
00710                                     features[lines].string[j+overflow_len]=alpha->remap_to_bin(dummy[old_sz+j]);
00711                                 alpha->add_string_to_histogram(&dummy[old_sz], len);
00712                                 alpha_bin->add_string_to_histogram(features[lines].string, features[lines].length);
00713                             }
00714                             else
00715                             {
00716                                 for (int32_t j=0; j<overflow_len; j++)
00717                                     features[lines].string[j]=overflow[j];
00718                                 for (int32_t j=0; j<len; j++)
00719                                     features[lines].string[j+overflow_len]=dummy[old_sz+j];
00720                                 alpha->add_string_to_histogram(&dummy[old_sz], len);
00721                                 alpha->add_string_to_histogram(features[lines].string, features[lines].length);
00722                             }
00723 
00724                             // clear overflow
00725                             overflow_len=0;
00726 
00727                             //CMath::display_vector(features[lines].string, len);
00728                             old_sz=i+1;
00729                             lines++;
00730                             SG_PROGRESS(lines, 0, num_vectors, 1, "LOADING:\t");
00731                         }
00732                     }
00733                     for (size_t i=old_sz; i<sz; i++)
00734                         overflow[i-old_sz]=dummy[i];
00735 
00736                     overflow_len=sz-old_sz;
00737                 }
00738 
00739                 if (alpha->check_alphabet_size() && alpha->check_alphabet())
00740                 {
00741                     SG_INFO("file successfully read\n");
00742                     SG_INFO("max_string_length=%d\n", max_string_length);
00743                     SG_INFO("num_strings=%d\n", num_vectors);
00744                 }
00745                 fclose(f);
00746             }
00747 
00748             delete[] dummy;
00749 
00750             SG_UNREF(alphabet);
00751 
00752             if (remap_to_bin)
00753                 alphabet = alpha_bin;
00754             else
00755                 alphabet = alpha;
00756             SG_REF(alphabet);
00757             num_symbols=alphabet->get_num_symbols();
00758         }
00759 
00766         bool load_fasta_file(const char* fname, bool ignore_invalid=false)
00767         {
00768             int32_t i=0;
00769             uint64_t len=0;
00770             uint64_t offs=0;
00771             int32_t num=0;
00772             int32_t max_len=0;
00773 
00774             CMemoryMappedFile<char> f(fname);
00775 
00776             while (true)
00777             {
00778                 char* s=f.get_line(len, offs);
00779                 if (!s)
00780                     break;
00781 
00782                 if (len>0 && s[0]=='>')
00783                     num++;
00784             }
00785 
00786             if (num==0)
00787                 SG_ERROR("No fasta hunks (lines starting with '>') found\n");
00788 
00789             cleanup();
00790             SG_UNREF(alphabet);
00791             alphabet=new CAlphabet(DNA);
00792             num_symbols=alphabet->get_num_symbols();
00793 
00794             TString<ST>* strings=new TString<ST>[num];
00795             offs=0;
00796 
00797             for (i=0;i<num; i++)
00798             {
00799                 uint64_t id_len=0;
00800                 char* id=f.get_line(id_len, offs);
00801 
00802                 char* fasta=f.get_line(len, offs);
00803                 char* s=fasta;
00804                 int32_t fasta_len=0;
00805                 int32_t spanned_lines=0;
00806 
00807                 while (true)
00808                 {
00809                     if (!s || len==0)
00810                         SG_ERROR("Error reading fasta entry in line %d len=%ld", 4*i+1, len);
00811 
00812                     if (s[0]=='>' || offs==f.get_size())
00813                     {
00814                         offs-=len+1; // seek to beginning
00815                         if (offs==f.get_size())
00816                         {
00817                             SG_DEBUG("at EOF\n");
00818                             fasta_len+=len;
00819                         }
00820 
00821                         len = fasta_len-spanned_lines;
00822                         strings[i].string=new ST[len];
00823                         strings[i].length=len;
00824 
00825                         ST* str=strings[i].string;
00826                         int32_t idx=0;
00827                         SG_DEBUG("'%.*s', len=%d, spanned_lines=%d\n", (int32_t) id_len, id, (int32_t) len, (int32_t) spanned_lines);
00828 
00829                         for (int32_t j=0; j<fasta_len; j++)
00830                         {
00831                             if (fasta[j]=='\n')
00832                                 continue;
00833 
00834                             ST c = (ST) fasta[j];
00835 
00836                             if (ignore_invalid  && !alphabet->is_valid((uint8_t) fasta[j]))
00837                                 c = (ST) 'A';
00838 
00839                             if (idx>=len)
00840                                 SG_ERROR("idx=%d j=%d fasta_len=%d, spanned_lines=%d str='%.*s'\n", idx, j, fasta_len, spanned_lines, idx, str);
00841                             str[idx++]=c;
00842                         }
00843                         max_len=CMath::max(max_len, strings[i].length);
00844 
00845 
00846                         break;
00847                     }
00848 
00849                     spanned_lines++;
00850                     fasta_len+=len+1; // including '\n'
00851                     s=f.get_line(len, offs);
00852                 }
00853             }
00854 
00855             return set_features(strings, num, max_len);
00856         }
00857 
00865         bool load_fastq_file(const char* fname,
00866                 bool ignore_invalid=false, bool bitremap_in_single_string=false)
00867         {
00868             CMemoryMappedFile<char> f(fname);
00869 
00870             int32_t i=0;
00871             uint64_t len=0;
00872             uint64_t offs=0;
00873 
00874             int32_t num=f.get_num_lines();
00875             int32_t max_len=0;
00876 
00877             if (num%4)
00878                 SG_ERROR("Number of lines must be divisible by 4 in fastq files\n");
00879             num/=4;
00880 
00881             cleanup();
00882             SG_UNREF(alphabet);
00883             alphabet=new CAlphabet(DNA);
00884 
00885             TString<ST>* strings;
00886 
00887             ST* str;
00888             if (bitremap_in_single_string)
00889             {
00890                 strings=new TString<ST>[1];
00891                 strings[0].string=new ST[num];
00892                 strings[0].length=num;
00893                 f.get_line(len, offs);
00894                 f.get_line(len, offs);
00895                 order=len;
00896                 max_len=num;
00897                 offs=0;
00898                 original_num_symbols=alphabet->get_num_symbols();
00899                 int32_t max_val=alphabet->get_num_bits();
00900                 str=new ST[len];
00901             }
00902             else
00903                 strings=new TString<ST>[num];
00904 
00905             for (i=0;i<num; i++)
00906             {
00907                 if (!f.get_line(len, offs))
00908                     SG_ERROR("Error reading 'read' identifier in line %d", 4*i);
00909 
00910                 char* s=f.get_line(len, offs);
00911                 if (!s || len==0)
00912                     SG_ERROR("Error reading 'read' in line %d len=%ld", 4*i+1, len);
00913 
00914                 if (bitremap_in_single_string)
00915                 {
00916                     if (len!=order)
00917                         SG_ERROR("read in line %d not of length %d (is %d)\n", 4*i+1, order, len);
00918                     for (int32_t j=0; j<order; j++)
00919                         str[j]=(ST) alphabet->remap_to_bin((uint8_t) s[j]);
00920 
00921                     strings[0].string[i]=embed_word(str, order);
00922                 }
00923                 else
00924                 {
00925                     strings[i].string=new ST[len];
00926                     strings[i].length=len;
00927                     str=strings[i].string;
00928 
00929                     if (ignore_invalid)
00930                     {
00931                         for (int32_t j=0; j<len; j++)
00932                         {
00933                             if (alphabet->is_valid((uint8_t) s[j]))
00934                                 str[j]= (ST) s[j];
00935                             else
00936                                 str[j]= (ST) 'A';
00937                         }
00938                     }
00939                     else
00940                     {
00941                         for (int32_t j=0; j<len; j++)
00942                             str[j]= (ST) s[j];
00943                     }
00944                     max_len=CMath::max(max_len, (int32_t) len);
00945                 }
00946 
00947 
00948                 if (!f.get_line(len, offs))
00949                     SG_ERROR("Error reading 'read' quality identifier in line %d", 4*i+2);
00950 
00951                 if (!f.get_line(len, offs))
00952                     SG_ERROR("Error reading 'read' quality in line %d", 4*i+3);
00953             }
00954 
00955             if (bitremap_in_single_string)
00956                 num=1;
00957 
00958             num_vectors=num;
00959             max_string_length=max_len;
00960             features=strings;
00961 
00962             return true;
00963         }
00964 
00970         bool load_from_directory(char* dirname)
00971         {
00972             struct dirent **namelist;
00973             int32_t n;
00974 
00975             IO::set_dirname(dirname);
00976 
00977             SG_DEBUG("dirname '%s'\n", dirname);
00978 
00979             n = scandir(dirname, &namelist, &IO::filter, alphasort);
00980             if (n <= 0)
00981             {
00982                 SG_ERROR("error calling scandir - no files found\n");
00983                 return false;
00984             }
00985             else
00986             {
00987                 TString<ST>* strings=NULL;
00988 
00989                 int32_t num=0;
00990                 int32_t max_len=-1;
00991 
00992                 //usually n==num_vec, but it might not in race conditions
00993                 //(file perms modified, file erased)
00994                 strings=new TString<ST>[n];
00995 
00996                 for (int32_t i=0; i<n; i++)
00997                 {
00998                     char* fname=IO::concat_filename(namelist[i]->d_name);
00999 
01000                     struct stat s;
01001                     off_t filesize=0;
01002 
01003                     if (!stat(fname, &s) && s.st_size>0)
01004                     {
01005                         filesize=s.st_size/sizeof(ST);
01006 
01007                         FILE* f=fopen(fname, "ro");
01008                         if (f)
01009                         {
01010                             ST* str=new ST[filesize];
01011                             SG_DEBUG("%s:%ld\n", fname, (int64_t) filesize);
01012                             fread(str, sizeof(ST), filesize, f);
01013                             strings[num].string=str;
01014                             strings[num].length=filesize;
01015                             max_len=CMath::max(max_len, strings[num].length);
01016 
01017                             num++;
01018                             fclose(f);
01019                         }
01020                     }
01021                     else
01022                         SG_ERROR("empty or non readable file \'%s\'\n", fname);
01023 
01024                     free(namelist[i]);
01025                 }
01026                 free(namelist);
01027 
01028                 if (num>0 && strings)
01029                 {
01030                     set_features(strings, num, max_len);
01031                     return true;
01032                 }
01033             }
01034             return false;
01035         }
01036 
01044         bool set_features(TString<ST>* p_features, int32_t p_num_vectors, int32_t p_max_string_length)
01045         {
01046             if (p_features)
01047             {
01048                 CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet());
01049 
01050                 //compute histogram for char/byte
01051                 for (int32_t i=0; i<p_num_vectors; i++)
01052                     alpha->add_string_to_histogram( p_features[i].string, p_features[i].length);
01053 
01054                 SG_INFO("max_value_in_histogram:%d\n", alpha->get_max_value_in_histogram());
01055                 SG_INFO("num_symbols_in_histogram:%d\n", alpha->get_num_symbols_in_histogram());
01056 
01057                 if (alpha->check_alphabet_size() && alpha->check_alphabet())
01058                 {
01059                     cleanup();
01060                     SG_UNREF(alphabet);
01061 
01062                     alphabet=alpha;
01063                     SG_REF(alphabet);
01064 
01065                     this->features=p_features;
01066                     this->num_vectors=p_num_vectors;
01067                     this->max_string_length=p_max_string_length;
01068 
01069                     return true;
01070                 }
01071                 else
01072                     SG_UNREF(alpha);
01073             }
01074 
01075             return false;
01076         }
01077 
01083         bool append_features(CStringFeatures<ST>* sf)
01084         {
01085             ASSERT(sf);
01086             TString<ST>* new_features = new TString<ST>[sf->num_vectors];
01087 
01088             for (int32_t i=0; i<sf->num_vectors; i++)
01089             {
01090                 int32_t length=sf->features[i].length;
01091                 new_features[i].string=new ST[length];
01092                 memcpy(new_features[i].string, sf->features[i].string, length);
01093                 new_features[i].length=length;
01094             }
01095             return append_features(new_features, sf->num_vectors,
01096                     sf->max_string_length);
01097         }
01098 
01109         bool append_features(TString<ST>* p_features, int32_t p_num_vectors, int32_t p_max_string_length)
01110         {
01111             if (!features)
01112                 return set_features(p_features, p_num_vectors, p_max_string_length);
01113 
01114             CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet());
01115 
01116             //compute histogram for char/byte
01117             for (int32_t i=0; i<p_num_vectors; i++)
01118                 alpha->add_string_to_histogram( p_features[i].string, p_features[i].length);
01119 
01120             SG_INFO("max_value_in_histogram:%d\n", alpha->get_max_value_in_histogram());
01121             SG_INFO("num_symbols_in_histogram:%d\n", alpha->get_num_symbols_in_histogram());
01122 
01123             if (alpha->check_alphabet_size() && alpha->check_alphabet())
01124             {
01125                 SG_UNREF(alpha);
01126                 for (int32_t i=0; i<p_num_vectors; i++)
01127                     alphabet->add_string_to_histogram( p_features[i].string, p_features[i].length);
01128 
01129                 int32_t old_num_vectors=num_vectors;
01130                 num_vectors=old_num_vectors+p_num_vectors;
01131                 TString<ST>* new_features = new TString<ST>[num_vectors];
01132 
01133                 for (int32_t i=0; i<num_vectors; i++)
01134                 {
01135                     if (i<old_num_vectors)
01136                     {
01137                         new_features[i].string=features[i].string;
01138                         new_features[i].length=features[i].length;
01139                     }
01140                     else
01141                     {
01142                         new_features[i].string=p_features[i-old_num_vectors].string;
01143                         new_features[i].length=p_features[i-old_num_vectors].length;
01144                     }
01145                 }
01146                 delete[] features;
01147                 delete[] p_features; // free now obsolete features
01148 
01149                 this->features=new_features;
01150                 this->max_string_length=CMath::max(max_string_length, p_max_string_length);
01151 
01152                 return true;
01153             }
01154             SG_UNREF(alpha);
01155 
01156             return false;
01157         }
01158 
01165         virtual TString<ST>* get_features(int32_t& num_str, int32_t& max_str_len)
01166         {
01167             num_str=num_vectors;
01168             max_str_len=max_string_length;
01169             return features;
01170         }
01171 
01178         virtual TString<ST>* copy_features(int32_t& num_str, int32_t& max_str_len)
01179         {
01180             ASSERT(num_vectors>0);
01181 
01182             num_str=num_vectors;
01183             max_str_len=max_string_length;
01184             TString<ST>* new_feat=new TString<ST>[num_str];
01185 
01186             for (int32_t i=0; i<num_str; i++)
01187             {
01188                 int32_t len;
01189                 bool free_vec;
01190                 ST* vec=get_feature_vector(i, len, free_vec);
01191                 new_feat[i].string=new ST[len];
01192                 new_feat[i].length=len;
01193                 memcpy(new_feat[i].string, vec, ((size_t) len) * sizeof(ST));
01194                 free_feature_vector(vec, i, free_vec);
01195             }
01196 
01197             return new_feat;
01198         }
01199 
01205         virtual void get_features(TString<ST>** dst, int32_t* num_str)
01206         {
01207             int32_t num_vec;
01208             int32_t max_str_len;
01209             *dst=copy_features(num_vec, max_str_len);
01210             *num_str=num_vec;
01211         }
01212 
01217         virtual inline void save(CFile* writer);
01218 
01225         virtual bool load_compressed(char* src, bool decompress)
01226         {
01227             FILE* file=NULL;
01228 
01229             if (!(file=fopen(src, "r")))
01230                 return false;
01231             cleanup();
01232 
01233             // header shogun v0
01234             char id[4];
01235             fread(&id[0], sizeof(char), 1, file);
01236             ASSERT(id[0]=='S');
01237             fread(&id[1], sizeof(char), 1, file);
01238             ASSERT(id[1]=='G');
01239             fread(&id[2], sizeof(char), 1, file);
01240             ASSERT(id[2]=='V');
01241             fread(&id[3], sizeof(char), 1, file);
01242             ASSERT(id[3]=='0');
01243 
01244             //compression type
01245             uint8_t c;
01246             fread(&c, sizeof(uint8_t), 1, file);
01247             CCompressor* compressor= new CCompressor((E_COMPRESSION_TYPE) c);
01248             //alphabet
01249             uint8_t a;
01250             delete alphabet;
01251             fread(&a, sizeof(uint8_t), 1, file);
01252             alphabet=new CAlphabet((EAlphabet) a);
01253             // number of vectors
01254             fread(&num_vectors, sizeof(int32_t), 1, file);
01255             ASSERT(num_vectors>0);
01256             // maximum string length
01257             fread(&max_string_length, sizeof(int32_t), 1, file);
01258             ASSERT(max_string_length>0);
01259 
01260             features=new TString<ST>[num_vectors];
01261 
01262             // vectors
01263             for (int32_t i=0; i<num_vectors; i++)
01264             {
01265                 // vector len compressed
01266                 int32_t len_compressed;
01267                 fread(&len_compressed, sizeof(int32_t), 1, file);
01268                 // vector len uncompressed
01269                 int32_t len_uncompressed;
01270                 fread(&len_uncompressed, sizeof(int32_t), 1, file);
01271 
01272                 // vector raw data
01273                 if (decompress)
01274                 {
01275                     features[i].string=new ST[len_uncompressed];
01276                     features[i].length=len_uncompressed;
01277                     uint8_t* compressed=new uint8_t[len_compressed];
01278                     fread(compressed, len_compressed, 1, file);
01279                     uint64_t uncompressed_size=len_uncompressed;
01280                     uncompressed_size*=sizeof(ST);
01281                     compressor->decompress(compressed, len_compressed,
01282                             (uint8_t*) features[i].string, uncompressed_size);
01283                     delete[] compressed;
01284                     ASSERT(uncompressed_size==((uint64_t) len_uncompressed)*sizeof(ST));
01285                 }
01286                 else
01287                 {
01288                     int32_t offs=CMath::ceil(2.0*sizeof(int32_t)/sizeof(ST));
01289                     features[i].string=new ST[len_compressed+offs];
01290                     features[i].length=len_compressed+offs;
01291                     int32_t* feat32ptr=((int32_t*) (features[i].string));
01292                     memset(features[i].string, 0, offs*sizeof(ST));
01293                     feat32ptr[0]=(int32_t) len_compressed;
01294                     feat32ptr[1]=(int32_t) len_uncompressed;
01295                     uint8_t* compressed=(uint8_t*) (&features[i].string[offs]);
01296                     fread(compressed, len_compressed, 1, file);
01297                 }
01298             }
01299 
01300             delete compressor;
01301             fclose(file);
01302             return false;
01303         }
01304 
01312         virtual bool save_compressed(char* dest, E_COMPRESSION_TYPE compression, int level)
01313         {
01314             FILE* file=NULL;
01315 
01316             if (!(file=fopen(dest, "wb")))
01317                 return false;
01318 
01319             CCompressor* compressor= new CCompressor(compression);
01320 
01321             // header shogun v0
01322             const char* id="SGV0";
01323             fwrite(&id[0], sizeof(char), 1, file);
01324             fwrite(&id[1], sizeof(char), 1, file);
01325             fwrite(&id[2], sizeof(char), 1, file);
01326             fwrite(&id[3], sizeof(char), 1, file);
01327 
01328             //compression type
01329             uint8_t c=(uint8_t) compression;
01330             fwrite(&c, sizeof(uint8_t), 1, file);
01331             //alphabet
01332             uint8_t a=(uint8_t) alphabet->get_alphabet();
01333             fwrite(&a, sizeof(uint8_t), 1, file);
01334             // number of vectors
01335             fwrite(&num_vectors, sizeof(int32_t), 1, file);
01336             // maximum string length
01337             fwrite(&max_string_length, sizeof(int32_t), 1, file);
01338 
01339             // vectors
01340             for (int32_t i=0; i<num_vectors; i++)
01341             {
01342                 int32_t len=-1;
01343                 bool vfree;
01344                 ST* vec=get_feature_vector(i, len, vfree);
01345 
01346                 uint8_t* compressed=NULL;
01347                 uint64_t compressed_size=0;
01348 
01349                 compressor->compress((uint8_t*) vec, ((uint64_t) len)*sizeof(ST),
01350                         compressed, compressed_size, level);
01351 
01352                 int32_t len_compressed = (int32_t) compressed_size;
01353                 // vector len compressed in bytes
01354                 fwrite(&len_compressed, sizeof(int32_t), 1, file);
01355                 // vector len uncompressed in number of elements of type ST
01356                 fwrite(&len, sizeof(int32_t), 1, file);
01357                 // vector raw data
01358                 fwrite(compressed, compressed_size, 1, file);
01359                 delete[] compressed;
01360 
01361                 free_feature_vector(vec, i, vfree);
01362             }
01363 
01364             delete compressor;
01365             fclose(file);
01366             return true;
01367         }
01368 
01369 
01374         virtual int32_t get_size() { return sizeof(ST); }
01375 
01381         virtual bool apply_preproc(bool force_preprocessing=false)
01382         {
01383             SG_DEBUG( "force: %d\n", force_preprocessing);
01384 
01385             for (int32_t i=0; i<get_num_preproc(); i++)
01386             {
01387                 if ( (!is_preprocessed(i) || force_preprocessing) )
01388                 {
01389                     set_preprocessed(i);
01390                     CStringPreProc<ST>* p = (CStringPreProc<ST>*) get_preproc(i);
01391                     SG_INFO( "preprocessing using preproc %s\n", p->get_name());
01392 
01393                     if (!p->apply_to_string_features(this))
01394                     {
01395                         SG_UNREF(p);
01396                         return false;
01397                     }
01398                     else
01399                         SG_UNREF(p);
01400                 }
01401             }
01402             return true;
01403         }
01404 
01414         int32_t obtain_by_sliding_window(int32_t window_size, int32_t step_size, int32_t skip=0)
01415         {
01416             ASSERT(step_size>0);
01417             ASSERT(window_size>0);
01418             ASSERT(num_vectors==1 || single_string);
01419             ASSERT(max_string_length>=window_size ||
01420                     (single_string && length_of_single_string>=window_size));
01421 
01422             //in case we are dealing with a single remapped string
01423             //allow remapping
01424             if (single_string)
01425                 num_vectors= (length_of_single_string-window_size)/step_size + 1;
01426             else if (num_vectors==1)
01427             {
01428                 num_vectors= (max_string_length-window_size)/step_size + 1;
01429                 length_of_single_string=max_string_length;
01430             }
01431 
01432             TString<ST>* f=new TString<ST>[num_vectors];
01433             int32_t offs=0;
01434             for (int32_t i=0; i<num_vectors; i++)
01435             {
01436                 f[i].string=&features[0].string[offs+skip];
01437                 f[i].length=window_size-skip;
01438                 offs+=step_size;
01439             }
01440             single_string=features[0].string;
01441             delete[] features;
01442             features=f;
01443             max_string_length=window_size-skip;
01444 
01445             return num_vectors;
01446         }
01447 
01456         int32_t obtain_by_position_list(int32_t window_size, CDynamicArray<int32_t>* positions, int32_t skip=0)
01457         {
01458             ASSERT(positions);
01459             ASSERT(window_size>0);
01460             ASSERT(num_vectors==1 || single_string);
01461             ASSERT(max_string_length>=window_size ||
01462                     (single_string && length_of_single_string>=window_size));
01463 
01464             num_vectors= positions->get_num_elements();
01465             ASSERT(num_vectors>0);
01466 
01467             int32_t len;
01468 
01469             //in case we are dealing with a single remapped string
01470             //allow remapping
01471             if (single_string)
01472                 len=length_of_single_string;
01473             else
01474             {
01475                 single_string=features[0].string;
01476                 len=max_string_length;
01477                 length_of_single_string=max_string_length;
01478             }
01479 
01480             TString<ST>* f=new TString<ST>[num_vectors];
01481             for (int32_t i=0; i<num_vectors; i++)
01482             {
01483                 int32_t p=positions->get_element(i);
01484 
01485                 if (p>=0 && p<=len-window_size)
01486                 {
01487                     f[i].string=&features[0].string[p+skip];
01488                     f[i].length=window_size-skip;
01489                 }
01490                 else
01491                 {
01492                     num_vectors=1;
01493                     max_string_length=len;
01494                     features[0].length=len;
01495                     single_string=NULL;
01496                     delete[] f;
01497                     SG_ERROR("window (size:%d) starting at position[%d]=%d does not fit in sequence(len:%d)\n",
01498                             window_size, i, p, len);
01499                     return -1;
01500                 }
01501             }
01502 
01503             delete[] features;
01504             features=f;
01505             max_string_length=window_size-skip;
01506 
01507             return num_vectors;
01508         }
01509 
01521         inline bool obtain_from_char(CStringFeatures<char>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
01522         {
01523             return obtain_from_char_features(sf, start, p_order, gap, rev);
01524         }
01525 
01535         template <class CT>
01536             bool obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
01537             {
01538                 ASSERT(sf);
01539 
01540                 CAlphabet* alpha=sf->get_alphabet();
01541                 ASSERT(alpha->get_num_symbols_in_histogram() > 0);
01542 
01543                 this->order=p_order;
01544                 cleanup();
01545 
01546                 num_vectors=sf->get_num_vectors();
01547                 ASSERT(num_vectors>0);
01548                 max_string_length=sf->get_max_vector_length()-start;
01549                 features=new TString<ST>[num_vectors];
01550 
01551                 SG_DEBUG( "%1.0llf symbols in StringFeatures<*> %d symbols in histogram\n", sf->get_num_symbols(),
01552                         alpha->get_num_symbols_in_histogram());
01553 
01554                 for (int32_t i=0; i<num_vectors; i++)
01555                 {
01556                     int32_t len=-1;
01557                     bool vfree;
01558                     CT* c=sf->get_feature_vector(i, len, vfree);
01559                     ASSERT(!vfree); // won't work when preprocessors are attached
01560 
01561                     features[i].string=new ST[len];
01562                     features[i].length=len;
01563 
01564                     ST* str=features[i].string;
01565                     for (int32_t j=0; j<len; j++)
01566                         str[j]=(ST) alpha->remap_to_bin(c[j]);
01567                 }
01568 
01569                 original_num_symbols=alpha->get_num_symbols();
01570                 int32_t max_val=alpha->get_num_bits();
01571 
01572                 SG_UNREF(alpha);
01573 
01574                 if (p_order>1)
01575                     num_symbols=CMath::powl((floatmax_t) 2, (floatmax_t) max_val*p_order);
01576                 else
01577                     num_symbols=original_num_symbols;
01578                 SG_INFO( "max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols);
01579 
01580                 if ( ((floatmax_t) num_symbols) > CMath::powl(((floatmax_t) 2),((floatmax_t) sizeof(ST)*8)) )
01581                 {
01582                     SG_ERROR( "symbol does not fit into datatype \"%c\" (%d)\n", (char) max_val, (int) max_val);
01583                     return false;
01584                 }
01585 
01586                 SG_DEBUG( "translate: start=%i order=%i gap=%i(size:%i)\n", start, p_order, gap, sizeof(ST)) ;
01587                 for (int32_t line=0; line<num_vectors; line++)
01588                 {
01589                     int32_t len=0;
01590                     bool vfree;
01591                     ST* fv=get_feature_vector(line, len, vfree);
01592                     ASSERT(!vfree); // won't work when preprocessors are attached
01593 
01594                     if (rev)
01595                         CAlphabet::translate_from_single_order_reversed(fv, len, start+gap, p_order+gap, max_val, gap);
01596                     else
01597                         CAlphabet::translate_from_single_order(fv, len, start+gap, p_order+gap, max_val, gap);
01598 
01599                     /* fix the length of the string -- hacky */
01600                     features[line].length-=start+gap ;
01601                     if (features[line].length<0)
01602                         features[line].length=0 ;
01603                 }
01604 
01605                 compute_symbol_mask_table(max_val);
01606 
01607                 return true;
01608             }
01609 
01617         bool have_same_length(int32_t len=-1)
01618         {
01619             if (len!=-1)
01620             {
01621                 if (len!=get_max_vector_length())
01622                     return false;
01623             }
01624             len = get_max_vector_length();
01625 
01626             for (int32_t i=0; i<num_vectors; i++)
01627             {
01628                 if (get_vector_length(i)!=len)
01629                     return false;
01630             }
01631 
01632             return true;
01633         }
01634 
01639         inline void embed_features(int32_t p_order)
01640         {
01641             ASSERT(alphabet->get_num_symbols_in_histogram() > 0);
01642 
01643             order=p_order;
01644             original_num_symbols=alphabet->get_num_symbols();
01645             int32_t max_val=alphabet->get_num_bits();
01646 
01647             if (p_order>1)
01648                 num_symbols=CMath::powl((floatmax_t) 2, (floatmax_t) max_val*p_order);
01649             else
01650                 num_symbols=original_num_symbols;
01651 
01652             SG_INFO( "max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols);
01653 
01654             if ( ((floatmax_t) num_symbols) > CMath::powl(((floatmax_t) 2),((floatmax_t) sizeof(ST)*8)) )
01655                 SG_WARNING("symbols did not fit into datatype \"%c\" (%d)\n", (char) max_val, (int) max_val);
01656 
01657             ST mask=0;
01658             for (int32_t i=0; i<p_order*max_val; i++)
01659                 mask= (mask<<1) | ((ST) 1);
01660 
01661             for (int32_t i=0; i<num_vectors; i++)
01662             {
01663                 int32_t len=features[i].length;
01664 
01665                 if (len < p_order)
01666                     SG_ERROR("Sequence must be longer than order (%d vs. %d)\n", len, p_order);
01667 
01668                 ST* str = features[i].string;
01669 
01670                 // convert first word
01671                 for (int32_t j=0; j<p_order; j++)
01672                     str[j]=(ST) alphabet->remap_to_bin(str[j]);
01673                 str[0]=embed_word(&str[0], p_order);
01674 
01675                 // convert the rest
01676                 int32_t idx=0;
01677                 for (int32_t j=p_order; j<len; j++)
01678                 {
01679                     str[j]=(ST) alphabet->remap_to_bin(str[j]);
01680                     str[idx+1]= ((str[idx]<<max_val) | str[j]) & mask;
01681                     idx++;
01682                 }
01683 
01684                 features[i].length=len-p_order+1;
01685             }
01686 
01687             compute_symbol_mask_table(max_val);
01688         }
01689 
01694         inline void compute_symbol_mask_table(int64_t max_val)
01695         {
01696             delete[] symbol_mask_table;
01697             symbol_mask_table=new ST[256];
01698 
01699             uint64_t mask=0;
01700             for (int32_t i=0; i< (int64_t) max_val; i++)
01701                 mask=(mask<<1) | 1;
01702 
01703             for (int32_t i=0; i<256; i++)
01704             {
01705                 uint8_t bits=(uint8_t) i;
01706                 symbol_mask_table[i]=0;
01707 
01708                 for (int32_t j=0; j<8; j++)
01709                 {
01710                     if (bits & 1)
01711                         symbol_mask_table[i]|=mask<<(max_val*j);
01712 
01713                     bits>>=1;
01714                 }
01715             }
01716         }
01717 
01724         inline void unembed_word(ST word, uint8_t* seq, int32_t len)
01725         {
01726             uint32_t nbits= (uint32_t) alphabet->get_num_bits();
01727 
01728             ST mask=0;
01729             for (int32_t i=0; i<nbits; i++)
01730                 mask=(mask<<1) | (ST) 1;
01731 
01732             for (int32_t i=0; i<len; i++)
01733             {
01734                 ST w=(word & mask);
01735                 seq[len-i-1]=alphabet->remap_to_char((uint8_t) w);
01736                 word>>=nbits;
01737             }
01738         }
01739 
01745         inline ST embed_word(ST* seq, int32_t len)
01746         {
01747             ST value=(ST) 0;
01748             uint32_t nbits= (uint32_t) alphabet->get_num_bits();
01749             for (int32_t i=0; i<len; i++)
01750             {
01751                 value<<=nbits;
01752                 value|=seq[i];
01753             }
01754 
01755             return value;
01756         }
01757 
01760         void determine_maximum_string_length()
01761         {
01762             max_string_length=0;
01763 
01764             for (int32_t i=0; i<num_vectors; i++)
01765                 max_string_length=CMath::max(max_string_length, features[i].length);
01766         }
01767 
01775         static ST* get_zero_terminated_string_copy(TString<ST> str)
01776         {
01777             int32_t l=str.length;
01778             ST* s=new ST[l+1];
01779             memcpy(s, str.string, sizeof(ST)*l);
01780             s[l]='\0';
01781             return s;
01782         }
01783 
01790         virtual void set_feature_vector(int32_t num, ST* string, int32_t len)
01791         {
01792             ASSERT(features);
01793             ASSERT(num<num_vectors);
01794 
01795             features[num].length=len ;
01796             features[num].string=string ;
01797 
01798             max_string_length=CMath::max(len, max_string_length);
01799         }
01800 
01801 
01804         virtual void get_histogram(float64_t** hist, int32_t* rows, int32_t* cols, bool normalize=true)
01805         {
01806             int32_t nsym=get_num_symbols();
01807             int32_t slen=get_max_vector_length();
01808             int64_t sz=int64_t(nsym)*slen*sizeof(float64_t);
01809             float64_t* h= (float64_t*) malloc(sz);
01810             ASSERT(h);
01811             memset(h, 0, sz);
01812 
01813             float64_t* h_normalizer=new float64_t[slen];
01814             memset(h_normalizer, 0, slen*sizeof(float64_t));
01815             int32_t num_str=get_num_vectors();
01816             for (int32_t i=0; i<num_str; i++)
01817             {
01818                 int32_t len;
01819                 bool free_vec;
01820                 ST* vec=get_feature_vector(i, len, free_vec);
01821                 for (int32_t j=0; j<len; j++)
01822                 {
01823                     h[int64_t(j)*nsym+alphabet->remap_to_bin(vec[j])]++;
01824                     h_normalizer[j]++;
01825                 }
01826                 free_feature_vector(vec, i, free_vec);
01827             }
01828 
01829             if (normalize)
01830             {
01831                 for (int32_t i=0; i<slen; i++)
01832                 {
01833                     for (int32_t j=0; j<nsym; j++)
01834                     {
01835                         if (h_normalizer && h_normalizer[i])
01836                             h[int64_t(i)*nsym+j]/=h_normalizer[i];
01837                     }
01838                 }
01839             }
01840             delete[] h_normalizer;
01841 
01842             *hist=h;
01843             *rows=nsym;
01844             *cols=slen;
01845         }
01846 
01849         virtual void create_random(float64_t* hist, int32_t rows, int32_t cols, int32_t num_vec)
01850         {
01851             ASSERT(rows == get_num_symbols());
01852             cleanup();
01853             float64_t* randoms=new float64_t[cols];
01854             TString<ST>* sf=new TString<ST>[num_vec];
01855 
01856             for (int32_t i=0; i<num_vec; i++)
01857             {
01858                 sf[i].string=new ST[cols];
01859                 sf[i].length=cols;
01860 
01861                 CMath::random_vector(randoms, cols, 0.0, 1.0);
01862 
01863                 for (int32_t j=0; j<cols; j++)
01864                 {
01865                     float64_t lik=hist[int64_t(j)*rows+0];
01866 
01867                     int32_t c;
01868                     for (c=0; c<rows-1; c++)
01869                     {
01870                         if (randoms[j]<=lik)
01871                             break;
01872                         lik+=hist[int64_t(j)*rows+c+1];
01873                     }
01874                     sf[i].string[j]=alphabet->remap_to_char(c);
01875                 }
01876             }
01877             delete[] randoms;
01878             set_features(sf, num_vec, cols);
01879         }
01880 
01881         /*
01882         CStringFeatures<SSKTripleFeature>* obtain_sssk_triple_from_cha(int d1, int d2)
01883         {
01884             int *s;
01885             int32_t nStr=get_num_vectors();
01886 
01887             int32_t nfeat = 0;
01888             for (int32_t i = 0; i < nStr; ++i)
01889                 nfeat += get_vector_length[i] - d1 -d2;
01890             TString<SSKFeature>* F= new TString<SSKFeature>[nfeat];
01891             int32_t c = 0;
01892             for (int32_t i = 0; i < nStr; ++i)
01893             {
01894             int32_t len;
01895             bool free_vec;
01896             ST* S=get_feature_vector(vec_num, len, free_vec);
01897             free_feature_vector(vec, vec_num, free_vec);
01898                 int32_t n = len - d1 - d2;
01899                 s = S[i];
01900                 for (int32_t j = 0; j < n; ++j)
01901                 {
01902                     F[c].feature1 = s[j];
01903                     F[c].feature2 = s[j+d1];
01904                     F[c].feature3 = s[j+d1+d2];
01905                     F[c].group = i;
01906                     c++;
01907                 }
01908             }
01909             ASSERT(nfeat==c);
01910             return F;
01911         }
01912 
01913         CStringFeatures<SSKFeature>* obtain_sssk_double_from_char(int **S, int *len, int nStr, int d1)
01914         {
01915             int i, j;
01916             int n, nfeat;
01917             int *group;
01918             int *features;
01919             int *s;
01920             int c;
01921             SSKFeatures *F;
01922 
01923             nfeat = 0;
01924             for (i = 0; i < nStr; ++i)
01925                 nfeat += len[i] - d1;
01926             group = (int *)malloc(nfeat*sizeof(int));
01927             features = (int *)malloc(nfeat*2*sizeof(int *));
01928             c = 0;
01929             for (i = 0; i < nStr; ++i)
01930             {
01931                 n = len[i] - d1;
01932                 s = S[i];
01933                 for (j = 0; j < n; ++j)
01934                 {
01935                     features[c] = s[j];
01936                     features[c+nfeat] = s[j+d1];
01937                     group[c] = i;
01938                     c++;
01939                 }
01940             }
01941             if (nfeat!=c)
01942                 printf("Something is wrong...\n");
01943             F = (SSKFeatures *)malloc(sizeof(SSKFeatures));
01944             (*F).features = features;
01945             (*F).group = group;
01946             (*F).n = nfeat;
01947             return F;
01948         }
01949     */
01950 
01951 
01952 
01954         inline virtual const char* get_name() const { return "StringFeatures"; }
01955 
01956     protected:
01957 
01968         virtual ST* compute_feature_vector(int32_t num, int32_t& len)
01969         {
01970             ASSERT(features && num<num_vectors);
01971 
01972             len=features[num].length;
01973             if (len<=0)
01974                 return NULL;
01975 
01976             ST* target=new ST[len];
01977             memcpy(target, features[num].string, len*sizeof(ST));
01978             return target;
01979         }
01980 
01981     private:
01982         void init(void)
01983         {
01984             set_generic<ST>();
01985 
01986             m_parameters->add((CSGObject**) &alphabet, "alphabet");
01987             m_parameters->add_vector(&features, &num_vectors, "features",
01988                     "This contains the array of features.");
01989             m_parameters->add_vector(&single_string,
01990                     &length_of_single_string,
01991                     "single_string",
01992                     "Created by sliding window.");
01993             m_parameters->add(&max_string_length, "max_string_length",
01994                     "Length of longest string.");
01995             m_parameters->add(&num_symbols, "num_symbols",
01996                     "Number of used symbols.");
01997             m_parameters->add(&original_num_symbols, "original_num_symbols",
01998                     "Original number of used symbols.");
01999             m_parameters->add(&order, "order",
02000                     "Order used in higher order mapping.");
02001             m_parameters->add(&preprocess_on_get, "preprocess_on_get",
02002                     "Preprocess on-the-fly?");
02003 
02004             /* TODO M_PARAMETERS->ADD?
02005              * /// order used in higher order mapping
02006              * ST* symbol_mask_table;
02007              */
02008         }
02009 
02010 
02011     protected:
02012 
02014         CAlphabet* alphabet;
02015 
02017         int32_t num_vectors;
02018 
02020         TString<ST>* features;
02021 
02023         ST* single_string;
02024 
02026         int32_t length_of_single_string;
02027 
02029         int32_t max_string_length;
02030 
02032         floatmax_t num_symbols;
02033 
02035         floatmax_t original_num_symbols;
02036 
02038         int32_t order;
02039 
02041         ST* symbol_mask_table;
02042 
02044         bool preprocess_on_get;
02045 
02047         CCache<ST>* feature_cache;
02048 };
02049 
02050 #ifndef DOXYGEN_SHOULD_SKIP_THIS
02051 
02055 template<> inline EFeatureType CStringFeatures<bool>::get_feature_type()
02056 {
02057     return F_BOOL;
02058 }
02059 
02064 template<> inline EFeatureType CStringFeatures<char>::get_feature_type()
02065 {
02066     return F_CHAR;
02067 }
02068 
02073 template<> inline EFeatureType CStringFeatures<uint8_t>::get_feature_type()
02074 {
02075     return F_BYTE;
02076 }
02077 
02082 template<> inline EFeatureType CStringFeatures<int16_t>::get_feature_type()
02083 {
02084     return F_SHORT;
02085 }
02086 
02091 template<> inline EFeatureType CStringFeatures<uint16_t>::get_feature_type()
02092 {
02093     return F_WORD;
02094 }
02095 
02100 template<> inline EFeatureType CStringFeatures<int32_t>::get_feature_type()
02101 {
02102     return F_INT;
02103 }
02104 
02109 template<> inline EFeatureType CStringFeatures<uint32_t>::get_feature_type()
02110 {
02111     return F_UINT;
02112 }
02113 
02118 template<> inline EFeatureType CStringFeatures<int64_t>::get_feature_type()
02119 {
02120     return F_LONG;
02121 }
02122 
02127 template<> inline EFeatureType CStringFeatures<uint64_t>::get_feature_type()
02128 {
02129     return F_ULONG;
02130 }
02131 
02136 template<> inline EFeatureType CStringFeatures<float32_t>::get_feature_type()
02137 {
02138     return F_SHORTREAL;
02139 }
02140 
02145 template<> inline EFeatureType CStringFeatures<float64_t>::get_feature_type()
02146 {
02147     return F_DREAL;
02148 }
02149 
02154 template<> inline EFeatureType CStringFeatures<floatmax_t>::get_feature_type()
02155 {
02156     return F_LONGREAL;
02157 }
02158 
02159 template<> inline bool CStringFeatures<bool>::get_masked_symbols(bool symbol, uint8_t mask)
02160 {
02161     return symbol;
02162 }
02163 template<> inline float32_t CStringFeatures<float32_t>::get_masked_symbols(float32_t symbol, uint8_t mask)
02164 {
02165     return symbol;
02166 }
02167 template<> inline float64_t CStringFeatures<float64_t>::get_masked_symbols(float64_t symbol, uint8_t mask)
02168 {
02169     return symbol;
02170 }
02171 template<> inline floatmax_t CStringFeatures<floatmax_t>::get_masked_symbols(floatmax_t symbol, uint8_t mask)
02172 {
02173     return symbol;
02174 }
02175 
02176 template<> inline bool CStringFeatures<bool>::shift_offset(bool symbol, int32_t amount)
02177 {
02178     return false;
02179 }
02180 template<> inline float32_t CStringFeatures<float32_t>::shift_offset(float32_t symbol, int32_t amount)
02181 {
02182     return 0;
02183 }
02184 template<> inline float64_t CStringFeatures<float64_t>::shift_offset(float64_t symbol, int32_t amount)
02185 {
02186     return 0;
02187 }
02188 template<> inline floatmax_t CStringFeatures<floatmax_t>::shift_offset(floatmax_t symbol, int32_t amount)
02189 {
02190     return 0;
02191 }
02192 
02193 template<> inline bool CStringFeatures<bool>::shift_symbol(bool symbol, int32_t amount)
02194 {
02195     return symbol;
02196 }
02197 template<> inline float32_t CStringFeatures<float32_t>::shift_symbol(float32_t symbol, int32_t amount)
02198 {
02199     return symbol;
02200 }
02201 template<> inline float64_t CStringFeatures<float64_t>::shift_symbol(float64_t symbol, int32_t amount)
02202 {
02203     return symbol;
02204 }
02205 template<> inline floatmax_t CStringFeatures<floatmax_t>::shift_symbol(floatmax_t symbol, int32_t amount)
02206 {
02207     return symbol;
02208 }
02209 
02210 #ifndef SUNOS
02211 template<>  template <class CT> bool CStringFeatures<float32_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
02212 {
02213     return false;
02214 }
02215 template<>  template <class CT> bool CStringFeatures<float64_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
02216 {
02217     return false;
02218 }
02219 template<>  template <class CT> bool CStringFeatures<floatmax_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
02220 {
02221     return false;
02222 }
02223 #endif
02224 
02225 template<>  inline void CStringFeatures<float32_t>::embed_features(int32_t p_order)
02226 {
02227 }
02228 template<>  inline void CStringFeatures<float64_t>::embed_features(int32_t p_order)
02229 {
02230 }
02231 template<>  inline void CStringFeatures<floatmax_t>::embed_features(int32_t p_order)
02232 {
02233 }
02234 
02235 template<>  inline void CStringFeatures<float32_t>::compute_symbol_mask_table(int64_t max_val)
02236 {
02237 }
02238 template<>  inline void CStringFeatures<float64_t>::compute_symbol_mask_table(int64_t max_val)
02239 {
02240 }
02241 template<>  inline void CStringFeatures<floatmax_t>::compute_symbol_mask_table(int64_t max_val)
02242 {
02243 }
02244 
02245 template<>  inline float32_t CStringFeatures<float32_t>::embed_word(float32_t* seq, int32_t len)
02246 {
02247     return 0;
02248 }
02249 template<>  inline float64_t CStringFeatures<float64_t>::embed_word(float64_t* seq, int32_t len)
02250 {
02251     return 0;
02252 }
02253 template<>  inline floatmax_t CStringFeatures<floatmax_t>::embed_word(floatmax_t* seq, int32_t len)
02254 {
02255     return 0;
02256 }
02257 
02258 template<>  inline void CStringFeatures<float32_t>::unembed_word(float32_t word, uint8_t* seq, int32_t len)
02259 {
02260 }
02261 template<>  inline void CStringFeatures<float64_t>::unembed_word(float64_t word, uint8_t* seq, int32_t len)
02262 {
02263 }
02264 template<>  inline void CStringFeatures<floatmax_t>::unembed_word(floatmax_t word, uint8_t* seq, int32_t len)
02265 {
02266 }
02267 #define LOAD(f_load, sg_type)                                               \
02268 template<> inline void CStringFeatures<sg_type>::load(CFile* loader)        \
02269 {                                                                           \
02270     SG_INFO( "loading...\n");                                               \
02271                                                                             \
02272     SG_SET_LOCALE_C;                                                    \
02273     TString<sg_type>* strs;                                             \
02274     int32_t num_str;                                                        \
02275     int32_t max_len;                                                        \
02276     loader->f_load(strs, num_str, max_len);                                 \
02277     set_features(strs, num_str, max_len);                                   \
02278     SG_RESET_LOCALE;                                                    \
02279 }
02280 
02281 LOAD(get_bool_string_list, bool)
02282 LOAD(get_char_string_list, char)
02283 LOAD(get_int8_string_list, int8_t)
02284 LOAD(get_byte_string_list, uint8_t)
02285 LOAD(get_short_string_list, int16_t)
02286 LOAD(get_word_string_list, uint16_t)
02287 LOAD(get_int_string_list, int32_t)
02288 LOAD(get_uint_string_list, uint32_t)
02289 LOAD(get_long_string_list, int64_t)
02290 LOAD(get_ulong_string_list, uint64_t)
02291 LOAD(get_shortreal_string_list, float32_t)
02292 LOAD(get_real_string_list, float64_t)
02293 LOAD(get_longreal_string_list, floatmax_t)
02294 #undef LOAD
02295 
02296 #define SAVE(f_write, sg_type)                                              \
02297 template<> inline void CStringFeatures<sg_type>::save(CFile* writer)        \
02298 {                                                                           \
02299     SG_SET_LOCALE_C;                                                    \
02300     ASSERT(writer);                                                         \
02301     writer->f_write(features, num_vectors);                                 \
02302     SG_RESET_LOCALE;                                                    \
02303 }
02304 
02305 SAVE(set_bool_string_list, bool)
02306 SAVE(set_char_string_list, char)
02307 SAVE(set_int8_string_list, int8_t)
02308 SAVE(set_byte_string_list, uint8_t)
02309 SAVE(set_short_string_list, int16_t)
02310 SAVE(set_word_string_list, uint16_t)
02311 SAVE(set_int_string_list, int32_t)
02312 SAVE(set_uint_string_list, uint32_t)
02313 SAVE(set_long_string_list, int64_t)
02314 SAVE(set_ulong_string_list, uint64_t)
02315 SAVE(set_shortreal_string_list, float32_t)
02316 SAVE(set_real_string_list, float64_t)
02317 SAVE(set_longreal_string_list, floatmax_t)
02318 #undef SAVE
02319 #endif // DOXYGEN_SHOULD_SKIP_THIS
02320 }
02321 #endif // _CSTRINGFEATURES__H__
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation