BitString.h

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2009 Soeren Sonnenburg
00008  * Copyright (C) 2009 Fraunhofer Institute FIRST and Max Planck Society
00009  */
00010 #ifndef __BITSTRING_H__
00011 #define __BITSTRING_H__
00012 
00013 #include <shogun/features/Alphabet.h>
00014 #include <shogun/lib/common.h>
00015 #include <shogun/lib/io.h>
00016 #include <shogun/lib/MemoryMappedFile.h>
00017 #include <shogun/lib/Mathematics.h>
00018 
00019 namespace shogun
00020 {
00028 class CBitString : public CSGObject
00029 {
00030     public:
00032         CBitString(void) {
00033             SG_UNSTABLE("CBitString::CBitString(void)", "\n");
00034 
00035             alphabet = NULL;
00036             string = NULL;
00037             length = 0;
00038             word_len = 0;
00039             mask = 0;
00040             single_mask = 0;
00041         }
00042 
00050         CBitString(EAlphabet alpha, int32_t width=1) : CSGObject(), string(NULL), length(0)
00051         {
00052             alphabet=new CAlphabet(alpha);
00053             int32_t nbits=alphabet->get_num_bits();
00054             word_len = width*nbits;
00055 
00056             mask=0;
00057             for (int32_t j=0; j<word_len; j++)
00058                 mask=(mask<<1) | (uint64_t) 1;
00059             mask<<=sizeof(uint64_t)*8-word_len;
00060 
00061             single_mask=0;
00062             for (int32_t j=0; j<nbits; j++)
00063                 mask=(mask<<1) | (uint64_t) 1;
00064         }
00065 
00067         ~CBitString()
00068         {
00069             cleanup();
00070             SG_UNREF(alphabet);
00071         }
00072 
00074         void cleanup()
00075         {
00076             delete[] string;
00077             string=NULL;
00078             length=0;
00079         }
00080 
00086         void obtain_from_char(char* str, uint64_t len)
00087         {
00088             cleanup();
00089             uint64_t stream_len=len/sizeof(uint64_t)+1;
00090             string=new uint64_t[stream_len];
00091             length=len;
00092 
00093             uint64_t w=0;
00094             int32_t nbits=alphabet->get_num_bits();
00095             uint64_t j=0;
00096             uint64_t nfit=8*sizeof(w)/nbits;
00097             for (uint64_t i=0; i<len; i++)
00098             {
00099                 w= (w << nbits) | alphabet->remap_to_bin((uint8_t) str[j]);
00100 
00101                 if (i % nfit == nfit-1)
00102                 {
00103                     string[j]=w;
00104                     j++;
00105                 }
00106             }
00107 
00108             if (j<stream_len)
00109             {
00110                 string[j]=w;
00111                 j++;
00112             }
00113 
00114             ASSERT(j==stream_len);
00115         }
00116 
00123         void load_fasta_file(const char* fname, bool ignore_invalid=false)
00124         {
00125             cleanup();
00126 
00127             uint64_t len=0;
00128             uint64_t offs=0;
00129 
00130             CMemoryMappedFile<char> f(fname);
00131 
00132             uint64_t id_len=0;
00133             char* id=f.get_line(id_len, offs);
00134 
00135             if (!id_len || id[0]!='>')
00136                 SG_SERROR("No fasta hunks (lines starting with '>') found\n");
00137 
00138             if (offs==f.get_size())
00139                 SG_SERROR("Empty file?\n");
00140 
00141             char* fasta=NULL;
00142             char* s=NULL;
00143             int32_t spanned_lines=0;
00144             int32_t fasta_len=0;
00145 
00146             while (true)
00147             {
00148                 s=f.get_line(len, offs);
00149 
00150                 if (!fasta)
00151                     fasta=s;
00152 
00153                 if (!s || len==0)
00154                     SG_SERROR("Error reading fasta entry in line %d len=%ld", spanned_lines+1, len);
00155 
00156                 if (s[0]=='>')
00157                     SG_SERROR("Multiple fasta hunks (lines starting with '>') are not supported!\n");
00158 
00159                 if (offs==f.get_size())
00160                 {
00161                     offs-=len+1; // seek to beginning
00162                     fasta_len+=len;
00163 
00164                     uint64_t w=0;
00165                     int32_t nbits=alphabet->get_num_bits();
00166                     uint64_t nfit=8*sizeof(w)/nbits;
00167 
00168                     len = fasta_len-spanned_lines;
00169                     uint64_t stream_len=len/(nfit)+1;
00170                     string=new uint64_t[stream_len];
00171                     length=len;
00172 
00173                     uint64_t idx=0;
00174                     int32_t k=0;
00175 
00176                     for (int32_t j=0; j<fasta_len; j++, k++)
00177                     {
00178                         if (fasta[j]=='\n')
00179                         {
00180                             k--;
00181                             continue;
00182                         }
00183 
00184                         w= (w << nbits) | alphabet->remap_to_bin((uint8_t) fasta[j]);
00185 
00186                         if (k % nfit == nfit-1)
00187                         {
00188                             string[idx]=w;
00189                             w=0;
00190                             idx++;
00191                         }
00192                     }
00193 
00194                     if (idx<stream_len)
00195                         string[idx]=w<<(nbits*(nfit - k%nfit));
00196 
00197                     break;
00198                 }
00199 
00200                 spanned_lines++;
00201                 fasta_len+=len+1; // including '\n'
00202             }
00203         }
00204 
00210         void set_string(uint64_t* str, uint64_t len)
00211         {
00212             cleanup();
00213             string=str;
00214             length=len;
00215         }
00216 
00221         void create(uint64_t len)
00222         {
00223             cleanup();
00224             uint64_t stream_len=len/sizeof(uint64_t)+1;
00225             string=new uint64_t[stream_len];
00226             CMath::fill_vector(string, (int32_t) stream_len, (uint64_t) 0);
00227             length=len;
00228         }
00229 
00230         /*
00231         inline uint64_t condense(uint64_t bits, uint64_t mask) const
00232             {
00233                 uint64_t res = 0 ;
00234                 uint64_t m=mask ;
00235                 uint64_t b=bits ;
00236                 char cnt=0 ;
00237                 
00238                 char ar[256][256] ;
00239 
00240                 for (int i=0; i<8; i++)
00241                 {
00242                     //fprintf(stdout, "%i %lx %lx %lx %i\n", i, res, m, b, (int)cnt) ;
00243                     if (m&1)
00244                         res = res>>8 | ar[b&255][m&255] ;
00245                     //else
00246                     //  cnt++ ;
00247                     m=m>>8 ;
00248                     b=b>>8 ;
00249                 }
00250                 res=res>>cnt ;
00251                 //fprintf(stdout, "%lx %lx %lx\n", res, m, b) ;
00252                 
00253                 //res = res & bits & mask ;
00254                 
00255                 return res ;
00256             }
00257         */
00258 
00263         inline uint64_t operator[](uint64_t index) const
00264         {
00265             ASSERT(index<length);
00266 
00267             uint64_t bitindex=alphabet->get_num_bits()*index;
00268             int32_t ws=8*sizeof(uint64_t);
00269             uint64_t i=bitindex/ws;
00270             int32_t j=bitindex % ws;
00271             int32_t missing=word_len-(ws-j);
00272 
00273             //SG_SPRINT("i=%lld j=%d ws=%d word_len=%d missing=%d left=%llx shift=%d\n", i, j, ws, word_len, missing, ( string[i] << j ) & mask, ws-word_len);
00274             uint64_t res= ((string[i] << j) & mask ) >> (ws-word_len);
00275 
00276             if (missing>0)
00277                 res|= ( string[i+1] >> (ws-missing) );
00278 
00279             //res = condense(res, 1<<31|1<<24|1<<10|1<<8|1<<4|1<<2|1<<1|1) ;
00280 
00281             return res;
00282         }
00283 
00289         inline void set_binary_word(uint16_t word, uint64_t index)
00290         {
00291             ASSERT(index<length);
00292 
00293             uint64_t bitindex=alphabet->get_num_bits()*index;
00294             int32_t ws=8*sizeof(uint64_t);
00295             uint64_t i=bitindex/ws;
00296             int32_t j=bitindex % ws;
00297             int32_t missing=word_len-(ws-j);
00298 
00299             uint64_t sl = j-word_len;
00300             uint64_t ml;
00301             uint64_t wl;
00302             uint64_t mr;
00303             uint64_t wr;
00304 
00305             if (sl>0)
00306             {
00307                 ml=mask<<sl;
00308                 wl=word<<sl ;
00309             }
00310             else
00311             {
00312                 sl=-sl;
00313                 ml=mask>>sl;
00314                 wl=word>>sl;
00315             }
00316 
00317             string[i] = ( string[i] & (~ml) ) | ( wl & ml);
00318 
00319             if (missing>0)
00320             {
00321                 mr = mask<<missing;
00322                 wr = word<<missing;
00323                 string[i+1] = ( string[i+1] & (~mr) ) | ( wr & mr);
00324             }
00325 
00326         }
00327 
00329         inline uint64_t get_length() const { return length-word_len/alphabet->get_num_bits()+1; }
00330 
00332         inline virtual const char* get_name() const { return "BitString"; }
00333 
00334     private:
00336         CAlphabet* alphabet;
00338         uint64_t* string;
00340         uint64_t length;
00342         int32_t word_len;
00344         uint64_t mask;
00346         uint64_t single_mask;
00347 };
00348 }
00349 #endif //__BITSTRING_H__
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation