Go to the documentation of this file.00001
00002
00003
00004
00005
00006
00007
00008
00009
00010 #ifndef __BITSTRING_H__
00011 #define __BITSTRING_H__
00012
00013 #include <shogun/features/Alphabet.h>
00014 #include <shogun/lib/common.h>
00015 #include <shogun/lib/io.h>
00016 #include <shogun/lib/MemoryMappedFile.h>
00017 #include <shogun/lib/Mathematics.h>
00018
00019 namespace shogun
00020 {
00028 class CBitString : public CSGObject
00029 {
00030 public:
00032 CBitString(void) {
00033 SG_UNSTABLE("CBitString::CBitString(void)", "\n");
00034
00035 alphabet = NULL;
00036 string = NULL;
00037 length = 0;
00038 word_len = 0;
00039 mask = 0;
00040 single_mask = 0;
00041 }
00042
00050 CBitString(EAlphabet alpha, int32_t width=1) : CSGObject(), string(NULL), length(0)
00051 {
00052 alphabet=new CAlphabet(alpha);
00053 int32_t nbits=alphabet->get_num_bits();
00054 word_len = width*nbits;
00055
00056 mask=0;
00057 for (int32_t j=0; j<word_len; j++)
00058 mask=(mask<<1) | (uint64_t) 1;
00059 mask<<=sizeof(uint64_t)*8-word_len;
00060
00061 single_mask=0;
00062 for (int32_t j=0; j<nbits; j++)
00063 mask=(mask<<1) | (uint64_t) 1;
00064 }
00065
00067 ~CBitString()
00068 {
00069 cleanup();
00070 SG_UNREF(alphabet);
00071 }
00072
00074 void cleanup()
00075 {
00076 delete[] string;
00077 string=NULL;
00078 length=0;
00079 }
00080
00086 void obtain_from_char(char* str, uint64_t len)
00087 {
00088 cleanup();
00089 uint64_t stream_len=len/sizeof(uint64_t)+1;
00090 string=new uint64_t[stream_len];
00091 length=len;
00092
00093 uint64_t w=0;
00094 int32_t nbits=alphabet->get_num_bits();
00095 uint64_t j=0;
00096 uint64_t nfit=8*sizeof(w)/nbits;
00097 for (uint64_t i=0; i<len; i++)
00098 {
00099 w= (w << nbits) | alphabet->remap_to_bin((uint8_t) str[j]);
00100
00101 if (i % nfit == nfit-1)
00102 {
00103 string[j]=w;
00104 j++;
00105 }
00106 }
00107
00108 if (j<stream_len)
00109 {
00110 string[j]=w;
00111 j++;
00112 }
00113
00114 ASSERT(j==stream_len);
00115 }
00116
00123 void load_fasta_file(const char* fname, bool ignore_invalid=false)
00124 {
00125 cleanup();
00126
00127 uint64_t len=0;
00128 uint64_t offs=0;
00129
00130 CMemoryMappedFile<char> f(fname);
00131
00132 uint64_t id_len=0;
00133 char* id=f.get_line(id_len, offs);
00134
00135 if (!id_len || id[0]!='>')
00136 SG_SERROR("No fasta hunks (lines starting with '>') found\n");
00137
00138 if (offs==f.get_size())
00139 SG_SERROR("Empty file?\n");
00140
00141 char* fasta=NULL;
00142 char* s=NULL;
00143 int32_t spanned_lines=0;
00144 int32_t fasta_len=0;
00145
00146 while (true)
00147 {
00148 s=f.get_line(len, offs);
00149
00150 if (!fasta)
00151 fasta=s;
00152
00153 if (!s || len==0)
00154 SG_SERROR("Error reading fasta entry in line %d len=%ld", spanned_lines+1, len);
00155
00156 if (s[0]=='>')
00157 SG_SERROR("Multiple fasta hunks (lines starting with '>') are not supported!\n");
00158
00159 if (offs==f.get_size())
00160 {
00161 offs-=len+1;
00162 fasta_len+=len;
00163
00164 uint64_t w=0;
00165 int32_t nbits=alphabet->get_num_bits();
00166 uint64_t nfit=8*sizeof(w)/nbits;
00167
00168 len = fasta_len-spanned_lines;
00169 uint64_t stream_len=len/(nfit)+1;
00170 string=new uint64_t[stream_len];
00171 length=len;
00172
00173 uint64_t idx=0;
00174 int32_t k=0;
00175
00176 for (int32_t j=0; j<fasta_len; j++, k++)
00177 {
00178 if (fasta[j]=='\n')
00179 {
00180 k--;
00181 continue;
00182 }
00183
00184 w= (w << nbits) | alphabet->remap_to_bin((uint8_t) fasta[j]);
00185
00186 if (k % nfit == nfit-1)
00187 {
00188 string[idx]=w;
00189 w=0;
00190 idx++;
00191 }
00192 }
00193
00194 if (idx<stream_len)
00195 string[idx]=w<<(nbits*(nfit - k%nfit));
00196
00197 break;
00198 }
00199
00200 spanned_lines++;
00201 fasta_len+=len+1;
00202 }
00203 }
00204
00210 void set_string(uint64_t* str, uint64_t len)
00211 {
00212 cleanup();
00213 string=str;
00214 length=len;
00215 }
00216
00221 void create(uint64_t len)
00222 {
00223 cleanup();
00224 uint64_t stream_len=len/sizeof(uint64_t)+1;
00225 string=new uint64_t[stream_len];
00226 CMath::fill_vector(string, (int32_t) stream_len, (uint64_t) 0);
00227 length=len;
00228 }
00229
00230
00231
00232
00233
00234
00235
00236
00237
00238
00239
00240
00241
00242
00243
00244
00245
00246
00247
00248
00249
00250
00251
00252
00253
00254
00255
00256
00257
00258
00263 inline uint64_t operator[](uint64_t index) const
00264 {
00265 ASSERT(index<length);
00266
00267 uint64_t bitindex=alphabet->get_num_bits()*index;
00268 int32_t ws=8*sizeof(uint64_t);
00269 uint64_t i=bitindex/ws;
00270 int32_t j=bitindex % ws;
00271 int32_t missing=word_len-(ws-j);
00272
00273
00274 uint64_t res= ((string[i] << j) & mask ) >> (ws-word_len);
00275
00276 if (missing>0)
00277 res|= ( string[i+1] >> (ws-missing) );
00278
00279
00280
00281 return res;
00282 }
00283
00289 inline void set_binary_word(uint16_t word, uint64_t index)
00290 {
00291 ASSERT(index<length);
00292
00293 uint64_t bitindex=alphabet->get_num_bits()*index;
00294 int32_t ws=8*sizeof(uint64_t);
00295 uint64_t i=bitindex/ws;
00296 int32_t j=bitindex % ws;
00297 int32_t missing=word_len-(ws-j);
00298
00299 uint64_t sl = j-word_len;
00300 uint64_t ml;
00301 uint64_t wl;
00302 uint64_t mr;
00303 uint64_t wr;
00304
00305 if (sl>0)
00306 {
00307 ml=mask<<sl;
00308 wl=word<<sl ;
00309 }
00310 else
00311 {
00312 sl=-sl;
00313 ml=mask>>sl;
00314 wl=word>>sl;
00315 }
00316
00317 string[i] = ( string[i] & (~ml) ) | ( wl & ml);
00318
00319 if (missing>0)
00320 {
00321 mr = mask<<missing;
00322 wr = word<<missing;
00323 string[i+1] = ( string[i+1] & (~mr) ) | ( wr & mr);
00324 }
00325
00326 }
00327
00329 inline uint64_t get_length() const { return length-word_len/alphabet->get_num_bits()+1; }
00330
00332 inline virtual const char* get_name() const { return "BitString"; }
00333
00334 private:
00336 CAlphabet* alphabet;
00338 uint64_t* string;
00340 uint64_t length;
00342 int32_t word_len;
00344 uint64_t mask;
00346 uint64_t single_mask;
00347 };
00348 }
00349 #endif //__BITSTRING_H__