SHOGUN  v3.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
BitString.h
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2009 Soeren Sonnenburg
8  * Copyright (C) 2009 Fraunhofer Institute FIRST and Max Planck Society
9  */
10 #ifndef __BITSTRING_H__
11 #define __BITSTRING_H__
12 
14 #include <shogun/lib/common.h>
15 #include <shogun/io/SGIO.h>
18 
19 namespace shogun
20 {
28 class CBitString : public CSGObject
29 {
30  public:
33  SG_UNSTABLE("CBitString::CBitString()", "\n")
34 
35  alphabet = NULL;
36  string = NULL;
37  length = 0;
38  word_len = 0;
39  mask = 0;
40  single_mask = 0;
41  }
42 
50  CBitString(EAlphabet alpha, int32_t width=1) : CSGObject(), string(NULL), length(0)
51  {
52  alphabet=new CAlphabet(alpha);
53  int32_t nbits=alphabet->get_num_bits();
54  word_len = width*nbits;
55 
56  mask=0;
57  for (int32_t j=0; j<word_len; j++)
58  mask=(mask<<1) | (uint64_t) 1;
59  mask<<=sizeof(uint64_t)*8-word_len;
60 
61  single_mask=0;
62  for (int32_t j=0; j<nbits; j++)
63  mask=(mask<<1) | (uint64_t) 1;
64  }
65 
68  {
69  cleanup();
70  SG_UNREF(alphabet);
71  }
72 
74  void cleanup()
75  {
76  SG_FREE(string);
77  string=NULL;
78  length=0;
79  }
80 
86  void obtain_from_char(char* str, uint64_t len)
87  {
88  cleanup();
89  uint64_t stream_len=len/sizeof(uint64_t)+1;
90  string=SG_MALLOC(uint64_t, stream_len);
91  length=len;
92 
93  uint64_t w=0;
94  int32_t nbits=alphabet->get_num_bits();
95  uint64_t j=0;
96  uint64_t nfit=8*sizeof(w)/nbits;
97  for (uint64_t i=0; i<len; i++)
98  {
99  w= (w << nbits) | alphabet->remap_to_bin((uint8_t) str[j]);
100 
101  if (i % nfit == nfit-1)
102  {
103  string[j]=w;
104  j++;
105  }
106  }
107 
108  if (j<stream_len)
109  {
110  string[j]=w;
111  j++;
112  }
113 
114  ASSERT(j==stream_len)
115  }
116 
123  void load_fasta_file(const char* fname, bool ignore_invalid=false)
124  {
125  cleanup();
126 
127  uint64_t len=0;
128  uint64_t offs=0;
129 
130  CMemoryMappedFile<char> f(fname);
131 
132  uint64_t id_len=0;
133  char* id=f.get_line(id_len, offs);
134 
135  if (!id_len || id[0]!='>')
136  SG_SERROR("No fasta hunks (lines starting with '>') found\n")
137 
138  if (offs==f.get_size())
139  SG_SERROR("Empty file?\n")
140 
141  char* fasta=NULL;
142  char* s=NULL;
143  int32_t spanned_lines=0;
144  int32_t fasta_len=0;
145 
146  while (true)
147  {
148  s=f.get_line(len, offs);
149 
150  if (!fasta)
151  fasta=s;
152 
153  if (!s || len==0)
154  SG_SERROR("Error reading fasta entry in line %d len=%ld", spanned_lines+1, len)
155 
156  if (s[0]=='>')
157  SG_SERROR("Multiple fasta hunks (lines starting with '>') are not supported!\n")
158 
159  if (offs==f.get_size())
160  {
161  offs-=len+1; // seek to beginning
162  fasta_len+=len;
163 
164  uint64_t w=0;
165  int32_t nbits=alphabet->get_num_bits();
166  uint64_t nfit=8*sizeof(w)/nbits;
167 
168  len = fasta_len-spanned_lines;
169  uint64_t stream_len=len/(nfit)+1;
170  string=SG_MALLOC(uint64_t, stream_len);
171  length=len;
172 
173  uint64_t idx=0;
174  int32_t k=0;
175 
176  for (int32_t j=0; j<fasta_len; j++, k++)
177  {
178  if (fasta[j]=='\n')
179  {
180  k--;
181  continue;
182  }
183 
184  w= (w << nbits) | alphabet->remap_to_bin((uint8_t) fasta[j]);
185 
186  if (k % nfit == nfit-1)
187  {
188  string[idx]=w;
189  w=0;
190  idx++;
191  }
192  }
193 
194  if (idx<stream_len)
195  string[idx]=w<<(nbits*(nfit - k%nfit));
196 
197  break;
198  }
199 
200  spanned_lines++;
201  fasta_len+=len+1; // including '\n'
202  }
203  }
204 
210  void set_string(uint64_t* str, uint64_t len)
211  {
212  cleanup();
213  string=str;
214  length=len;
215  }
216 
221  void create(uint64_t len)
222  {
223  cleanup();
224  uint64_t stream_len=len/sizeof(uint64_t)+1;
225  string=SG_MALLOC(uint64_t, stream_len);
226  SGVector<uint64_t>::fill_vector(string, (int32_t) stream_len, (uint64_t) 0);
227  length=len;
228  }
229 
230  /*
231  inline uint64_t condense(uint64_t bits, uint64_t mask) const
232  {
233  uint64_t res = 0 ;
234  uint64_t m=mask ;
235  uint64_t b=bits ;
236  char cnt=0 ;
237 
238  char ar[256][256] ;
239 
240  for (int i=0; i<8; i++)
241  {
242  //fprintf(stdout, "%i %lx %lx %lx %i\n", i, res, m, b, (int)cnt) ;
243  if (m&1)
244  res = res>>8 | ar[b&255][m&255] ;
245  //else
246  // cnt++ ;
247  m=m>>8 ;
248  b=b>>8 ;
249  }
250  res=res>>cnt ;
251  //fprintf(stdout, "%lx %lx %lx\n", res, m, b) ;
252 
253  //res = res & bits & mask ;
254 
255  return res ;
256  }
257  */
258 
263  inline uint64_t operator[](uint64_t index) const
264  {
265  ASSERT(index<length)
266 
267  uint64_t bitindex=alphabet->get_num_bits()*index;
268  int32_t ws=8*sizeof(uint64_t);
269  uint64_t i=bitindex/ws;
270  int32_t j=bitindex % ws;
271  int32_t missing=word_len-(ws-j);
272 
273  //SG_SPRINT("i=%lld j=%d ws=%d word_len=%d missing=%d left=%llx shift=%d\n", i, j, ws, word_len, missing, ( string[i] << j ) & mask, ws-word_len)
274  uint64_t res= ((string[i] << j) & mask ) >> (ws-word_len);
275 
276  if (missing>0)
277  res|= ( string[i+1] >> (ws-missing) );
278 
279  //res = condense(res, 1<<31|1<<24|1<<10|1<<8|1<<4|1<<2|1<<1|1) ;
280 
281  return res;
282  }
283 
289  inline void set_binary_word(uint16_t word, uint64_t index)
290  {
291  ASSERT(index<length)
292 
293  uint64_t bitindex=alphabet->get_num_bits()*index;
294  int32_t ws=8*sizeof(uint64_t);
295  uint64_t i=bitindex/ws;
296  int32_t j=bitindex % ws;
297  int32_t missing=word_len-(ws-j);
298 
299  uint64_t sl = j-word_len;
300  uint64_t ml;
301  uint64_t wl;
302  uint64_t mr;
303  uint64_t wr;
304 
305  if (sl>0)
306  {
307  ml=mask<<sl;
308  wl=word<<sl ;
309  }
310  else
311  {
312  sl=-sl;
313  ml=mask>>sl;
314  wl=word>>sl;
315  }
316 
317  string[i] = ( string[i] & (~ml) ) | ( wl & ml);
318 
319  if (missing>0)
320  {
321  mr = mask<<missing;
322  wr = word<<missing;
323  string[i+1] = ( string[i+1] & (~mr) ) | ( wr & mr);
324  }
325 
326  }
327 
329  inline uint64_t get_length() const { return length-word_len/alphabet->get_num_bits()+1; }
330 
332  virtual const char* get_name() const { return "BitString"; }
333 
334  private:
336  CAlphabet* alphabet;
338  uint64_t* string;
340  uint64_t length;
342  int32_t word_len;
344  uint64_t mask;
346  uint64_t single_mask;
347 };
348 }
349 #endif //__BITSTRING_H__

SHOGUN Machine Learning Toolbox - Documentation