SHOGUN  4.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
BitString.h
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2009 Soeren Sonnenburg
8  * Copyright (C) 2009 Fraunhofer Institute FIRST and Max Planck Society
9  */
10 #ifndef __BITSTRING_H__
11 #define __BITSTRING_H__
12 
13 #include <shogun/lib/config.h>
14 
16 #include <shogun/lib/common.h>
17 #include <shogun/io/SGIO.h>
20 
21 namespace shogun
22 {
30 class CBitString : public CSGObject
31 {
32  public:
35  SG_UNSTABLE("CBitString::CBitString()", "\n")
36 
37  alphabet = NULL;
38  string = NULL;
39  length = 0;
40  word_len = 0;
41  mask = 0;
42  single_mask = 0;
43  }
44 
52  CBitString(EAlphabet alpha, int32_t width=1) : CSGObject(), string(NULL), length(0)
53  {
54  alphabet=new CAlphabet(alpha);
55  int32_t nbits=alphabet->get_num_bits();
56  word_len = width*nbits;
57 
58  mask=0;
59  for (int32_t j=0; j<word_len; j++)
60  mask=(mask<<1) | (uint64_t) 1;
61  mask<<=sizeof(uint64_t)*8-word_len;
62 
63  single_mask=0;
64  for (int32_t j=0; j<nbits; j++)
65  mask=(mask<<1) | (uint64_t) 1;
66  }
67 
70  {
71  cleanup();
72  SG_UNREF(alphabet);
73  }
74 
76  void cleanup()
77  {
78  SG_FREE(string);
79  string=NULL;
80  length=0;
81  }
82 
88  void obtain_from_char(char* str, uint64_t len)
89  {
90  cleanup();
91  uint64_t stream_len=len/sizeof(uint64_t)+1;
92  string=SG_MALLOC(uint64_t, stream_len);
93  length=len;
94 
95  uint64_t w=0;
96  int32_t nbits=alphabet->get_num_bits();
97  uint64_t j=0;
98  uint64_t nfit=8*sizeof(w)/nbits;
99  for (uint64_t i=0; i<len; i++)
100  {
101  w= (w << nbits) | alphabet->remap_to_bin((uint8_t) str[j]);
102 
103  if (i % nfit == nfit-1)
104  {
105  string[j]=w;
106  j++;
107  }
108  }
109 
110  if (j<stream_len)
111  {
112  string[j]=w;
113  j++;
114  }
115 
116  ASSERT(j==stream_len)
117  }
118 
125  void load_fasta_file(const char* fname, bool ignore_invalid=false)
126  {
127  cleanup();
128 
129  uint64_t len=0;
130  uint64_t offs=0;
131 
132  CMemoryMappedFile<char> f(fname);
133 
134  uint64_t id_len=0;
135  char* id=f.get_line(id_len, offs);
136 
137  if (!id_len || id[0]!='>')
138  SG_SERROR("No fasta hunks (lines starting with '>') found\n")
139 
140  if (offs==f.get_size())
141  SG_SERROR("Empty file?\n")
142 
143  char* fasta=NULL;
144  char* s=NULL;
145  int32_t spanned_lines=0;
146  int32_t fasta_len=0;
147 
148  while (true)
149  {
150  s=f.get_line(len, offs);
151 
152  if (!fasta)
153  fasta=s;
154 
155  if (!s || len==0)
156  SG_SERROR("Error reading fasta entry in line %d len=%ld", spanned_lines+1, len)
157 
158  if (s[0]=='>')
159  SG_SERROR("Multiple fasta hunks (lines starting with '>') are not supported!\n")
160 
161  if (offs==f.get_size())
162  {
163  offs-=len+1; // seek to beginning
164  fasta_len+=len;
165 
166  uint64_t w=0;
167  int32_t nbits=alphabet->get_num_bits();
168  uint64_t nfit=8*sizeof(w)/nbits;
169 
170  len = fasta_len-spanned_lines;
171  uint64_t stream_len=len/(nfit)+1;
172  string=SG_MALLOC(uint64_t, stream_len);
173  length=len;
174 
175  uint64_t idx=0;
176  int32_t k=0;
177 
178  for (int32_t j=0; j<fasta_len; j++, k++)
179  {
180  if (fasta[j]=='\n')
181  {
182  k--;
183  continue;
184  }
185 
186  w= (w << nbits) | alphabet->remap_to_bin((uint8_t) fasta[j]);
187 
188  if (k % nfit == nfit-1)
189  {
190  string[idx]=w;
191  w=0;
192  idx++;
193  }
194  }
195 
196  if (idx<stream_len)
197  string[idx]=w<<(nbits*(nfit - k%nfit));
198 
199  break;
200  }
201 
202  spanned_lines++;
203  fasta_len+=len+1; // including '\n'
204  }
205  }
206 
212  void set_string(uint64_t* str, uint64_t len)
213  {
214  cleanup();
215  string=str;
216  length=len;
217  }
218 
223  void create(uint64_t len)
224  {
225  cleanup();
226  uint64_t stream_len=len/sizeof(uint64_t)+1;
227  string=SG_MALLOC(uint64_t, stream_len);
228  SGVector<uint64_t>::fill_vector(string, (int32_t) stream_len, (uint64_t) 0);
229  length=len;
230  }
231 
232  /*
233  inline uint64_t condense(uint64_t bits, uint64_t mask) const
234  {
235  uint64_t res = 0 ;
236  uint64_t m=mask ;
237  uint64_t b=bits ;
238  char cnt=0 ;
239 
240  char ar[256][256] ;
241 
242  for (int i=0; i<8; i++)
243  {
244  //fprintf(stdout, "%i %lx %lx %lx %i\n", i, res, m, b, (int)cnt) ;
245  if (m&1)
246  res = res>>8 | ar[b&255][m&255] ;
247  //else
248  // cnt++ ;
249  m=m>>8 ;
250  b=b>>8 ;
251  }
252  res=res>>cnt ;
253  //fprintf(stdout, "%lx %lx %lx\n", res, m, b) ;
254 
255  //res = res & bits & mask ;
256 
257  return res ;
258  }
259  */
260 
265  inline uint64_t operator[](uint64_t index) const
266  {
267  ASSERT(index<length)
268 
269  uint64_t bitindex=alphabet->get_num_bits()*index;
270  int32_t ws=8*sizeof(uint64_t);
271  uint64_t i=bitindex/ws;
272  int32_t j=bitindex % ws;
273  int32_t missing=word_len-(ws-j);
274 
275  //SG_SPRINT("i=%lld j=%d ws=%d word_len=%d missing=%d left=%llx shift=%d\n", i, j, ws, word_len, missing, ( string[i] << j ) & mask, ws-word_len)
276  uint64_t res= ((string[i] << j) & mask ) >> (ws-word_len);
277 
278  if (missing>0)
279  res|= ( string[i+1] >> (ws-missing) );
280 
281  //res = condense(res, 1<<31|1<<24|1<<10|1<<8|1<<4|1<<2|1<<1|1) ;
282 
283  return res;
284  }
285 
291  inline void set_binary_word(uint16_t word, uint64_t index)
292  {
293  ASSERT(index<length)
294 
295  uint64_t bitindex=alphabet->get_num_bits()*index;
296  int32_t ws=8*sizeof(uint64_t);
297  uint64_t i=bitindex/ws;
298  int32_t j=bitindex % ws;
299  int32_t missing=word_len-(ws-j);
300 
301  uint64_t sl = j-word_len;
302  uint64_t ml;
303  uint64_t wl;
304  uint64_t mr;
305  uint64_t wr;
306 
307  if (sl>0)
308  {
309  ml=mask<<sl;
310  wl=word<<sl ;
311  }
312  else
313  {
314  sl=-sl;
315  ml=mask>>sl;
316  wl=word>>sl;
317  }
318 
319  string[i] = ( string[i] & (~ml) ) | ( wl & ml);
320 
321  if (missing>0)
322  {
323  mr = mask<<missing;
324  wr = word<<missing;
325  string[i+1] = ( string[i+1] & (~mr) ) | ( wr & mr);
326  }
327 
328  }
329 
331  inline uint64_t get_length() const { return length-word_len/alphabet->get_num_bits()+1; }
332 
334  virtual const char* get_name() const { return "BitString"; }
335 
336  private:
338  CAlphabet* alphabet;
340  uint64_t* string;
342  uint64_t length;
344  int32_t word_len;
346  uint64_t mask;
348  uint64_t single_mask;
349 };
350 }
351 #endif //__BITSTRING_H__
static void fill_vector(T *vec, int32_t len, T value)
Definition: SGVector.cpp:221
a string class embedding a string in a compact bit representation
Definition: BitString.h:30
EAlphabet
Alphabet of charfeatures/observations.
Definition: Alphabet.h:23
uint64_t get_length() const
Definition: BitString.h:331
The class Alphabet implements an alphabet and alphabet utility functions.
Definition: Alphabet.h:91
uint8_t remap_to_bin(uint8_t c)
Definition: Alphabet.h:159
void set_binary_word(uint16_t word, uint64_t index)
Definition: BitString.h:291
virtual const char * get_name() const
Definition: BitString.h:334
char * get_line(uint64_t &len, uint64_t &offs)
#define ASSERT(x)
Definition: SGIO.h:201
Class SGObject is the base class of all shogun objects.
Definition: SGObject.h:115
void load_fasta_file(const char *fname, bool ignore_invalid=false)
Definition: BitString.h:125
int32_t get_num_bits() const
Definition: Alphabet.h:149
void set_string(uint64_t *str, uint64_t len)
Definition: BitString.h:212
#define SG_UNREF(x)
Definition: SGObject.h:55
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
#define SG_SERROR(...)
Definition: SGIO.h:179
void obtain_from_char(char *str, uint64_t len)
Definition: BitString.h:88
void create(uint64_t len)
Definition: BitString.h:223
#define SG_UNSTABLE(func,...)
Definition: SGIO.h:132
uint64_t operator[](uint64_t index) const
Definition: BitString.h:265
CBitString(EAlphabet alpha, int32_t width=1)
Definition: BitString.h:52

SHOGUN Machine Learning Toolbox - Documentation