SHOGUN  4.1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
CircularBuffer.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2013 Evgeniy Andreev (gsomix)
8  */
9 
11 #include <shogun/lib/Tokenizer.h>
12 #include <shogun/io/SGIO.h>
13 
14 #include <cstdio>
15 #include <cstring>
16 
17 using namespace shogun;
18 
20 {
21  init();
22 }
23 
25 {
26  init();
27 
28  m_buffer=SGVector<char>(buffer_size);
29  m_buffer_end=m_buffer.vector+m_buffer.vlen;
30 
31  m_begin_pos=m_buffer.vector;
32  m_end_pos=m_begin_pos;
33 
34  m_bytes_available=m_buffer.vlen;
35 }
36 
38 {
39  SG_UNREF(m_tokenizer);
40 }
41 
43 {
44  SG_REF(tokenizer);
45  SG_UNREF(m_tokenizer);
46  m_tokenizer=tokenizer;
47 }
48 
50 {
51  if (source.vector==NULL || source.vlen==0)
52  {
53  SG_ERROR("CCircularBuffer::push(SGVector<char>):: Invalid parameters! Source shouldn't be NULL or zero sized\n");
54  return -1;
55  }
56 
57  int32_t bytes_to_write;
58  if (source.vlen>m_bytes_available)
59  bytes_to_write=m_bytes_available;
60  else
61  bytes_to_write=source.vlen;
62 
63  if (bytes_to_write==0)
64  return 0;
65 
66  // determine which part of the memory block is free to read
67  if (m_end_pos>=m_begin_pos)
68  {
69  int32_t bytes_to_memory_end=m_buffer.vlen-(m_end_pos-m_buffer.vector);
70  if (bytes_to_memory_end<bytes_to_write)
71  {
72  // we need write as at end of memory block and at begin
73  // because logical structure of buffer is ring
74  int32_t first_chunk_size=bytes_to_memory_end;
75  int32_t second_chunk_size=bytes_to_write-first_chunk_size;
76 
77  bytes_to_write=append_chunk(source.vector, first_chunk_size, false);
78  bytes_to_write+=append_chunk(source.vector+first_chunk_size, second_chunk_size, true);
79  }
80  else
81  {
82  bytes_to_write=append_chunk(source.vector, bytes_to_write, false);
83  }
84  }
85  else
86  {
87  bytes_to_write=append_chunk(source.vector, bytes_to_write, false);
88  }
89 
90  return bytes_to_write;
91 }
92 
93 int32_t CCircularBuffer::push(FILE* source, int32_t source_size)
94 {
95  if (source==NULL || source_size==0)
96  {
97  SG_ERROR("CCircularBuffer::push(FILE*, int32_t):: Invalid parameters! Source shouldn't be NULL or zero sized\n");
98  return -1;
99  }
100 
101  int32_t bytes_to_write;
102  if (source_size>m_bytes_available)
103  bytes_to_write=m_bytes_available;
104  else
105  bytes_to_write=source_size;
106 
107  if (bytes_to_write==0)
108  return 0;
109 
110  // determine which part of the memory block is free to read
111  if (m_end_pos>=m_begin_pos)
112  {
113  int32_t bytes_to_memory_end=m_buffer.vlen-(m_end_pos-m_buffer.vector);
114  if (bytes_to_memory_end<bytes_to_write)
115  {
116  // we need write as at end of memory block and at begin
117  // because logical structure of buffer is ring
118  int32_t first_chunk_size=bytes_to_memory_end;
119  int32_t second_chunk_size=bytes_to_write-first_chunk_size;
120 
121  bytes_to_write=append_chunk(source, first_chunk_size, false);
122  bytes_to_write+=append_chunk(source, second_chunk_size, true);
123  }
124  else
125  {
126  bytes_to_write=append_chunk(source, bytes_to_write, false);
127  }
128  }
129  else
130  {
131  bytes_to_write=append_chunk(source, bytes_to_write, false);
132  }
133 
134  return bytes_to_write;
135 }
136 
138 {
139  SGVector<char> result;
140 
141  int32_t bytes_to_read;
142  if (num_bytes>m_bytes_count)
143  bytes_to_read=m_bytes_count;
144  else
145  bytes_to_read=num_bytes;
146 
147  if (bytes_to_read==0)
148  return 0;
149 
150  // determine which part of the memory block will be read
151  if (m_begin_pos>=m_end_pos)
152  {
153  int32_t bytes_to_memory_end=m_buffer.vlen-(m_begin_pos-m_buffer.vector);
154  if (bytes_to_memory_end<bytes_to_read)
155  {
156  // read continious block from end of memory and from begin
157  int32_t first_chunk_size=bytes_to_memory_end;
158  int32_t second_chunk_size=bytes_to_read-first_chunk_size;
159 
160  detach_chunk(&result.vector, &result.vlen, 0, first_chunk_size, false);
161  detach_chunk(&result.vector, &result.vlen, first_chunk_size, second_chunk_size, true);
162  }
163  else
164  {
165  detach_chunk(&result.vector, &result.vlen, 0, bytes_to_read, false);
166  }
167  }
168  else
169  {
170  detach_chunk(&result.vector, &result.vlen, 0, bytes_to_read, false);
171  }
172 
173  return result;
174 }
175 
177 {
178  if (m_tokenizer==NULL)
179  {
180  SG_ERROR("CCircularBuffer::has_next():: Tokenizer is not initialized\n");
181  return false;
182  }
183 
184  if (m_bytes_count==0)
185  return false;
186 
187  int32_t head_length=m_buffer_end-m_begin_pos;
188 
189  // determine position of finder pointer in memory block
190  if (m_last_idx<head_length)
191  {
192  if (m_end_pos>=m_begin_pos && m_bytes_available!=0)
193  {
194  return has_next_locally(m_begin_pos+m_last_idx, m_end_pos);
195  }
196  else
197  {
198  bool temp=false;
199  temp=has_next_locally(m_begin_pos+m_last_idx, m_buffer_end);
200 
201  if (temp)
202  return temp;
203 
204  return has_next_locally(m_buffer.vector+m_last_idx-head_length, m_end_pos);
205  }
206  }
207  else
208  {
209  return has_next_locally(m_buffer.vector+m_last_idx-head_length, m_end_pos);
210  }
211 
212  return false;
213 }
214 
216 {
217  index_t end;
218 
219  if (m_tokenizer==NULL)
220  {
221  SG_ERROR("CCircularBuffer::next_token_idx(index_t&):: Tokenizer is not initialized\n");
222  return 0;
223  }
224 
225  if (m_bytes_count==0)
226  return m_bytes_count;
227 
228  int32_t tail_length=m_end_pos-m_buffer.vector;
229  int32_t head_length=m_buffer_end-m_begin_pos;
230 
231  // determine position of finder pointer in memory block
232  if (m_last_idx<head_length)
233  {
234  if (m_end_pos>=m_begin_pos && m_bytes_available!=0)
235  {
236  end=next_token_idx_locally(start, m_begin_pos+m_last_idx, m_end_pos);
237  if (end<=m_bytes_count)
238  return end;
239  }
240  else
241  {
242  index_t temp_start;
243 
244  // in this case we should find first at end of memory block
245  end=next_token_idx_locally(start, m_begin_pos+m_last_idx, m_buffer_end);
246 
247  if (end<head_length)
248  return end;
249 
250  // and then at begin
251  end=next_token_idx_locally(temp_start, m_buffer.vector+m_last_idx-head_length, m_end_pos);
252 
253  if (start>=head_length)
254  start=temp_start;
255 
256  return end;
257  }
258  }
259  else
260  {
261  end=next_token_idx_locally(start, m_buffer.vector+m_last_idx-head_length, m_end_pos);
262  if (end-head_length<=tail_length)
263  return end;
264  }
265 
266  start=0;
267  return start;
268 }
269 
270 void CCircularBuffer::skip_characters(int32_t num_chars)
271 {
272  move_pointer(&m_begin_pos, m_begin_pos+num_chars);
273 
274  m_last_idx-=num_chars;
275  if (m_last_idx<0)
276  m_last_idx=0;
277 
278  m_bytes_available+=num_chars;
279  m_bytes_count-=num_chars;
280 }
281 
283 {
284  m_begin_pos=m_buffer.vector;
285  m_end_pos=m_begin_pos;
286 
287  m_last_idx=0;
288  m_bytes_available=m_buffer.vlen;
289  m_bytes_count=0;
290 }
291 
292 void CCircularBuffer::init()
293 {
294  m_buffer=SGVector<char>();
295  m_buffer_end=NULL;
296  m_tokenizer=NULL;
297 
298  m_begin_pos=NULL;
299  m_end_pos=NULL;
300 
301  m_last_idx=0;
302  m_bytes_available=0;
303  m_bytes_count=0;
304 }
305 
306 int32_t CCircularBuffer::append_chunk(const char* source, int32_t source_size,
307  bool from_buffer_begin)
308 {
309  if (source==NULL || source_size==0)
310  {
311  SG_ERROR("CCircularBuffer::append_chunk(const char*, int32_t, bool):: Invalid parameters!\
312  Source shouldn't be NULL or zero sized\n");
313  return -1;
314  }
315 
316  if (from_buffer_begin)
317  m_end_pos=m_buffer.vector;
318 
319  memcpy(m_end_pos, source, source_size);
320  move_pointer(&m_end_pos, m_end_pos+source_size);
321 
322  m_bytes_available-=source_size;
323  m_bytes_count+=source_size;
324 
325  return source_size;
326 }
327 
328 int32_t CCircularBuffer::append_chunk(FILE* source, int32_t source_size,
329  bool from_buffer_begin)
330 {
331  int32_t actually_read=fread(m_end_pos, sizeof(char), source_size, source);
332 
333  if (from_buffer_begin && actually_read==source_size)
334  m_end_pos=m_buffer.vector;
335  move_pointer(&m_end_pos, m_end_pos+actually_read);
336 
337  m_bytes_available-=actually_read;
338  m_bytes_count+=actually_read;
339 
340  return actually_read;
341 }
342 
343 void CCircularBuffer::detach_chunk(char** dest, int32_t* dest_size, int32_t dest_offset, int32_t num_bytes,
344  bool from_buffer_begin)
345 {
346  if (dest==NULL || dest_size==NULL)
347  {
348  SG_ERROR("CCircularBuffer::detach_chunk(...):: Invalid parameters! Pointers are NULL\n");
349  return;
350  }
351 
352  if (*dest==NULL)
353  {
354  *dest=SG_MALLOC(char, num_bytes+dest_offset);
355  *dest_size=num_bytes+dest_offset;
356  }
357 
358  if (*dest_size<num_bytes+dest_offset)
359  {
360  *dest=SG_REALLOC(char, *dest, *dest_size, num_bytes+dest_offset);
361  *dest_size=num_bytes+dest_offset;
362  }
363 
364  if (from_buffer_begin)
365  m_begin_pos=m_buffer.vector;
366 
367  memcpy(*dest+dest_offset, m_begin_pos, num_bytes);
368  move_pointer(&m_begin_pos, m_begin_pos+num_bytes);
369 
370  m_last_idx-=num_bytes;
371  if (m_last_idx<0)
372  m_last_idx=0;
373 
374  m_bytes_available+=num_bytes;
375  m_bytes_count-=num_bytes;
376 }
377 
378 bool CCircularBuffer::has_next_locally(char* part_begin, char* part_end)
379 {
380  int32_t num_bytes_to_search=part_end-part_begin;
381 
382  SGVector<char> buffer_part(part_begin, num_bytes_to_search, false);
383  m_tokenizer->set_text(buffer_part);
384 
385  return m_tokenizer->has_next();
386 }
387 
388 index_t CCircularBuffer::next_token_idx_locally(index_t &start, char* part_begin, char* part_end)
389 {
390  index_t end=0;
391  int32_t num_bytes_to_search=part_end-part_begin;
392  if (num_bytes_to_search<=0)
393  {
394  start=0;
395  return m_last_idx;
396  }
397 
398  SGVector<char> buffer_part(part_begin, num_bytes_to_search, false);
399  m_tokenizer->set_text(buffer_part);
400 
401  end=m_tokenizer->next_token_idx(start);
402 
403  start+=m_last_idx;
404  m_last_idx+=end;
405 
406  if (end==num_bytes_to_search)
407  return m_last_idx;
408  else
409  return m_last_idx++;
410 }
411 
412 void CCircularBuffer::move_pointer(char** pointer, char* new_position)
413 {
414  *pointer=new_position;
415  if (*pointer>=m_buffer.vector+m_buffer.vlen)
416  *pointer=m_buffer.vector;
417 }
int32_t index_t
Definition: common.h:62
virtual void set_text(SGVector< char > txt)
Definition: Tokenizer.cpp:17
int32_t push(SGVector< char > source)
#define SG_ERROR(...)
Definition: SGIO.h:129
void skip_characters(int32_t num_chars)
#define SG_REF(x)
Definition: SGObject.h:51
index_t next_token_idx(index_t &start)
index_t vlen
Definition: SGVector.h:494
The class CTokenizer acts as a base class in order to implement tokenizers. Sub-classes must implemen...
Definition: Tokenizer.h:29
SGVector< char > pop(int32_t num_chars)
virtual bool has_next()=0
#define SG_UNREF(x)
Definition: SGObject.h:52
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
virtual index_t next_token_idx(index_t &start)=0
void set_tokenizer(CTokenizer *tokenizer)

SHOGUN Machine Learning Toolbox - Documentation