SHOGUN  6.1.3
CircularBuffer.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2013 Evgeniy Andreev (gsomix)
8  */
9 
11 #include <shogun/lib/Tokenizer.h>
12 #include <shogun/io/SGIO.h>
13 
14 #include <cstdio>
15 #include <cstring>
16 
17 using namespace shogun;
18 
20 {
21  init();
22 }
23 
25 {
26  init();
27 
28  m_buffer=SGVector<char>(buffer_size);
29  m_buffer_end=m_buffer.vector+m_buffer.vlen;
30 
31  m_begin_pos=m_buffer.vector;
32  m_end_pos=m_begin_pos;
33 
34  m_bytes_available=m_buffer.vlen;
35 }
36 
38 {
39  SG_UNREF(m_tokenizer);
40 }
41 
43 {
44  SG_REF(tokenizer);
45  SG_UNREF(m_tokenizer);
46  m_tokenizer=tokenizer;
47 }
48 
50 {
51  if (source.vector==NULL || source.vlen==0)
52  {
53  SG_ERROR("CCircularBuffer::push(SGVector<char>):: Invalid parameters! Source shouldn't be NULL or zero sized\n");
54  return -1;
55  }
56 
57  int32_t bytes_to_write;
58  if (source.vlen>m_bytes_available)
59  bytes_to_write=m_bytes_available;
60  else
61  bytes_to_write=source.vlen;
62 
63  if (bytes_to_write==0)
64  return 0;
65 
66  // determine which part of the memory block is free to read
67  if (m_end_pos>=m_begin_pos)
68  {
69  auto bytes_to_memory_end=m_buffer.vlen-std::distance(m_buffer.vector, m_end_pos);
70  if (bytes_to_memory_end<bytes_to_write)
71  {
72  // we need write as at end of memory block and at begin
73  // because logical structure of buffer is ring
74  int32_t first_chunk_size=bytes_to_memory_end;
75  int32_t second_chunk_size=bytes_to_write-first_chunk_size;
76 
77  bytes_to_write=append_chunk(source.vector, first_chunk_size, false);
78  bytes_to_write+=append_chunk(source.vector+first_chunk_size, second_chunk_size, true);
79  }
80  else
81  {
82  bytes_to_write=append_chunk(source.vector, bytes_to_write, false);
83  }
84  }
85  else
86  {
87  bytes_to_write=append_chunk(source.vector, bytes_to_write, false);
88  }
89 
90  return bytes_to_write;
91 }
92 
93 int32_t CCircularBuffer::push(FILE* source, int32_t source_size)
94 {
95  if (source==NULL || source_size==0)
96  {
97  SG_ERROR("CCircularBuffer::push(FILE*, int32_t):: Invalid parameters! Source shouldn't be NULL or zero sized\n");
98  return -1;
99  }
100 
101  int32_t bytes_to_write;
102  if (source_size>m_bytes_available)
103  bytes_to_write=m_bytes_available;
104  else
105  bytes_to_write=source_size;
106 
107  if (bytes_to_write==0)
108  return 0;
109 
110  // determine which part of the memory block is free to read
111  if (m_end_pos>=m_begin_pos)
112  {
113  int32_t bytes_to_memory_end=m_buffer.vlen-std::distance(m_buffer.vector, m_end_pos);
114  if (bytes_to_memory_end<bytes_to_write)
115  {
116  // we need write as at end of memory block and at begin
117  // because logical structure of buffer is ring
118  int32_t first_chunk_size=bytes_to_memory_end;
119  int32_t second_chunk_size=bytes_to_write-first_chunk_size;
120 
121  bytes_to_write=append_chunk(source, first_chunk_size, false);
122  bytes_to_write+=append_chunk(source, second_chunk_size, true);
123  }
124  else
125  {
126  bytes_to_write=append_chunk(source, bytes_to_write, false);
127  }
128  }
129  else
130  {
131  bytes_to_write=append_chunk(source, bytes_to_write, false);
132  }
133 
134  return bytes_to_write;
135 }
136 
138 {
139  SGVector<char> result;
140 
141  int32_t bytes_to_read;
142  if (num_bytes>m_bytes_count)
143  bytes_to_read=m_bytes_count;
144  else
145  bytes_to_read=num_bytes;
146 
147  if (bytes_to_read==0)
148  return 0;
149 
150  // determine which part of the memory block will be read
151  if (m_begin_pos>=m_end_pos)
152  {
153  int32_t bytes_to_memory_end=m_buffer.vlen-(m_begin_pos-m_buffer.vector);
154  if (bytes_to_memory_end<bytes_to_read)
155  {
156  // read continious block from end of memory and from begin
157  int32_t first_chunk_size=bytes_to_memory_end;
158  int32_t second_chunk_size=bytes_to_read-first_chunk_size;
159 
160  detach_chunk(&result.vector, &result.vlen, 0, first_chunk_size, false);
161  detach_chunk(&result.vector, &result.vlen, first_chunk_size, second_chunk_size, true);
162  }
163  else
164  {
165  detach_chunk(&result.vector, &result.vlen, 0, bytes_to_read, false);
166  }
167  }
168  else
169  {
170  detach_chunk(&result.vector, &result.vlen, 0, bytes_to_read, false);
171  }
172 
173  return result;
174 }
175 
177 {
178  if (m_tokenizer==NULL)
179  {
180  SG_ERROR("CCircularBuffer::has_next():: Tokenizer is not initialized\n");
181  return false;
182  }
183 
184  if (m_bytes_count==0)
185  return false;
186 
187  auto head_length=std::distance(m_begin_pos, m_buffer_end);
188 
189  // determine position of finder pointer in memory block
190  if (m_last_idx<head_length)
191  {
192  if (m_end_pos>=m_begin_pos && m_bytes_available!=0)
193  {
194  return has_next_locally(m_begin_pos+m_last_idx, m_end_pos);
195  }
196  else
197  {
198  bool temp=false;
199  temp=has_next_locally(m_begin_pos+m_last_idx, m_buffer_end);
200  return (temp > 0)
201  ? temp
202  : has_next_locally(m_buffer.vector+m_last_idx-head_length, m_end_pos);
203  }
204  }
205  else
206  {
207  return has_next_locally(m_buffer.vector+m_last_idx-head_length, m_end_pos);
208  }
209 
210  return false;
211 }
212 
214 {
215  index_t end;
216 
217  if (m_tokenizer==NULL)
218  {
219  SG_ERROR("CCircularBuffer::next_token_idx(index_t&):: Tokenizer is not initialized\n");
220  return 0;
221  }
222 
223  if (m_bytes_count==0)
224  return m_bytes_count;
225 
226  auto tail_length=std::distance(m_buffer.vector, m_end_pos);
227  auto head_length=std::distance(m_begin_pos, m_buffer_end);
228 
229  // determine position of finder pointer in memory block
230  if (m_last_idx<head_length)
231  {
232  if (m_end_pos>=m_begin_pos && m_bytes_available!=0)
233  {
234  end=next_token_idx_locally(start, m_begin_pos+m_last_idx, m_end_pos);
235  if (end<=m_bytes_count)
236  return end;
237  }
238  else
239  {
240  index_t temp_start;
241 
242  // in this case we should find first at end of memory block
243  end=next_token_idx_locally(start, m_begin_pos+m_last_idx, m_buffer_end);
244 
245  if (end<head_length)
246  return end;
247 
248  // and then at begin
249  end=next_token_idx_locally(temp_start, m_buffer.vector+m_last_idx-head_length, m_end_pos);
250 
251  if (start>=head_length)
252  start=temp_start;
253 
254  return end;
255  }
256  }
257  else
258  {
259  end=next_token_idx_locally(start, m_buffer.vector+m_last_idx-head_length, m_end_pos);
260  if (end-head_length<=tail_length)
261  return end;
262  }
263 
264  start=0;
265  return start;
266 }
267 
268 void CCircularBuffer::skip_characters(int32_t num_chars)
269 {
270  auto head_length = std::distance(m_begin_pos, m_buffer_end);
271  if (head_length >= num_chars)
272  move_pointer(&m_begin_pos, m_begin_pos+num_chars);
273  else
274  move_pointer(&m_begin_pos, m_buffer.vector+num_chars-head_length);
275 
276  m_last_idx-=num_chars;
277  if (m_last_idx<0)
278  m_last_idx=0;
279 
280  m_bytes_available+=num_chars;
281  m_bytes_count-=num_chars;
282 }
283 
285 {
286  m_begin_pos=m_buffer.vector;
287  m_end_pos=m_begin_pos;
288 
289  m_last_idx=0;
290  m_bytes_available=m_buffer.vlen;
291  m_bytes_count=0;
292 }
293 
294 void CCircularBuffer::init()
295 {
296  m_buffer=SGVector<char>();
297  m_buffer_end=NULL;
298  m_tokenizer=NULL;
299 
300  m_begin_pos=NULL;
301  m_end_pos=NULL;
302 
303  m_last_idx=0;
304  m_bytes_available=0;
305  m_bytes_count=0;
306 }
307 
308 int32_t CCircularBuffer::append_chunk(const char* source, int32_t source_size,
309  bool from_buffer_begin)
310 {
311  if (source==NULL || source_size==0)
312  {
313  SG_ERROR("CCircularBuffer::append_chunk(const char*, int32_t, bool):: Invalid parameters!\
314  Source shouldn't be NULL or zero sized\n");
315  return -1;
316  }
317 
318  if (from_buffer_begin)
319  m_end_pos=m_buffer.vector;
320 
321  sg_memcpy(m_end_pos, source, source_size);
322  move_pointer(&m_end_pos, m_end_pos+source_size);
323 
324  m_bytes_available-=source_size;
325  m_bytes_count+=source_size;
326 
327  return source_size;
328 }
329 
330 int32_t CCircularBuffer::append_chunk(FILE* source, int32_t source_size,
331  bool from_buffer_begin)
332 {
333  int32_t actually_read=fread(m_end_pos, sizeof(char), source_size, source);
334 
335  if (from_buffer_begin && actually_read==source_size)
336  m_end_pos=m_buffer.vector;
337  move_pointer(&m_end_pos, m_end_pos+actually_read);
338 
339  m_bytes_available-=actually_read;
340  m_bytes_count+=actually_read;
341 
342  return actually_read;
343 }
344 
345 void CCircularBuffer::detach_chunk(char** dest, int32_t* dest_size, int32_t dest_offset, int32_t num_bytes,
346  bool from_buffer_begin)
347 {
348  if (dest==NULL || dest_size==NULL)
349  {
350  SG_ERROR("CCircularBuffer::detach_chunk(...):: Invalid parameters! Pointers are NULL\n");
351  return;
352  }
353 
354  if (*dest==NULL)
355  {
356  *dest=SG_MALLOC(char, num_bytes+dest_offset);
357  *dest_size=num_bytes+dest_offset;
358  }
359 
360  if (*dest_size<num_bytes+dest_offset)
361  {
362  *dest=SG_REALLOC(char, *dest, *dest_size, num_bytes+dest_offset);
363  *dest_size=num_bytes+dest_offset;
364  }
365 
366  if (from_buffer_begin)
367  m_begin_pos=m_buffer.vector;
368 
369  sg_memcpy(*dest+dest_offset, m_begin_pos, num_bytes);
370  move_pointer(&m_begin_pos, m_begin_pos+num_bytes);
371 
372  m_last_idx-=num_bytes;
373  if (m_last_idx<0)
374  m_last_idx=0;
375 
376  m_bytes_available+=num_bytes;
377  m_bytes_count-=num_bytes;
378 }
379 
380 bool CCircularBuffer::has_next_locally(char* part_begin, char* part_end)
381 {
382  auto num_bytes_to_search=std::distance(part_begin, part_end);
383 
384  SGVector<char> buffer_part(part_begin, num_bytes_to_search, false);
385  m_tokenizer->set_text(buffer_part);
386 
387  return m_tokenizer->has_next();
388 }
389 
390 index_t CCircularBuffer::next_token_idx_locally(index_t &start, char* part_begin, char* part_end)
391 {
392  index_t end=0;
393  auto num_bytes_to_search=std::distance(part_begin, part_end);
394  if (num_bytes_to_search<=0)
395  {
396  start=0;
397  return m_last_idx;
398  }
399 
400  SGVector<char> buffer_part(part_begin, num_bytes_to_search, false);
401  m_tokenizer->set_text(buffer_part);
402 
403  end=m_tokenizer->next_token_idx(start);
404 
405  start+=m_last_idx;
406  m_last_idx+=end;
407 
408  if (end==num_bytes_to_search)
409  return m_last_idx;
410  else
411  return m_last_idx++;
412 }
413 
414 void CCircularBuffer::move_pointer(char** pointer, char* new_position)
415 {
416  *pointer = (new_position >= m_buffer_end)
417  ? m_buffer.vector
418  : new_position;
419 }
float distance(CJLCoverTreePoint p1, CJLCoverTreePoint p2, float64_t upper_bound)
int32_t index_t
Definition: common.h:72
virtual void set_text(SGVector< char > txt)
Definition: Tokenizer.cpp:17
int32_t push(SGVector< char > source)
#define SG_ERROR(...)
Definition: SGIO.h:128
void skip_characters(int32_t num_chars)
#define SG_REF(x)
Definition: SGObject.h:52
index_t next_token_idx(index_t &start)
The class CTokenizer acts as a base class in order to implement tokenizers. Sub-classes must implemen...
Definition: Tokenizer.h:29
SGVector< char > pop(int32_t num_chars)
virtual bool has_next()=0
#define SG_UNREF(x)
Definition: SGObject.h:53
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
virtual index_t next_token_idx(index_t &start)=0
void set_tokenizer(CTokenizer *tokenizer)
index_t vlen
Definition: SGVector.h:571

SHOGUN Machine Learning Toolbox - Documentation