SHOGUN  4.1.0
 全部  命名空间 文件 函数 变量 类型定义 枚举 枚举值 友元 宏定义  
LineReader.cpp
浏览该文件的文档.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2013 Evgeniy Andreev (gsomix)
8  */
9 
10 #include <shogun/io/LineReader.h>
12 #include <shogun/lib/Tokenizer.h>
13 #include <shogun/io/SGIO.h>
14 #include <cstdio>
15 
16 using namespace shogun;
17 
19 {
20  init();
21 
22  m_buffer=new CCircularBuffer();
23 }
24 
25 CLineReader::CLineReader(FILE* stream, CTokenizer* tokenizer)
26 {
27  init();
28 
29  m_stream=stream;
30  m_max_token_length=10*1024*1024;
31 
32  SG_REF(tokenizer);
33  m_tokenizer=tokenizer;
34 
35  m_buffer=new CCircularBuffer(m_max_token_length);
36  m_buffer->set_tokenizer(m_tokenizer);
37 }
38 
39 CLineReader::CLineReader(int32_t max_token_length, FILE* stream, CTokenizer* tokenizer)
40 {
41  init();
42 
43  m_stream=stream;
44  m_max_token_length=max_token_length;
45 
46  SG_REF(tokenizer);
47  m_tokenizer=tokenizer;
48 
49  m_buffer=new CCircularBuffer(m_max_token_length);
50  m_buffer->set_tokenizer(m_tokenizer);
51 }
52 
54 {
55  SG_UNREF(m_tokenizer);
56  SG_UNREF(m_buffer);
57 }
58 
60 {
61  if (m_stream==NULL || m_max_token_length==0 || m_tokenizer==NULL)
62  {
63  SG_ERROR("CLineReader::has_next():: Class is not initialized\n");
64  return false;
65  }
66 
67  if (ferror(m_stream))
68  {
69  SG_ERROR("CLineReader::has_next():: Error reading file\n");
70  return false;
71  }
72 
73  if (feof(m_stream) && (m_buffer->num_bytes_contained()<=0 || !m_buffer->has_next()))
74  return false; // nothing to read
75 
76  return true;
77 }
78 
80 {
81  int32_t bytes_to_skip=0;
82  m_next_token_length=read(bytes_to_skip);
83  if (m_next_token_length==-1)
84  return;
85  else
86  m_buffer->skip_characters(bytes_to_skip);
87 }
88 
90 {
91  SGVector<char> line;
92 
93  int32_t bytes_to_skip=0;
94  m_next_token_length=read(bytes_to_skip);
95  if (m_next_token_length==-1)
96  line=SGVector<char>();
97  else
98  {
99  m_buffer->skip_characters(bytes_to_skip);
100  line=read_token(m_next_token_length-bytes_to_skip);
101  }
102 
103  return line;
104 }
105 
107 {
108  rewind(m_stream);
109  m_buffer->clear();
110 }
111 
113 {
114  SG_REF(tokenizer);
115  SG_UNREF(m_tokenizer);
116  m_tokenizer=tokenizer;
117 
118  m_buffer->set_tokenizer(tokenizer);
119 }
120 
121 void CLineReader::init()
122 {
123  m_buffer=NULL;
124  m_tokenizer=NULL;
125  m_stream=NULL;
126 
127  m_max_token_length=0;
128  m_next_token_length=-1;
129 }
130 
131 int32_t CLineReader::read(int32_t& bytes_to_skip)
132 {
133  int32_t line_end=0;
134  int32_t bytes_to_read=0;
135  int32_t temp_bytes_to_skip=0;
136 
137  while (1)
138  {
139  if (bytes_to_skip==line_end)
140  line_end=m_buffer->next_token_idx(bytes_to_skip);
141  else
142  line_end=m_buffer->next_token_idx(temp_bytes_to_skip);
143 
144  if (m_buffer->num_bytes_contained()!=0 && line_end<m_buffer->num_bytes_contained())
145  return line_end;
146  else if (m_buffer->available()==0)
147  return -1; // we need some limit in case file does not contain delimiter
148 
149  // if there is no delimiter in buffer
150  // try get more data from stream
151  // and write it into buffer
152  if (m_buffer->available() < m_max_token_length)
153  bytes_to_read=m_buffer->available();
154  else
155  bytes_to_read=m_max_token_length;
156 
157  if (feof(m_stream))
158  return line_end;
159  else
160  m_buffer->push(m_stream, bytes_to_read);
161 
162  if (ferror(m_stream))
163  {
164  SG_ERROR("CLineReader::read(int32_t&):: Error reading file\n");
165  return -1;
166  }
167  }
168 }
169 
170 SGVector<char> CLineReader::read_token(int32_t line_len)
171 {
172  SGVector<char> line;
173 
174  if (line_len==0)
175  line=SGVector<char>();
176  else
177  line=m_buffer->pop(line_len);
178 
179  return line;
180 }
virtual bool has_next()
Definition: LineReader.cpp:59
Implementation of circular buffer This buffer has logical structure such as queue (FIFO)...
int32_t push(SGVector< char > source)
int32_t num_bytes_contained() const
#define SG_ERROR(...)
Definition: SGIO.h:129
void skip_characters(int32_t num_chars)
#define SG_REF(x)
Definition: SGObject.h:51
index_t next_token_idx(index_t &start)
virtual SGVector< char > read_line()
Definition: LineReader.cpp:89
The class CTokenizer acts as a base class in order to implement tokenizers. Sub-classes must implemen...
Definition: Tokenizer.h:29
virtual void skip_line()
Definition: LineReader.cpp:79
SGVector< char > pop(int32_t num_chars)
virtual ~CLineReader()
Definition: LineReader.cpp:53
#define SG_UNREF(x)
Definition: SGObject.h:52
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
void set_tokenizer(CTokenizer *tokenizer)
Definition: LineReader.cpp:112
void set_tokenizer(CTokenizer *tokenizer)
int32_t available() const

SHOGUN 机器学习工具包 - 项目文档