SHOGUN  v3.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
LineReader.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2013 Evgeniy Andreev (gsomix)
8  */
9 
10 #include <shogun/io/LineReader.h>
11 #include <cstdio>
12 
13 using namespace shogun;
14 
16 {
17  init();
18 
19  m_buffer=new CCircularBuffer();
20 }
21 
22 CLineReader::CLineReader(FILE* stream, CTokenizer* tokenizer)
23 {
24  init();
25 
26  m_stream=stream;
27  m_max_token_length=10*1024*1024;
28 
29  SG_REF(tokenizer);
30  m_tokenizer=tokenizer;
31 
32  m_buffer=new CCircularBuffer(m_max_token_length);
33  m_buffer->set_tokenizer(m_tokenizer);
34 }
35 
36 CLineReader::CLineReader(int32_t max_token_length, FILE* stream, CTokenizer* tokenizer)
37 {
38  init();
39 
40  m_stream=stream;
41  m_max_token_length=max_token_length;
42 
43  SG_REF(tokenizer);
44  m_tokenizer=tokenizer;
45 
46  m_buffer=new CCircularBuffer(m_max_token_length);
47  m_buffer->set_tokenizer(m_tokenizer);
48 }
49 
51 {
52  SG_UNREF(m_tokenizer);
53  SG_UNREF(m_buffer);
54 }
55 
57 {
58  if (m_stream==NULL || m_max_token_length==0 || m_tokenizer==NULL)
59  {
60  SG_ERROR("CLineReader::has_next():: Class is not initialized\n");
61  return false;
62  }
63 
64  if (ferror(m_stream))
65  {
66  SG_ERROR("CLineReader::has_next():: Error reading file\n");
67  return false;
68  }
69 
70  if (feof(m_stream) && (m_buffer->num_bytes_contained()<=0 || !m_buffer->has_next()))
71  return false; // nothing to read
72 
73  return true;
74 }
75 
77 {
78  int32_t bytes_to_skip=0;
79  m_next_token_length=read(bytes_to_skip);
80  if (m_next_token_length==-1)
81  return;
82  else
83  m_buffer->skip_characters(bytes_to_skip);
84 }
85 
87 {
88  SGVector<char> line;
89 
90  int32_t bytes_to_skip=0;
91  m_next_token_length=read(bytes_to_skip);
92  if (m_next_token_length==-1)
93  line=SGVector<char>();
94  else
95  {
96  m_buffer->skip_characters(bytes_to_skip);
97  line=read_token(m_next_token_length-bytes_to_skip);
98  }
99 
100  return line;
101 }
102 
104 {
105  rewind(m_stream);
106  m_buffer->clear();
107 }
108 
110 {
111  SG_REF(tokenizer);
112  SG_UNREF(m_tokenizer);
113  m_tokenizer=tokenizer;
114 
115  m_buffer->set_tokenizer(tokenizer);
116 }
117 
118 void CLineReader::init()
119 {
120  m_buffer=NULL;
121  m_tokenizer=NULL;
122  m_stream=NULL;
123 
124  m_max_token_length=0;
125  m_next_token_length=-1;
126 }
127 
128 int32_t CLineReader::read(int32_t& bytes_to_skip)
129 {
130  int32_t line_end=0;
131  int32_t bytes_to_read=0;
132  int32_t temp_bytes_to_skip=0;
133 
134  while (1)
135  {
136  if (bytes_to_skip==line_end)
137  line_end=m_buffer->next_token_idx(bytes_to_skip);
138  else
139  line_end=m_buffer->next_token_idx(temp_bytes_to_skip);
140 
141  if (m_buffer->num_bytes_contained()!=0 && line_end<m_buffer->num_bytes_contained())
142  return line_end;
143  else if (m_buffer->available()==0)
144  return -1; // we need some limit in case file does not contain delimiter
145 
146  // if there is no delimiter in buffer
147  // try get more data from stream
148  // and write it into buffer
149  if (m_buffer->available() < m_max_token_length)
150  bytes_to_read=m_buffer->available();
151  else
152  bytes_to_read=m_max_token_length;
153 
154  if (feof(m_stream))
155  return line_end;
156  else
157  m_buffer->push(m_stream, bytes_to_read);
158 
159  if (ferror(m_stream))
160  {
161  SG_ERROR("CLineReader::read(int32_t&):: Error reading file\n");
162  return -1;
163  }
164  }
165 }
166 
167 SGVector<char> CLineReader::read_token(int32_t line_len)
168 {
169  SGVector<char> line;
170 
171  if (line_len==0)
172  line=SGVector<char>();
173  else
174  line=m_buffer->pop(line_len);
175 
176  return line;
177 }

SHOGUN Machine Learning Toolbox - Documentation