SHOGUN  4.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
LibSVMFile.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2014 Jiaolong Xu
8  * Written (W) 2013 Evgeniy Andreev (gsomix)
9  */
10 
11 #include <shogun/io/LibSVMFile.h>
12 
13 #include <shogun/lib/SGVector.h>
15 #include <shogun/base/DynArray.h>
16 #include <shogun/io/LineReader.h>
17 #include <shogun/io/Parser.h>
19 
20 using namespace shogun;
21 
23 {
24  init();
25 }
26 
27 CLibSVMFile::CLibSVMFile(FILE* f, const char* name) :
28  CFile(f, name)
29 {
30  init();
31  init_with_defaults();
32 }
33 
34 CLibSVMFile::CLibSVMFile(const char* fname, char rw, const char* name) :
35  CFile(fname, rw, name)
36 {
37  init();
38  init_with_defaults();
39 }
40 
42 {
43  SG_UNREF(m_whitespace_tokenizer);
44  SG_UNREF(m_delimiter_feat_tokenizer);
45  SG_UNREF(m_delimiter_label_tokenizer);
46  SG_UNREF(m_line_tokenizer);
47  SG_UNREF(m_parser);
48  SG_UNREF(m_line_reader);
49 }
50 
51 void CLibSVMFile::init()
52 {
53  m_delimiter_feat=0;
54 
55  m_whitespace_tokenizer=NULL;
56  m_delimiter_feat_tokenizer=NULL;
57  m_delimiter_label_tokenizer=NULL;
58  m_line_tokenizer=NULL;
59  m_parser=NULL;
60  m_line_reader=NULL;
61 }
62 
63 void CLibSVMFile::init_with_defaults()
64 {
65  m_delimiter_feat=':';
66  m_delimiter_label=',';
67 
68  m_whitespace_tokenizer=new CDelimiterTokenizer(true);
69  m_whitespace_tokenizer->delimiters[' ']=1;
70  SG_REF(m_whitespace_tokenizer);
71 
72  m_delimiter_feat_tokenizer=new CDelimiterTokenizer(true);
73  m_delimiter_feat_tokenizer->delimiters[m_delimiter_feat]=1;
74  SG_REF(m_delimiter_feat_tokenizer);
75 
76  m_delimiter_label_tokenizer=new CDelimiterTokenizer(true);
77  m_delimiter_label_tokenizer->delimiters[m_delimiter_label]=1;
78  SG_REF(m_delimiter_label_tokenizer);
79 
80  m_line_tokenizer=new CDelimiterTokenizer(true);
81  m_line_tokenizer->delimiters['\n']=1;
82  SG_REF(m_line_tokenizer);
83 
84  m_parser=new CParser();
85  m_line_reader=new CLineReader(file, m_line_tokenizer);
86 }
87 
88 #define GET_SPARSE_MATRIX(read_func, sg_type) \
89 void CLibSVMFile::get_sparse_matrix(SGSparseVector<sg_type>*& mat_feat, int32_t& num_feat, int32_t& num_vec) \
90 { \
91  SGVector<float64_t>* multilabel; \
92  int32_t num_classes; \
93  get_sparse_matrix(mat_feat, num_feat, num_vec, multilabel, num_classes, false); \
94 }
95 
96 GET_SPARSE_MATRIX(read_bool, bool)
97 GET_SPARSE_MATRIX(read_char, int8_t)
98 GET_SPARSE_MATRIX(read_byte, uint8_t)
99 GET_SPARSE_MATRIX(read_char, char)
100 GET_SPARSE_MATRIX(read_int, int32_t)
101 GET_SPARSE_MATRIX(read_uint, uint32_t)
102 GET_SPARSE_MATRIX(read_short_real, float32_t)
103 GET_SPARSE_MATRIX(read_real, float64_t)
104 GET_SPARSE_MATRIX(read_long_real, floatmax_t)
105 GET_SPARSE_MATRIX(read_short, int16_t)
106 GET_SPARSE_MATRIX(read_word, uint16_t)
107 GET_SPARSE_MATRIX(read_long, int64_t)
108 GET_SPARSE_MATRIX(read_ulong, uint64_t)
109 #undef GET_SPARSE_MATRIX
110 
111 #define GET_LABELED_SPARSE_MATRIX(read_func, sg_type) \
112 void CLibSVMFile::get_sparse_matrix(SGSparseVector<sg_type>*& mat_feat, int32_t& num_feat, int32_t& num_vec, \
113  float64_t*& labels, bool load_labels) \
114 { \
115  SGVector<float64_t>* multilabel; \
116  int32_t num_classes; \
117  get_sparse_matrix(mat_feat, num_feat, num_vec, multilabel, num_classes, load_labels); \
118  \
119  for (int32_t i=0; i<num_vec; i++) \
120  { \
121  REQUIRE(multilabel[i].size()==1, \
122  "%s a multilabel file. You are trying to read it with a single-label reader.", filename); \
123  } \
124  labels=SG_MALLOC(float64_t, num_vec); \
125  \
126  for (int32_t i=0; i<num_vec; i++) \
127  labels[i]=multilabel[i][0]; \
128  SG_FREE(multilabel); \
129 } \
130 
131 GET_LABELED_SPARSE_MATRIX(read_bool, bool)
132 GET_LABELED_SPARSE_MATRIX(read_char, int8_t)
133 GET_LABELED_SPARSE_MATRIX(read_byte, uint8_t)
134 GET_LABELED_SPARSE_MATRIX(read_char, char)
135 GET_LABELED_SPARSE_MATRIX(read_int, int32_t)
136 GET_LABELED_SPARSE_MATRIX(read_uint, uint32_t)
137 GET_LABELED_SPARSE_MATRIX(read_short_real, float32_t)
139 GET_LABELED_SPARSE_MATRIX(read_long_real, floatmax_t)
140 GET_LABELED_SPARSE_MATRIX(read_short, int16_t)
141 GET_LABELED_SPARSE_MATRIX(read_word, uint16_t)
142 GET_LABELED_SPARSE_MATRIX(read_long, int64_t)
143 GET_LABELED_SPARSE_MATRIX(read_ulong, uint64_t)
144 #undef GET_LABELED_SPARSE_MATRIX
145 
146 #define GET_MULTI_LABELED_SPARSE_MATRIX(read_func, sg_type) \
147 void CLibSVMFile::get_sparse_matrix(SGSparseVector<sg_type>*& mat_feat, int32_t& num_feat, int32_t& num_vec, \
148  SGVector<float64_t>*& multilabel, int32_t& num_classes, bool load_labels) \
149 { \
150  num_feat=0; \
151  \
152  SG_INFO("counting line numbers in file %s\n", filename) \
153  num_vec=get_num_lines(); \
154  \
155  int32_t current_line_ind=0; \
156  SGVector<char> line; \
157  \
158  int32_t num_feat_entries=0; \
159  DynArray<SGVector<char> > entries_feat; \
160  DynArray<float64_t > entries_label; \
161  DynArray<float64_t> classes; \
162  \
163  mat_feat=SG_MALLOC(SGSparseVector<sg_type>, num_vec); \
164  multilabel=SG_MALLOC(SGVector<float64_t>, num_vec); \
165  \
166  num_classes=0; \
167  SG_SET_LOCALE_C; \
168  \
169  while (m_line_reader->has_next()) \
170  { \
171  num_feat_entries=0; \
172  entries_feat.reset(SGVector<char>(false)); \
173  line=m_line_reader->read_line(); \
174  \
175  m_parser->set_tokenizer(m_whitespace_tokenizer); \
176  m_parser->set_text(line); \
177  \
178  SGVector<char> entry_label; \
179  if (load_labels && m_parser->has_next()) \
180  { \
181  entry_label=m_parser->read_string(); \
182  if (is_feat_entry(entry_label)) \
183  { \
184  entries_feat.push_back(entry_label); \
185  num_feat_entries++; \
186  entry_label=SGVector<char>(0); \
187  } \
188  } \
189  \
190  while (m_parser->has_next()) \
191  { \
192  entries_feat.push_back(m_parser->read_string()); \
193  num_feat_entries++; \
194  } \
195  \
196  mat_feat[current_line_ind]=SGSparseVector<sg_type>(num_feat_entries); \
197  for (int32_t i=0; i<num_feat_entries; i++) \
198  { \
199  m_parser->set_tokenizer(m_delimiter_feat_tokenizer); \
200  m_parser->set_text(entries_feat[i]); \
201  \
202  int32_t feat_index=0; \
203  \
204  if (m_parser->has_next()) \
205  feat_index=m_parser->read_int(); \
206  \
207  sg_type entry=0; \
208  \
209  if (m_parser->has_next()) \
210  entry=m_parser->read_func(); \
211  \
212  if (feat_index>num_feat) \
213  num_feat=feat_index; \
214  \
215  mat_feat[current_line_ind].features[i].feat_index=feat_index-1; \
216  mat_feat[current_line_ind].features[i].entry=entry; \
217  } \
218  \
219  if (load_labels) \
220  { \
221  m_parser->set_tokenizer(m_delimiter_label_tokenizer); \
222  m_parser->set_text(entry_label); \
223  \
224  int32_t num_label_entries=0; \
225  entries_label.reset(0); \
226  \
227  while (m_parser->has_next()) \
228  { \
229  num_label_entries++; \
230  float64_t label_val=m_parser->read_real(); \
231  \
232  if (classes.find_element(label_val)==-1) \
233  classes.push_back(label_val); \
234  \
235  entries_label.push_back(label_val); \
236  } \
237  multilabel[current_line_ind]=SGVector<float64_t>(num_label_entries); \
238  \
239  for (int32_t j=0; j < num_label_entries; j++) \
240  multilabel[current_line_ind][j]=entries_label[j]; \
241  \
242  } \
243  \
244  current_line_ind++; \
245  SG_PROGRESS(current_line_ind, 0, num_vec, 1, "LOADING:\t") \
246  } \
247  num_classes=classes.get_num_elements(); \
248  \
249  SG_RESET_LOCALE; \
250  \
251  SG_INFO("file successfully read\n") \
252 }
253 
254 GET_MULTI_LABELED_SPARSE_MATRIX(read_bool, bool)
255 GET_MULTI_LABELED_SPARSE_MATRIX(read_char, int8_t)
256 GET_MULTI_LABELED_SPARSE_MATRIX(read_byte, uint8_t)
257 GET_MULTI_LABELED_SPARSE_MATRIX(read_char, char)
258 GET_MULTI_LABELED_SPARSE_MATRIX(read_int, int32_t)
259 GET_MULTI_LABELED_SPARSE_MATRIX(read_uint, uint32_t)
263 GET_MULTI_LABELED_SPARSE_MATRIX(read_short, int16_t)
264 GET_MULTI_LABELED_SPARSE_MATRIX(read_word, uint16_t)
265 GET_MULTI_LABELED_SPARSE_MATRIX(read_long, int64_t)
266 GET_MULTI_LABELED_SPARSE_MATRIX(read_ulong, uint64_t)
267 #undef GET_MULTI_LABELED_SPARSE_MATRIX
268 
269 #define SET_SPARSE_MATRIX(format, sg_type) \
270 void CLibSVMFile::set_sparse_matrix( \
271  const SGSparseVector<sg_type>* matrix, int32_t num_feat, int32_t num_vec) \
272 { \
273  SGVector <float64_t>* labels = NULL; \
274  set_sparse_matrix(matrix, num_feat, num_vec, labels); \
275 }
276 
277 SET_SPARSE_MATRIX(SCNi32, bool)
278 SET_SPARSE_MATRIX(SCNi8, int8_t)
279 SET_SPARSE_MATRIX(SCNu8, uint8_t)
280 SET_SPARSE_MATRIX(SCNu8, char)
281 SET_SPARSE_MATRIX(SCNi32, int32_t)
282 SET_SPARSE_MATRIX(SCNu32, uint32_t)
283 SET_SPARSE_MATRIX(SCNi64, int64_t)
284 SET_SPARSE_MATRIX(SCNu64, uint64_t)
288 SET_SPARSE_MATRIX(SCNi16, int16_t)
289 SET_SPARSE_MATRIX(SCNu16, uint16_t)
290 #undef SET_SPARSE_MATRIX
291 
292 #define SET_LABELED_SPARSE_MATRIX(format, sg_type) \
293 void CLibSVMFile::set_sparse_matrix( \
294  const SGSparseVector<sg_type>* matrix, int32_t num_feat, int32_t num_vec, \
295  const float64_t* labels) \
296 { \
297  SGVector<float64_t>* multilabel=SG_MALLOC(SGVector<float64_t>, num_vec); \
298  \
299  for (int32_t i=0; i<num_vec; i++) \
300  { \
301  multilabel[i]=SGVector<float64_t>(1); \
302  multilabel[i][0]=labels[i]; \
303  } \
304  \
305  set_sparse_matrix(matrix, num_feat, num_vec, multilabel); \
306  SG_FREE(multilabel); \
307 }
308 
309 SET_LABELED_SPARSE_MATRIX(SCNi32, bool)
310 SET_LABELED_SPARSE_MATRIX(SCNi8, int8_t)
311 SET_LABELED_SPARSE_MATRIX(SCNu8, uint8_t)
312 SET_LABELED_SPARSE_MATRIX(SCNu8, char)
313 SET_LABELED_SPARSE_MATRIX(SCNi32, int32_t)
314 SET_LABELED_SPARSE_MATRIX(SCNu32, uint32_t)
315 SET_LABELED_SPARSE_MATRIX(SCNi64, int64_t)
316 SET_LABELED_SPARSE_MATRIX(SCNu64, uint64_t)
320 SET_LABELED_SPARSE_MATRIX(SCNi16, int16_t)
321 SET_LABELED_SPARSE_MATRIX(SCNu16, uint16_t)
322 #undef SET_LABELED_SPARSE_MATRIX
323 
324 #define SET_MULTI_LABELED_SPARSE_MATRIX(format, sg_type) \
325 void CLibSVMFile::set_sparse_matrix( \
326  const SGSparseVector<sg_type>* matrix, int32_t num_feat, int32_t num_vec, \
327  const SGVector<float64_t>* multilabel) \
328 { \
329  SG_SET_LOCALE_C; \
330  \
331  for (int32_t i=0; i<num_vec; i++) \
332  { \
333  if (multilabel!=NULL) \
334  { \
335  if (multilabel[i].size()==0) \
336  fprintf(file, " "); \
337  \
338  for (int32_t j=0; j <multilabel[i].size(); j++) \
339  { \
340  fprintf(file, "%lg", multilabel[i][j]); \
341  \
342  if (j==multilabel[i].size()-1) \
343  fprintf(file, " "); \
344  else \
345  fprintf(file, ","); \
346  } \
347  } \
348  \
349  for (int32_t j=0; j<matrix[i].num_feat_entries; j++) \
350  { \
351  fprintf(file, "%d%c%" format " ", \
352  matrix[i].features[j].feat_index+1, \
353  m_delimiter_feat, \
354  matrix[i].features[j].entry); \
355  } \
356  fprintf(file, "\n"); \
357  } \
358  \
359  SG_RESET_LOCALE; \
360 }
361 
363 SET_MULTI_LABELED_SPARSE_MATRIX(SCNi8, int8_t)
364 SET_MULTI_LABELED_SPARSE_MATRIX(SCNu8, uint8_t)
366 SET_MULTI_LABELED_SPARSE_MATRIX(SCNi32, int32_t)
367 SET_MULTI_LABELED_SPARSE_MATRIX(SCNu32, uint32_t)
368 SET_MULTI_LABELED_SPARSE_MATRIX(SCNi64, int64_t)
369 SET_MULTI_LABELED_SPARSE_MATRIX(SCNu64, uint64_t)
373 SET_MULTI_LABELED_SPARSE_MATRIX(SCNi16, int16_t)
374 SET_MULTI_LABELED_SPARSE_MATRIX(SCNu16, uint16_t)
375 #undef SET_MULTI_LABELED_SPARSE_MATRIX
376 
377 int32_t CLibSVMFile::get_num_lines()
378 {
379  int32_t num_lines=0;
380  while (m_line_reader->has_next())
381  {
382  m_line_reader->skip_line();
383  num_lines++;
384  }
385  m_line_reader->reset();
386 
387  return num_lines;
388 }
389 
390 bool CLibSVMFile::is_feat_entry(const SGVector<char> entry)
391 {
392  CParser* parser = new CParser();
393  parser->set_tokenizer(m_delimiter_feat_tokenizer);
394  parser->set_text(entry);
395  bool isfeat = false;
396 
397  if (parser->has_next())
398  {
399  parser->read_real();
400 
401  if (parser->has_next())
402  isfeat = true;
403 
404  }
405 
406  SG_UNREF(parser);
407 
408  return isfeat;
409 }
void set_text(SGVector< char > text)
Definition: Parser.cpp:138
#define SET_LABELED_SPARSE_MATRIX(format, sg_type)
Definition: LibSVMFile.cpp:292
virtual bool has_next()
Definition: LineReader.cpp:59
#define GET_SPARSE_MATRIX(read_func, sg_type)
Definition: LibSVMFile.cpp:88
#define SET_SPARSE_MATRIX(format, sg_type)
Definition: LibSVMFile.cpp:269
virtual ~CLibSVMFile()
Definition: LibSVMFile.cpp:41
virtual float64_t read_real()
FILE * file
Definition: File.h:505
Class for buffered reading from a ascii file.
Definition: LineReader.h:24
#define SG_REF(x)
Definition: SGObject.h:54
#define GET_LABELED_SPARSE_MATRIX(read_func, sg_type)
Definition: LibSVMFile.cpp:111
virtual bool has_next()
Definition: Parser.cpp:39
Class for reading from a string.
Definition: Parser.h:23
double float64_t
Definition: common.h:50
long double floatmax_t
Definition: common.h:51
A File access base class.
Definition: File.h:34
virtual void skip_line()
Definition: LineReader.cpp:79
void set_tokenizer(CTokenizer *tokenizer)
Definition: Parser.cpp:146
float float32_t
Definition: common.h:49
#define SG_UNREF(x)
Definition: SGObject.h:55
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
The class CDelimiterTokenizer is used to tokenize a SGVector into tokens using custom chars as ...
#define GET_MULTI_LABELED_SPARSE_MATRIX(read_func, sg_type)
Definition: LibSVMFile.cpp:146
#define SET_MULTI_LABELED_SPARSE_MATRIX(format, sg_type)
Definition: LibSVMFile.cpp:324

SHOGUN Machine Learning Toolbox - Documentation