35 CFile(fname, rw, name)
44 SG_UNREF(m_delimiter_feat_tokenizer);
45 SG_UNREF(m_delimiter_label_tokenizer);
51 void CLibSVMFile::init()
55 m_whitespace_tokenizer=NULL;
56 m_delimiter_feat_tokenizer=NULL;
57 m_delimiter_label_tokenizer=NULL;
58 m_line_tokenizer=NULL;
63 void CLibSVMFile::init_with_defaults()
66 m_delimiter_label=
',';
70 SG_REF(m_whitespace_tokenizer);
73 m_delimiter_feat_tokenizer->
delimiters[m_delimiter_feat]=1;
74 SG_REF(m_delimiter_feat_tokenizer);
77 m_delimiter_label_tokenizer->
delimiters[m_delimiter_label]=1;
78 SG_REF(m_delimiter_label_tokenizer);
88 #define GET_SPARSE_MATRIX(read_func, sg_type) \
89 void CLibSVMFile::get_sparse_matrix(SGSparseVector<sg_type>*& mat_feat, int32_t& num_feat, int32_t& num_vec) \
91 SGVector<float64_t>* multilabel; \
92 int32_t num_classes; \
93 get_sparse_matrix(mat_feat, num_feat, num_vec, multilabel, num_classes, false); \
109 #undef GET_SPARSE_MATRIX
111 #define GET_LABELED_SPARSE_MATRIX(read_func, sg_type) \
112 void CLibSVMFile::get_sparse_matrix(SGSparseVector<sg_type>*& mat_feat, int32_t& num_feat, int32_t& num_vec, \
113 float64_t*& labels, bool load_labels) \
115 SGVector<float64_t>* multilabel; \
116 int32_t num_classes; \
117 get_sparse_matrix(mat_feat, num_feat, num_vec, multilabel, num_classes, load_labels); \
119 for (int32_t i=0; i<num_vec; i++) \
121 REQUIRE(multilabel[i].size()==1, \
122 "%s a multilabel file. You are trying to read it with a single-label reader.", filename); \
124 labels=SG_MALLOC(float64_t, num_vec); \
126 for (int32_t i=0; i<num_vec; i++) \
127 labels[i]=multilabel[i][0]; \
128 SG_FREE(multilabel); \
144 #undef GET_LABELED_SPARSE_MATRIX
146 #define GET_MULTI_LABELED_SPARSE_MATRIX(read_func, sg_type) \
147 void CLibSVMFile::get_sparse_matrix(SGSparseVector<sg_type>*& mat_feat, int32_t& num_feat, int32_t& num_vec, \
148 SGVector<float64_t>*& multilabel, int32_t& num_classes, bool load_labels) \
152 SG_INFO("counting line numbers in file %s\n", filename) \
153 num_vec=get_num_lines(); \
155 int32_t current_line_ind=0; \
156 SGVector<char> line; \
158 int32_t num_feat_entries=0; \
159 DynArray<SGVector<char> > entries_feat; \
160 DynArray<float64_t > entries_label; \
161 DynArray<float64_t> classes; \
163 mat_feat=SG_MALLOC(SGSparseVector<sg_type>, num_vec); \
164 multilabel=SG_MALLOC(SGVector<float64_t>, num_vec); \
169 while (m_line_reader->has_next()) \
171 num_feat_entries=0; \
172 entries_feat.reset(SGVector<char>(false)); \
173 line=m_line_reader->read_line(); \
175 m_parser->set_tokenizer(m_whitespace_tokenizer); \
176 m_parser->set_text(line); \
178 SGVector<char> entry_label; \
179 if (load_labels && m_parser->has_next()) \
181 entry_label=m_parser->read_string(); \
182 if (is_feat_entry(entry_label)) \
184 entries_feat.push_back(entry_label); \
185 num_feat_entries++; \
186 entry_label=SGVector<char>(0); \
190 while (m_parser->has_next()) \
192 entries_feat.push_back(m_parser->read_string()); \
193 num_feat_entries++; \
196 mat_feat[current_line_ind]=SGSparseVector<sg_type>(num_feat_entries); \
197 for (int32_t i=0; i<num_feat_entries; i++) \
199 m_parser->set_tokenizer(m_delimiter_feat_tokenizer); \
200 m_parser->set_text(entries_feat[i]); \
202 int32_t feat_index=0; \
204 if (m_parser->has_next()) \
205 feat_index=m_parser->read_int(); \
209 if (m_parser->has_next()) \
210 entry=m_parser->read_func(); \
212 if (feat_index>num_feat) \
213 num_feat=feat_index; \
215 mat_feat[current_line_ind].features[i].feat_index=feat_index-1; \
216 mat_feat[current_line_ind].features[i].entry=entry; \
221 m_parser->set_tokenizer(m_delimiter_label_tokenizer); \
222 m_parser->set_text(entry_label); \
224 int32_t num_label_entries=0; \
225 entries_label.reset(0); \
227 while (m_parser->has_next()) \
229 num_label_entries++; \
230 float64_t label_val=m_parser->read_real(); \
232 if (classes.find_element(label_val)==-1) \
233 classes.push_back(label_val); \
235 entries_label.push_back(label_val); \
237 multilabel[current_line_ind]=SGVector<float64_t>(num_label_entries); \
239 for (int32_t j=0; j < num_label_entries; j++) \
240 multilabel[current_line_ind][j]=entries_label[j]; \
244 current_line_ind++; \
245 SG_PROGRESS(current_line_ind, 0, num_vec, 1, "LOADING:\t") \
247 num_classes=classes.get_num_elements(); \
251 SG_INFO("file successfully read\n") \
267 #undef GET_MULTI_LABELED_SPARSE_MATRIX
269 #define SET_SPARSE_MATRIX(format, sg_type) \
270 void CLibSVMFile::set_sparse_matrix( \
271 const SGSparseVector<sg_type>* matrix, int32_t num_feat, int32_t num_vec) \
273 SGVector <float64_t>* labels = NULL; \
274 set_sparse_matrix(matrix, num_feat, num_vec, labels); \
290 #undef SET_SPARSE_MATRIX
292 #define SET_LABELED_SPARSE_MATRIX(format, sg_type) \
293 void CLibSVMFile::set_sparse_matrix( \
294 const SGSparseVector<sg_type>* matrix, int32_t num_feat, int32_t num_vec, \
295 const float64_t* labels) \
297 SGVector<float64_t>* multilabel=SG_MALLOC(SGVector<float64_t>, num_vec); \
299 for (int32_t i=0; i<num_vec; i++) \
301 multilabel[i]=SGVector<float64_t>(1); \
302 multilabel[i][0]=labels[i]; \
305 set_sparse_matrix(matrix, num_feat, num_vec, multilabel); \
306 SG_FREE(multilabel); \
322 #undef SET_LABELED_SPARSE_MATRIX
324 #define SET_MULTI_LABELED_SPARSE_MATRIX(format, sg_type) \
325 void CLibSVMFile::set_sparse_matrix( \
326 const SGSparseVector<sg_type>* matrix, int32_t num_feat, int32_t num_vec, \
327 const SGVector<float64_t>* multilabel) \
331 for (int32_t i=0; i<num_vec; i++) \
333 if (multilabel!=NULL) \
335 if (multilabel[i].size()==0) \
336 fprintf(file, " "); \
338 for (int32_t j=0; j <multilabel[i].size(); j++) \
340 fprintf(file, "%lg", multilabel[i][j]); \
342 if (j==multilabel[i].size()-1) \
343 fprintf(file, " "); \
345 fprintf(file, ","); \
349 for (int32_t j=0; j<matrix[i].num_feat_entries; j++) \
351 fprintf(file, "%d%c%" format " ", \
352 matrix[i].features[j].feat_index+1, \
354 matrix[i].features[j].entry); \
356 fprintf(file, "\n"); \
375 #undef SET_MULTI_LABELED_SPARSE_MATRIX
377 int32_t CLibSVMFile::get_num_lines()
385 m_line_reader->
reset();
void set_text(SGVector< char > text)
#define SET_LABELED_SPARSE_MATRIX(format, sg_type)
#define GET_SPARSE_MATRIX(read_func, sg_type)
#define SET_SPARSE_MATRIX(format, sg_type)
virtual float64_t read_real()
Class for buffered reading from a ascii file.
#define GET_LABELED_SPARSE_MATRIX(read_func, sg_type)
Class for reading from a string.
A File access base class.
void set_tokenizer(CTokenizer *tokenizer)
all of classes and functions are contained in the shogun namespace
The class CDelimiterTokenizer is used to tokenize a SGVector into tokens using custom chars as ...
SGVector< bool > delimiters
#define GET_MULTI_LABELED_SPARSE_MATRIX(read_func, sg_type)
#define SET_MULTI_LABELED_SPARSE_MATRIX(format, sg_type)