SHOGUN  v3.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
VwParser.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2009 Yahoo! Inc. All rights reserved. The copyrights
3  * embodied in the content of this file are licensed under the BSD
4  * (revised) open source license.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * Written (W) 2011 Shashwat Lal Das
12  * Adaptation of Vowpal Wabbit v5.1.
13  * Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society.
14  */
15 
18 
19 using namespace shogun;
20 
22  : CSGObject()
23 {
24  env = new CVwEnvironment();
26  write_cache = false;
27  cache_writer = NULL;
28 }
29 
31  : CSGObject()
32 {
33  ASSERT(env_to_use)
34 
35  env = env_to_use;
37  write_cache = false;
38  cache_writer = NULL;
39  SG_REF(env);
40 }
41 
43 {
44  SG_UNREF(env);
46 }
47 
49 {
50  char *line=NULL;
51  int32_t num_chars = buf->read_line(line);
52  if (num_chars == 0)
53  return num_chars;
54 
55  /* Mark begin and end of example in the buffer */
56  substring example_string = {line, line + num_chars};
57 
58  /* Channels containing separate namespaces/label information*/
59  channels.erase();
60 
61  /* Split at '|' character */
62  tokenize('|', example_string, channels);
63 
64  /* If first char is not '|', then the first channel contains label data */
65  substring* feature_start = &channels[1];
66 
67  if (*line == '|')
68  feature_start = &channels[0]; /* Unlabelled data */
69  else
70  {
71  /* First channel has label info */
72  substring label_space = channels[0];
73  char* tab_location = safe_index(label_space.start, '\t', label_space.end);
74  if (tab_location != label_space.end)
75  label_space.start = tab_location+1;
76 
77  /* Split the label space on spaces */
78  tokenize(' ',label_space,words);
79  if (words.index() > 0 && words.last().end == label_space.end) //The last field is a tag, so record and strip it off
80  {
81  substring tag = words.pop();
82  ae->tag.push_many(tag.start, tag.end - tag.start);
83  }
84 
85  ae->ld->label_from_substring(words);
86  set_minmax(ae->ld->label);
87  }
88 
89  vw_size_t mask = env->mask;
90 
91  /* Now parse the individual channels, i.e., namespaces */
92  for (substring* i = feature_start; i != channels.end; i++)
93  {
94  substring channel = *i;
95 
96  tokenize(' ',channel, words);
97  if (words.begin == words.end)
98  continue;
99 
100  /* Set default scale value for channel */
101  float32_t channel_v = 1.;
102  vw_size_t channel_hash;
103 
104  /* Index by which to refer to the namespace */
105  vw_size_t index = 0;
106  bool new_index = false;
107  vw_size_t feature_offset = 0;
108 
109  if (channel.start[0] != ' ')
110  {
111  /* Nonanonymous namespace specified */
112  feature_offset++;
113  feature_value(words[0], name, channel_v);
114 
115  if (name.index() > 0)
116  {
117  index = (unsigned char)(*name[0].start);
118  if (ae->atomics[index].begin == ae->atomics[index].end)
119  {
120  ae->sum_feat_sq[index] = 0;
121  new_index = true;
122  }
123  }
124  channel_hash = hasher(name[0], hash_base);
125  }
126  else
127  {
128  /* Use default namespace with index below */
129  index = (unsigned char)' ';
130  if (ae->atomics[index].begin == ae->atomics[index].end)
131  {
132  ae->sum_feat_sq[index] = 0;
133  new_index = true;
134  }
135  channel_hash = 0;
136  }
137 
138  for (substring* j = words.begin+feature_offset; j != words.end; j++)
139  {
140  /* Get individual features and multiply by scale value */
141  float32_t v = 0.0;
142  feature_value(*j, name, v);
143  v *= channel_v;
144 
145  /* Hash feature */
146  vw_size_t word_hash = (hasher(name[0], channel_hash)) & mask;
147  VwFeature f = {v,word_hash};
148  ae->sum_feat_sq[index] += v*v;
149  ae->atomics[index].push(f);
150  }
151 
152  /* Add index to list of indices if required */
153  if (new_index && ae->atomics[index].begin != ae->atomics[index].end)
154  ae->indices.push(index);
155 
156  }
157 
158  if (write_cache)
160 
161  return num_chars;
162 }
163 
165 {
166  char *line=NULL;
167  int32_t num_chars = buf->read_line(line);
168  if (num_chars == 0)
169  return num_chars;
170 
171  /* Mark begin and end of example in the buffer */
172  substring example_string = {line, line + num_chars};
173 
174  vw_size_t mask = env->mask;
175  tokenize(' ', example_string, words);
176 
177  ae->ld->label = SGIO::float_of_substring(words[0]);
178  ae->ld->weight = 1.;
179  ae->ld->initial = 0.;
180  set_minmax(ae->ld->label);
181 
182  substring* feature_start = &words[1];
183 
184  vw_size_t index = (unsigned char)' '; // Any default namespace is ok
185  vw_size_t channel_hash = 0;
186  ae->sum_feat_sq[index] = 0;
187  ae->indices.push(index);
188  /* Now parse the individual features */
189  for (substring* i = feature_start; i != words.end; i++)
190  {
191  float32_t v;
192  feature_value(*i, name, v);
193 
194  vw_size_t word_hash = (hasher(name[0], channel_hash)) & mask;
195  VwFeature f = {v,word_hash};
196  ae->sum_feat_sq[index] += v*v;
197  ae->atomics[index].push(f);
198  }
199 
200  if (write_cache)
202 
203  return num_chars;
204 }
205 
207 {
208  char *line=NULL;
209  int32_t num_chars = buf->read_line(line);
210  if (num_chars == 0)
211  return num_chars;
212 
213  // Mark begin and end of example in the buffer
214  substring example_string = {line, line + num_chars};
215 
216  vw_size_t mask = env->mask;
217  tokenize(' ', example_string, words);
218 
219  ae->ld->label = SGIO::float_of_substring(words[0]);
220  ae->ld->weight = 1.;
221  ae->ld->initial = 0.;
222  set_minmax(ae->ld->label);
223 
224  substring* feature_start = &words[1];
225 
226  vw_size_t index = (unsigned char)' ';
227 
228  ae->sum_feat_sq[index] = 0;
229  ae->indices.push(index);
230  // Now parse individual features
231  int32_t j=0;
232  for (substring* i = feature_start; i != words.end; i++)
233  {
235  vw_size_t word_hash = j & mask;
236  VwFeature f = {v,word_hash};
237  ae->sum_feat_sq[index] += v*v;
238  ae->atomics[index].push(f);
239  j++;
240  }
241 
242  if (write_cache)
244 
245  return num_chars;
246 }
247 
248 void CVwParser::init_cache(char * fname, EVwCacheType type)
249 {
250  char* file_name = fname;
251  char default_cache_name[] = "vw_cache.dat.cache";
252 
253  if (!fname)
254  file_name = default_cache_name;
255 
256  write_cache = true;
257  cache_type = type;
258 
259  switch (type)
260  {
261  case C_NATIVE:
262  cache_writer = new CVwNativeCacheWriter(file_name, env);
263  return;
264  case C_PROTOBUF:
265  SG_ERROR("Protocol buffers cache support is not implemented yet.\n")
266  }
267 
268  SG_ERROR("Unexpected cache type specified!\n")
269 }
270 
272 {
273  // Get the value of the feature in the substring
274  tokenize(':', s, feat_name);
275 
276  switch (feat_name.index())
277  {
278  // If feature value is not specified, assume 1.0
279  case 0:
280  case 1:
281  v = 1.;
282  break;
283  case 2:
284  v = SGIO::float_of_substring(feat_name[1]);
285  if (CMath::is_nan(v))
286  SG_SERROR("error NaN value for feature %s! Terminating!\n",
287  SGIO::c_string_of_substring(feat_name[0]));
288  break;
289  default:
290  SG_SERROR("Examples with a weird name, i.e., '%s'\n",
292  }
293 }
294 
296 {
297  ret.erase();
298  char *last = s.start;
299  for (; s.start != s.end; s.start++)
300  {
301  if (*s.start == delim)
302  {
303  if (s.start != last)
304  {
305  substring temp = {last,s.start};
306  ret.push(temp);
307  }
308  last = s.start+1;
309  }
310  }
311  if (s.start != last)
312  {
313  substring final = {last, s.start};
314  ret.push(final);
315  }
316 }

SHOGUN Machine Learning Toolbox - Documentation