SHOGUN  4.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
VwParser.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2009 Yahoo! Inc. All rights reserved. The copyrights
3  * embodied in the content of this file are licensed under the BSD
4  * (revised) open source license.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * Written (W) 2011 Shashwat Lal Das
12  * Adaptation of Vowpal Wabbit v5.1.
13  * Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society.
14  */
15 
18 
19 using namespace shogun;
20 
22  : CSGObject()
23 {
24  env = new CVwEnvironment();
25  SG_REF(env);
27  write_cache = false;
28  cache_writer = NULL;
29 }
30 
32  : CSGObject()
33 {
34  ASSERT(env_to_use)
35 
36  if(env_to_use!=env)
37  {
38  SG_REF(env_to_use);
39  SG_UNREF(env);
40  env = env_to_use;
41  }
42 
44  write_cache = false;
45  cache_writer = NULL;
46 }
47 
49 {
50  SG_UNREF(env);
52 }
53 
55 {
56  char *line=NULL;
57  int32_t num_chars = buf->read_line(line);
58  if (num_chars == 0)
59  return num_chars;
60 
61  /* Mark begin and end of example in the buffer */
62  substring example_string = {line, line + num_chars};
63 
64  /* Channels containing separate namespaces/label information*/
65  channels.erase();
66 
67  /* Split at '|' character */
68  tokenize('|', example_string, channels);
69 
70  /* If first char is not '|', then the first channel contains label data */
71  substring* feature_start = &channels[1];
72 
73  if (*line == '|')
74  feature_start = &channels[0]; /* Unlabelled data */
75  else
76  {
77  /* First channel has label info */
78  substring label_space = channels[0];
79  char* tab_location = safe_index(label_space.start, '\t', label_space.end);
80  if (tab_location != label_space.end)
81  label_space.start = tab_location+1;
82 
83  /* Split the label space on spaces */
84  tokenize(' ',label_space,words);
85  if (words.index() > 0 && words.last().end == label_space.end) //The last field is a tag, so record and strip it off
86  {
87  substring tag = words.pop();
88  ae->tag.push_many(tag.start, tag.end - tag.start);
89  }
90 
91  ae->ld->label_from_substring(words);
92  set_minmax(ae->ld->label);
93  }
94 
95  vw_size_t mask = env->mask;
96 
97  /* Now parse the individual channels, i.e., namespaces */
98  for (substring* i = feature_start; i != channels.end; i++)
99  {
100  substring channel = *i;
101 
102  tokenize(' ',channel, words);
103  if (words.begin == words.end)
104  continue;
105 
106  /* Set default scale value for channel */
107  float32_t channel_v = 1.;
108  vw_size_t channel_hash;
109 
110  /* Index by which to refer to the namespace */
111  vw_size_t index = 0;
112  bool new_index = false;
113  vw_size_t feature_offset = 0;
114 
115  if (channel.start[0] != ' ')
116  {
117  /* Nonanonymous namespace specified */
118  feature_offset++;
119  feature_value(words[0], name, channel_v);
120 
121  if (name.index() > 0)
122  {
123  index = (unsigned char)(*name[0].start);
124  if (ae->atomics[index].begin == ae->atomics[index].end)
125  {
126  ae->sum_feat_sq[index] = 0;
127  new_index = true;
128  }
129  }
130  channel_hash = hasher(name[0], hash_base);
131  }
132  else
133  {
134  /* Use default namespace with index below */
135  index = (unsigned char)' ';
136  if (ae->atomics[index].begin == ae->atomics[index].end)
137  {
138  ae->sum_feat_sq[index] = 0;
139  new_index = true;
140  }
141  channel_hash = 0;
142  }
143 
144  for (substring* j = words.begin+feature_offset; j != words.end; j++)
145  {
146  /* Get individual features and multiply by scale value */
147  float32_t v = 0.0;
148  feature_value(*j, name, v);
149  v *= channel_v;
150 
151  /* Hash feature */
152  vw_size_t word_hash = (hasher(name[0], channel_hash)) & mask;
153  VwFeature f = {v,word_hash};
154  ae->sum_feat_sq[index] += v*v;
155  ae->atomics[index].push(f);
156  }
157 
158  /* Add index to list of indices if required */
159  if (new_index && ae->atomics[index].begin != ae->atomics[index].end)
160  ae->indices.push(index);
161 
162  }
163 
164  if (write_cache)
166 
167  return num_chars;
168 }
169 
171 {
172  char *line=NULL;
173  int32_t num_chars = buf->read_line(line);
174  if (num_chars == 0)
175  return num_chars;
176 
177  /* Mark begin and end of example in the buffer */
178  substring example_string = {line, line + num_chars};
179 
180  vw_size_t mask = env->mask;
181  tokenize(' ', example_string, words);
182 
183  ae->ld->label = SGIO::float_of_substring(words[0]);
184  ae->ld->weight = 1.;
185  ae->ld->initial = 0.;
186  set_minmax(ae->ld->label);
187 
188  substring* feature_start = &words[1];
189 
190  vw_size_t index = (unsigned char)' '; // Any default namespace is ok
191  vw_size_t channel_hash = 0;
192  ae->sum_feat_sq[index] = 0;
193  ae->indices.push(index);
194  /* Now parse the individual features */
195  for (substring* i = feature_start; i != words.end; i++)
196  {
197  float32_t v;
198  feature_value(*i, name, v);
199 
200  vw_size_t word_hash = (hasher(name[0], channel_hash)) & mask;
201  VwFeature f = {v,word_hash};
202  ae->sum_feat_sq[index] += v*v;
203  ae->atomics[index].push(f);
204  }
205 
206  if (write_cache)
208 
209  return num_chars;
210 }
211 
213 {
214  char *line=NULL;
215  int32_t num_chars = buf->read_line(line);
216  if (num_chars == 0)
217  return num_chars;
218 
219  // Mark begin and end of example in the buffer
220  substring example_string = {line, line + num_chars};
221 
222  vw_size_t mask = env->mask;
223  tokenize(' ', example_string, words);
224 
225  ae->ld->label = SGIO::float_of_substring(words[0]);
226  ae->ld->weight = 1.;
227  ae->ld->initial = 0.;
228  set_minmax(ae->ld->label);
229 
230  substring* feature_start = &words[1];
231 
232  vw_size_t index = (unsigned char)' ';
233 
234  ae->sum_feat_sq[index] = 0;
235  ae->indices.push(index);
236  // Now parse individual features
237  int32_t j=0;
238  for (substring* i = feature_start; i != words.end; i++)
239  {
241  vw_size_t word_hash = j & mask;
242  VwFeature f = {v,word_hash};
243  ae->sum_feat_sq[index] += v*v;
244  ae->atomics[index].push(f);
245  j++;
246  }
247 
248  if (write_cache)
250 
251  return num_chars;
252 }
253 
254 void CVwParser::init_cache(char * fname, EVwCacheType type)
255 {
256  char* file_name = fname;
257  char default_cache_name[] = "vw_cache.dat.cache";
258 
259  if (!fname)
260  file_name = default_cache_name;
261 
262  write_cache = true;
263  cache_type = type;
264 
265  switch (type)
266  {
267  case C_NATIVE:
268  cache_writer = new CVwNativeCacheWriter(file_name, env);
269  return;
270  case C_PROTOBUF:
271  SG_ERROR("Protocol buffers cache support is not implemented yet.\n")
272  }
273 
274  SG_ERROR("Unexpected cache type specified!\n")
275 }
276 
278 {
279  // Get the value of the feature in the substring
280  tokenize(':', s, feat_name);
281 
282  switch (feat_name.index())
283  {
284  // If feature value is not specified, assume 1.0
285  case 0:
286  case 1:
287  v = 1.;
288  break;
289  case 2:
290  v = SGIO::float_of_substring(feat_name[1]);
291  if (CMath::is_nan(v))
292  SG_SERROR("error NaN value for feature %s! Terminating!\n",
293  SGIO::c_string_of_substring(feat_name[0]));
294  break;
295  default:
296  SG_SERROR("Examples with a weird name, i.e., '%s'\n",
298  }
299 }
300 
302 {
303  ret.erase();
304  char *last = s.start;
305  for (; s.start != s.end; s.start++)
306  {
307  if (*s.start == delim)
308  {
309  if (s.start != last)
310  {
311  substring temp = {last,s.start};
312  ret.push(temp);
313  }
314  last = s.start+1;
315  }
316  }
317  if (s.start != last)
318  {
319  substring final = {last, s.start};
320  ret.push(final);
321  }
322 }
An I/O buffer class.
Definition: IOBuffer.h:41
uint32_t vw_size_t
vw_size_t typedef to work across platforms
Definition: vw_constants.h:26
ssize_t read_line(char *&pointer)
Definition: IOBuffer.h:153
void feature_value(substring &s, v_array< substring > &name, float32_t &v)
Definition: VwParser.cpp:277
char * safe_index(char *start, char v, char *max)
Definition: VwParser.h:246
const uint32_t hash_base
Seed for hash.
Definition: vw_constants.h:35
virtual ~CVwParser()
Definition: VwParser.cpp:48
void push_many(const T *new_elem, size_t num)
Definition: v_array.h:183
char * end
Definition: SGIO.h:234
Class CVwEnvironment is the environment used by VW.
Definition: VwEnvironment.h:41
int32_t read_features(CIOBuffer *buf, VwExample *&ex)
Definition: VwParser.cpp:54
CVwEnvironment * env
Environment of VW - used by parser.
Definition: VwParser.h:259
Class v_array taken directly from JL's implementation.
void set_minmax(float64_t label)
Definition: VwParser.h:165
CVwCacheWriter * cache_writer
Object which will be used for writing cache.
Definition: VwParser.h:261
#define SG_ERROR(...)
Definition: SGIO.h:129
char * start
Definition: SGIO.h:232
int32_t read_dense_features(CIOBuffer *buf, VwExample *&ae)
Definition: VwParser.cpp:212
float64_t sum_feat_sq[256]
Sum of square of features.
Definition: vw_example.h:104
struct Substring, specified by start position and end position.
Definition: SGIO.h:229
#define SG_REF(x)
Definition: SGObject.h:54
void tokenize(char delim, substring s, v_array< substring > &ret)
Definition: VwParser.cpp:301
void push(const T &new_elem)
Definition: v_array.h:168
bool write_cache
Whether to write cache or not.
Definition: VwParser.h:265
float32_t label
Label value.
Definition: vw_label.h:92
v_array< vw_size_t > indices
Array of namespaces.
Definition: vw_example.h:84
float32_t weight
Weight of example.
Definition: vw_label.h:94
#define ASSERT(x)
Definition: SGIO.h:201
Class SGObject is the base class of all shogun objects.
Definition: SGObject.h:115
void erase()
Definition: v_array.h:113
static char * c_string_of_substring(substring s)
Definition: SGIO.cpp:321
int32_t read_svmlight_features(CIOBuffer *buf, VwExample *&ae)
Definition: VwParser.cpp:170
void label_from_substring(v_array< substring > &words)
Definition: vw_label.cpp:19
static float32_t float_of_substring(substring s)
Definition: SGIO.cpp:336
v_array< char > tag
Tag.
Definition: vw_example.h:82
Example class for VW.
Definition: vw_example.h:58
vw_size_t mask
Mask used for hashing.
EVwCacheType cache_type
Type of cache.
Definition: VwParser.h:263
float float32_t
Definition: common.h:49
float32_t initial
Initial approximation.
Definition: vw_label.h:96
One feature in VW.
Definition: vw_example.h:34
virtual void cache_example(VwExample *&ex)=0
#define SG_UNREF(x)
Definition: SGObject.h:55
static uint32_t MurmurHashString(substring s, uint32_t h)
Definition: Hash.cpp:381
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
static int is_nan(double f)
checks whether a float is nan
Definition: Math.cpp:234
VwLabel * ld
Label object.
Definition: vw_example.h:79
#define SG_SERROR(...)
Definition: SGIO.h:179
void init_cache(char *fname, EVwCacheType type=C_NATIVE)
Definition: VwParser.cpp:254
hash_func_t hasher
Hash function to use, of type hash_func_t.
Definition: VwParser.h:255
Class CVwNativeCacheWriter writes a cache exactly as that which would be produced by VW's default cac...
v_array< VwFeature > atomics[256]
Array of features.
Definition: vw_example.h:86

SHOGUN Machine Learning Toolbox - Documentation