00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016 #include <shogun/classifier/vw/VwParser.h>
00017 #include <shogun/classifier/vw/cache/VwNativeCacheWriter.h>
00018
00019 using namespace shogun;
00020
00021 CVwParser::CVwParser()
00022 : CSGObject()
00023 {
00024 env = new CVwEnvironment();
00025 hasher = CHash::MurmurHashString;
00026 write_cache = false;
00027 cache_writer = NULL;
00028 }
00029
00030 CVwParser::CVwParser(CVwEnvironment* env_to_use)
00031 : CSGObject()
00032 {
00033 ASSERT(env_to_use);
00034
00035 env = env_to_use;
00036 hasher = CHash::MurmurHashString;
00037 write_cache = false;
00038 cache_writer = NULL;
00039 SG_REF(env);
00040 }
00041
00042 CVwParser::~CVwParser()
00043 {
00044 SG_FREE(channels.begin);
00045 channels.begin = channels.end = channels.end_array = NULL;
00046 SG_FREE(words.begin);
00047 words.begin = words.end = words.end_array = NULL;
00048 SG_FREE(name.begin);
00049 name.begin = name.end = name.end_array = NULL;
00050
00051 SG_UNREF(env);
00052 SG_UNREF(cache_writer);
00053 }
00054
00055 int32_t CVwParser::read_features(CIOBuffer* buf, VwExample*& ae)
00056 {
00057 char *line=NULL;
00058 int32_t num_chars = buf->read_line(line);
00059 if (num_chars == 0)
00060 return num_chars;
00061
00062
00063 substring example_string = {line, line + num_chars};
00064
00065
00066 channels.erase();
00067
00068
00069 tokenize('|', example_string, channels);
00070
00071
00072 substring* feature_start = &channels[1];
00073
00074 if (*line == '|')
00075 feature_start = &channels[0];
00076 else
00077 {
00078
00079 substring label_space = channels[0];
00080 char* tab_location = safe_index(label_space.start, '\t', label_space.end);
00081 if (tab_location != label_space.end)
00082 label_space.start = tab_location+1;
00083
00084
00085 tokenize(' ',label_space,words);
00086 if (words.index() > 0 && words.last().end == label_space.end)
00087 {
00088 substring tag = words.pop();
00089 ae->tag.push_many(tag.start, tag.end - tag.start);
00090 }
00091
00092 ae->ld->parse_label(words);
00093 set_minmax(ae->ld->label);
00094 }
00095
00096 vw_size_t mask = env->mask;
00097
00098
00099 for (substring* i = feature_start; i != channels.end; i++)
00100 {
00101 substring channel = *i;
00102
00103 tokenize(' ',channel, words);
00104 if (words.begin == words.end)
00105 continue;
00106
00107
00108 float32_t channel_v = 1.;
00109 vw_size_t channel_hash;
00110
00111
00112 vw_size_t index = 0;
00113 bool new_index = false;
00114 vw_size_t feature_offset = 0;
00115
00116 if (channel.start[0] != ' ')
00117 {
00118
00119 feature_offset++;
00120 feature_value(words[0], name, channel_v);
00121
00122 if (name.index() > 0)
00123 {
00124 index = (unsigned char)(*name[0].start);
00125 if (ae->atomics[index].begin == ae->atomics[index].end)
00126 {
00127 ae->sum_feat_sq[index] = 0;
00128 new_index = true;
00129 }
00130 }
00131 channel_hash = hasher(name[0], hash_base);
00132 }
00133 else
00134 {
00135
00136 index = (unsigned char)' ';
00137 if (ae->atomics[index].begin == ae->atomics[index].end)
00138 {
00139 ae->sum_feat_sq[index] = 0;
00140 new_index = true;
00141 }
00142 channel_hash = 0;
00143 }
00144
00145 for (substring* j = words.begin+feature_offset; j != words.end; j++)
00146 {
00147
00148 float32_t v;
00149 feature_value(*j, name, v);
00150 v *= channel_v;
00151
00152
00153 vw_size_t word_hash = (hasher(name[0], channel_hash)) & mask;
00154 VwFeature f = {v,word_hash};
00155 ae->sum_feat_sq[index] += v*v;
00156 ae->atomics[index].push(f);
00157 }
00158
00159
00160 if (new_index && ae->atomics[index].begin != ae->atomics[index].end)
00161 ae->indices.push(index);
00162
00163 }
00164
00165 if (write_cache)
00166 cache_writer->cache_example(ae);
00167
00168 return num_chars;
00169 }
00170
00171 int32_t CVwParser::read_svmlight_features(CIOBuffer* buf, VwExample*& ae)
00172 {
00173 char *line=NULL;
00174 int32_t num_chars = buf->read_line(line);
00175 if (num_chars == 0)
00176 return num_chars;
00177
00178
00179 substring example_string = {line, line + num_chars};
00180
00181 vw_size_t mask = env->mask;
00182 tokenize(' ', example_string, words);
00183
00184 ae->ld->label = float_of_substring(words[0]);
00185 ae->ld->weight = 1.;
00186 ae->ld->initial = 0.;
00187 set_minmax(ae->ld->label);
00188
00189 substring* feature_start = &words[1];
00190
00191 vw_size_t index = (unsigned char)' ';
00192 vw_size_t channel_hash = 0;
00193 ae->sum_feat_sq[index] = 0;
00194 ae->indices.push(index);
00195
00196 for (substring* i = feature_start; i != words.end; i++)
00197 {
00198 float32_t v;
00199 feature_value(*i, name, v);
00200
00201 vw_size_t word_hash = (hasher(name[0], channel_hash)) & mask;
00202 VwFeature f = {v,word_hash};
00203 ae->sum_feat_sq[index] += v*v;
00204 ae->atomics[index].push(f);
00205 }
00206
00207 if (write_cache)
00208 cache_writer->cache_example(ae);
00209
00210 return num_chars;
00211 }
00212
00213 int32_t CVwParser::read_dense_features(CIOBuffer* buf, VwExample*& ae)
00214 {
00215 char *line=NULL;
00216 int32_t num_chars = buf->read_line(line);
00217 if (num_chars == 0)
00218 return num_chars;
00219
00220
00221 substring example_string = {line, line + num_chars};
00222
00223 vw_size_t mask = env->mask;
00224 tokenize(' ', example_string, words);
00225
00226 ae->ld->label = float_of_substring(words[0]);
00227 ae->ld->weight = 1.;
00228 ae->ld->initial = 0.;
00229 set_minmax(ae->ld->label);
00230
00231 substring* feature_start = &words[1];
00232
00233 vw_size_t index = (unsigned char)' ';
00234
00235 ae->sum_feat_sq[index] = 0;
00236 ae->indices.push(index);
00237
00238 int32_t j=0;
00239 for (substring* i = feature_start; i != words.end; i++)
00240 {
00241 float32_t v = float_of_substring(*i);
00242 vw_size_t word_hash = j & mask;
00243 VwFeature f = {v,word_hash};
00244 ae->sum_feat_sq[index] += v*v;
00245 ae->atomics[index].push(f);
00246 j++;
00247 }
00248
00249 if (write_cache)
00250 cache_writer->cache_example(ae);
00251
00252 return num_chars;
00253 }
00254
00255 void CVwParser::init_cache(char * fname, EVwCacheType type)
00256 {
00257 char* file_name = fname;
00258 char default_cache_name[] = "vw_cache.dat.cache";
00259
00260 if (!fname)
00261 file_name = default_cache_name;
00262
00263 write_cache = true;
00264 cache_type = type;
00265
00266 switch (type)
00267 {
00268 case C_NATIVE:
00269 cache_writer = new CVwNativeCacheWriter(file_name, env);
00270 return;
00271 case C_PROTOBUF:
00272 SG_ERROR("Protocol buffers cache support is not implemented yet.\n");
00273 }
00274
00275 SG_ERROR("Unexpected cache type specified!\n");
00276 }
00277
00278 void CVwParser::feature_value(substring &s, v_array<substring>& feat_name, float32_t &v)
00279 {
00280
00281 tokenize(':', s, feat_name);
00282
00283 switch (feat_name.index())
00284 {
00285
00286 case 0:
00287 case 1:
00288 v = 1.;
00289 break;
00290 case 2:
00291 v = float_of_substring(feat_name[1]);
00292 if (isnan(v))
00293 SG_SERROR("error NaN value for feature %s! Terminating!\n",
00294 c_string_of_substring(feat_name[0]));
00295 break;
00296 default:
00297 SG_SERROR("Examples with a weird name, i.e., '%s'\n",
00298 c_string_of_substring(s));
00299 }
00300 }
00301
00302 void CVwParser::tokenize(char delim, substring s, v_array<substring>& ret)
00303 {
00304 ret.erase();
00305 char *last = s.start;
00306 for (; s.start != s.end; s.start++)
00307 {
00308 if (*s.start == delim)
00309 {
00310 if (s.start != last)
00311 {
00312 substring temp = {last,s.start};
00313 ret.push(temp);
00314 }
00315 last = s.start+1;
00316 }
00317 }
00318 if (s.start != last)
00319 {
00320 substring final = {last, s.start};
00321 ret.push(final);
00322 }
00323 }