00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016 #include <shogun/classifier/vw/VwParser.h>
00017 #include <shogun/classifier/vw/cache/VwNativeCacheWriter.h>
00018
00019 using namespace shogun;
00020
00021 CVwParser::CVwParser()
00022 : CSGObject()
00023 {
00024 env = new CVwEnvironment();
00025 hasher = CHash::MurmurHashString;
00026 write_cache = false;
00027 cache_writer = NULL;
00028 }
00029
00030 CVwParser::CVwParser(CVwEnvironment* env_to_use)
00031 : CSGObject()
00032 {
00033 ASSERT(env_to_use);
00034
00035 env = env_to_use;
00036 hasher = CHash::MurmurHashString;
00037 write_cache = false;
00038 cache_writer = NULL;
00039 SG_REF(env);
00040 }
00041
00042 CVwParser::~CVwParser()
00043 {
00044 SG_UNREF(env);
00045 SG_UNREF(cache_writer);
00046 }
00047
00048 int32_t CVwParser::read_features(CIOBuffer* buf, VwExample*& ae)
00049 {
00050 char *line=NULL;
00051 int32_t num_chars = buf->read_line(line);
00052 if (num_chars == 0)
00053 return num_chars;
00054
00055
00056 substring example_string = {line, line + num_chars};
00057
00058
00059 channels.erase();
00060
00061
00062 tokenize('|', example_string, channels);
00063
00064
00065 substring* feature_start = &channels[1];
00066
00067 if (*line == '|')
00068 feature_start = &channels[0];
00069 else
00070 {
00071
00072 substring label_space = channels[0];
00073 char* tab_location = safe_index(label_space.start, '\t', label_space.end);
00074 if (tab_location != label_space.end)
00075 label_space.start = tab_location+1;
00076
00077
00078 tokenize(' ',label_space,words);
00079 if (words.index() > 0 && words.last().end == label_space.end)
00080 {
00081 substring tag = words.pop();
00082 ae->tag.push_many(tag.start, tag.end - tag.start);
00083 }
00084
00085 ae->ld->label_from_substring(words);
00086 set_minmax(ae->ld->label);
00087 }
00088
00089 vw_size_t mask = env->mask;
00090
00091
00092 for (substring* i = feature_start; i != channels.end; i++)
00093 {
00094 substring channel = *i;
00095
00096 tokenize(' ',channel, words);
00097 if (words.begin == words.end)
00098 continue;
00099
00100
00101 float32_t channel_v = 1.;
00102 vw_size_t channel_hash;
00103
00104
00105 vw_size_t index = 0;
00106 bool new_index = false;
00107 vw_size_t feature_offset = 0;
00108
00109 if (channel.start[0] != ' ')
00110 {
00111
00112 feature_offset++;
00113 feature_value(words[0], name, channel_v);
00114
00115 if (name.index() > 0)
00116 {
00117 index = (unsigned char)(*name[0].start);
00118 if (ae->atomics[index].begin == ae->atomics[index].end)
00119 {
00120 ae->sum_feat_sq[index] = 0;
00121 new_index = true;
00122 }
00123 }
00124 channel_hash = hasher(name[0], hash_base);
00125 }
00126 else
00127 {
00128
00129 index = (unsigned char)' ';
00130 if (ae->atomics[index].begin == ae->atomics[index].end)
00131 {
00132 ae->sum_feat_sq[index] = 0;
00133 new_index = true;
00134 }
00135 channel_hash = 0;
00136 }
00137
00138 for (substring* j = words.begin+feature_offset; j != words.end; j++)
00139 {
00140
00141 float32_t v = 0.0;
00142 feature_value(*j, name, v);
00143 v *= channel_v;
00144
00145
00146 vw_size_t word_hash = (hasher(name[0], channel_hash)) & mask;
00147 VwFeature f = {v,word_hash};
00148 ae->sum_feat_sq[index] += v*v;
00149 ae->atomics[index].push(f);
00150 }
00151
00152
00153 if (new_index && ae->atomics[index].begin != ae->atomics[index].end)
00154 ae->indices.push(index);
00155
00156 }
00157
00158 if (write_cache)
00159 cache_writer->cache_example(ae);
00160
00161 return num_chars;
00162 }
00163
00164 int32_t CVwParser::read_svmlight_features(CIOBuffer* buf, VwExample*& ae)
00165 {
00166 char *line=NULL;
00167 int32_t num_chars = buf->read_line(line);
00168 if (num_chars == 0)
00169 return num_chars;
00170
00171
00172 substring example_string = {line, line + num_chars};
00173
00174 vw_size_t mask = env->mask;
00175 tokenize(' ', example_string, words);
00176
00177 ae->ld->label = float_of_substring(words[0]);
00178 ae->ld->weight = 1.;
00179 ae->ld->initial = 0.;
00180 set_minmax(ae->ld->label);
00181
00182 substring* feature_start = &words[1];
00183
00184 vw_size_t index = (unsigned char)' ';
00185 vw_size_t channel_hash = 0;
00186 ae->sum_feat_sq[index] = 0;
00187 ae->indices.push(index);
00188
00189 for (substring* i = feature_start; i != words.end; i++)
00190 {
00191 float32_t v;
00192 feature_value(*i, name, v);
00193
00194 vw_size_t word_hash = (hasher(name[0], channel_hash)) & mask;
00195 VwFeature f = {v,word_hash};
00196 ae->sum_feat_sq[index] += v*v;
00197 ae->atomics[index].push(f);
00198 }
00199
00200 if (write_cache)
00201 cache_writer->cache_example(ae);
00202
00203 return num_chars;
00204 }
00205
00206 int32_t CVwParser::read_dense_features(CIOBuffer* buf, VwExample*& ae)
00207 {
00208 char *line=NULL;
00209 int32_t num_chars = buf->read_line(line);
00210 if (num_chars == 0)
00211 return num_chars;
00212
00213
00214 substring example_string = {line, line + num_chars};
00215
00216 vw_size_t mask = env->mask;
00217 tokenize(' ', example_string, words);
00218
00219 ae->ld->label = float_of_substring(words[0]);
00220 ae->ld->weight = 1.;
00221 ae->ld->initial = 0.;
00222 set_minmax(ae->ld->label);
00223
00224 substring* feature_start = &words[1];
00225
00226 vw_size_t index = (unsigned char)' ';
00227
00228 ae->sum_feat_sq[index] = 0;
00229 ae->indices.push(index);
00230
00231 int32_t j=0;
00232 for (substring* i = feature_start; i != words.end; i++)
00233 {
00234 float32_t v = float_of_substring(*i);
00235 vw_size_t word_hash = j & mask;
00236 VwFeature f = {v,word_hash};
00237 ae->sum_feat_sq[index] += v*v;
00238 ae->atomics[index].push(f);
00239 j++;
00240 }
00241
00242 if (write_cache)
00243 cache_writer->cache_example(ae);
00244
00245 return num_chars;
00246 }
00247
00248 void CVwParser::init_cache(char * fname, EVwCacheType type)
00249 {
00250 char* file_name = fname;
00251 char default_cache_name[] = "vw_cache.dat.cache";
00252
00253 if (!fname)
00254 file_name = default_cache_name;
00255
00256 write_cache = true;
00257 cache_type = type;
00258
00259 switch (type)
00260 {
00261 case C_NATIVE:
00262 cache_writer = new CVwNativeCacheWriter(file_name, env);
00263 return;
00264 case C_PROTOBUF:
00265 SG_ERROR("Protocol buffers cache support is not implemented yet.\n");
00266 }
00267
00268 SG_ERROR("Unexpected cache type specified!\n");
00269 }
00270
00271 void CVwParser::feature_value(substring &s, v_array<substring>& feat_name, float32_t &v)
00272 {
00273
00274 tokenize(':', s, feat_name);
00275
00276 switch (feat_name.index())
00277 {
00278
00279 case 0:
00280 case 1:
00281 v = 1.;
00282 break;
00283 case 2:
00284 v = float_of_substring(feat_name[1]);
00285 if (isnan(v))
00286 SG_SERROR("error NaN value for feature %s! Terminating!\n",
00287 c_string_of_substring(feat_name[0]));
00288 break;
00289 default:
00290 SG_SERROR("Examples with a weird name, i.e., '%s'\n",
00291 c_string_of_substring(s));
00292 }
00293 }
00294
00295 void CVwParser::tokenize(char delim, substring s, v_array<substring>& ret)
00296 {
00297 ret.erase();
00298 char *last = s.start;
00299 for (; s.start != s.end; s.start++)
00300 {
00301 if (*s.start == delim)
00302 {
00303 if (s.start != last)
00304 {
00305 substring temp = {last,s.start};
00306 ret.push(temp);
00307 }
00308 last = s.start+1;
00309 }
00310 }
00311 if (s.start != last)
00312 {
00313 substring final = {last, s.start};
00314 ret.push(final);
00315 }
00316 }