SHOGUN: VwParser.cpp Source File

Go to the documentation of this file.
00001 /*
00002  * Copyright (c) 2009 Yahoo! Inc.  All rights reserved.  The copyrights
00003  * embodied in the content of this file are licensed under the BSD
00004  * (revised) open source license.
00005  *
00006  * This program is free software; you can redistribute it and/or modify
00007  * it under the terms of the GNU General Public License as published by
00008  * the Free Software Foundation; either version 3 of the License, or
00009  * (at your option) any later version.
00010  *
00011  * Written (W) 2011 Shashwat Lal Das
00012  * Adaptation of Vowpal Wabbit v5.1.
00013  * Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society.
00014  */
00015 
00016 #include <shogun/classifier/vw/VwParser.h>
00017 #include <shogun/classifier/vw/cache/VwNativeCacheWriter.h>
00018 
00019 using namespace shogun;
00020 
00021 CVwParser::CVwParser()
00022     : CSGObject()
00023 {
00024     env = new CVwEnvironment();
00025     hasher = CHash::MurmurHashString;
00026     write_cache = false;
00027     cache_writer = NULL;
00028 }
00029 
00030 CVwParser::CVwParser(CVwEnvironment* env_to_use)
00031     : CSGObject()
00032 {
00033     ASSERT(env_to_use);
00034 
00035     env = env_to_use;
00036     hasher = CHash::MurmurHashString;
00037     write_cache = false;
00038     cache_writer = NULL;
00039     SG_REF(env);
00040 }
00041 
00042 CVwParser::~CVwParser()
00043 {
00044     SG_FREE(channels.begin);
00045     channels.begin = channels.end = channels.end_array = NULL;
00046     SG_FREE(words.begin);
00047     words.begin = words.end = words.end_array = NULL;
00048     SG_FREE(name.begin);
00049     name.begin = name.end = name.end_array = NULL;
00050 
00051     SG_UNREF(env);
00052     SG_UNREF(cache_writer);
00053 }
00054 
00055 int32_t CVwParser::read_features(CIOBuffer* buf, VwExample*& ae)
00056 {
00057     char *line=NULL;
00058     int32_t num_chars = buf->read_line(line);
00059     if (num_chars == 0)
00060         return num_chars;
00061 
00062     /* Mark begin and end of example in the buffer */
00063     substring example_string = {line, line + num_chars};
00064 
00065     /* Channels containing separate namespaces/label information*/
00066     channels.erase();
00067 
00068     /* Split at '|' character */
00069     tokenize('|', example_string, channels);
00070 
00071     /* If first char is not '|', then the first channel contains label data */
00072     substring* feature_start = &channels[1];
00073 
00074     if (*line == '|')
00075         feature_start = &channels[0]; /* Unlabelled data */
00076     else
00077     {
00078         /* First channel has label info */
00079         substring label_space = channels[0];
00080         char* tab_location = safe_index(label_space.start, '\t', label_space.end);
00081         if (tab_location != label_space.end)
00082             label_space.start = tab_location+1;
00083 
00084         /* Split the label space on spaces */
00085         tokenize(' ',label_space,words);
00086         if (words.index() > 0 && words.last().end == label_space.end) //The last field is a tag, so record and strip it off
00087         {
00088             substring tag = words.pop();
00089             ae->tag.push_many(tag.start, tag.end - tag.start);
00090         }
00091 
00092         ae->ld->parse_label(words);
00093         set_minmax(ae->ld->label);
00094     }
00095 
00096     vw_size_t mask = env->mask;
00097 
00098     /* Now parse the individual channels, i.e., namespaces */
00099     for (substring* i = feature_start; i != channels.end; i++)
00100     {
00101         substring channel = *i;
00102 
00103         tokenize(' ',channel, words);
00104         if (words.begin == words.end)
00105             continue;
00106 
00107         /* Set default scale value for channel */
00108         float32_t channel_v = 1.;
00109         vw_size_t channel_hash;
00110 
00111         /* Index by which to refer to the namespace */
00112         vw_size_t index = 0;
00113         bool new_index = false;
00114         vw_size_t feature_offset = 0;
00115 
00116         if (channel.start[0] != ' ')
00117         {
00118             /* Nonanonymous namespace specified */
00119             feature_offset++;
00120             feature_value(words[0], name, channel_v);
00121 
00122             if (name.index() > 0)
00123             {
00124                 index = (unsigned char)(*name[0].start);
00125                 if (ae->atomics[index].begin == ae->atomics[index].end)
00126                 {
00127                     ae->sum_feat_sq[index] = 0;
00128                     new_index = true;
00129                 }
00130             }
00131             channel_hash = hasher(name[0], hash_base);
00132         }
00133         else
00134         {
00135             /* Use default namespace with index below */
00136             index = (unsigned char)' ';
00137             if (ae->atomics[index].begin == ae->atomics[index].end)
00138             {
00139                 ae->sum_feat_sq[index] = 0;
00140                 new_index = true;
00141             }
00142             channel_hash = 0;
00143         }
00144 
00145         for (substring* j = words.begin+feature_offset; j != words.end; j++)
00146         {
00147             /* Get individual features and multiply by scale value */
00148             float32_t v;
00149             feature_value(*j, name, v);
00150             v *= channel_v;
00151 
00152             /* Hash feature */
00153             vw_size_t word_hash = (hasher(name[0], channel_hash)) & mask;
00154             VwFeature f = {v,word_hash};
00155             ae->sum_feat_sq[index] += v*v;
00156             ae->atomics[index].push(f);
00157         }
00158 
00159         /* Add index to list of indices if required */
00160         if (new_index && ae->atomics[index].begin != ae->atomics[index].end)
00161             ae->indices.push(index);
00162 
00163     }
00164 
00165     if (write_cache)
00166         cache_writer->cache_example(ae);
00167 
00168     return num_chars;
00169 }
00170 
00171 int32_t CVwParser::read_svmlight_features(CIOBuffer* buf, VwExample*& ae)
00172 {
00173     char *line=NULL;
00174     int32_t num_chars = buf->read_line(line);
00175     if (num_chars == 0)
00176         return num_chars;
00177 
00178     /* Mark begin and end of example in the buffer */
00179     substring example_string = {line, line + num_chars};
00180 
00181     vw_size_t mask = env->mask;
00182     tokenize(' ', example_string, words);
00183 
00184     ae->ld->label = float_of_substring(words[0]);
00185     ae->ld->weight = 1.;
00186     ae->ld->initial = 0.;
00187     set_minmax(ae->ld->label);
00188 
00189     substring* feature_start = &words[1];
00190 
00191     vw_size_t index = (unsigned char)' ';   // Any default namespace is ok
00192     vw_size_t channel_hash = 0;
00193     ae->sum_feat_sq[index] = 0;
00194     ae->indices.push(index);
00195     /* Now parse the individual features */
00196     for (substring* i = feature_start; i != words.end; i++)
00197     {
00198         float32_t v;
00199         feature_value(*i, name, v);
00200 
00201         vw_size_t word_hash = (hasher(name[0], channel_hash)) & mask;
00202         VwFeature f = {v,word_hash};
00203         ae->sum_feat_sq[index] += v*v;
00204         ae->atomics[index].push(f);
00205     }
00206 
00207     if (write_cache)
00208         cache_writer->cache_example(ae);
00209 
00210     return num_chars;
00211 }
00212 
00213 int32_t CVwParser::read_dense_features(CIOBuffer* buf, VwExample*& ae)
00214 {
00215     char *line=NULL;
00216     int32_t num_chars = buf->read_line(line);
00217     if (num_chars == 0)
00218         return num_chars;
00219 
00220     // Mark begin and end of example in the buffer
00221     substring example_string = {line, line + num_chars};
00222 
00223     vw_size_t mask = env->mask;
00224     tokenize(' ', example_string, words);
00225 
00226     ae->ld->label = float_of_substring(words[0]);
00227     ae->ld->weight = 1.;
00228     ae->ld->initial = 0.;
00229     set_minmax(ae->ld->label);
00230 
00231     substring* feature_start = &words[1];
00232 
00233     vw_size_t index = (unsigned char)' ';
00234 
00235     ae->sum_feat_sq[index] = 0;
00236     ae->indices.push(index);
00237     // Now parse individual features
00238     int32_t j=0;
00239     for (substring* i = feature_start; i != words.end; i++)
00240     {
00241         float32_t v = float_of_substring(*i);
00242         vw_size_t word_hash = j & mask;
00243         VwFeature f = {v,word_hash};
00244         ae->sum_feat_sq[index] += v*v;
00245         ae->atomics[index].push(f);
00246         j++;
00247     }
00248 
00249     if (write_cache)
00250         cache_writer->cache_example(ae);
00251 
00252     return num_chars;
00253 }
00254 
00255 void CVwParser::init_cache(char * fname, EVwCacheType type)
00256 {
00257     char* file_name = fname;
00258     char default_cache_name[] = "vw_cache.dat.cache";
00259 
00260     if (!fname)
00261         file_name = default_cache_name;
00262 
00263     write_cache = true;
00264     cache_type = type;
00265 
00266     switch (type)
00267     {
00268     case C_NATIVE:
00269         cache_writer = new CVwNativeCacheWriter(file_name, env);
00270         return;
00271     case C_PROTOBUF:
00272         SG_ERROR("Protocol buffers cache support is not implemented yet.\n");
00273     }
00274 
00275     SG_ERROR("Unexpected cache type specified!\n");
00276 }
00277 
00278 void CVwParser::feature_value(substring &s, v_array<substring>& feat_name, float32_t &v)
00279 {
00280     // Get the value of the feature in the substring
00281     tokenize(':', s, feat_name);
00282 
00283     switch (feat_name.index())
00284     {
00285     // If feature value is not specified, assume 1.0
00286     case 0:
00287     case 1:
00288         v = 1.;
00289         break;
00290     case 2:
00291         v = float_of_substring(feat_name[1]);
00292         if (isnan(v))
00293             SG_SERROR("error NaN value for feature %s! Terminating!\n",
00294                   c_string_of_substring(feat_name[0]));
00295         break;
00296     default:
00297         SG_SERROR("Examples with a weird name, i.e., '%s'\n",
00298               c_string_of_substring(s));
00299     }
00300 }
00301 
00302 void CVwParser::tokenize(char delim, substring s, v_array<substring>& ret)
00303 {
00304     ret.erase();
00305     char *last = s.start;
00306     for (; s.start != s.end; s.start++)
00307     {
00308         if (*s.start == delim)
00309         {
00310             if (s.start != last)
00311             {
00312                 substring temp = {last,s.start};
00313                 ret.push(temp);
00314             }
00315             last = s.start+1;
00316         }
00317     }
00318     if (s.start != last)
00319     {
00320         substring final = {last, s.start};
00321         ret.push(final);
00322     }
00323 }