VwParser.h

Go to the documentation of this file.
00001 /*
00002  * Copyright (c) 2009 Yahoo! Inc.  All rights reserved.  The copyrights
00003  * embodied in the content of this file are licensed under the BSD
00004  * (revised) open source license.
00005  *
00006  * This program is free software; you can redistribute it and/or modify
00007  * it under the terms of the GNU General Public License as published by
00008  * the Free Software Foundation; either version 3 of the License, or
00009  * (at your option) any later version.
00010  *
00011  * Written (W) 2011 Shashwat Lal Das
00012  * Adaptation of Vowpal Wabbit v5.1.
00013  * Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society.
00014  */
00015 
00016 #ifndef _VW_PARSER_H__
00017 #define _VW_PARSER_H__
00018 
00019 #include <shogun/base/SGObject.h>
00020 #include <shogun/io/SGIO.h>
00021 #include <shogun/lib/Hash.h>
00022 #include <shogun/classifier/vw/vw_common.h>
00023 #include <shogun/classifier/vw/cache/VwCacheWriter.h>
00024 
00025 namespace shogun
00026 {
00028 enum E_VW_PARSER_TYPE
00029 {
00030     T_VW = 1,
00031     T_SVMLIGHT = 2,
00032     T_DENSE = 3
00033 };
00034 
00046 class CVwParser: public CSGObject
00047 {
00048 public:
00052     CVwParser();
00053 
00059     CVwParser(CVwEnvironment* env_to_use);
00060 
00064     virtual ~CVwParser();
00065 
00071     CVwEnvironment* get_env()
00072     {
00073         SG_REF(env);
00074         return env;
00075     }
00076 
00082     void set_env(CVwEnvironment* env_to_use)
00083     {
00084         env = env_to_use;
00085         SG_REF(env);
00086     }
00087 
00094     void set_cache_parameters(char * fname, EVwCacheType type = C_NATIVE)
00095     {
00096         init_cache(fname, type);
00097     }
00098 
00104     EVwCacheType get_cache_type()
00105     {
00106         return cache_type;
00107     }
00108 
00114     void set_write_cache(bool wr_cache)
00115     {
00116         write_cache = wr_cache;
00117         if (wr_cache)
00118             init_cache(NULL);
00119         else
00120             if (cache_writer)
00121                 SG_UNREF(cache_writer);
00122     }
00123 
00129     bool get_write_cache()
00130     {
00131         return write_cache;
00132     }
00133 
00139     void set_mm(float64_t label)
00140     {
00141         env->min_label = CMath::min(env->min_label, label);
00142         if (label != FLT_MAX)
00143             env->max_label = CMath::max(env->max_label, label);
00144     }
00145 
00152     void noop_mm(float64_t label) { }
00153 
00160     void set_minmax(float64_t label)
00161     {
00162         set_mm(label);
00163     }
00164 
00173     int32_t read_features(CIOBuffer* buf, VwExample*& ex);
00174 
00183     int32_t read_svmlight_features(CIOBuffer* buf, VwExample*& ae);
00184 
00193     int32_t read_dense_features(CIOBuffer* buf, VwExample*& ae);
00194 
00200     virtual const char* get_name() const { return "VwParser"; }
00201 
00202 protected:
00209     void init_cache(char * fname, EVwCacheType type = C_NATIVE);
00210 
00219     void feature_value(substring &s, v_array<substring>& name, float32_t &v);
00220 
00229     void tokenize(char delim, substring s, v_array<substring> &ret);
00230 
00241     inline char* safe_index(char *start, char v, char *max)
00242     {
00243         while (start != max && *start != v)
00244             start++;
00245         return start;
00246     }
00247 
00248 public:
00250     hash_func_t hasher;
00251 
00252 protected:
00254     CVwEnvironment* env;
00256     CVwCacheWriter* cache_writer;
00258     EVwCacheType cache_type;
00260     bool write_cache;
00261 
00262 private:
00264     v_array<substring> channels;
00265     v_array<substring> words;
00266     v_array<substring> name;
00267 };
00268 
00269 }
00270 #endif // _VW_PARSER_H__
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation