Go to the documentation of this file.00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016 #ifndef _VW_PARSER_H__
00017 #define _VW_PARSER_H__
00018
00019 #include <shogun/base/SGObject.h>
00020 #include <shogun/io/SGIO.h>
00021 #include <shogun/lib/Hash.h>
00022 #include <shogun/classifier/vw/vw_common.h>
00023 #include <shogun/classifier/vw/cache/VwCacheWriter.h>
00024
00025 namespace shogun
00026 {
00028 enum E_VW_PARSER_TYPE
00029 {
00030 T_VW = 1,
00031 T_SVMLIGHT = 2,
00032 T_DENSE = 3
00033 };
00034
00046 class CVwParser: public CSGObject
00047 {
00048 public:
00052 CVwParser();
00053
00059 CVwParser(CVwEnvironment* env_to_use);
00060
00064 virtual ~CVwParser();
00065
00071 CVwEnvironment* get_env()
00072 {
00073 SG_REF(env);
00074 return env;
00075 }
00076
00082 void set_env(CVwEnvironment* env_to_use)
00083 {
00084 env = env_to_use;
00085 SG_REF(env);
00086 }
00087
00094 void set_cache_parameters(char * fname, EVwCacheType type = C_NATIVE)
00095 {
00096 init_cache(fname, type);
00097 }
00098
00104 EVwCacheType get_cache_type()
00105 {
00106 return cache_type;
00107 }
00108
00114 void set_write_cache(bool wr_cache)
00115 {
00116 write_cache = wr_cache;
00117 if (wr_cache)
00118 init_cache(NULL);
00119 else
00120 if (cache_writer)
00121 SG_UNREF(cache_writer);
00122 }
00123
00129 bool get_write_cache()
00130 {
00131 return write_cache;
00132 }
00133
00139 void set_mm(float64_t label)
00140 {
00141 env->min_label = CMath::min(env->min_label, label);
00142 if (label != FLT_MAX)
00143 env->max_label = CMath::max(env->max_label, label);
00144 }
00145
00152 void noop_mm(float64_t label) { }
00153
00160 void set_minmax(float64_t label)
00161 {
00162 set_mm(label);
00163 }
00164
00173 int32_t read_features(CIOBuffer* buf, VwExample*& ex);
00174
00183 int32_t read_svmlight_features(CIOBuffer* buf, VwExample*& ae);
00184
00193 int32_t read_dense_features(CIOBuffer* buf, VwExample*& ae);
00194
00200 virtual const char* get_name() const { return "VwParser"; }
00201
00202 protected:
00209 void init_cache(char * fname, EVwCacheType type = C_NATIVE);
00210
00219 void feature_value(substring &s, v_array<substring>& name, float32_t &v);
00220
00229 void tokenize(char delim, substring s, v_array<substring> &ret);
00230
00241 inline char* safe_index(char *start, char v, char *max)
00242 {
00243 while (start != max && *start != v)
00244 start++;
00245 return start;
00246 }
00247
00248 public:
00250 hash_func_t hasher;
00251
00252 protected:
00254 CVwEnvironment* env;
00256 CVwCacheWriter* cache_writer;
00258 EVwCacheType cache_type;
00260 bool write_cache;
00261
00262 private:
00264 v_array<substring> channels;
00265 v_array<substring> words;
00266 v_array<substring> name;
00267 };
00268
00269 }
00270 #endif // _VW_PARSER_H__