StreamingVwFeatures.cpp

Go to the documentation of this file.
00001 /*
00002  * Copyright (c) 2009 Yahoo! Inc.  All rights reserved.  The copyrights
00003  * embodied in the content of this file are licensed under the BSD
00004  * (revised) open source license.
00005  *
00006  * This program is free software; you can redistribute it and/or modify
00007  * it under the terms of the GNU General Public License as published by
00008  * the Free Software Foundation; either version 3 of the License, or
00009  * (at your option) any later version.
00010  *
00011  * Written (W) 2011 Shashwat Lal Das
00012  * Adaptation of Vowpal Wabbit v5.1.
00013  * Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society.
00014  */
00015 
00016 #include <shogun/features/StreamingVwFeatures.h>
00017 
00018 using namespace shogun;
00019 
00020 CStreamingVwFeatures::CStreamingVwFeatures() : CStreamingDotFeatures()
00021 {
00022     init();
00023     set_read_functions();
00024 }
00025 
00026 CStreamingVwFeatures::CStreamingVwFeatures(CStreamingVwFile* file,
00027         bool is_labelled, int32_t size)
00028 : CStreamingDotFeatures()
00029 {
00030     init(file, is_labelled, size);
00031     set_read_functions();
00032 }
00033 
00034 CStreamingVwFeatures::CStreamingVwFeatures(CStreamingVwCacheFile* file,
00035         bool is_labelled, int32_t size)
00036 : CStreamingDotFeatures()
00037 {
00038     init(file, is_labelled, size);
00039     set_read_functions();
00040 }
00041 
00042 CStreamingVwFeatures::~CStreamingVwFeatures()
00043 {
00044     parser.end_parser();
00045     SG_UNREF(env);
00046 }
00047 
00048 CFeatures* CStreamingVwFeatures::duplicate() const
00049 {
00050     return new CStreamingVwFeatures(*this);
00051 }
00052 
00053 void CStreamingVwFeatures::set_vector_reader()
00054 {
00055     parser.set_read_vector(&CStreamingFile::get_vector);
00056 }
00057 
00058 void CStreamingVwFeatures::set_vector_and_label_reader()
00059 {
00060     parser.set_read_vector_and_label(&CStreamingFile::get_vector_and_label);
00061 }
00062 
00063 void CStreamingVwFeatures::reset_stream()
00064 {
00065     if (working_file->is_seekable())
00066     {
00067         working_file->reset_stream();
00068         parser.exit_parser();
00069         parser.init(working_file, has_labels, parser.get_ring_size());
00070         parser.set_free_vector_after_release(false);
00071         parser.start_parser();
00072     }
00073     else
00074         SG_ERROR("The input cannot be reset! Please use 1 pass.\n");
00075 }
00076 
00077 CVwEnvironment* CStreamingVwFeatures::get_env()
00078 {
00079     SG_REF(env);
00080     return env;
00081 }
00082 
00083 void CStreamingVwFeatures::set_env(CVwEnvironment* vw_env)
00084 {
00085     env = vw_env;
00086     SG_REF(env);
00087 }
00088 
00089 void CStreamingVwFeatures::expand_if_required(float32_t*& vec, int32_t& len)
00090 {
00091     int32_t dim = 1 << env->num_bits;
00092     if (dim > len)
00093     {
00094         vec = SG_REALLOC(float32_t, vec, dim);
00095         memset(&vec[len], 0, (dim-len) * sizeof(float32_t));
00096         len = dim;
00097     }
00098 }
00099 
00100 void CStreamingVwFeatures::expand_if_required(float64_t*& vec, int32_t& len)
00101 {
00102     int32_t dim = 1 << env->num_bits;
00103     if (dim > len)
00104     {
00105         vec = SG_REALLOC(float64_t, vec, dim);
00106         memset(&vec[len], 0, (dim-len) * sizeof(float64_t));
00107         len = dim;
00108     }
00109 }
00110 
00111 float32_t CStreamingVwFeatures::real_weight(float32_t w, float32_t gravity)
00112 {
00113     float32_t wprime = 0;
00114     if (gravity < fabsf(w))
00115         wprime = CMath::sign(w)*(fabsf(w) - gravity);
00116     return wprime;
00117 }
00118 
00119 int32_t CStreamingVwFeatures::get_nnz_features_for_vector()
00120 {
00121     return current_length;
00122 }
00123 
00124 int32_t CStreamingVwFeatures::get_num_vectors() const
00125 {
00126     if (current_example)
00127         return 1;
00128     else
00129         return 0;
00130 }
00131 
00132 int32_t CStreamingVwFeatures::get_size()
00133 {
00134     return sizeof(VwExample);
00135 }
00136 
00137 EFeatureType CStreamingVwFeatures::get_feature_type()
00138 {
00139     return F_DREAL;
00140 }
00141 
00142 void CStreamingVwFeatures::init()
00143 {
00144     working_file=NULL;
00145     seekable=false;
00146     current_length=-1;
00147     current_example=NULL;
00148 
00149     example_count = 0;
00150 }
00151 
00152 void CStreamingVwFeatures::init(CStreamingVwFile* file, bool is_labelled, int32_t size)
00153 {
00154     init();
00155     has_labels = is_labelled;
00156     working_file = file;
00157     parser.init(file, is_labelled, size);
00158     parser.set_free_vector_after_release(false);
00159     seekable=false;
00160 
00161     // Get environment from the StreamingVwFile
00162     env = ((CStreamingVwFile*) file)->get_env();
00163     SG_REF(env);
00164 }
00165 
00166 void CStreamingVwFeatures::init(CStreamingVwCacheFile* file, bool is_labelled, int32_t size)
00167 {
00168     init();
00169     has_labels = is_labelled;
00170     working_file = file;
00171     parser.init(file, is_labelled, size);
00172     parser.set_free_vector_after_release(false);
00173     seekable=true;
00174 
00175     // Get environment from the StreamingVwFile
00176     env = ((CStreamingVwCacheFile*) file)->get_env();
00177     SG_REF(env);
00178 }
00179 
00180 void CStreamingVwFeatures::setup_example(VwExample* ae)
00181 {
00182     ae->pass = env->passes_complete;
00183     ae->num_features = 0;
00184     ae->total_sum_feat_sq = 1;
00185     ae->example_counter = ++example_count;
00186     ae->global_weight = ae->ld->weight;
00187     env->t += ae->global_weight;
00188     ae->example_t = env->t;
00189 
00190     // If some namespaces should be ignored, remove them
00191     if (env->ignore_some)
00192     {
00193         for (vw_size_t* i = ae->indices.begin; i != ae->indices.end; i++)
00194             if (env->ignore[*i])
00195             {
00196                 ae->atomics[*i].erase();
00197                 memmove(i,i+1,(ae->indices.end - (i+1))*sizeof(vw_size_t));
00198                 ae->indices.end--;
00199                 i--;
00200             }
00201     }
00202 
00203     // Add constant feature
00204     vw_size_t constant_namespace = 128;
00205     VwFeature temp = {1,constant_hash & env->mask};
00206     ae->indices.push(constant_namespace);
00207     ae->atomics[constant_namespace].push(temp);
00208     ae->sum_feat_sq[constant_namespace] = 0;
00209 
00210     if(env->stride != 1)
00211     {
00212         // Make room for per-feature information.
00213         vw_size_t stride = env->stride;
00214         for (vw_size_t* i = ae->indices.begin; i != ae->indices.end; i++)
00215             for(VwFeature* j = ae->atomics[*i].begin; j != ae->atomics[*i].end; j++)
00216                 j->weight_index = j->weight_index*stride;
00217     }
00218 
00219     for (vw_size_t* i = ae->indices.begin; i != ae->indices.end; i++)
00220     {
00221         ae->num_features += ae->atomics[*i].end - ae->atomics[*i].begin;
00222         ae->total_sum_feat_sq += ae->sum_feat_sq[*i];
00223     }
00224 
00225     // For quadratic features
00226     for (int32_t k = 0; k < env->pairs.get_num_elements(); k++)
00227     {
00228         char* i = env->pairs.get_element(k);
00229 
00230         ae->num_features
00231             += (ae->atomics[(int32_t)(i[0])].end - ae->atomics[(int32_t)(i[0])].begin)
00232             *(ae->atomics[(int32_t)(i[1])].end - ae->atomics[(int32_t)(i[1])].begin);
00233 
00234         ae->total_sum_feat_sq += ae->sum_feat_sq[(int32_t)(i[0])]*ae->sum_feat_sq[(int32_t)(i[1])];
00235     }
00236 }
00237 
00238 void CStreamingVwFeatures::start_parser()
00239 {
00240     if (!parser.is_running())
00241         parser.start_parser();
00242 }
00243 
00244 void CStreamingVwFeatures::end_parser()
00245 {
00246     parser.end_parser();
00247 }
00248 
00249 bool CStreamingVwFeatures::get_next_example()
00250 {
00251     bool ret_value;
00252     ret_value = (bool) parser.get_next_example(current_example,
00253                            current_length,
00254                            current_label);
00255     if (current_length < 1)
00256         return false;
00257 
00258     if (ret_value)
00259         setup_example(current_example);
00260     else
00261         return false;
00262 
00263     current_label = current_example->ld->label;
00264     current_length = current_example->num_features;
00265 
00266     return ret_value;
00267 }
00268 
00269 VwExample* CStreamingVwFeatures::get_example()
00270 {
00271     return current_example;
00272 }
00273 
00274 float64_t CStreamingVwFeatures::get_label()
00275 {
00276     ASSERT(has_labels);
00277 
00278     return current_label;
00279 }
00280 
00281 void CStreamingVwFeatures::release_example()
00282 {
00283     env->example_number++;
00284     env->weighted_examples += current_example->ld->weight;
00285 
00286     if (current_example->ld->label == FLT_MAX)
00287         env->weighted_labels += 0;
00288     else
00289         env->weighted_labels += current_example->ld->label * current_example->ld->weight;
00290 
00291     env->total_features += current_example->num_features;
00292     env->sum_loss += current_example->loss;
00293 
00294     current_example->reset_members();
00295     parser.finalize_example();
00296 }
00297 
00298 int32_t CStreamingVwFeatures::get_dim_feature_space() const
00299 {
00300     return current_length;
00301 }
00302 
00303 float32_t CStreamingVwFeatures::dot(CStreamingDotFeatures* df)
00304 {
00305     SG_NOTIMPLEMENTED;
00306     return CMath::INFTY;
00307 }
00308 
00309 float32_t CStreamingVwFeatures::dense_dot(VwExample* &ex, const float32_t* vec2)
00310 {
00311     float32_t ret = 0.;
00312     for (vw_size_t* i = ex->indices.begin; i!= ex->indices.end; i++)
00313     {
00314         for (VwFeature* f = ex->atomics[*i].begin; f != ex->atomics[*i].end; f++)
00315             ret += vec2[f->weight_index & env->thread_mask] * f->x;
00316     }
00317     return ret;
00318 }
00319 
00320 float32_t CStreamingVwFeatures::dense_dot(const float32_t* vec2, int32_t vec2_len)
00321 {
00322     return dense_dot(current_example, vec2);
00323 }
00324 
00325 float32_t CStreamingVwFeatures::dense_dot(SGSparseVector<float32_t>* vec1, const float32_t* vec2)
00326 {
00327     float32_t ret = 0.;
00328     for (int32_t i = 0; i < vec1->num_feat_entries; i++)
00329         ret += vec1->features[i].entry * vec2[vec1->features[i].feat_index & env->mask];
00330 
00331     return ret;
00332 }
00333 
00334 float32_t CStreamingVwFeatures::dense_dot_truncated(const float32_t* vec2, VwExample* &ex, float32_t gravity)
00335 {
00336     float32_t ret = 0.;
00337     for (vw_size_t* i = ex->indices.begin; i != ex->indices.end; i++)
00338     {
00339         for (VwFeature* f = ex->atomics[*i].begin; f!= ex->atomics[*i].end; f++)
00340         {
00341             float32_t w = vec2[f->weight_index & env->thread_mask];
00342             float32_t wprime = real_weight(w,gravity);
00343             ret += wprime*f->x;
00344         }
00345     }
00346 
00347     return ret;
00348 }
00349 
00350 void CStreamingVwFeatures::add_to_dense_vec(float32_t alpha, VwExample* &ex, float32_t* vec2, int32_t vec2_len, bool abs_val)
00351 {
00352     if (abs_val)
00353     {
00354         for (vw_size_t* i = ex->indices.begin; i != ex->indices.end; i++)
00355         {
00356             for (VwFeature* f = ex->atomics[*i].begin; f != ex->atomics[*i].end; f++)
00357                 vec2[f->weight_index & env->thread_mask] += alpha * abs(f->x);
00358         }
00359     }
00360     else
00361     {
00362         for (vw_size_t* i = ex->indices.begin; i != ex->indices.end; i++)
00363         {
00364             for (VwFeature* f = ex->atomics[*i].begin; f != ex->atomics[*i].end; f++)
00365                 vec2[f->weight_index & env->thread_mask] += alpha * f->x;
00366         }
00367     }
00368 }
00369 
00370 void CStreamingVwFeatures::add_to_dense_vec(float32_t alpha, float32_t* vec2, int32_t vec2_len, bool abs_val)
00371 {
00372     add_to_dense_vec(alpha, current_example, vec2, vec2_len, abs_val);
00373 }
00374 
00375 int32_t CStreamingVwFeatures::get_num_features()
00376 {
00377     return current_length;
00378 }
00379 
00380 EFeatureClass CStreamingVwFeatures::get_feature_class()
00381 {
00382     return C_STREAMING_VW;
00383 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation