StreamingVwFeatures.cpp

Go to the documentation of this file.
00001 /*
00002  * Copyright (c) 2009 Yahoo! Inc.  All rights reserved.  The copyrights
00003  * embodied in the content of this file are licensed under the BSD
00004  * (revised) open source license.
00005  *
00006  * This program is free software; you can redistribute it and/or modify
00007  * it under the terms of the GNU General Public License as published by
00008  * the Free Software Foundation; either version 3 of the License, or
00009  * (at your option) any later version.
00010  *
00011  * Written (W) 2011 Shashwat Lal Das
00012  * Adaptation of Vowpal Wabbit v5.1.
00013  * Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society.
00014  */
00015 
00016 #include <shogun/features/StreamingVwFeatures.h>
00017 
00018 using namespace shogun;
00019 
00020 void CStreamingVwFeatures::set_vector_reader()
00021 {
00022     parser.set_read_vector(&CStreamingFile::get_vector);
00023 }
00024 
00025 void CStreamingVwFeatures::set_vector_and_label_reader()
00026 {
00027     parser.set_read_vector_and_label(&CStreamingFile::get_vector_and_label);
00028 }
00029 
00030 inline EFeatureType CStreamingVwFeatures::get_feature_type()
00031 {
00032     return F_DREAL;
00033 }
00034 
00035 void CStreamingVwFeatures::init()
00036 {
00037     working_file=NULL;
00038     seekable=false;
00039     current_length=-1;
00040     current_example=NULL;
00041 
00042     example_count = 0;
00043 }
00044 
00045 void CStreamingVwFeatures::init(CStreamingVwFile* file, bool is_labelled, int32_t size)
00046 {
00047     init();
00048     has_labels = is_labelled;
00049     working_file = file;
00050     parser.init(file, is_labelled, size);
00051     parser.set_free_vector_after_release(false);
00052     seekable=false;
00053 
00054     // Get environment from the StreamingVwFile
00055     env = ((CStreamingVwFile*) file)->get_env();
00056     SG_REF(env);
00057 }
00058 
00059 void CStreamingVwFeatures::init(CStreamingVwCacheFile* file, bool is_labelled, int32_t size)
00060 {
00061     init();
00062     has_labels = is_labelled;
00063     working_file = file;
00064     parser.init(file, is_labelled, size);
00065     parser.set_free_vector_after_release(false);
00066     seekable=true;
00067 
00068     // Get environment from the StreamingVwFile
00069     env = ((CStreamingVwCacheFile*) file)->get_env();
00070     SG_REF(env);
00071 }
00072 
00073 void CStreamingVwFeatures::setup_example(VwExample* ae)
00074 {
00075     ae->pass = env->passes_complete;
00076     ae->num_features = 0;
00077     ae->total_sum_feat_sq = 1;
00078     ae->example_counter = ++example_count;
00079     ae->global_weight = ae->ld->weight;
00080     env->t += ae->global_weight;
00081     ae->example_t = env->t;
00082 
00083     // If some namespaces should be ignored, remove them
00084     if (env->ignore_some)
00085     {
00086         for (vw_size_t* i = ae->indices.begin; i != ae->indices.end; i++)
00087             if (env->ignore[*i])
00088             {
00089                 ae->atomics[*i].erase();
00090                 memmove(i,i+1,(ae->indices.end - (i+1))*sizeof(vw_size_t));
00091                 ae->indices.end--;
00092                 i--;
00093             }
00094     }
00095 
00096     // Add constant feature
00097     vw_size_t constant_namespace = 128;
00098     VwFeature temp = {1,constant_hash & env->mask};
00099     ae->indices.push(constant_namespace);
00100     ae->atomics[constant_namespace].push(temp);
00101     ae->sum_feat_sq[constant_namespace] = 0;
00102 
00103     if(env->stride != 1)
00104     {
00105         // Make room for per-feature information.
00106         vw_size_t stride = env->stride;
00107         for (vw_size_t* i = ae->indices.begin; i != ae->indices.end; i++)
00108             for(VwFeature* j = ae->atomics[*i].begin; j != ae->atomics[*i].end; j++)
00109                 j->weight_index = j->weight_index*stride;
00110     }
00111 
00112     for (vw_size_t* i = ae->indices.begin; i != ae->indices.end; i++)
00113     {
00114         ae->num_features += ae->atomics[*i].end - ae->atomics[*i].begin;
00115         ae->total_sum_feat_sq += ae->sum_feat_sq[*i];
00116     }
00117 
00118     // For quadratic features
00119     for (int32_t k = 0; k < env->pairs.get_num_elements(); k++)
00120     {
00121         char* i = env->pairs.get_element(k);
00122 
00123         ae->num_features
00124             += (ae->atomics[(int32_t)(i[0])].end - ae->atomics[(int32_t)(i[0])].begin)
00125             *(ae->atomics[(int32_t)(i[1])].end - ae->atomics[(int32_t)(i[1])].begin);
00126 
00127         ae->total_sum_feat_sq += ae->sum_feat_sq[(int32_t)(i[0])]*ae->sum_feat_sq[(int32_t)(i[1])];
00128     }
00129 }
00130 
00131 void CStreamingVwFeatures::start_parser()
00132 {
00133     if (!parser.is_running())
00134         parser.start_parser();
00135 }
00136 
00137 void CStreamingVwFeatures::end_parser()
00138 {
00139     parser.end_parser();
00140 }
00141 
00142 bool CStreamingVwFeatures::get_next_example()
00143 {
00144     bool ret_value;
00145     ret_value = (bool) parser.get_next_example(current_example,
00146                            current_length,
00147                            current_label);
00148     if (current_length < 1)
00149         return false;
00150 
00151     if (ret_value)
00152         setup_example(current_example);
00153     else
00154         return false;
00155 
00156     current_label = current_example->ld->label;
00157     current_length = current_example->num_features;
00158 
00159     return ret_value;
00160 }
00161 
00162 VwExample* CStreamingVwFeatures::get_example()
00163 {
00164     return current_example;
00165 }
00166 
00167 float64_t CStreamingVwFeatures::get_label()
00168 {
00169     ASSERT(has_labels);
00170 
00171     return current_label;
00172 }
00173 
00174 void CStreamingVwFeatures::release_example()
00175 {
00176     env->example_number++;
00177     env->weighted_examples += current_example->ld->weight;
00178 
00179     if (current_example->ld->label == FLT_MAX)
00180         env->weighted_labels += 0;
00181     else
00182         env->weighted_labels += current_example->ld->label * current_example->ld->weight;
00183 
00184     env->total_features += current_example->num_features;
00185     env->sum_loss += current_example->loss;
00186 
00187     current_example->reset_members();
00188     parser.finalize_example();
00189 }
00190 
00191 int32_t CStreamingVwFeatures::get_dim_feature_space() const
00192 {
00193     return current_length;
00194 }
00195 
00196 float32_t CStreamingVwFeatures::dot(CStreamingDotFeatures* df)
00197 {
00198     SG_NOTIMPLEMENTED;
00199     return CMath::INFTY;
00200 }
00201 
00202 float32_t CStreamingVwFeatures::dense_dot(VwExample* &ex, const float32_t* vec2)
00203 {
00204     float32_t ret = 0.;
00205     for (vw_size_t* i = ex->indices.begin; i!= ex->indices.end; i++)
00206     {
00207         for (VwFeature* f = ex->atomics[*i].begin; f != ex->atomics[*i].end; f++)
00208             ret += vec2[f->weight_index & env->thread_mask] * f->x;
00209     }
00210     return ret;
00211 }
00212 
00213 float32_t CStreamingVwFeatures::dense_dot(const float32_t* vec2, int32_t vec2_len)
00214 {
00215     return dense_dot(current_example, vec2);
00216 }
00217 
00218 float32_t CStreamingVwFeatures::dense_dot(SGSparseVector<float32_t>* vec1, const float32_t* vec2)
00219 {
00220     float32_t ret = 0.;
00221     for (int32_t i = 0; i < vec1->num_feat_entries; i++)
00222         ret += vec1->features[i].entry * vec2[vec1->features[i].feat_index & env->mask];
00223 
00224     return ret;
00225 }
00226 
00227 float32_t CStreamingVwFeatures::dense_dot_truncated(const float32_t* vec2, VwExample* &ex, float32_t gravity)
00228 {
00229     float32_t ret = 0.;
00230     for (vw_size_t* i = ex->indices.begin; i != ex->indices.end; i++)
00231     {
00232         for (VwFeature* f = ex->atomics[*i].begin; f!= ex->atomics[*i].end; f++)
00233         {
00234             float32_t w = vec2[f->weight_index & env->thread_mask];
00235             float32_t wprime = real_weight(w,gravity);
00236             ret += wprime*f->x;
00237         }
00238     }
00239 
00240     return ret;
00241 }
00242 
00243 void CStreamingVwFeatures::add_to_dense_vec(float32_t alpha, VwExample* &ex, float32_t* vec2, int32_t vec2_len, bool abs_val)
00244 {
00245     if (abs_val)
00246     {
00247         for (vw_size_t* i = ex->indices.begin; i != ex->indices.end; i++)
00248         {
00249             for (VwFeature* f = ex->atomics[*i].begin; f != ex->atomics[*i].end; f++)
00250                 vec2[f->weight_index & env->thread_mask] += alpha * abs(f->x);
00251         }
00252     }
00253     else
00254     {
00255         for (vw_size_t* i = ex->indices.begin; i != ex->indices.end; i++)
00256         {
00257             for (VwFeature* f = ex->atomics[*i].begin; f != ex->atomics[*i].end; f++)
00258                 vec2[f->weight_index & env->thread_mask] += alpha * f->x;
00259         }
00260     }
00261 }
00262 
00263 void CStreamingVwFeatures::add_to_dense_vec(float32_t alpha, float32_t* vec2, int32_t vec2_len, bool abs_val)
00264 {
00265     add_to_dense_vec(alpha, current_example, vec2, vec2_len, abs_val);
00266 }
00267 
00268 int32_t CStreamingVwFeatures::get_num_features()
00269 {
00270     return current_length;
00271 }
00272 
00273 EFeatureClass CStreamingVwFeatures::get_feature_class()
00274 {
00275     return C_STREAMING_VW;
00276 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation