00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016 #include <shogun/features/StreamingVwFeatures.h>
00017
00018 using namespace shogun;
00019
00020 CStreamingVwFeatures::CStreamingVwFeatures() : CStreamingDotFeatures()
00021 {
00022 init();
00023 set_read_functions();
00024 }
00025
00026 CStreamingVwFeatures::CStreamingVwFeatures(CStreamingVwFile* file,
00027 bool is_labelled, int32_t size)
00028 : CStreamingDotFeatures()
00029 {
00030 init(file, is_labelled, size);
00031 set_read_functions();
00032 }
00033
00034 CStreamingVwFeatures::CStreamingVwFeatures(CStreamingVwCacheFile* file,
00035 bool is_labelled, int32_t size)
00036 : CStreamingDotFeatures()
00037 {
00038 init(file, is_labelled, size);
00039 set_read_functions();
00040 }
00041
00042 CStreamingVwFeatures::~CStreamingVwFeatures()
00043 {
00044 parser.end_parser();
00045 SG_UNREF(env);
00046 }
00047
00048 CFeatures* CStreamingVwFeatures::duplicate() const
00049 {
00050 return new CStreamingVwFeatures(*this);
00051 }
00052
00053 void CStreamingVwFeatures::set_vector_reader()
00054 {
00055 parser.set_read_vector(&CStreamingFile::get_vector);
00056 }
00057
00058 void CStreamingVwFeatures::set_vector_and_label_reader()
00059 {
00060 parser.set_read_vector_and_label(&CStreamingFile::get_vector_and_label);
00061 }
00062
00063 void CStreamingVwFeatures::reset_stream()
00064 {
00065 if (working_file->is_seekable())
00066 {
00067 working_file->reset_stream();
00068 parser.exit_parser();
00069 parser.init(working_file, has_labels, parser.get_ring_size());
00070 parser.set_free_vector_after_release(false);
00071 parser.start_parser();
00072 }
00073 else
00074 SG_ERROR("The input cannot be reset! Please use 1 pass.\n");
00075 }
00076
00077 CVwEnvironment* CStreamingVwFeatures::get_env()
00078 {
00079 SG_REF(env);
00080 return env;
00081 }
00082
00083 void CStreamingVwFeatures::set_env(CVwEnvironment* vw_env)
00084 {
00085 env = vw_env;
00086 SG_REF(env);
00087 }
00088
00089 void CStreamingVwFeatures::expand_if_required(float32_t*& vec, int32_t& len)
00090 {
00091 int32_t dim = 1 << env->num_bits;
00092 if (dim > len)
00093 {
00094 vec = SG_REALLOC(float32_t, vec, dim);
00095 memset(&vec[len], 0, (dim-len) * sizeof(float32_t));
00096 len = dim;
00097 }
00098 }
00099
00100 void CStreamingVwFeatures::expand_if_required(float64_t*& vec, int32_t& len)
00101 {
00102 int32_t dim = 1 << env->num_bits;
00103 if (dim > len)
00104 {
00105 vec = SG_REALLOC(float64_t, vec, dim);
00106 memset(&vec[len], 0, (dim-len) * sizeof(float64_t));
00107 len = dim;
00108 }
00109 }
00110
00111 float32_t CStreamingVwFeatures::real_weight(float32_t w, float32_t gravity)
00112 {
00113 float32_t wprime = 0;
00114 if (gravity < fabsf(w))
00115 wprime = CMath::sign(w)*(fabsf(w) - gravity);
00116 return wprime;
00117 }
00118
00119 int32_t CStreamingVwFeatures::get_nnz_features_for_vector()
00120 {
00121 return current_length;
00122 }
00123
00124 int32_t CStreamingVwFeatures::get_num_vectors() const
00125 {
00126 if (current_example)
00127 return 1;
00128 else
00129 return 0;
00130 }
00131
00132 int32_t CStreamingVwFeatures::get_size()
00133 {
00134 return sizeof(VwExample);
00135 }
00136
00137 EFeatureType CStreamingVwFeatures::get_feature_type()
00138 {
00139 return F_DREAL;
00140 }
00141
00142 void CStreamingVwFeatures::init()
00143 {
00144 working_file=NULL;
00145 seekable=false;
00146 current_length=-1;
00147 current_example=NULL;
00148
00149 example_count = 0;
00150 }
00151
00152 void CStreamingVwFeatures::init(CStreamingVwFile* file, bool is_labelled, int32_t size)
00153 {
00154 init();
00155 has_labels = is_labelled;
00156 working_file = file;
00157 parser.init(file, is_labelled, size);
00158 parser.set_free_vector_after_release(false);
00159 seekable=false;
00160
00161
00162 env = ((CStreamingVwFile*) file)->get_env();
00163 SG_REF(env);
00164 }
00165
00166 void CStreamingVwFeatures::init(CStreamingVwCacheFile* file, bool is_labelled, int32_t size)
00167 {
00168 init();
00169 has_labels = is_labelled;
00170 working_file = file;
00171 parser.init(file, is_labelled, size);
00172 parser.set_free_vector_after_release(false);
00173 seekable=true;
00174
00175
00176 env = ((CStreamingVwCacheFile*) file)->get_env();
00177 SG_REF(env);
00178 }
00179
00180 void CStreamingVwFeatures::setup_example(VwExample* ae)
00181 {
00182 ae->pass = env->passes_complete;
00183 ae->num_features = 0;
00184 ae->total_sum_feat_sq = 1;
00185 ae->example_counter = ++example_count;
00186 ae->global_weight = ae->ld->weight;
00187 env->t += ae->global_weight;
00188 ae->example_t = env->t;
00189
00190
00191 if (env->ignore_some)
00192 {
00193 for (vw_size_t* i = ae->indices.begin; i != ae->indices.end; i++)
00194 if (env->ignore[*i])
00195 {
00196 ae->atomics[*i].erase();
00197 memmove(i,i+1,(ae->indices.end - (i+1))*sizeof(vw_size_t));
00198 ae->indices.end--;
00199 i--;
00200 }
00201 }
00202
00203
00204 vw_size_t constant_namespace = 128;
00205 VwFeature temp = {1,constant_hash & env->mask};
00206 ae->indices.push(constant_namespace);
00207 ae->atomics[constant_namespace].push(temp);
00208 ae->sum_feat_sq[constant_namespace] = 0;
00209
00210 if(env->stride != 1)
00211 {
00212
00213 vw_size_t stride = env->stride;
00214 for (vw_size_t* i = ae->indices.begin; i != ae->indices.end; i++)
00215 for(VwFeature* j = ae->atomics[*i].begin; j != ae->atomics[*i].end; j++)
00216 j->weight_index = j->weight_index*stride;
00217 }
00218
00219 for (vw_size_t* i = ae->indices.begin; i != ae->indices.end; i++)
00220 {
00221 ae->num_features += ae->atomics[*i].end - ae->atomics[*i].begin;
00222 ae->total_sum_feat_sq += ae->sum_feat_sq[*i];
00223 }
00224
00225
00226 for (int32_t k = 0; k < env->pairs.get_num_elements(); k++)
00227 {
00228 char* i = env->pairs.get_element(k);
00229
00230 ae->num_features
00231 += (ae->atomics[(int32_t)(i[0])].end - ae->atomics[(int32_t)(i[0])].begin)
00232 *(ae->atomics[(int32_t)(i[1])].end - ae->atomics[(int32_t)(i[1])].begin);
00233
00234 ae->total_sum_feat_sq += ae->sum_feat_sq[(int32_t)(i[0])]*ae->sum_feat_sq[(int32_t)(i[1])];
00235 }
00236 }
00237
00238 void CStreamingVwFeatures::start_parser()
00239 {
00240 if (!parser.is_running())
00241 parser.start_parser();
00242 }
00243
00244 void CStreamingVwFeatures::end_parser()
00245 {
00246 parser.end_parser();
00247 }
00248
00249 bool CStreamingVwFeatures::get_next_example()
00250 {
00251 bool ret_value;
00252 ret_value = (bool) parser.get_next_example(current_example,
00253 current_length,
00254 current_label);
00255 if (current_length < 1)
00256 return false;
00257
00258 if (ret_value)
00259 setup_example(current_example);
00260 else
00261 return false;
00262
00263 current_label = current_example->ld->label;
00264 current_length = current_example->num_features;
00265
00266 return ret_value;
00267 }
00268
00269 VwExample* CStreamingVwFeatures::get_example()
00270 {
00271 return current_example;
00272 }
00273
00274 float64_t CStreamingVwFeatures::get_label()
00275 {
00276 ASSERT(has_labels);
00277
00278 return current_label;
00279 }
00280
00281 void CStreamingVwFeatures::release_example()
00282 {
00283 env->example_number++;
00284 env->weighted_examples += current_example->ld->weight;
00285
00286 if (current_example->ld->label == FLT_MAX)
00287 env->weighted_labels += 0;
00288 else
00289 env->weighted_labels += current_example->ld->label * current_example->ld->weight;
00290
00291 env->total_features += current_example->num_features;
00292 env->sum_loss += current_example->loss;
00293
00294 current_example->reset_members();
00295 parser.finalize_example();
00296 }
00297
00298 int32_t CStreamingVwFeatures::get_dim_feature_space() const
00299 {
00300 return current_length;
00301 }
00302
00303 float32_t CStreamingVwFeatures::dot(CStreamingDotFeatures* df)
00304 {
00305 SG_NOTIMPLEMENTED;
00306 return CMath::INFTY;
00307 }
00308
00309 float32_t CStreamingVwFeatures::dense_dot(VwExample* &ex, const float32_t* vec2)
00310 {
00311 float32_t ret = 0.;
00312 for (vw_size_t* i = ex->indices.begin; i!= ex->indices.end; i++)
00313 {
00314 for (VwFeature* f = ex->atomics[*i].begin; f != ex->atomics[*i].end; f++)
00315 ret += vec2[f->weight_index & env->thread_mask] * f->x;
00316 }
00317 return ret;
00318 }
00319
00320 float32_t CStreamingVwFeatures::dense_dot(const float32_t* vec2, int32_t vec2_len)
00321 {
00322 return dense_dot(current_example, vec2);
00323 }
00324
00325 float32_t CStreamingVwFeatures::dense_dot(SGSparseVector<float32_t>* vec1, const float32_t* vec2)
00326 {
00327 float32_t ret = 0.;
00328 for (int32_t i = 0; i < vec1->num_feat_entries; i++)
00329 ret += vec1->features[i].entry * vec2[vec1->features[i].feat_index & env->mask];
00330
00331 return ret;
00332 }
00333
00334 float32_t CStreamingVwFeatures::dense_dot_truncated(const float32_t* vec2, VwExample* &ex, float32_t gravity)
00335 {
00336 float32_t ret = 0.;
00337 for (vw_size_t* i = ex->indices.begin; i != ex->indices.end; i++)
00338 {
00339 for (VwFeature* f = ex->atomics[*i].begin; f!= ex->atomics[*i].end; f++)
00340 {
00341 float32_t w = vec2[f->weight_index & env->thread_mask];
00342 float32_t wprime = real_weight(w,gravity);
00343 ret += wprime*f->x;
00344 }
00345 }
00346
00347 return ret;
00348 }
00349
00350 void CStreamingVwFeatures::add_to_dense_vec(float32_t alpha, VwExample* &ex, float32_t* vec2, int32_t vec2_len, bool abs_val)
00351 {
00352 if (abs_val)
00353 {
00354 for (vw_size_t* i = ex->indices.begin; i != ex->indices.end; i++)
00355 {
00356 for (VwFeature* f = ex->atomics[*i].begin; f != ex->atomics[*i].end; f++)
00357 vec2[f->weight_index & env->thread_mask] += alpha * abs(f->x);
00358 }
00359 }
00360 else
00361 {
00362 for (vw_size_t* i = ex->indices.begin; i != ex->indices.end; i++)
00363 {
00364 for (VwFeature* f = ex->atomics[*i].begin; f != ex->atomics[*i].end; f++)
00365 vec2[f->weight_index & env->thread_mask] += alpha * f->x;
00366 }
00367 }
00368 }
00369
00370 void CStreamingVwFeatures::add_to_dense_vec(float32_t alpha, float32_t* vec2, int32_t vec2_len, bool abs_val)
00371 {
00372 add_to_dense_vec(alpha, current_example, vec2, vec2_len, abs_val);
00373 }
00374
00375 int32_t CStreamingVwFeatures::get_num_features()
00376 {
00377 return current_length;
00378 }
00379
00380 EFeatureClass CStreamingVwFeatures::get_feature_class()
00381 {
00382 return C_STREAMING_VW;
00383 }