00001
00002
00003
00004
00005
00006
00007
00008
00009
00010 #include <shogun/features/streaming/StreamingSparseFeatures.h>
00011 namespace shogun
00012 {
00013
00014 template <class T>
00015 CStreamingSparseFeatures<T>::CStreamingSparseFeatures() : CStreamingDotFeatures()
00016 {
00017 set_read_functions();
00018 init();
00019 }
00020
00021 template <class T>
00022 CStreamingSparseFeatures<T>::CStreamingSparseFeatures(CStreamingFile* file,
00023 bool is_labelled,
00024 int32_t size)
00025 : CStreamingDotFeatures()
00026 {
00027 set_read_functions();
00028 init(file, is_labelled, size);
00029 }
00030
00031 template <class T>
00032 CStreamingSparseFeatures<T>::~CStreamingSparseFeatures()
00033 {
00034 parser.end_parser();
00035 }
00036
00037 template <class T>
00038 T CStreamingSparseFeatures<T>::get_feature(int32_t index)
00039 {
00040 ASSERT(index>=0 && index<current_num_features);
00041
00042 T ret=0;
00043
00044 if (current_vector)
00045 {
00046 for (int32_t i=0; i<current_length; i++)
00047 if (current_vector[i].feat_index==index)
00048 ret += current_vector[i].entry;
00049 }
00050
00051 return ret;
00052 }
00053
00054 template <class T>
00055 void CStreamingSparseFeatures<T>::reset_stream()
00056 {
00057 }
00058
00059 template <class T>
00060 int32_t CStreamingSparseFeatures<T>::set_num_features(int32_t num)
00061 {
00062 int32_t n=current_num_features;
00063 ASSERT(n<=num);
00064 current_num_features=num;
00065 return n;
00066 }
00067
00068 template <class T>
00069 void CStreamingSparseFeatures<T>::expand_if_required(float32_t*& vec, int32_t &len)
00070 {
00071 int32_t dim = get_dim_feature_space();
00072 if (dim > len)
00073 {
00074 vec = SG_REALLOC(float32_t, vec, dim);
00075 memset(&vec[len], 0, (dim-len) * sizeof(float32_t));
00076 len = dim;
00077 }
00078 }
00079
00080 template <class T>
00081 void CStreamingSparseFeatures<T>::expand_if_required(float64_t*& vec, int32_t &len)
00082 {
00083 int32_t dim = get_dim_feature_space();
00084 if (dim > len)
00085 {
00086 vec = SG_REALLOC(float64_t, vec, dim);
00087 memset(&vec[len], 0, (dim-len) * sizeof(float64_t));
00088 len = dim;
00089 }
00090 }
00091
00092 template <class T>
00093 T CStreamingSparseFeatures<T>::sparse_dot(T alpha, SGSparseVectorEntry<T>* avec, int32_t alen, SGSparseVectorEntry<T>* bvec, int32_t blen)
00094 {
00095 T result=0;
00096
00097
00098 if (avec && bvec)
00099 {
00100 if (alen<=blen)
00101 {
00102 int32_t j=0;
00103 for (int32_t i=0; i<alen; i++)
00104 {
00105 int32_t a_feat_idx=avec[i].feat_index;
00106
00107 while ( (j<blen) && (bvec[j].feat_index < a_feat_idx) )
00108 j++;
00109
00110 if ( (j<blen) && (bvec[j].feat_index == a_feat_idx) )
00111 {
00112 result+= avec[i].entry * bvec[j].entry;
00113 j++;
00114 }
00115 }
00116 }
00117 else
00118 {
00119 int32_t j=0;
00120 for (int32_t i=0; i<blen; i++)
00121 {
00122 int32_t b_feat_idx=bvec[i].feat_index;
00123
00124 while ( (j<alen) && (avec[j].feat_index < b_feat_idx) )
00125 j++;
00126
00127 if ( (j<alen) && (avec[j].feat_index == b_feat_idx) )
00128 {
00129 result+= bvec[i].entry * avec[j].entry;
00130 j++;
00131 }
00132 }
00133 }
00134
00135 result*=alpha;
00136 }
00137
00138 return result;
00139 }
00140
00141 template <class T>
00142 T CStreamingSparseFeatures<T>::dense_dot(T alpha, T* vec, int32_t dim, T b)
00143 {
00144 ASSERT(vec);
00145 ASSERT(dim>=current_num_features);
00146 T result=b;
00147
00148 int32_t num_feat=current_length;
00149 SGSparseVectorEntry<T>* sv=current_vector;
00150
00151 if (sv)
00152 {
00153 for (int32_t i=0; i<num_feat; i++)
00154 result+=alpha*vec[sv[i].feat_index]*sv[i].entry;
00155 }
00156
00157 return result;
00158 }
00159
00160 template <class T>
00161 float64_t CStreamingSparseFeatures<T>::dense_dot(const float64_t* vec2, int32_t vec2_len)
00162 {
00163 ASSERT(vec2);
00164 if (vec2_len < current_num_features)
00165 {
00166 SG_ERROR("dimension of vec2 (=%d) does not match number of features (=%d)\n",
00167 vec2_len, current_num_features);
00168 }
00169
00170 float64_t result=0;
00171 if (current_vector)
00172 {
00173 for (int32_t i=0; i<current_length; i++)
00174 result+=vec2[current_vector[i].feat_index]*current_vector[i].entry;
00175 }
00176
00177 return result;
00178 }
00179
00180 template <class T>
00181 float32_t CStreamingSparseFeatures<T>::dense_dot(const float32_t* vec2, int32_t vec2_len)
00182 {
00183 ASSERT(vec2);
00184 if (vec2_len < current_num_features)
00185 {
00186 SG_ERROR("dimension of vec2 (=%d) does not match number of features (=%d)\n",
00187 vec2_len, current_num_features);
00188 }
00189
00190 float32_t result=0;
00191 if (current_vector)
00192 {
00193 for (int32_t i=0; i<current_length; i++)
00194 result+=vec2[current_vector[i].feat_index]*current_vector[i].entry;
00195 }
00196
00197 return result;
00198 }
00199
00200 template <class T>
00201 void CStreamingSparseFeatures<T>::add_to_dense_vec(float64_t alpha, float64_t* vec2, int32_t vec2_len, bool abs_val)
00202 {
00203 ASSERT(vec2);
00204 if (vec2_len < current_num_features)
00205 {
00206 SG_ERROR("dimension of vec (=%d) does not match number of features (=%d)\n",
00207 vec2_len, current_num_features);
00208 }
00209
00210 SGSparseVectorEntry<T>* sv=current_vector;
00211 int32_t num_feat=current_length;
00212
00213 if (sv)
00214 {
00215 if (abs_val)
00216 {
00217 for (int32_t i=0; i<num_feat; i++)
00218 vec2[sv[i].feat_index]+= alpha*CMath::abs(sv[i].entry);
00219 }
00220 else
00221 {
00222 for (int32_t i=0; i<num_feat; i++)
00223 vec2[sv[i].feat_index]+= alpha*sv[i].entry;
00224 }
00225 }
00226 }
00227
00228 template <class T>
00229 void CStreamingSparseFeatures<T>::add_to_dense_vec(float32_t alpha, float32_t* vec2, int32_t vec2_len, bool abs_val)
00230 {
00231 ASSERT(vec2);
00232 if (vec2_len < current_num_features)
00233 {
00234 SG_ERROR("dimension of vec (=%d) does not match number of features (=%d)\n",
00235 vec2_len, current_num_features);
00236 }
00237
00238 SGSparseVectorEntry<T>* sv=current_vector;
00239 int32_t num_feat=current_length;
00240
00241 if (sv)
00242 {
00243 if (abs_val)
00244 {
00245 for (int32_t i=0; i<num_feat; i++)
00246 vec2[sv[i].feat_index]+= alpha*CMath::abs(sv[i].entry);
00247 }
00248 else
00249 {
00250 for (int32_t i=0; i<num_feat; i++)
00251 vec2[sv[i].feat_index]+= alpha*sv[i].entry;
00252 }
00253 }
00254 }
00255
00256 template <class T>
00257 int64_t CStreamingSparseFeatures<T>::get_num_nonzero_entries()
00258 {
00259 return current_length;
00260 }
00261
00262 template <class T>
00263 float32_t CStreamingSparseFeatures<T>::compute_squared()
00264 {
00265 ASSERT(current_vector);
00266
00267 float32_t sq=0;
00268
00269 for (int32_t i=0; i<current_length; i++)
00270 sq += current_vector[i].entry * current_vector[i].entry;
00271
00272 return sq;
00273 }
00274
00275 template <class T>
00276 void CStreamingSparseFeatures<T>::sort_features()
00277 {
00278 ASSERT(current_vector);
00279
00280 SGSparseVectorEntry<T>* sf_orig=current_vector;
00281 int32_t len=current_length;
00282
00283 int32_t* feat_idx=SG_MALLOC(int32_t, len);
00284 int32_t* orig_idx=SG_MALLOC(int32_t, len);
00285
00286 for (int32_t i=0; i<len; i++)
00287 {
00288 feat_idx[i]=sf_orig[i].feat_index;
00289 orig_idx[i]=i;
00290 }
00291
00292 CMath::qsort_index(feat_idx, orig_idx, len);
00293
00294 SGSparseVectorEntry<T>* sf_new=SG_MALLOC(SGSparseVectorEntry<T>, len);
00295
00296 for (int32_t i=0; i<len; i++)
00297 sf_new[i]=sf_orig[orig_idx[i]];
00298
00299
00300 for (int32_t i=0; i<len-1; i++)
00301 ASSERT(sf_new[i].feat_index<sf_new[i+1].feat_index);
00302
00303
00304 for (int32_t i=0; i<len; i++)
00305 sf_orig[i]=sf_new[i];
00306
00307 SG_FREE(orig_idx);
00308 SG_FREE(feat_idx);
00309 SG_FREE(sf_new);
00310 }
00311
00312 template <class T>
00313 CFeatures* CStreamingSparseFeatures<T>::duplicate() const
00314 {
00315 return new CStreamingSparseFeatures<T>(*this);
00316 }
00317
00318 template <class T>
00319 int32_t CStreamingSparseFeatures<T>::get_num_vectors() const
00320 {
00321 if (current_vector)
00322 return 1;
00323 return 0;
00324 }
00325
00326 template <class T>
00327 int32_t CStreamingSparseFeatures<T>::get_size() const
00328 {
00329 return sizeof(T);
00330 }
00331
00332 template <class T> void CStreamingSparseFeatures<T>::set_vector_reader()
00333 {
00334 parser.set_read_vector(&CStreamingFile::get_sparse_vector);
00335 }
00336
00337 template <class T> void CStreamingSparseFeatures<T>::set_vector_and_label_reader()
00338 {
00339 parser.set_read_vector_and_label
00340 (&CStreamingFile::get_sparse_vector_and_label);
00341 }
00342
00343 #define GET_FEATURE_TYPE(f_type, sg_type) \
00344 template<> EFeatureType CStreamingSparseFeatures<sg_type>::get_feature_type() const \
00345 { \
00346 return f_type; \
00347 }
00348
00349 GET_FEATURE_TYPE(F_BOOL, bool)
00350 GET_FEATURE_TYPE(F_CHAR, char)
00351 GET_FEATURE_TYPE(F_BYTE, uint8_t)
00352 GET_FEATURE_TYPE(F_BYTE, int8_t)
00353 GET_FEATURE_TYPE(F_SHORT, int16_t)
00354 GET_FEATURE_TYPE(F_WORD, uint16_t)
00355 GET_FEATURE_TYPE(F_INT, int32_t)
00356 GET_FEATURE_TYPE(F_UINT, uint32_t)
00357 GET_FEATURE_TYPE(F_LONG, int64_t)
00358 GET_FEATURE_TYPE(F_ULONG, uint64_t)
00359 GET_FEATURE_TYPE(F_SHORTREAL, float32_t)
00360 GET_FEATURE_TYPE(F_DREAL, float64_t)
00361 GET_FEATURE_TYPE(F_LONGREAL, floatmax_t)
00362 #undef GET_FEATURE_TYPE
00363
00364
00365 template <class T>
00366 void CStreamingSparseFeatures<T>::init()
00367 {
00368 working_file=NULL;
00369 current_vector=NULL;
00370 current_length=-1;
00371 current_vec_index=0;
00372 current_num_features=-1;
00373 }
00374
00375 template <class T>
00376 void CStreamingSparseFeatures<T>::init(CStreamingFile* file,
00377 bool is_labelled,
00378 int32_t size)
00379 {
00380 init();
00381 has_labels = is_labelled;
00382 working_file = file;
00383 SG_REF(working_file);
00384 parser.init(file, is_labelled, size);
00385 }
00386
00387 template <class T>
00388 void CStreamingSparseFeatures<T>::start_parser()
00389 {
00390 if (!parser.is_running())
00391 parser.start_parser();
00392 }
00393
00394 template <class T>
00395 void CStreamingSparseFeatures<T>::end_parser()
00396 {
00397 parser.end_parser();
00398 }
00399
00400 template <class T>
00401 bool CStreamingSparseFeatures<T>::get_next_example()
00402 {
00403 bool ret_value;
00404 ret_value = (bool) parser.get_next_example(current_vector,
00405 current_length,
00406 current_label);
00407
00408 if (!ret_value)
00409 return false;
00410
00411
00412 for (int32_t i=0; i<current_length; i++)
00413 {
00414 if (current_vector[i].feat_index > current_num_features)
00415 current_num_features = current_vector[i].feat_index+1;
00416 }
00417 current_vec_index++;
00418
00419 return true;
00420 }
00421
00422 template <class T>
00423 SGSparseVector<T> CStreamingSparseFeatures<T>::get_vector()
00424 {
00425 current_sgvector.features=current_vector;
00426 current_sgvector.num_feat_entries=current_length;
00427
00428 return current_sgvector;
00429 }
00430
00431 template <class T>
00432 float64_t CStreamingSparseFeatures<T>::get_label()
00433 {
00434 ASSERT(has_labels);
00435
00436 return current_label;
00437 }
00438
00439 template <class T>
00440 void CStreamingSparseFeatures<T>::release_example()
00441 {
00442 parser.finalize_example();
00443 }
00444
00445 template <class T>
00446 int32_t CStreamingSparseFeatures<T>::get_dim_feature_space() const
00447 {
00448 return current_num_features;
00449 }
00450
00451 template <class T>
00452 float32_t CStreamingSparseFeatures<T>::dot(CStreamingDotFeatures* df)
00453 {
00454 SG_NOTIMPLEMENTED;
00455 return -1;
00456 }
00457
00458 template <class T>
00459 int32_t CStreamingSparseFeatures<T>::get_num_features()
00460 {
00461 return current_num_features;
00462 }
00463
00464 template <class T>
00465 int32_t CStreamingSparseFeatures<T>::get_nnz_features_for_vector()
00466 {
00467 return current_length;
00468 }
00469
00470 template <class T>
00471 EFeatureClass CStreamingSparseFeatures<T>::get_feature_class() const
00472 {
00473 return C_STREAMING_SPARSE;
00474 }
00475
00476 template class CStreamingSparseFeatures<bool>;
00477 template class CStreamingSparseFeatures<char>;
00478 template class CStreamingSparseFeatures<int8_t>;
00479 template class CStreamingSparseFeatures<uint8_t>;
00480 template class CStreamingSparseFeatures<int16_t>;
00481 template class CStreamingSparseFeatures<uint16_t>;
00482 template class CStreamingSparseFeatures<int32_t>;
00483 template class CStreamingSparseFeatures<uint32_t>;
00484 template class CStreamingSparseFeatures<int64_t>;
00485 template class CStreamingSparseFeatures<uint64_t>;
00486 template class CStreamingSparseFeatures<float32_t>;
00487 template class CStreamingSparseFeatures<float64_t>;
00488 template class CStreamingSparseFeatures<floatmax_t>;
00489 }