00001 #include <shogun/features/StreamingSparseFeatures.h>
00002 namespace shogun
00003 {
00004
00005 template <class T>
00006 CStreamingSparseFeatures<T>::CStreamingSparseFeatures() : CStreamingDotFeatures()
00007 {
00008 set_read_functions();
00009 init();
00010 }
00011
00012 template <class T>
00013 CStreamingSparseFeatures<T>::CStreamingSparseFeatures(CStreamingFile* file,
00014 bool is_labelled,
00015 int32_t size)
00016 : CStreamingDotFeatures()
00017 {
00018 set_read_functions();
00019 init(file, is_labelled, size);
00020 }
00021
00022 template <class T>
00023 CStreamingSparseFeatures<T>::~CStreamingSparseFeatures()
00024 {
00025 parser.end_parser();
00026 }
00027
00028 template <class T>
00029 T CStreamingSparseFeatures<T>::get_feature(int32_t index)
00030 {
00031 ASSERT(index>=0 && index<current_num_features);
00032
00033 T ret=0;
00034
00035 if (current_vector)
00036 {
00037 for (int32_t i=0; i<current_length; i++)
00038 if (current_vector[i].feat_index==index)
00039 ret += current_vector[i].entry;
00040 }
00041
00042 return ret;
00043 }
00044
00045 template <class T>
00046 void CStreamingSparseFeatures<T>::reset_stream()
00047 {
00048 }
00049
00050 template <class T>
00051 int32_t CStreamingSparseFeatures<T>::set_num_features(int32_t num)
00052 {
00053 int32_t n=current_num_features;
00054 ASSERT(n<=num);
00055 current_num_features=num;
00056 return n;
00057 }
00058
00059 template <class T>
00060 void CStreamingSparseFeatures<T>::expand_if_required(float32_t*& vec, int32_t &len)
00061 {
00062 int32_t dim = get_dim_feature_space();
00063 if (dim > len)
00064 {
00065 vec = SG_REALLOC(float32_t, vec, dim);
00066 memset(&vec[len], 0, (dim-len) * sizeof(float32_t));
00067 len = dim;
00068 }
00069 }
00070
00071 template <class T>
00072 void CStreamingSparseFeatures<T>::expand_if_required(float64_t*& vec, int32_t &len)
00073 {
00074 int32_t dim = get_dim_feature_space();
00075 if (dim > len)
00076 {
00077 vec = SG_REALLOC(float64_t, vec, dim);
00078 memset(&vec[len], 0, (dim-len) * sizeof(float64_t));
00079 len = dim;
00080 }
00081 }
00082
00083 template <class T>
00084 T CStreamingSparseFeatures<T>::sparse_dot(T alpha, SGSparseVectorEntry<T>* avec, int32_t alen, SGSparseVectorEntry<T>* bvec, int32_t blen)
00085 {
00086 T result=0;
00087
00088
00089 if (avec && bvec)
00090 {
00091 if (alen<=blen)
00092 {
00093 int32_t j=0;
00094 for (int32_t i=0; i<alen; i++)
00095 {
00096 int32_t a_feat_idx=avec[i].feat_index;
00097
00098 while ( (j<blen) && (bvec[j].feat_index < a_feat_idx) )
00099 j++;
00100
00101 if ( (j<blen) && (bvec[j].feat_index == a_feat_idx) )
00102 {
00103 result+= avec[i].entry * bvec[j].entry;
00104 j++;
00105 }
00106 }
00107 }
00108 else
00109 {
00110 int32_t j=0;
00111 for (int32_t i=0; i<blen; i++)
00112 {
00113 int32_t b_feat_idx=bvec[i].feat_index;
00114
00115 while ( (j<alen) && (avec[j].feat_index < b_feat_idx) )
00116 j++;
00117
00118 if ( (j<alen) && (avec[j].feat_index == b_feat_idx) )
00119 {
00120 result+= bvec[i].entry * avec[j].entry;
00121 j++;
00122 }
00123 }
00124 }
00125
00126 result*=alpha;
00127 }
00128
00129 return result;
00130 }
00131
00132 template <class T>
00133 T CStreamingSparseFeatures<T>::dense_dot(T alpha, T* vec, int32_t dim, T b)
00134 {
00135 ASSERT(vec);
00136 ASSERT(dim>=current_num_features);
00137 T result=b;
00138
00139 int32_t num_feat=current_length;
00140 SGSparseVectorEntry<T>* sv=current_vector;
00141
00142 if (sv)
00143 {
00144 for (int32_t i=0; i<num_feat; i++)
00145 result+=alpha*vec[sv[i].feat_index]*sv[i].entry;
00146 }
00147
00148 return result;
00149 }
00150
00151 template <class T>
00152 float64_t CStreamingSparseFeatures<T>::dense_dot(const float64_t* vec2, int32_t vec2_len)
00153 {
00154 ASSERT(vec2);
00155 if (vec2_len < current_num_features)
00156 {
00157 SG_ERROR("dimension of vec2 (=%d) does not match number of features (=%d)\n",
00158 vec2_len, current_num_features);
00159 }
00160
00161 float64_t result=0;
00162 if (current_vector)
00163 {
00164 for (int32_t i=0; i<current_length; i++)
00165 result+=vec2[current_vector[i].feat_index]*current_vector[i].entry;
00166 }
00167
00168 return result;
00169 }
00170
00171 template <class T>
00172 float32_t CStreamingSparseFeatures<T>::dense_dot(const float32_t* vec2, int32_t vec2_len)
00173 {
00174 ASSERT(vec2);
00175 if (vec2_len < current_num_features)
00176 {
00177 SG_ERROR("dimension of vec2 (=%d) does not match number of features (=%d)\n",
00178 vec2_len, current_num_features);
00179 }
00180
00181 float32_t result=0;
00182 if (current_vector)
00183 {
00184 for (int32_t i=0; i<current_length; i++)
00185 result+=vec2[current_vector[i].feat_index]*current_vector[i].entry;
00186 }
00187
00188 return result;
00189 }
00190
00191 template <class T>
00192 void CStreamingSparseFeatures<T>::add_to_dense_vec(float64_t alpha, float64_t* vec2, int32_t vec2_len, bool abs_val)
00193 {
00194 ASSERT(vec2);
00195 if (vec2_len < current_num_features)
00196 {
00197 SG_ERROR("dimension of vec (=%d) does not match number of features (=%d)\n",
00198 vec2_len, current_num_features);
00199 }
00200
00201 SGSparseVectorEntry<T>* sv=current_vector;
00202 int32_t num_feat=current_length;
00203
00204 if (sv)
00205 {
00206 if (abs_val)
00207 {
00208 for (int32_t i=0; i<num_feat; i++)
00209 vec2[sv[i].feat_index]+= alpha*CMath::abs(sv[i].entry);
00210 }
00211 else
00212 {
00213 for (int32_t i=0; i<num_feat; i++)
00214 vec2[sv[i].feat_index]+= alpha*sv[i].entry;
00215 }
00216 }
00217 }
00218
00219 template <class T>
00220 void CStreamingSparseFeatures<T>::add_to_dense_vec(float32_t alpha, float32_t* vec2, int32_t vec2_len, bool abs_val)
00221 {
00222 ASSERT(vec2);
00223 if (vec2_len < current_num_features)
00224 {
00225 SG_ERROR("dimension of vec (=%d) does not match number of features (=%d)\n",
00226 vec2_len, current_num_features);
00227 }
00228
00229 SGSparseVectorEntry<T>* sv=current_vector;
00230 int32_t num_feat=current_length;
00231
00232 if (sv)
00233 {
00234 if (abs_val)
00235 {
00236 for (int32_t i=0; i<num_feat; i++)
00237 vec2[sv[i].feat_index]+= alpha*CMath::abs(sv[i].entry);
00238 }
00239 else
00240 {
00241 for (int32_t i=0; i<num_feat; i++)
00242 vec2[sv[i].feat_index]+= alpha*sv[i].entry;
00243 }
00244 }
00245 }
00246
00247 template <class T>
00248 int64_t CStreamingSparseFeatures<T>::get_num_nonzero_entries()
00249 {
00250 return current_length;
00251 }
00252
00253 template <class T>
00254 float32_t CStreamingSparseFeatures<T>::compute_squared()
00255 {
00256 ASSERT(current_vector);
00257
00258 float32_t sq=0;
00259
00260 for (int32_t i=0; i<current_length; i++)
00261 sq += current_vector[i].entry * current_vector[i].entry;
00262
00263 return sq;
00264 }
00265
00266 template <class T>
00267 void CStreamingSparseFeatures<T>::sort_features()
00268 {
00269 ASSERT(current_vector);
00270
00271 SGSparseVectorEntry<T>* sf_orig=current_vector;
00272 int32_t len=current_length;
00273
00274 int32_t* feat_idx=SG_MALLOC(int32_t, len);
00275 int32_t* orig_idx=SG_MALLOC(int32_t, len);
00276
00277 for (int32_t i=0; i<len; i++)
00278 {
00279 feat_idx[i]=sf_orig[i].feat_index;
00280 orig_idx[i]=i;
00281 }
00282
00283 CMath::qsort_index(feat_idx, orig_idx, len);
00284
00285 SGSparseVectorEntry<T>* sf_new=SG_MALLOC(SGSparseVectorEntry<T>, len);
00286
00287 for (int32_t i=0; i<len; i++)
00288 sf_new[i]=sf_orig[orig_idx[i]];
00289
00290
00291 for (int32_t i=0; i<len-1; i++)
00292 ASSERT(sf_new[i].feat_index<sf_new[i+1].feat_index);
00293
00294
00295 for (int32_t i=0; i<len; i++)
00296 sf_orig[i]=sf_new[i];
00297
00298 SG_FREE(orig_idx);
00299 SG_FREE(feat_idx);
00300 SG_FREE(sf_new);
00301 }
00302
00303 template <class T>
00304 CFeatures* CStreamingSparseFeatures<T>::duplicate() const
00305 {
00306 return new CStreamingSparseFeatures<T>(*this);
00307 }
00308
00309 template <class T>
00310 int32_t CStreamingSparseFeatures<T>::get_num_vectors() const
00311 {
00312 if (current_vector)
00313 return 1;
00314 return 0;
00315 }
00316
00317 template <class T>
00318 int32_t CStreamingSparseFeatures<T>::get_size()
00319 {
00320 return sizeof(T);
00321 }
00322
00323 template <class T> void CStreamingSparseFeatures<T>::set_vector_reader()
00324 {
00325 parser.set_read_vector(&CStreamingFile::get_sparse_vector);
00326 }
00327
00328 template <class T> void CStreamingSparseFeatures<T>::set_vector_and_label_reader()
00329 {
00330 parser.set_read_vector_and_label
00331 (&CStreamingFile::get_sparse_vector_and_label);
00332 }
00333
00334 #define GET_FEATURE_TYPE(f_type, sg_type) \
00335 template<> EFeatureType CStreamingSparseFeatures<sg_type>::get_feature_type() \
00336 { \
00337 return f_type; \
00338 }
00339
00340 GET_FEATURE_TYPE(F_BOOL, bool)
00341 GET_FEATURE_TYPE(F_CHAR, char)
00342 GET_FEATURE_TYPE(F_BYTE, uint8_t)
00343 GET_FEATURE_TYPE(F_BYTE, int8_t)
00344 GET_FEATURE_TYPE(F_SHORT, int16_t)
00345 GET_FEATURE_TYPE(F_WORD, uint16_t)
00346 GET_FEATURE_TYPE(F_INT, int32_t)
00347 GET_FEATURE_TYPE(F_UINT, uint32_t)
00348 GET_FEATURE_TYPE(F_LONG, int64_t)
00349 GET_FEATURE_TYPE(F_ULONG, uint64_t)
00350 GET_FEATURE_TYPE(F_SHORTREAL, float32_t)
00351 GET_FEATURE_TYPE(F_DREAL, float64_t)
00352 GET_FEATURE_TYPE(F_LONGREAL, floatmax_t)
00353 #undef GET_FEATURE_TYPE
00354
00355
00356 template <class T>
00357 void CStreamingSparseFeatures<T>::init()
00358 {
00359 working_file=NULL;
00360 current_vector=NULL;
00361 current_length=-1;
00362 current_vec_index=0;
00363 current_num_features=-1;
00364 }
00365
00366 template <class T>
00367 void CStreamingSparseFeatures<T>::init(CStreamingFile* file,
00368 bool is_labelled,
00369 int32_t size)
00370 {
00371 init();
00372 has_labels = is_labelled;
00373 working_file = file;
00374 parser.init(file, is_labelled, size);
00375 }
00376
00377 template <class T>
00378 void CStreamingSparseFeatures<T>::start_parser()
00379 {
00380 if (!parser.is_running())
00381 parser.start_parser();
00382 }
00383
00384 template <class T>
00385 void CStreamingSparseFeatures<T>::end_parser()
00386 {
00387 parser.end_parser();
00388 }
00389
00390 template <class T>
00391 bool CStreamingSparseFeatures<T>::get_next_example()
00392 {
00393 bool ret_value;
00394 ret_value = (bool) parser.get_next_example(current_vector,
00395 current_length,
00396 current_label);
00397
00398 if (!ret_value)
00399 return false;
00400
00401
00402 for (int32_t i=0; i<current_length; i++)
00403 {
00404 if (current_vector[i].feat_index > current_num_features)
00405 current_num_features = current_vector[i].feat_index+1;
00406 }
00407 current_vec_index++;
00408
00409 return true;
00410 }
00411
00412 template <class T>
00413 SGSparseVector<T> CStreamingSparseFeatures<T>::get_vector()
00414 {
00415 current_sgvector.features=current_vector;
00416 current_sgvector.num_feat_entries=current_length;
00417 current_sgvector.vec_index=current_vec_index;
00418
00419 return current_sgvector;
00420 }
00421
00422 template <class T>
00423 float64_t CStreamingSparseFeatures<T>::get_label()
00424 {
00425 ASSERT(has_labels);
00426
00427 return current_label;
00428 }
00429
00430 template <class T>
00431 void CStreamingSparseFeatures<T>::release_example()
00432 {
00433 parser.finalize_example();
00434 }
00435
00436 template <class T>
00437 int32_t CStreamingSparseFeatures<T>::get_dim_feature_space() const
00438 {
00439 return current_num_features;
00440 }
00441
00442 template <class T>
00443 float32_t CStreamingSparseFeatures<T>::dot(CStreamingDotFeatures* df)
00444 {
00445 SG_NOTIMPLEMENTED;
00446 return -1;
00447 }
00448
00449 template <class T>
00450 int32_t CStreamingSparseFeatures<T>::get_num_features()
00451 {
00452 return current_num_features;
00453 }
00454
00455 template <class T>
00456 int32_t CStreamingSparseFeatures<T>::get_nnz_features_for_vector()
00457 {
00458 return current_length;
00459 }
00460
00461 template <class T>
00462 EFeatureClass CStreamingSparseFeatures<T>::get_feature_class()
00463 {
00464 return C_STREAMING_SPARSE;
00465 }
00466
00467 template class CStreamingSparseFeatures<bool>;
00468 template class CStreamingSparseFeatures<char>;
00469 template class CStreamingSparseFeatures<int8_t>;
00470 template class CStreamingSparseFeatures<uint8_t>;
00471 template class CStreamingSparseFeatures<int16_t>;
00472 template class CStreamingSparseFeatures<uint16_t>;
00473 template class CStreamingSparseFeatures<int32_t>;
00474 template class CStreamingSparseFeatures<uint32_t>;
00475 template class CStreamingSparseFeatures<int64_t>;
00476 template class CStreamingSparseFeatures<uint64_t>;
00477 template class CStreamingSparseFeatures<float32_t>;
00478 template class CStreamingSparseFeatures<float64_t>;
00479 template class CStreamingSparseFeatures<floatmax_t>;
00480 }