00001 #include <shogun/features/SparseFeatures.h>
00002 #include <shogun/preprocessor/SparsePreprocessor.h>
00003 #include <shogun/mathematics/Math.h>
00004 #include <shogun/lib/DataType.h>
00005 #include <shogun/io/SGIO.h>
00006
00007 #include <string.h>
00008 #include <stdlib.h>
00009
00010 namespace shogun
00011 {
00012
00013 template<class ST> CSparseFeatures<ST>::CSparseFeatures(int32_t size)
00014 : CDotFeatures(size), num_vectors(0), num_features(0),
00015 sparse_feature_matrix(NULL), feature_cache(NULL)
00016 {
00017 init();
00018 }
00019
00020 template<class ST> CSparseFeatures<ST>::CSparseFeatures(SGSparseVector<ST>* src,
00021 int32_t num_feat, int32_t num_vec, bool copy)
00022 : CDotFeatures(0), num_vectors(0), num_features(0),
00023 sparse_feature_matrix(NULL), feature_cache(NULL)
00024 {
00025 init();
00026
00027 if (!copy)
00028 set_sparse_feature_matrix(SGSparseMatrix<ST>(src, num_feat, num_vec));
00029 else
00030 {
00031 sparse_feature_matrix = SG_MALLOC(SGSparseVector<ST>, num_vec);
00032 memcpy(sparse_feature_matrix, src, sizeof(SGSparseVector<ST>)*num_vec);
00033 for (int32_t i=0; i< num_vec; i++)
00034 {
00035 sparse_feature_matrix[i].features = SG_MALLOC(SGSparseVectorEntry<ST>, sparse_feature_matrix[i].num_feat_entries);
00036 memcpy(sparse_feature_matrix[i].features, src[i].features, sizeof(SGSparseVectorEntry<ST>)*sparse_feature_matrix[i].num_feat_entries);
00037
00038 }
00039 }
00040 }
00041
00042 template<class ST> CSparseFeatures<ST>::CSparseFeatures(SGSparseMatrix<ST> sparse)
00043 : CDotFeatures(0), num_vectors(0), num_features(0),
00044 sparse_feature_matrix(NULL), feature_cache(NULL)
00045 {
00046 init();
00047
00048 set_sparse_feature_matrix(sparse);
00049 }
00050
00051 template<class ST> CSparseFeatures<ST>::CSparseFeatures(SGMatrix<ST> dense)
00052 : CDotFeatures(0), num_vectors(0), num_features(0),
00053 sparse_feature_matrix(NULL), feature_cache(NULL)
00054 {
00055 init();
00056
00057 set_full_feature_matrix(dense);
00058 }
00059
00060 template<class ST> CSparseFeatures<ST>::CSparseFeatures(const CSparseFeatures & orig)
00061 : CDotFeatures(orig), num_vectors(orig.num_vectors),
00062 num_features(orig.num_features),
00063 sparse_feature_matrix(orig.sparse_feature_matrix),
00064 feature_cache(orig.feature_cache)
00065 {
00066 init();
00067
00068 if (orig.sparse_feature_matrix)
00069 {
00070 free_sparse_feature_matrix();
00071 sparse_feature_matrix=SG_MALLOC(SGSparseVector<ST>, num_vectors);
00072 memcpy(sparse_feature_matrix, orig.sparse_feature_matrix, sizeof(SGSparseVector<ST>)*num_vectors);
00073 for (int32_t i=0; i< num_vectors; i++)
00074 {
00075 sparse_feature_matrix[i].features=SG_MALLOC(SGSparseVectorEntry<ST>, sparse_feature_matrix[i].num_feat_entries);
00076 memcpy(sparse_feature_matrix[i].features, orig.sparse_feature_matrix[i].features, sizeof(SGSparseVectorEntry<ST>)*sparse_feature_matrix[i].num_feat_entries);
00077
00078 }
00079 }
00080
00081 m_subset=orig.m_subset->duplicate();
00082 }
00083 template<class ST> CSparseFeatures<ST>::CSparseFeatures(CFile* loader)
00084 : CDotFeatures(loader), num_vectors(0), num_features(0),
00085 sparse_feature_matrix(NULL), feature_cache(NULL)
00086 {
00087 init();
00088
00089 load(loader);
00090 }
00091
00092 template<class ST> CSparseFeatures<ST>::~CSparseFeatures()
00093 {
00094 free_sparse_features();
00095 }
00096 template<class ST> void CSparseFeatures<ST>::free_sparse_feature_matrix()
00097 {
00098 clean_tsparse(sparse_feature_matrix, num_vectors);
00099 sparse_feature_matrix = NULL;
00100 num_vectors=0;
00101 num_features=0;
00102 remove_subset();
00103 }
00104 template<class ST> void CSparseFeatures<ST>::free_sparse_features()
00105 {
00106 free_sparse_feature_matrix();
00107 delete feature_cache;
00108 feature_cache = NULL;
00109 }
00110 template<class ST> CFeatures* CSparseFeatures<ST>::duplicate() const
00111 {
00112 return new CSparseFeatures<ST>(*this);
00113 }
00114
00115 template<class ST> ST CSparseFeatures<ST>::get_feature(int32_t num, int32_t index)
00116 {
00117 ASSERT(index>=0 && index<num_features) ;
00118 ASSERT(num>=0 && num<get_num_vectors()) ;
00119
00120 int32_t i;
00121 SGSparseVector<ST> sv=get_sparse_feature_vector(num);
00122 ST ret = 0 ;
00123
00124 if (sv.features)
00125 {
00126 for (i=0; i<sv.num_feat_entries; i++)
00127 if (sv.features[i].feat_index==index)
00128 ret+=sv.features[i].entry ;
00129 }
00130
00131 free_sparse_feature_vector(sv, num);
00132
00133 return ret ;
00134 }
00135
00136 template<class ST> ST* CSparseFeatures<ST>::get_full_feature_vector(int32_t num, int32_t& len)
00137 {
00138 int32_t i;
00139 len=0;
00140 SGSparseVector<ST> sv=get_sparse_feature_vector(num);
00141 ST* fv=NULL;
00142
00143 if (sv.features)
00144 {
00145 len=num_features;
00146 fv=SG_MALLOC(ST, num_features);
00147
00148 for (i=0; i<num_features; i++)
00149 fv[i]=0;
00150
00151 for (i=0; i<sv.num_feat_entries; i++)
00152 fv[sv.features[i].feat_index]= sv.features[i].entry;
00153 }
00154
00155 free_sparse_feature_vector(sv, num);
00156
00157 return fv;
00158 }
00159
00160 template<class ST> SGVector<ST> CSparseFeatures<ST>::get_full_feature_vector(int32_t num)
00161 {
00162 if (num>=num_vectors)
00163 {
00164 SG_ERROR("Index out of bounds (number of vectors %d, you "
00165 "requested %d)\n", num_vectors, num);
00166 }
00167
00168 SGSparseVector<ST> sv=get_sparse_feature_vector(num);
00169
00170 SGVector<ST> dense;
00171
00172 if (sv.features)
00173 {
00174 dense.do_free=true;
00175 dense.vlen=num_features;
00176 dense.vector=SG_MALLOC(ST, num_features);
00177 memset(dense.vector, 0, sizeof(ST)*num_features);
00178
00179 for (int32_t i=0; i<sv.num_feat_entries; i++)
00180 {
00181 dense.vector[sv.features[i].feat_index]= sv.features[i].entry;
00182 }
00183 }
00184
00185 free_sparse_feature_vector(sv, num);
00186
00187 return dense;
00188 }
00189
00190 template<class ST> int32_t CSparseFeatures<ST>::get_nnz_features_for_vector(int32_t num)
00191 {
00192 SGSparseVector<ST> sv = get_sparse_feature_vector(num);
00193 int32_t len=sv.num_feat_entries;
00194 free_sparse_feature_vector(sv, num);
00195 return len;
00196 }
00197
00198 template<class ST> SGSparseVector<ST> CSparseFeatures<ST>::get_sparse_feature_vector(int32_t num)
00199 {
00200 ASSERT(num<get_num_vectors());
00201
00202 index_t real_num=subset_idx_conversion(num);
00203
00204 SGSparseVector<ST> result;
00205
00206 if (sparse_feature_matrix)
00207 {
00208 result=sparse_feature_matrix[real_num];
00209 result.do_free=false;
00210 return result;
00211 }
00212 else
00213 {
00214 result.do_free=false;
00215
00216 if (feature_cache)
00217 {
00218 result.features=feature_cache->lock_entry(num);
00219
00220 if (result.features)
00221 return result;
00222 else
00223 {
00224 result.features=feature_cache->set_entry(num);
00225 }
00226 }
00227
00228 if (!result.features)
00229 result.do_free=true;
00230
00231 result.features=compute_sparse_feature_vector(num,
00232 result.num_feat_entries, result.features);
00233
00234
00235 if (get_num_preprocessors())
00236 {
00237 int32_t tmp_len=result.num_feat_entries;
00238 SGSparseVectorEntry<ST>* tmp_feat_before=result.features;
00239 SGSparseVectorEntry<ST>* tmp_feat_after = NULL;
00240
00241 for (int32_t i=0; i<get_num_preprocessors(); i++)
00242 {
00243
00244
00245 if (i!=0)
00246 SG_FREE(tmp_feat_before);
00247 tmp_feat_before=tmp_feat_after;
00248 }
00249
00250 memcpy(result.features, tmp_feat_after,
00251 sizeof(SGSparseVectorEntry<ST>)*tmp_len);
00252
00253 SG_FREE(tmp_feat_after);
00254 result.num_feat_entries=tmp_len ;
00255 SG_DEBUG( "len: %d len2: %d\n", result.num_feat_entries, num_features);
00256 }
00257 result.vec_index=num;
00258 return result ;
00259 }
00260 }
00261
00262 template<class ST> ST CSparseFeatures<ST>::sparse_dot(ST alpha, SGSparseVectorEntry<ST>* avec, int32_t alen, SGSparseVectorEntry<ST>* bvec, int32_t blen)
00263 {
00264 ST result=0;
00265
00266
00267 if (avec && bvec)
00268 {
00269 if (alen<=blen)
00270 {
00271 int32_t j=0;
00272 for (int32_t i=0; i<alen; i++)
00273 {
00274 int32_t a_feat_idx=avec[i].feat_index;
00275
00276 while ( (j<blen) && (bvec[j].feat_index < a_feat_idx) )
00277 j++;
00278
00279 if ( (j<blen) && (bvec[j].feat_index == a_feat_idx) )
00280 {
00281 result+= avec[i].entry * bvec[j].entry;
00282 j++;
00283 }
00284 }
00285 }
00286 else
00287 {
00288 int32_t j=0;
00289 for (int32_t i=0; i<blen; i++)
00290 {
00291 int32_t b_feat_idx=bvec[i].feat_index;
00292
00293 while ( (j<alen) && (avec[j].feat_index < b_feat_idx) )
00294 j++;
00295
00296 if ( (j<alen) && (avec[j].feat_index == b_feat_idx) )
00297 {
00298 result+= bvec[i].entry * avec[j].entry;
00299 j++;
00300 }
00301 }
00302 }
00303
00304 result*=alpha;
00305 }
00306
00307 return result;
00308 }
00309
00310 template<class ST> ST CSparseFeatures<ST>::dense_dot(ST alpha, int32_t num, ST* vec, int32_t dim, ST b)
00311 {
00312 ASSERT(vec);
00313 ASSERT(dim==num_features);
00314 ST result=b;
00315
00316 SGSparseVector<ST> sv=get_sparse_feature_vector(num);
00317
00318 if (sv.features)
00319 {
00320 for (int32_t i=0; i<sv.num_feat_entries; i++)
00321 {
00322 result+=alpha*vec[sv.features[i].feat_index]
00323 *sv.features[i].entry;
00324 }
00325 }
00326
00327 free_sparse_feature_vector(sv, num);
00328 return result;
00329 }
00330
00331 template<class ST> void CSparseFeatures<ST>::add_to_dense_vec(float64_t alpha, int32_t num, float64_t* vec, int32_t dim, bool abs_val)
00332 {
00333 ASSERT(vec);
00334 if (dim!=num_features)
00335 {
00336 SG_ERROR("dimension of vec (=%d) does not match number of features (=%d)\n",
00337 dim, num_features);
00338 }
00339
00340 SGSparseVector<ST> sv=get_sparse_feature_vector(num);
00341
00342 if (sv.features)
00343 {
00344 if (abs_val)
00345 {
00346 for (int32_t i=0; i<sv.num_feat_entries; i++)
00347 {
00348 vec[sv.features[i].feat_index]+=alpha
00349 *CMath::abs(sv.features[i].entry);
00350 }
00351 }
00352 else
00353 {
00354 for (int32_t i=0; i<sv.num_feat_entries; i++)
00355 {
00356 vec[sv.features[i].feat_index]+=alpha
00357 *sv.features[i].entry;
00358 }
00359 }
00360 }
00361
00362 free_sparse_feature_vector(sv, num);
00363 }
00364
00365 template<class ST> void CSparseFeatures<ST>::free_sparse_feature_vector(SGSparseVector<ST> vec, int32_t num)
00366 {
00367 if (feature_cache)
00368 feature_cache->unlock_entry(subset_idx_conversion(num));
00369
00370 vec.free_vector();
00371 }
00372
00373 template<class ST> SGSparseVector<ST>* CSparseFeatures<ST>::get_sparse_feature_matrix(int32_t &num_feat, int32_t &num_vec)
00374 {
00375 if (m_subset)
00376 SG_ERROR("get_sparse_feature_matrix() not allowed with subset\n");
00377
00378 num_feat=num_features;
00379 num_vec=num_vectors;
00380
00381 return sparse_feature_matrix;
00382 }
00383
00384 template<class ST> SGSparseMatrix<ST> CSparseFeatures<ST>::get_sparse_feature_matrix()
00385 {
00386 if (m_subset)
00387 SG_ERROR("get_sparse_feature_matrix() not allowed with subset\n");
00388
00389 SGSparseMatrix<ST> sm;
00390 sm.sparse_matrix=get_sparse_feature_matrix(sm.num_features, sm.num_vectors);
00391 return sm;
00392 }
00393
00394 template<class ST> void CSparseFeatures<ST>::clean_tsparse(SGSparseVector<ST>* sfm, int32_t num_vec)
00395 {
00396 if (sfm)
00397 {
00398 for (int32_t i=0; i<num_vec; i++)
00399 SG_FREE(sfm[i].features);
00400
00401 SG_FREE(sfm);
00402 }
00403 }
00404
00405 template<class ST> CSparseFeatures<ST>* CSparseFeatures<ST>::get_transposed()
00406 {
00407 int32_t num_feat;
00408 int32_t num_vec;
00409 SGSparseVector<ST>* s=get_transposed(num_feat, num_vec);
00410 return new CSparseFeatures<ST>(s, num_feat, num_vec);
00411 }
00412
00413 template<class ST> SGSparseVector<ST>* CSparseFeatures<ST>::get_transposed(int32_t &num_feat, int32_t &num_vec)
00414 {
00415 num_feat=get_num_vectors();
00416 num_vec=num_features;
00417
00418 int32_t* hist=SG_MALLOC(int32_t, num_features);
00419 memset(hist, 0, sizeof(int32_t)*num_features);
00420
00421
00422 for (int32_t v=0; v<num_feat; v++)
00423 {
00424 SGSparseVector<ST> sv=get_sparse_feature_vector(v);
00425
00426 for (int32_t i=0; i<sv.num_feat_entries; i++)
00427 hist[sv.features[i].feat_index]++;
00428
00429 free_sparse_feature_vector(sv, v);
00430 }
00431
00432
00433 SGSparseVector<ST>* sfm=SG_MALLOC(SGSparseVector<ST>, num_vec);
00434 for (int32_t v=0; v<num_vec; v++)
00435 {
00436 sfm[v].features= SG_MALLOC(SGSparseVectorEntry<ST>, hist[v]);
00437 sfm[v].num_feat_entries=hist[v];
00438 sfm[v].vec_index=v;
00439 }
00440
00441
00442 memset(hist,0,sizeof(int32_t)*num_features);
00443 for (int32_t v=0; v<num_feat; v++)
00444 {
00445 SGSparseVector<ST> sv=get_sparse_feature_vector(v);
00446
00447 for (int32_t i=0; i<sv.num_feat_entries; i++)
00448 {
00449 int32_t vidx=sv.features[i].feat_index;
00450 int32_t fidx=v;
00451 sfm[vidx].features[hist[vidx]].feat_index=fidx;
00452 sfm[vidx].features[hist[vidx]].entry=sv.features[i].entry;
00453 hist[vidx]++;
00454 }
00455
00456 free_sparse_feature_vector(sv, v);
00457 }
00458
00459 SG_FREE(hist);
00460 return sfm;
00461 }
00462
00463 template<class ST> void CSparseFeatures<ST>::set_sparse_feature_matrix(SGSparseMatrix<ST> sm)
00464 {
00465 if (m_subset)
00466 SG_ERROR("set_sparse_feature_matrix() not allowed with subset\n");
00467
00468
00469 free_sparse_feature_matrix();
00470 sm.own_matrix();
00471
00472 sparse_feature_matrix=sm.sparse_matrix;
00473 num_features=sm.num_features;
00474 num_vectors=sm.num_vectors;
00475 }
00476
00477 template<class ST> SGMatrix<ST> CSparseFeatures<ST>::get_full_feature_matrix()
00478 {
00479 SGMatrix<ST> full;
00480
00481 SG_INFO( "converting sparse features to full feature matrix of %ld x %ld entries\n", num_vectors, num_features);
00482 full.num_rows=num_features;
00483 full.num_cols=get_num_vectors();
00484 full.do_free=true;
00485 full.matrix=SG_MALLOC(ST, int64_t(num_features)*get_num_vectors());
00486
00487 memset(full.matrix, 0, size_t(num_features)*size_t(get_num_vectors())*sizeof(ST));
00488
00489 for (int32_t v=0; v<full.num_cols; v++)
00490 {
00491 SGSparseVector<ST> current=
00492 sparse_feature_matrix[subset_idx_conversion(v)];
00493
00494 for (int32_t f=0; f<current.num_feat_entries; f++)
00495 {
00496 int64_t offs=(current.vec_index*num_features)
00497 +current.features[f].feat_index;
00498
00499 full.matrix[offs]=current.features[f].entry;
00500 }
00501 }
00502
00503 return full;
00504 }
00505
00506 template<class ST> bool CSparseFeatures<ST>::set_full_feature_matrix(SGMatrix<ST> full)
00507 {
00508 remove_subset();
00509
00510 ST* src=full.matrix;
00511 int32_t num_feat=full.num_rows;
00512 int32_t num_vec=full.num_cols;
00513
00514 free_sparse_feature_matrix();
00515 bool result=true;
00516 num_features=num_feat;
00517 num_vectors=num_vec;
00518
00519 SG_INFO("converting dense feature matrix to sparse one\n");
00520 int32_t* num_feat_entries=SG_MALLOC(int, num_vectors);
00521
00522 if (num_feat_entries)
00523 {
00524 int64_t num_total_entries=0;
00525
00526
00527 for (int32_t i=0; i< num_vec; i++)
00528 {
00529 num_feat_entries[i]=0;
00530 for (int32_t j=0; j< num_feat; j++)
00531 {
00532 if (src[i*((int64_t) num_feat) + j] != 0)
00533 num_feat_entries[i]++;
00534 }
00535 }
00536
00537 if (num_vec>0)
00538 {
00539 sparse_feature_matrix=SG_MALLOC(SGSparseVector<ST>, num_vec);
00540
00541 if (sparse_feature_matrix)
00542 {
00543 for (int32_t i=0; i< num_vec; i++)
00544 {
00545 sparse_feature_matrix[i].vec_index=i;
00546 sparse_feature_matrix[i].num_feat_entries=0;
00547 sparse_feature_matrix[i].features= NULL;
00548
00549 if (num_feat_entries[i]>0)
00550 {
00551 sparse_feature_matrix[i].features= SG_MALLOC(SGSparseVectorEntry<ST>, num_feat_entries[i]);
00552
00553 if (!sparse_feature_matrix[i].features)
00554 {
00555 SG_INFO( "allocation of features failed\n");
00556 return false;
00557 }
00558
00559 sparse_feature_matrix[i].num_feat_entries=num_feat_entries[i];
00560 int32_t sparse_feat_idx=0;
00561
00562 for (int32_t j=0; j< num_feat; j++)
00563 {
00564 int64_t pos= i*num_feat + j;
00565
00566 if (src[pos] != 0)
00567 {
00568 sparse_feature_matrix[i].features[sparse_feat_idx].entry=src[pos];
00569 sparse_feature_matrix[i].features[sparse_feat_idx].feat_index=j;
00570 sparse_feat_idx++;
00571 num_total_entries++;
00572 }
00573 }
00574 }
00575 }
00576 }
00577 else
00578 {
00579 SG_ERROR( "allocation of sparse feature matrix failed\n");
00580 result=false;
00581 }
00582
00583 SG_INFO( "sparse feature matrix has %ld entries (full matrix had %ld, sparsity %2.2f%%)\n",
00584 num_total_entries, int64_t(num_feat)*num_vec, (100.0*num_total_entries)/(int64_t(num_feat)*num_vec));
00585 }
00586 else
00587 {
00588 SG_ERROR( "huh ? zero size matrix given ?\n");
00589 result=false;
00590 }
00591 }
00592 SG_FREE(num_feat_entries);
00593 return result;
00594 }
00595
00596 template<class ST> bool CSparseFeatures<ST>::apply_preprocessor(bool force_preprocessing)
00597 {
00598 SG_INFO( "force: %d\n", force_preprocessing);
00599
00600 if ( sparse_feature_matrix && get_num_preprocessors() )
00601 {
00602 for (int32_t i=0; i<get_num_preprocessors(); i++)
00603 {
00604 if ( (!is_preprocessed(i) || force_preprocessing) )
00605 {
00606 set_preprocessed(i);
00607 SG_INFO( "preprocessing using preproc %s\n", get_preprocessor(i)->get_name());
00608 if (((CSparsePreprocessor<ST>*) get_preprocessor(i))->apply_to_sparse_feature_matrix(this) == NULL)
00609 return false;
00610 }
00611 return true;
00612 }
00613 return true;
00614 }
00615 else
00616 {
00617 SG_WARNING( "no sparse feature matrix available or features already preprocessed - skipping.\n");
00618 return false;
00619 }
00620 }
00621
00622 template<class ST> int32_t CSparseFeatures<ST>::get_size()
00623 {
00624 return sizeof(ST);
00625 }
00626
00627 template<class ST> bool CSparseFeatures<ST>::obtain_from_simple(CSimpleFeatures<ST>* sf)
00628 {
00629 SGMatrix<ST> fm=sf->get_feature_matrix();
00630 ASSERT(fm.matrix && fm.num_cols>0 && fm.num_rows>0);
00631
00632 return set_full_feature_matrix(fm);
00633 }
00634
00635 template<class ST> int32_t CSparseFeatures<ST>::get_num_vectors() const
00636 {
00637 return m_subset ? m_subset->get_size() : num_vectors;
00638 }
00639
00640 template<class ST> int32_t CSparseFeatures<ST>::get_num_features()
00641 {
00642 return num_features;
00643 }
00644
00645 template<class ST> int32_t CSparseFeatures<ST>::set_num_features(int32_t num)
00646 {
00647 int32_t n=num_features;
00648 ASSERT(n<=num);
00649 num_features=num;
00650 return num_features;
00651 }
00652
00653 template<class ST> EFeatureClass CSparseFeatures<ST>::get_feature_class()
00654 {
00655 return C_SPARSE;
00656 }
00657
00658 template<class ST> void CSparseFeatures<ST>::free_feature_vector(SGSparseVector<ST> vec, int32_t num)
00659 {
00660 if (feature_cache)
00661 feature_cache->unlock_entry(subset_idx_conversion(num));
00662
00663 vec.free_vector();
00664 }
00665
00666 template<class ST> int64_t CSparseFeatures<ST>::get_num_nonzero_entries()
00667 {
00668 int64_t num=0;
00669 index_t num_vec=get_num_vectors();
00670 for (int32_t i=0; i<num_vec; i++)
00671 num+=sparse_feature_matrix[subset_idx_conversion(i)].num_feat_entries;
00672
00673 return num;
00674 }
00675
00676 template<class ST> float64_t* CSparseFeatures<ST>::compute_squared(float64_t* sq)
00677 {
00678 ASSERT(sq);
00679
00680 index_t num_vec=get_num_vectors();
00681 for (int32_t i=0; i<num_vec; i++)
00682 {
00683 sq[i]=0;
00684 SGSparseVector<ST> vec=get_sparse_feature_vector(i);
00685
00686 for (int32_t j=0; j<vec.num_feat_entries; j++)
00687 sq[i]+=vec.features[j].entry*vec.features[j].entry;
00688
00689 free_feature_vector(vec, i);
00690 }
00691
00692 return sq;
00693 }
00694
00695 template<class ST> float64_t CSparseFeatures<ST>::compute_squared_norm(
00696 CSparseFeatures<float64_t>* lhs, float64_t* sq_lhs, int32_t idx_a,
00697 CSparseFeatures<float64_t>* rhs, float64_t* sq_rhs, int32_t idx_b)
00698 {
00699 int32_t i,j;
00700 ASSERT(lhs);
00701 ASSERT(rhs);
00702
00703 SGSparseVector<float64_t> avec=lhs->get_sparse_feature_vector(idx_a);
00704 SGSparseVector<float64_t> bvec=rhs->get_sparse_feature_vector(idx_b);
00705 ASSERT(avec.features);
00706 ASSERT(bvec.features);
00707
00708 float64_t result=sq_lhs[idx_a]+sq_rhs[idx_b];
00709
00710 if (avec.num_feat_entries<=bvec.num_feat_entries)
00711 {
00712 j=0;
00713 for (i=0; i<avec.num_feat_entries; i++)
00714 {
00715 int32_t a_feat_idx=avec.features[i].feat_index;
00716
00717 while ((j<bvec.num_feat_entries)
00718 &&(bvec.features[j].feat_index<a_feat_idx))
00719 j++;
00720
00721 if ((j<bvec.num_feat_entries)
00722 &&(bvec.features[j].feat_index==a_feat_idx))
00723 {
00724 result-=2*(avec.features[i].entry*bvec.features[j].entry);
00725 j++;
00726 }
00727 }
00728 }
00729 else
00730 {
00731 j=0;
00732 for (i=0; i<bvec.num_feat_entries; i++)
00733 {
00734 int32_t b_feat_idx=bvec.features[i].feat_index;
00735
00736 while ((j<avec.num_feat_entries)
00737 &&(avec.features[j].feat_index<b_feat_idx))
00738 j++;
00739
00740 if ((j<avec.num_feat_entries)
00741 &&(avec.features[j].feat_index==b_feat_idx))
00742 {
00743 result-=2*(bvec.features[i].entry*avec.features[j].entry);
00744 j++;
00745 }
00746 }
00747 }
00748
00749 ((CSparseFeatures<float64_t>*) lhs)->free_feature_vector(avec, idx_a);
00750 ((CSparseFeatures<float64_t>*) rhs)->free_feature_vector(bvec, idx_b);
00751
00752 return CMath::abs(result);
00753 }
00754
00755 template<class ST> CLabels* CSparseFeatures<ST>::load_svmlight_file(char* fname,
00756 bool do_sort_features)
00757 {
00758 remove_subset();
00759
00760 CLabels* lab=NULL;
00761
00762 size_t blocksize=1024*1024;
00763 size_t required_blocksize=blocksize;
00764 uint8_t* dummy=SG_MALLOC(uint8_t, blocksize);
00765 FILE* f=fopen(fname, "ro");
00766
00767 if (f)
00768 {
00769 free_sparse_feature_matrix();
00770 num_vectors=0;
00771 num_features=0;
00772
00773 SG_INFO("counting line numbers in file %s\n", fname);
00774 size_t sz=blocksize;
00775 size_t block_offs=0;
00776 size_t old_block_offs=0;
00777 fseek(f, 0, SEEK_END);
00778 size_t fsize=ftell(f);
00779 rewind(f);
00780
00781 while (sz == blocksize)
00782 {
00783 sz=fread(dummy, sizeof(uint8_t), blocksize, f);
00784 for (size_t i=0; i<sz; i++)
00785 {
00786 block_offs++;
00787 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00788 {
00789 num_vectors++;
00790 required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs+1);
00791 old_block_offs=block_offs;
00792 }
00793 }
00794 SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
00795 }
00796
00797 SG_INFO("found %d feature vectors\n", num_vectors);
00798 SG_FREE(dummy);
00799 blocksize=required_blocksize;
00800 dummy = SG_MALLOC(uint8_t, blocksize+1);
00801
00802 lab=new CLabels(num_vectors);
00803 sparse_feature_matrix=SG_MALLOC(SGSparseVector<ST>, num_vectors);
00804
00805 rewind(f);
00806 sz=blocksize;
00807 int32_t lines=0;
00808 while (sz == blocksize)
00809 {
00810 sz=fread(dummy, sizeof(uint8_t), blocksize, f);
00811
00812 size_t old_sz=0;
00813 for (size_t i=0; i<sz; i++)
00814 {
00815 if (i==sz-1 && dummy[i]!='\n' && sz==blocksize)
00816 {
00817 size_t len=i-old_sz+1;
00818 uint8_t* data=&dummy[old_sz];
00819
00820 for (size_t j=0; j<len; j++)
00821 dummy[j]=data[j];
00822
00823 sz=fread(dummy+len, sizeof(uint8_t), blocksize-len, f);
00824 i=0;
00825 old_sz=0;
00826 sz+=len;
00827 }
00828
00829 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00830 {
00831
00832 size_t len=i-old_sz;
00833 uint8_t* data=&dummy[old_sz];
00834
00835 int32_t dims=0;
00836 for (size_t j=0; j<len; j++)
00837 {
00838 if (data[j]==':')
00839 dims++;
00840 }
00841
00842 if (dims<=0)
00843 {
00844 SG_ERROR("Error in line %d - number of"
00845 " dimensions is %d line is %d characters"
00846 " long\n line_content:'%.*s'\n", lines,
00847 dims, len, len, (const char*) data);
00848 }
00849
00850 SGSparseVectorEntry<ST>* feat=SG_MALLOC(SGSparseVectorEntry<ST>, dims);
00851 size_t j=0;
00852 for (; j<len; j++)
00853 {
00854 if (data[j]==' ')
00855 {
00856 data[j]='\0';
00857
00858 lab->set_label(lines, atof((const char*) data));
00859 break;
00860 }
00861 }
00862
00863 int32_t d=0;
00864 j++;
00865 uint8_t* start=&data[j];
00866 for (; j<len; j++)
00867 {
00868 if (data[j]==':')
00869 {
00870 data[j]='\0';
00871
00872 feat[d].feat_index=(int32_t) atoi((const char*) start)-1;
00873 num_features=CMath::max(num_features, feat[d].feat_index+1);
00874
00875 j++;
00876 start=&data[j];
00877 for (; j<len; j++)
00878 {
00879 if (data[j]==' ' || data[j]=='\n')
00880 {
00881 data[j]='\0';
00882 feat[d].entry=(ST) atof((const char*) start);
00883 d++;
00884 break;
00885 }
00886 }
00887
00888 if (j==len)
00889 {
00890 data[j]='\0';
00891 feat[dims-1].entry=(ST) atof((const char*) start);
00892 }
00893
00894 j++;
00895 start=&data[j];
00896 }
00897 }
00898
00899 sparse_feature_matrix[lines].vec_index=lines;
00900 sparse_feature_matrix[lines].num_feat_entries=dims;
00901 sparse_feature_matrix[lines].features=feat;
00902
00903 old_sz=i+1;
00904 lines++;
00905 SG_PROGRESS(lines, 0, num_vectors, 1, "LOADING:\t");
00906 }
00907 }
00908 }
00909 SG_INFO("file successfully read\n");
00910 fclose(f);
00911 }
00912
00913 SG_FREE(dummy);
00914
00915 if (do_sort_features)
00916 sort_features();
00917
00918 return lab;
00919 }
00920
00921 template<class ST> void CSparseFeatures<ST>::sort_features()
00922 {
00923 if (m_subset)
00924 SG_ERROR("sort_features() not allowed with subset\n");
00925
00926 ASSERT(get_num_preprocessors()==0);
00927
00928 if (!sparse_feature_matrix)
00929 SG_ERROR("Requires sparse feature matrix to be available in-memory\n");
00930
00931 for (int32_t i=0; i<num_vectors; i++)
00932 {
00933 int32_t len=sparse_feature_matrix[i].num_feat_entries;
00934
00935 if (!len)
00936 continue;
00937
00938 SGSparseVectorEntry<ST>* sf_orig=sparse_feature_matrix[i].features;
00939 int32_t* feat_idx=SG_MALLOC(int32_t, len);
00940 int32_t* orig_idx=SG_MALLOC(int32_t, len);
00941
00942 for (int j=0; j<len; j++)
00943 {
00944 feat_idx[j]=sf_orig[j].feat_index;
00945 orig_idx[j]=j;
00946 }
00947
00948 CMath::qsort_index(feat_idx, orig_idx, len);
00949
00950 SGSparseVectorEntry<ST>* sf_new= SG_MALLOC(SGSparseVectorEntry<ST>, len);
00951 for (int j=0; j<len; j++)
00952 sf_new[j]=sf_orig[orig_idx[j]];
00953
00954 sparse_feature_matrix[i].features=sf_new;
00955
00956
00957 for (int j=0; j<len-1; j++)
00958 ASSERT(sf_new[j].feat_index<sf_new[j+1].feat_index);
00959
00960 SG_FREE(orig_idx);
00961 SG_FREE(feat_idx);
00962 SG_FREE(sf_orig);
00963 }
00964 }
00965
00966 template<class ST> bool CSparseFeatures<ST>::write_svmlight_file(char* fname,
00967 CLabels* label)
00968 {
00969 if (m_subset)
00970 SG_ERROR("write_svmlight_file() not allowed with subset\n");
00971
00972 ASSERT(label);
00973 int32_t num=label->get_num_labels();
00974 ASSERT(num>0);
00975 ASSERT(num==num_vectors);
00976
00977 FILE* f=fopen(fname, "wb");
00978
00979 if (f)
00980 {
00981 for (int32_t i=0; i<num; i++)
00982 {
00983 fprintf(f, "%d ", (int32_t) label->get_int_label(i));
00984
00985 SGSparseVectorEntry<ST>* vec = sparse_feature_matrix[i].features;
00986 int32_t num_feat = sparse_feature_matrix[i].num_feat_entries;
00987
00988 for (int32_t j=0; j<num_feat; j++)
00989 {
00990 if (j<num_feat-1)
00991 fprintf(f, "%d:%f ", (int32_t) vec[j].feat_index+1, (double) vec[j].entry);
00992 else
00993 fprintf(f, "%d:%f\n", (int32_t) vec[j].feat_index+1, (double) vec[j].entry);
00994 }
00995 }
00996
00997 fclose(f);
00998 return true;
00999 }
01000 return false;
01001 }
01002
01003 template<class ST> int32_t CSparseFeatures<ST>::get_dim_feature_space() const
01004 {
01005 return num_features;
01006 }
01007
01008 template<class ST> float64_t CSparseFeatures<ST>::dot(int32_t vec_idx1,
01009 CDotFeatures* df, int32_t vec_idx2)
01010 {
01011 ASSERT(df);
01012 ASSERT(df->get_feature_type() == get_feature_type());
01013 ASSERT(df->get_feature_class() == get_feature_class());
01014 CSparseFeatures<ST>* sf = (CSparseFeatures<ST>*) df;
01015
01016 SGSparseVector<ST> avec=get_sparse_feature_vector(vec_idx1);
01017 SGSparseVector<ST> bvec=sf->get_sparse_feature_vector(vec_idx2);
01018
01019 float64_t result=sparse_dot(1, avec.features, avec.num_feat_entries,
01020 bvec.features, bvec.num_feat_entries);
01021
01022 free_sparse_feature_vector(avec, vec_idx1);
01023 sf->free_sparse_feature_vector(bvec, vec_idx2);
01024
01025 return result;
01026 }
01027 template<class ST> float64_t CSparseFeatures<ST>::dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len)
01028 {
01029 ASSERT(vec2);
01030 if (vec2_len!=num_features)
01031 {
01032 SG_ERROR("dimension of vec2 (=%d) does not match number of features (=%d)\n",
01033 vec2_len, num_features);
01034 }
01035 float64_t result=0;
01036
01037 SGSparseVector<ST> sv=get_sparse_feature_vector(vec_idx1);
01038
01039 if (sv.features)
01040 {
01041 for (int32_t i=0; i<sv.num_feat_entries; i++)
01042 result+=vec2[sv.features[i].feat_index]*sv.features[i].entry;
01043 }
01044
01045 free_sparse_feature_vector(sv, vec_idx1);
01046
01047 return result;
01048 }
01049
01050 template<class ST> void* CSparseFeatures<ST>::get_feature_iterator(int32_t vector_index)
01051 {
01052 if (vector_index>=get_num_vectors())
01053 {
01054 SG_ERROR("Index out of bounds (number of vectors %d, you "
01055 "requested %d)\n", get_num_vectors(), vector_index);
01056 }
01057
01058 if (!sparse_feature_matrix)
01059 SG_ERROR("Requires a in-memory feature matrix\n");
01060
01061 sparse_feature_iterator* it=SG_MALLOC(sparse_feature_iterator, 1);
01062 it->sv=get_sparse_feature_vector(vector_index);
01063 it->index=0;
01064
01065 return it;
01066 }
01067
01068 template<class ST> bool CSparseFeatures<ST>::get_next_feature(int32_t& index, float64_t& value, void* iterator)
01069 {
01070 sparse_feature_iterator* it=(sparse_feature_iterator*) iterator;
01071 if (!it || it->index>=it->sv.num_feat_entries)
01072 return false;
01073
01074 int32_t i=it->index++;
01075
01076 index=it->sv.features[i].feat_index;
01077 value=(float64_t) it->sv.features[i].entry;
01078
01079 return true;
01080 }
01081
01082 template<class ST> void CSparseFeatures<ST>::free_feature_iterator(void* iterator)
01083 {
01084 if (!iterator)
01085 return;
01086
01087 sparse_feature_iterator* it=(sparse_feature_iterator*) iterator;
01088 free_sparse_feature_vector(it->sv, it->sv.vec_index);
01089 SG_FREE(it);
01090 }
01091
01092 template<class ST> CFeatures* CSparseFeatures<ST>::copy_subset(SGVector<index_t> indices)
01093 {
01094 SGSparseMatrix<ST> matrix_copy=SGSparseMatrix<ST>(indices.vlen,
01095 get_dim_feature_space());
01096
01097 for (index_t i=0; i<indices.vlen; ++i)
01098 {
01099
01100 index_t index=indices.vector[i];
01101
01102
01103 SGSparseVector<ST> current=get_sparse_feature_vector(index);
01104 matrix_copy.sparse_matrix[i]=SGSparseVector<ST>(
01105 current.num_feat_entries, current.vec_index);
01106
01107
01108 memcpy(matrix_copy.sparse_matrix[i].features, current.features,
01109 sizeof(SGSparseVectorEntry<ST>)*current.num_feat_entries);
01110
01111 free_sparse_feature_vector(current, index);
01112 }
01113
01114 return new CSparseFeatures<ST>(matrix_copy);
01115 }
01116
01117 template<class ST> SGSparseVectorEntry<ST>* CSparseFeatures<ST>::compute_sparse_feature_vector(int32_t num,
01118 int32_t& len, SGSparseVectorEntry<ST>* target)
01119 {
01120 SG_NOTIMPLEMENTED;
01121
01122 len=0;
01123 return NULL;
01124 }
01125
01126 template<class ST> void CSparseFeatures<ST>::init()
01127 {
01128 set_generic<ST>();
01129
01130 m_parameters->add_vector(&sparse_feature_matrix, &num_vectors,
01131 "sparse_feature_matrix",
01132 "Array of sparse vectors.");
01133 m_parameters->add(&num_features, "num_features",
01134 "Total number of features.");
01135 }
01136
01137 #define GET_FEATURE_TYPE(sg_type, f_type) \
01138 template<> EFeatureType CSparseFeatures<sg_type>::get_feature_type() \
01139 { \
01140 return f_type; \
01141 }
01142 GET_FEATURE_TYPE(bool, F_BOOL)
01143 GET_FEATURE_TYPE(char, F_CHAR)
01144 GET_FEATURE_TYPE(uint8_t, F_BYTE)
01145 GET_FEATURE_TYPE(int8_t, F_BYTE)
01146 GET_FEATURE_TYPE(int16_t, F_SHORT)
01147 GET_FEATURE_TYPE(uint16_t, F_WORD)
01148 GET_FEATURE_TYPE(int32_t, F_INT)
01149 GET_FEATURE_TYPE(uint32_t, F_UINT)
01150 GET_FEATURE_TYPE(int64_t, F_LONG)
01151 GET_FEATURE_TYPE(uint64_t, F_ULONG)
01152 GET_FEATURE_TYPE(float32_t, F_SHORTREAL)
01153 GET_FEATURE_TYPE(float64_t, F_DREAL)
01154 GET_FEATURE_TYPE(floatmax_t, F_LONGREAL)
01155 #undef GET_FEATURE_TYPE
01156
01157 #define LOAD(fname, sg_type) \
01158 template<> void CSparseFeatures<sg_type>::load(CFile* loader) \
01159 { \
01160 remove_subset(); \
01161 SG_SET_LOCALE_C; \
01162 ASSERT(loader); \
01163 SGSparseVector<sg_type>* matrix=NULL; \
01164 int32_t num_feat=0; \
01165 int32_t num_vec=0; \
01166 loader->fname(matrix, num_feat, num_vec); \
01167 set_sparse_feature_matrix(SGSparseMatrix<sg_type>(matrix, num_feat, num_vec)); \
01168 SG_RESET_LOCALE; \
01169 }
01170 LOAD(get_sparse_matrix, bool)
01171 LOAD(get_sparse_matrix, char)
01172 LOAD(get_sparse_matrix, uint8_t)
01173 LOAD(get_int8_sparsematrix, int8_t)
01174 LOAD(get_sparse_matrix, int16_t)
01175 LOAD(get_sparse_matrix, uint16_t)
01176 LOAD(get_sparse_matrix, int32_t)
01177 LOAD(get_uint_sparsematrix, uint32_t)
01178 LOAD(get_long_sparsematrix, int64_t)
01179 LOAD(get_ulong_sparsematrix, uint64_t)
01180 LOAD(get_sparse_matrix, float32_t)
01181 LOAD(get_sparse_matrix, float64_t)
01182 LOAD(get_longreal_sparsematrix, floatmax_t)
01183 #undef LOAD
01184
01185 #define WRITE(fname, sg_type) \
01186 template<> void CSparseFeatures<sg_type>::save(CFile* writer) \
01187 { \
01188 if (m_subset) \
01189 SG_ERROR("save() not allowed with subset\n"); \
01190 SG_SET_LOCALE_C; \
01191 ASSERT(writer); \
01192 writer->fname(sparse_feature_matrix, num_features, num_vectors); \
01193 SG_RESET_LOCALE; \
01194 }
01195 WRITE(set_sparse_matrix, bool)
01196 WRITE(set_sparse_matrix, char)
01197 WRITE(set_sparse_matrix, uint8_t)
01198 WRITE(set_int8_sparsematrix, int8_t)
01199 WRITE(set_sparse_matrix, int16_t)
01200 WRITE(set_sparse_matrix, uint16_t)
01201 WRITE(set_sparse_matrix, int32_t)
01202 WRITE(set_uint_sparsematrix, uint32_t)
01203 WRITE(set_long_sparsematrix, int64_t)
01204 WRITE(set_ulong_sparsematrix, uint64_t)
01205 WRITE(set_sparse_matrix, float32_t)
01206 WRITE(set_sparse_matrix, float64_t)
01207 WRITE(set_longreal_sparsematrix, floatmax_t)
01208 #undef WRITE
01209
01210 template class CSparseFeatures<bool>;
01211 template class CSparseFeatures<char>;
01212 template class CSparseFeatures<int8_t>;
01213 template class CSparseFeatures<uint8_t>;
01214 template class CSparseFeatures<int16_t>;
01215 template class CSparseFeatures<uint16_t>;
01216 template class CSparseFeatures<int32_t>;
01217 template class CSparseFeatures<uint32_t>;
01218 template class CSparseFeatures<int64_t>;
01219 template class CSparseFeatures<uint64_t>;
01220 template class CSparseFeatures<float32_t>;
01221 template class CSparseFeatures<float64_t>;
01222 template class CSparseFeatures<floatmax_t>;
01223 }