00001 #include <shogun/features/SparseFeatures.h>
00002 #include <shogun/preprocessor/SparsePreprocessor.h>
00003 #include <shogun/mathematics/Math.h>
00004 #include <shogun/lib/DataType.h>
00005 #include <shogun/labels/RegressionLabels.h>
00006 #include <shogun/io/SGIO.h>
00007
00008 #include <string.h>
00009 #include <stdlib.h>
00010
00011 namespace shogun
00012 {
00013
00014 template<class ST> CSparseFeatures<ST>::CSparseFeatures(int32_t size)
00015 : CDotFeatures(size), num_vectors(0), num_features(0),
00016 sparse_feature_matrix(NULL), feature_cache(NULL)
00017 {
00018 init();
00019 }
00020
00021 template<class ST> CSparseFeatures<ST>::CSparseFeatures(SGSparseVector<ST>* src,
00022 int32_t num_feat, int32_t num_vec, bool copy)
00023 : CDotFeatures(0), num_vectors(num_vec), num_features(num_feat),
00024 sparse_feature_matrix(NULL), feature_cache(NULL)
00025 {
00026 init();
00027
00028 sparse_feature_matrix = src;
00029
00030
00031
00032
00033
00034
00035 }
00036
00037 template<class ST> CSparseFeatures<ST>::CSparseFeatures(SGSparseMatrix<ST> sparse)
00038 : CDotFeatures(0), num_vectors(0), num_features(0),
00039 sparse_feature_matrix(NULL), feature_cache(NULL)
00040 {
00041 init();
00042
00043 set_sparse_feature_matrix(sparse);
00044 }
00045
00046 template<class ST> CSparseFeatures<ST>::CSparseFeatures(SGMatrix<ST> dense)
00047 : CDotFeatures(0), num_vectors(0), num_features(0),
00048 sparse_feature_matrix(NULL), feature_cache(NULL)
00049 {
00050 init();
00051
00052 set_full_feature_matrix(dense);
00053 }
00054
00055 template<class ST> CSparseFeatures<ST>::CSparseFeatures(const CSparseFeatures & orig)
00056 : CDotFeatures(orig), num_vectors(orig.num_vectors),
00057 num_features(orig.num_features),
00058 sparse_feature_matrix(orig.sparse_feature_matrix),
00059 feature_cache(orig.feature_cache)
00060 {
00061 init();
00062
00063 if (orig.sparse_feature_matrix)
00064 {
00065 free_sparse_feature_matrix();
00066 sparse_feature_matrix=SG_MALLOC(SGSparseVector<ST>, num_vectors);
00067 for (int32_t i=0; i< num_vectors; i++)
00068 {
00069 new (&sparse_feature_matrix[i]) SGSparseVector<ST>();
00070 sparse_feature_matrix[i] = orig.sparse_feature_matrix[i];
00071 }
00072 }
00073
00074 m_subset_stack=orig.m_subset_stack;
00075 SG_REF(m_subset_stack);
00076 }
00077 template<class ST> CSparseFeatures<ST>::CSparseFeatures(CFile* loader)
00078 : CDotFeatures(loader), num_vectors(0), num_features(0),
00079 sparse_feature_matrix(NULL), feature_cache(NULL)
00080 {
00081 init();
00082
00083 load(loader);
00084 }
00085
00086 template<class ST> CSparseFeatures<ST>::~CSparseFeatures()
00087 {
00088 free_sparse_features();
00089 }
00090 template<class ST> void CSparseFeatures<ST>::free_sparse_feature_matrix()
00091 {
00092 for (int32_t i=0; i<num_vectors; i++)
00093 (&sparse_feature_matrix[i])->~SGSparseVector();
00094
00095 SG_FREE(sparse_feature_matrix);
00096 num_vectors=0;
00097 num_features=0;
00098 remove_all_subsets();
00099 }
00100 template<class ST> void CSparseFeatures<ST>::free_sparse_features()
00101 {
00102 free_sparse_feature_matrix();
00103 delete feature_cache;
00104 feature_cache = NULL;
00105 }
00106 template<class ST> CFeatures* CSparseFeatures<ST>::duplicate() const
00107 {
00108 return new CSparseFeatures<ST>(*this);
00109 }
00110
00111 template<class ST> ST CSparseFeatures<ST>::get_feature(int32_t num, int32_t index)
00112 {
00113 ASSERT(index>=0 && index<num_features);
00114 ASSERT(num>=0 && num<get_num_vectors());
00115
00116 int32_t i;
00117 SGSparseVector<ST> sv=get_sparse_feature_vector(num);
00118 ST ret = 0 ;
00119
00120 if (sv.features)
00121 {
00122 for (i=0; i<sv.num_feat_entries; i++)
00123 if (sv.features[i].feat_index==index)
00124 ret+=sv.features[i].entry ;
00125 }
00126
00127 free_sparse_feature_vector(num);
00128
00129 return ret ;
00130 }
00131
00132 template<class ST> ST* CSparseFeatures<ST>::get_full_feature_vector(int32_t num, int32_t& len)
00133 {
00134 int32_t i;
00135 len=0;
00136 SGSparseVector<ST> sv=get_sparse_feature_vector(num);
00137 ST* fv=NULL;
00138
00139 if (sv.features)
00140 {
00141 len=num_features;
00142 fv=SG_MALLOC(ST, num_features);
00143
00144 for (i=0; i<num_features; i++)
00145 fv[i]=0;
00146
00147 for (i=0; i<sv.num_feat_entries; i++)
00148 fv[sv.features[i].feat_index]= sv.features[i].entry;
00149 }
00150
00151 free_sparse_feature_vector(num);
00152
00153 return fv;
00154 }
00155
00156 template<class ST> SGVector<ST> CSparseFeatures<ST>::get_full_feature_vector(int32_t num)
00157 {
00158 if (num>=get_num_vectors())
00159 {
00160 SG_ERROR("Index out of bounds (number of vectors %d, you "
00161 "requested %d)\n", get_num_vectors(), num);
00162 }
00163
00164 SGSparseVector<ST> sv=get_sparse_feature_vector(num);
00165
00166 SGVector<ST> dense;
00167
00168 if (sv.features)
00169 {
00170 dense=SGVector<ST>(num_features);
00171 dense.zero();
00172
00173 for (int32_t i=0; i<sv.num_feat_entries; i++)
00174 dense.vector[sv.features[i].feat_index]= sv.features[i].entry;
00175 }
00176
00177 free_sparse_feature_vector(num);
00178
00179 return dense;
00180 }
00181
00182 template<class ST> int32_t CSparseFeatures<ST>::get_nnz_features_for_vector(int32_t num)
00183 {
00184 SGSparseVector<ST> sv = get_sparse_feature_vector(num);
00185 int32_t len=sv.num_feat_entries;
00186 free_sparse_feature_vector(num);
00187 return len;
00188 }
00189
00190 template<class ST> SGSparseVector<ST> CSparseFeatures<ST>::get_sparse_feature_vector(int32_t num)
00191 {
00192 ASSERT(num<get_num_vectors());
00193
00194 index_t real_num=m_subset_stack->subset_idx_conversion(num);
00195
00196 SGSparseVector<ST> result;
00197
00198 if (sparse_feature_matrix)
00199 {
00200 return sparse_feature_matrix[real_num];
00201 }
00202 else
00203 {
00204 if (feature_cache)
00205 {
00206 result.features=feature_cache->lock_entry(num);
00207
00208 if (result.features)
00209 return result;
00210 else
00211 {
00212 result.features=feature_cache->set_entry(num);
00213 }
00214 }
00215
00216
00217
00218
00219 result.features=compute_sparse_feature_vector(num,
00220 result.num_feat_entries, result.features);
00221
00222
00223 if (get_num_preprocessors())
00224 {
00225 int32_t tmp_len=result.num_feat_entries;
00226 SGSparseVectorEntry<ST>* tmp_feat_before=result.features;
00227 SGSparseVectorEntry<ST>* tmp_feat_after = NULL;
00228
00229 for (int32_t i=0; i<get_num_preprocessors(); i++)
00230 {
00231
00232
00233 if (i!=0)
00234 SG_FREE(tmp_feat_before);
00235 tmp_feat_before=tmp_feat_after;
00236 }
00237
00238 memcpy(result.features, tmp_feat_after,
00239 sizeof(SGSparseVectorEntry<ST>)*tmp_len);
00240
00241 SG_FREE(tmp_feat_after);
00242 result.num_feat_entries=tmp_len ;
00243 SG_DEBUG( "len: %d len2: %d\n", result.num_feat_entries, num_features);
00244 }
00245 return result ;
00246 }
00247 }
00248
00249 template<class ST> ST CSparseFeatures<ST>::dense_dot(ST alpha, int32_t num, ST* vec, int32_t dim, ST b)
00250 {
00251 ASSERT(vec);
00252 ASSERT(dim==num_features);
00253 ST result=b;
00254
00255 SGSparseVector<ST> sv=get_sparse_feature_vector(num);
00256
00257 if (sv.features)
00258 {
00259 for (int32_t i=0; i<sv.num_feat_entries; i++)
00260 {
00261 result+=alpha*vec[sv.features[i].feat_index]
00262 *sv.features[i].entry;
00263 }
00264 }
00265
00266 free_sparse_feature_vector(num);
00267 return result;
00268 }
00269
00270 template<class ST> void CSparseFeatures<ST>::add_to_dense_vec(float64_t alpha, int32_t num, float64_t* vec, int32_t dim, bool abs_val)
00271 {
00272 ASSERT(vec);
00273 if (dim!=num_features)
00274 {
00275 SG_ERROR("dimension of vec (=%d) does not match number of features (=%d)\n",
00276 dim, num_features);
00277 }
00278
00279 SGSparseVector<ST> sv=get_sparse_feature_vector(num);
00280
00281 if (sv.features)
00282 {
00283 if (abs_val)
00284 {
00285 for (int32_t i=0; i<sv.num_feat_entries; i++)
00286 {
00287 vec[sv.features[i].feat_index]+=alpha
00288 *CMath::abs(sv.features[i].entry);
00289 }
00290 }
00291 else
00292 {
00293 for (int32_t i=0; i<sv.num_feat_entries; i++)
00294 {
00295 vec[sv.features[i].feat_index]+=alpha
00296 *sv.features[i].entry;
00297 }
00298 }
00299 }
00300
00301 free_sparse_feature_vector(num);
00302 }
00303
00304 template<class ST> void CSparseFeatures<ST>::free_sparse_feature_vector(int32_t num)
00305 {
00306 if (feature_cache)
00307 feature_cache->unlock_entry(m_subset_stack->subset_idx_conversion(num));
00308
00309
00310 }
00311
00312 template<class ST> SGSparseVector<ST>* CSparseFeatures<ST>::get_sparse_feature_matrix(int32_t &num_feat, int32_t &num_vec)
00313 {
00314 if (m_subset_stack->has_subsets())
00315 SG_ERROR("get_sparse_feature_matrix() not allowed with subset\n");
00316
00317 num_feat=num_features;
00318 num_vec=num_vectors;
00319
00320 return sparse_feature_matrix;
00321 }
00322
00323 template<class ST> SGSparseMatrix<ST> CSparseFeatures<ST>::get_sparse_feature_matrix()
00324 {
00325 if (m_subset_stack->has_subsets())
00326 SG_ERROR("get_sparse_feature_matrix() not allowed with subset\n");
00327
00328 SGSparseMatrix<ST> sm=SGSparseMatrix<ST>(NULL, 0, 0, false);
00329 sm.sparse_matrix=get_sparse_feature_matrix(sm.num_features, sm.num_vectors);
00330 return sm;
00331 }
00332
00333 template<class ST> CSparseFeatures<ST>* CSparseFeatures<ST>::get_transposed()
00334 {
00335 int32_t num_feat;
00336 int32_t num_vec;
00337 SGSparseVector<ST>* s=get_transposed(num_feat, num_vec);
00338
00339 return new CSparseFeatures<ST>(s, num_feat, num_vec);
00340 }
00341
00342 template<class ST> SGSparseVector<ST>* CSparseFeatures<ST>::get_transposed(int32_t &num_feat, int32_t &num_vec)
00343 {
00344 num_feat=get_num_vectors();
00345 num_vec=num_features;
00346
00347
00348 int32_t* hist=SG_MALLOC(int32_t, num_features);
00349 memset(hist, 0, sizeof(int32_t)*num_features);
00350
00351
00352 for (int32_t v=0; v<num_feat; v++)
00353 {
00354 SGSparseVector<ST> sv=get_sparse_feature_vector(v);
00355
00356 for (int32_t i=0; i<sv.num_feat_entries; i++)
00357 hist[sv.features[i].feat_index]++;
00358 }
00359
00360
00361 SGSparseVector<ST>* sfm=SG_MALLOC(SGSparseVector<ST>, num_vec);
00362 for (int32_t v=0; v<num_vec; v++)
00363 new (&sfm[v]) SGSparseVector<ST>(hist[v]);
00364
00365
00366 memset(hist,0,sizeof(int32_t)*num_features);
00367 for (int32_t v=0; v<num_feat; v++)
00368 {
00369 SGSparseVector<ST> sv=get_sparse_feature_vector(v);
00370
00371 for (int32_t i=0; i<sv.num_feat_entries; i++)
00372 {
00373 int32_t vidx=sv.features[i].feat_index;
00374 int32_t fidx=v;
00375 sfm[vidx].features[hist[vidx]].feat_index=fidx;
00376 sfm[vidx].features[hist[vidx]].entry=sv.features[i].entry;
00377 hist[vidx]++;
00378 }
00379
00380 }
00381
00382 SG_FREE(hist);
00383 return sfm;
00384 }
00385
00386 template<class ST> void CSparseFeatures<ST>::set_sparse_feature_matrix(SGSparseMatrix<ST> sm)
00387 {
00388 if (m_subset_stack->has_subsets())
00389 SG_ERROR("set_sparse_feature_matrix() not allowed with subset\n");
00390
00391 SGSparseVector<ST>* sparse_matrix = SG_MALLOC(SGSparseVector<ST>, sm.num_vectors);
00392 for (int32_t i=0; i<sm.num_vectors; i++)
00393 {
00394 new (&sparse_matrix[i]) SGSparseVector<ST>();
00395 sparse_matrix[i] = sm[i];
00396 }
00397
00398 sparse_feature_matrix=sparse_matrix;
00399 num_features=sm.num_features;
00400 num_vectors=sm.num_vectors;
00401 }
00402
00403 template<class ST> SGMatrix<ST> CSparseFeatures<ST>::get_full_feature_matrix()
00404 {
00405 SGMatrix<ST> full(num_features, get_num_vectors());
00406 full.zero();
00407
00408 SG_INFO( "converting sparse features to full feature matrix of %ld x %ld entries\n", num_vectors, num_features);
00409
00410 for (int32_t v=0; v<full.num_cols; v++)
00411 {
00412 int32_t idx=m_subset_stack->subset_idx_conversion(v);
00413 SGSparseVector<ST> current=sparse_feature_matrix[idx];
00414
00415 for (int32_t f=0; f<current.num_feat_entries; f++)
00416 {
00417 int64_t offs=(idx*num_features)
00418 +current.features[f].feat_index;
00419
00420 full.matrix[offs]=current.features[f].entry;
00421 }
00422 }
00423
00424 return full;
00425 }
00426
00427 template<class ST> bool CSparseFeatures<ST>::set_full_feature_matrix(SGMatrix<ST> full)
00428 {
00429 remove_all_subsets();
00430
00431 ST* src=full.matrix;
00432 int32_t num_feat=full.num_rows;
00433 int32_t num_vec=full.num_cols;
00434
00435 free_sparse_feature_matrix();
00436 bool result=true;
00437 num_features=num_feat;
00438 num_vectors=num_vec;
00439
00440 SG_INFO("converting dense feature matrix to sparse one\n");
00441 int32_t* num_feat_entries=SG_MALLOC(int, num_vectors);
00442
00443 if (num_feat_entries)
00444 {
00445 int64_t num_total_entries=0;
00446
00447
00448 for (int32_t i=0; i< num_vec; i++)
00449 {
00450 num_feat_entries[i]=0;
00451 for (int32_t j=0; j< num_feat; j++)
00452 {
00453 if (src[i*((int64_t) num_feat) + j] != 0)
00454 num_feat_entries[i]++;
00455 }
00456 }
00457
00458 if (num_vec>0)
00459 {
00460 sparse_feature_matrix=SG_MALLOC(SGSparseVector<ST>, num_vec);
00461
00462 if (sparse_feature_matrix)
00463 {
00464 for (int32_t i=0; i< num_vec; i++)
00465 {
00466 new(&sparse_feature_matrix[i]) SGSparseVector<ST>();
00467 sparse_feature_matrix[i] = SGSparseVector<ST>(num_feat_entries[i]);
00468 int32_t sparse_feat_idx=0;
00469
00470 for (int32_t j=0; j< num_feat; j++)
00471 {
00472 int64_t pos= i*num_feat + j;
00473
00474 if (src[pos] != 0)
00475 {
00476 sparse_feature_matrix[i].features[sparse_feat_idx].entry=src[pos];
00477 sparse_feature_matrix[i].features[sparse_feat_idx].feat_index=j;
00478 sparse_feat_idx++;
00479 num_total_entries++;
00480 }
00481 }
00482 }
00483 }
00484 else
00485 {
00486 SG_ERROR( "allocation of sparse feature matrix failed\n");
00487 result=false;
00488 }
00489
00490 SG_INFO( "sparse feature matrix has %ld entries (full matrix had %ld, sparsity %2.2f%%)\n",
00491 num_total_entries, int64_t(num_feat)*num_vec, (100.0*num_total_entries)/(int64_t(num_feat)*num_vec));
00492 }
00493 else
00494 {
00495 SG_ERROR( "huh ? zero size matrix given ?\n");
00496 result=false;
00497 }
00498 }
00499 SG_FREE(num_feat_entries);
00500 return result;
00501 }
00502
00503 template<class ST> bool CSparseFeatures<ST>::apply_preprocessor(bool force_preprocessing)
00504 {
00505 SG_INFO( "force: %d\n", force_preprocessing);
00506
00507 if ( sparse_feature_matrix && get_num_preprocessors() )
00508 {
00509 for (int32_t i=0; i<get_num_preprocessors(); i++)
00510 {
00511 if ( (!is_preprocessed(i) || force_preprocessing) )
00512 {
00513 set_preprocessed(i);
00514 SG_INFO( "preprocessing using preproc %s\n", get_preprocessor(i)->get_name());
00515 if (((CSparsePreprocessor<ST>*) get_preprocessor(i))->apply_to_sparse_feature_matrix(this) == NULL)
00516 return false;
00517 }
00518 return true;
00519 }
00520 return true;
00521 }
00522 else
00523 {
00524 SG_WARNING( "no sparse feature matrix available or features already preprocessed - skipping.\n");
00525 return false;
00526 }
00527 }
00528
00529 template<class ST> int32_t CSparseFeatures<ST>::get_size() const
00530 {
00531 return sizeof(ST);
00532 }
00533
00534 template<class ST> bool CSparseFeatures<ST>::obtain_from_simple(CDenseFeatures<ST>* sf)
00535 {
00536 SGMatrix<ST> fm=sf->get_feature_matrix();
00537 ASSERT(fm.matrix && fm.num_cols>0 && fm.num_rows>0);
00538
00539 return set_full_feature_matrix(fm);
00540 }
00541
00542 template<class ST> int32_t CSparseFeatures<ST>::get_num_vectors() const
00543 {
00544 return m_subset_stack->has_subsets() ? m_subset_stack->get_size() : num_vectors;
00545 }
00546
00547 template<class ST> int32_t CSparseFeatures<ST>::get_num_features()
00548 {
00549 return num_features;
00550 }
00551
00552 template<class ST> int32_t CSparseFeatures<ST>::set_num_features(int32_t num)
00553 {
00554 int32_t n=num_features;
00555 ASSERT(n<=num);
00556 num_features=num;
00557 return num_features;
00558 }
00559
00560 template<class ST> EFeatureClass CSparseFeatures<ST>::get_feature_class() const
00561 {
00562 return C_SPARSE;
00563 }
00564
00565 template<class ST> void CSparseFeatures<ST>::free_feature_vector(int32_t num)
00566 {
00567 if (feature_cache)
00568 feature_cache->unlock_entry(m_subset_stack->subset_idx_conversion(num));
00569
00570
00571 }
00572
00573 template<class ST> int64_t CSparseFeatures<ST>::get_num_nonzero_entries()
00574 {
00575 int64_t num=0;
00576 index_t num_vec=get_num_vectors();
00577 for (int32_t i=0; i<num_vec; i++)
00578 num+=sparse_feature_matrix[m_subset_stack->subset_idx_conversion(i)].num_feat_entries;
00579
00580 return num;
00581 }
00582
00583 template<class ST> float64_t* CSparseFeatures<ST>::compute_squared(float64_t* sq)
00584 {
00585 ASSERT(sq);
00586
00587 index_t num_vec=get_num_vectors();
00588 for (int32_t i=0; i<num_vec; i++)
00589 {
00590 sq[i]=0;
00591 SGSparseVector<ST> vec=get_sparse_feature_vector(i);
00592
00593 for (int32_t j=0; j<vec.num_feat_entries; j++)
00594 sq[i]+=vec.features[j].entry*vec.features[j].entry;
00595
00596 free_feature_vector(i);
00597 }
00598
00599 return sq;
00600 }
00601
00602 template<class ST> float64_t CSparseFeatures<ST>::compute_squared_norm(
00603 CSparseFeatures<float64_t>* lhs, float64_t* sq_lhs, int32_t idx_a,
00604 CSparseFeatures<float64_t>* rhs, float64_t* sq_rhs, int32_t idx_b)
00605 {
00606 int32_t i,j;
00607 ASSERT(lhs);
00608 ASSERT(rhs);
00609
00610 SGSparseVector<float64_t> avec=lhs->get_sparse_feature_vector(idx_a);
00611 SGSparseVector<float64_t> bvec=rhs->get_sparse_feature_vector(idx_b);
00612 ASSERT(avec.features);
00613 ASSERT(bvec.features);
00614
00615 float64_t result=sq_lhs[idx_a]+sq_rhs[idx_b];
00616
00617 if (avec.num_feat_entries<=bvec.num_feat_entries)
00618 {
00619 j=0;
00620 for (i=0; i<avec.num_feat_entries; i++)
00621 {
00622 int32_t a_feat_idx=avec.features[i].feat_index;
00623
00624 while ((j<bvec.num_feat_entries)
00625 &&(bvec.features[j].feat_index<a_feat_idx))
00626 j++;
00627
00628 if ((j<bvec.num_feat_entries)
00629 &&(bvec.features[j].feat_index==a_feat_idx))
00630 {
00631 result-=2*(avec.features[i].entry*bvec.features[j].entry);
00632 j++;
00633 }
00634 }
00635 }
00636 else
00637 {
00638 j=0;
00639 for (i=0; i<bvec.num_feat_entries; i++)
00640 {
00641 int32_t b_feat_idx=bvec.features[i].feat_index;
00642
00643 while ((j<avec.num_feat_entries)
00644 &&(avec.features[j].feat_index<b_feat_idx))
00645 j++;
00646
00647 if ((j<avec.num_feat_entries)
00648 &&(avec.features[j].feat_index==b_feat_idx))
00649 {
00650 result-=2*(bvec.features[i].entry*avec.features[j].entry);
00651 j++;
00652 }
00653 }
00654 }
00655
00656 ((CSparseFeatures<float64_t>*) lhs)->free_feature_vector(idx_a);
00657 ((CSparseFeatures<float64_t>*) rhs)->free_feature_vector(idx_b);
00658
00659 return CMath::abs(result);
00660 }
00661
00662 template<class ST> CRegressionLabels* CSparseFeatures<ST>::load_svmlight_file(char* fname,
00663 bool do_sort_features)
00664 {
00665 remove_all_subsets();
00666
00667 CRegressionLabels* lab=NULL;
00668
00669 size_t blocksize=1024*1024;
00670 size_t required_blocksize=blocksize;
00671 uint8_t* dummy=SG_MALLOC(uint8_t, blocksize);
00672 FILE* f=fopen(fname, "ro");
00673
00674 if (f)
00675 {
00676 free_sparse_feature_matrix();
00677 num_vectors=0;
00678 num_features=0;
00679
00680 SG_INFO("counting line numbers in file %s\n", fname);
00681 size_t sz=blocksize;
00682 size_t block_offs=0;
00683 size_t old_block_offs=0;
00684 fseek(f, 0, SEEK_END);
00685 size_t fsize=ftell(f);
00686 rewind(f);
00687
00688 while (sz == blocksize)
00689 {
00690 sz=fread(dummy, sizeof(uint8_t), blocksize, f);
00691 for (size_t i=0; i<sz; i++)
00692 {
00693 block_offs++;
00694 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00695 {
00696 num_vectors++;
00697 required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs+1);
00698 old_block_offs=block_offs;
00699 }
00700 }
00701 SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
00702 }
00703
00704 SG_INFO("found %d feature vectors\n", num_vectors);
00705 SG_FREE(dummy);
00706 blocksize=required_blocksize;
00707 dummy = SG_MALLOC(uint8_t, blocksize+1);
00708
00709 lab=new CRegressionLabels(num_vectors);
00710 sparse_feature_matrix=SG_MALLOC(SGSparseVector<ST>, num_vectors);
00711 for (int32_t i=0; i<num_vectors; i++)
00712 new (&sparse_feature_matrix[i]) SGSparseVector<ST>();
00713 rewind(f);
00714 sz=blocksize;
00715 int32_t lines=0;
00716 while (sz == blocksize)
00717 {
00718 sz=fread(dummy, sizeof(uint8_t), blocksize, f);
00719
00720 size_t old_sz=0;
00721 for (size_t i=0; i<sz; i++)
00722 {
00723 if (i==sz-1 && dummy[i]!='\n' && sz==blocksize)
00724 {
00725 size_t len=i-old_sz+1;
00726 uint8_t* data=&dummy[old_sz];
00727
00728 for (size_t j=0; j<len; j++)
00729 dummy[j]=data[j];
00730
00731 sz=fread(dummy+len, sizeof(uint8_t), blocksize-len, f);
00732 i=0;
00733 old_sz=0;
00734 sz+=len;
00735 }
00736
00737 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00738 {
00739
00740 size_t len=i-old_sz;
00741 uint8_t* data=&dummy[old_sz];
00742
00743 int32_t dims=0;
00744 for (size_t j=0; j<len; j++)
00745 {
00746 if (data[j]==':')
00747 dims++;
00748 }
00749
00750 if (dims<=0)
00751 {
00752 SG_ERROR("Error in line %d - number of"
00753 " dimensions is %d line is %d characters"
00754 " long\n line_content:'%.*s'\n", lines,
00755 dims, len, len, (const char*) data);
00756 }
00757
00758 SGSparseVectorEntry<ST>* feat=SG_MALLOC(SGSparseVectorEntry<ST>, dims);
00759 size_t j=0;
00760 for (; j<len; j++)
00761 {
00762 if (data[j]==' ')
00763 {
00764 data[j]='\0';
00765
00766 lab->set_label(lines, atof((const char*) data));
00767 break;
00768 }
00769 }
00770
00771 int32_t d=0;
00772 j++;
00773 uint8_t* start=&data[j];
00774 for (; j<len; j++)
00775 {
00776 if (data[j]==':')
00777 {
00778 data[j]='\0';
00779
00780 feat[d].feat_index=(int32_t) atoi((const char*) start)-1;
00781 num_features=CMath::max(num_features, feat[d].feat_index+1);
00782
00783 j++;
00784 start=&data[j];
00785 for (; j<len; j++)
00786 {
00787 if (data[j]==' ' || data[j]=='\n')
00788 {
00789 data[j]='\0';
00790 feat[d].entry=(ST) atof((const char*) start);
00791 d++;
00792 break;
00793 }
00794 }
00795
00796 if (j==len)
00797 {
00798 data[j]='\0';
00799 feat[dims-1].entry=(ST) atof((const char*) start);
00800 }
00801
00802 j++;
00803 start=&data[j];
00804 }
00805 }
00806
00807 sparse_feature_matrix[lines].num_feat_entries=dims;
00808 sparse_feature_matrix[lines].features=feat;
00809
00810 old_sz=i+1;
00811 lines++;
00812 SG_PROGRESS(lines, 0, num_vectors, 1, "LOADING:\t");
00813 }
00814 }
00815 }
00816 SG_INFO("file successfully read\n");
00817 fclose(f);
00818 }
00819
00820 SG_FREE(dummy);
00821
00822 if (do_sort_features)
00823 sort_features();
00824
00825 return lab;
00826 }
00827
00828 template<class ST> void CSparseFeatures<ST>::sort_features()
00829 {
00830 if (m_subset_stack->has_subsets())
00831 SG_ERROR("sort_features() not allowed with subset\n");
00832
00833 ASSERT(get_num_preprocessors()==0);
00834
00835 if (!sparse_feature_matrix)
00836 SG_ERROR("Requires sparse feature matrix to be available in-memory\n");
00837
00838 for (int32_t i=0; i<num_vectors; i++)
00839 {
00840 int32_t len=sparse_feature_matrix[i].num_feat_entries;
00841
00842 if (!len)
00843 continue;
00844
00845 SGSparseVectorEntry<ST>* sf_orig=sparse_feature_matrix[i].features;
00846 int32_t* feat_idx=SG_MALLOC(int32_t, len);
00847 int32_t* orig_idx=SG_MALLOC(int32_t, len);
00848
00849 for (int j=0; j<len; j++)
00850 {
00851 feat_idx[j]=sf_orig[j].feat_index;
00852 orig_idx[j]=j;
00853 }
00854
00855 CMath::qsort_index(feat_idx, orig_idx, len);
00856
00857 SGSparseVectorEntry<ST>* sf_new= SG_MALLOC(SGSparseVectorEntry<ST>, len);
00858 for (int j=0; j<len; j++)
00859 sf_new[j]=sf_orig[orig_idx[j]];
00860
00861 sparse_feature_matrix[i].features=sf_new;
00862
00863
00864 for (int j=0; j<len-1; j++)
00865 ASSERT(sf_new[j].feat_index<sf_new[j+1].feat_index);
00866
00867 SG_FREE(orig_idx);
00868 SG_FREE(feat_idx);
00869 SG_FREE(sf_orig);
00870 }
00871 }
00872
00873 template<class ST> bool CSparseFeatures<ST>::write_svmlight_file(char* fname,
00874 CRegressionLabels* label)
00875 {
00876 if (m_subset_stack->has_subsets())
00877 SG_ERROR("write_svmlight_file() not allowed with subset\n");
00878
00879 ASSERT(label);
00880 int32_t num=label->get_num_labels();
00881 ASSERT(num>0);
00882 ASSERT(num==num_vectors);
00883
00884 FILE* f=fopen(fname, "wb");
00885
00886 if (f)
00887 {
00888 for (int32_t i=0; i<num; i++)
00889 {
00890 fprintf(f, "%d ", (int32_t) label->get_int_label(i));
00891
00892 SGSparseVectorEntry<ST>* vec = sparse_feature_matrix[i].features;
00893 int32_t num_feat = sparse_feature_matrix[i].num_feat_entries;
00894
00895 for (int32_t j=0; j<num_feat; j++)
00896 {
00897 if (j<num_feat-1)
00898 fprintf(f, "%d:%f ", (int32_t) vec[j].feat_index+1, (double) vec[j].entry);
00899 else
00900 fprintf(f, "%d:%f\n", (int32_t) vec[j].feat_index+1, (double) vec[j].entry);
00901 }
00902 }
00903
00904 fclose(f);
00905 return true;
00906 }
00907 return false;
00908 }
00909
00910 template<class ST> int32_t CSparseFeatures<ST>::get_dim_feature_space() const
00911 {
00912 return num_features;
00913 }
00914
00915 template<class ST> float64_t CSparseFeatures<ST>::dot(int32_t vec_idx1,
00916 CDotFeatures* df, int32_t vec_idx2)
00917 {
00918 ASSERT(df);
00919 ASSERT(df->get_feature_type() == get_feature_type());
00920 ASSERT(df->get_feature_class() == get_feature_class());
00921 CSparseFeatures<ST>* sf = (CSparseFeatures<ST>*) df;
00922
00923 SGSparseVector<ST> avec=get_sparse_feature_vector(vec_idx1);
00924 SGSparseVector<ST> bvec=sf->get_sparse_feature_vector(vec_idx2);
00925
00926 float64_t result = SGSparseVector<ST>::sparse_dot(avec, bvec);
00927 free_sparse_feature_vector(vec_idx1);
00928 sf->free_sparse_feature_vector(vec_idx2);
00929
00930 return result;
00931 }
00932 template<class ST> float64_t CSparseFeatures<ST>::dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len)
00933 {
00934 ASSERT(vec2);
00935 if (vec2_len!=num_features)
00936 {
00937 SG_ERROR("dimension of vec2 (=%d) does not match number of features (=%d)\n",
00938 vec2_len, num_features);
00939 }
00940 float64_t result=0;
00941
00942 SGSparseVector<ST> sv=get_sparse_feature_vector(vec_idx1);
00943
00944 if (sv.features)
00945 {
00946 for (int32_t i=0; i<sv.num_feat_entries; i++)
00947 result+=vec2[sv.features[i].feat_index]*sv.features[i].entry;
00948 }
00949
00950 free_sparse_feature_vector(vec_idx1);
00951
00952 return result;
00953 }
00954
00955 template<class ST> void* CSparseFeatures<ST>::get_feature_iterator(int32_t vector_index)
00956 {
00957 if (vector_index>=get_num_vectors())
00958 {
00959 SG_ERROR("Index out of bounds (number of vectors %d, you "
00960 "requested %d)\n", get_num_vectors(), vector_index);
00961 }
00962
00963 if (!sparse_feature_matrix)
00964 SG_ERROR("Requires a in-memory feature matrix\n");
00965
00966 sparse_feature_iterator* it=SG_MALLOC(sparse_feature_iterator, 1);
00967 it->sv=get_sparse_feature_vector(vector_index);
00968 it->index=0;
00969 it->vector_index=vector_index;
00970
00971 return it;
00972 }
00973
00974 template<class ST> bool CSparseFeatures<ST>::get_next_feature(int32_t& index, float64_t& value, void* iterator)
00975 {
00976 sparse_feature_iterator* it=(sparse_feature_iterator*) iterator;
00977 if (!it || it->index>=it->sv.num_feat_entries)
00978 return false;
00979
00980 int32_t i=it->index++;
00981
00982 index=it->sv.features[i].feat_index;
00983 value=(float64_t) it->sv.features[i].entry;
00984
00985 return true;
00986 }
00987
00988 template<class ST> void CSparseFeatures<ST>::free_feature_iterator(void* iterator)
00989 {
00990 if (!iterator)
00991 return;
00992
00993 sparse_feature_iterator* it=(sparse_feature_iterator*) iterator;
00994 free_sparse_feature_vector(it->vector_index);
00995 SG_FREE(it);
00996 }
00997
00998 template<class ST> CFeatures* CSparseFeatures<ST>::copy_subset(SGVector<index_t> indices)
00999 {
01000 SGSparseMatrix<ST> matrix_copy=SGSparseMatrix<ST>(indices.vlen,
01001 get_dim_feature_space());
01002
01003 for (index_t i=0; i<indices.vlen; ++i)
01004 {
01005
01006 index_t index=indices.vector[i];
01007 index_t real_index=m_subset_stack->subset_idx_conversion(index);
01008
01009
01010 SGSparseVector<ST> current=get_sparse_feature_vector(real_index);
01011 matrix_copy.sparse_matrix[i]=current;
01012
01013 free_sparse_feature_vector(index);
01014 }
01015
01016 CFeatures* result=new CSparseFeatures<ST>(matrix_copy);
01017 SG_REF(result);
01018 return result;
01019 }
01020
01021 template<class ST> SGSparseVectorEntry<ST>* CSparseFeatures<ST>::compute_sparse_feature_vector(int32_t num,
01022 int32_t& len, SGSparseVectorEntry<ST>* target)
01023 {
01024 SG_NOTIMPLEMENTED;
01025
01026 len=0;
01027 return NULL;
01028 }
01029
01030 template<class ST> void CSparseFeatures<ST>::init()
01031 {
01032 set_generic<ST>();
01033
01034 m_parameters->add_vector(&sparse_feature_matrix, &num_vectors,
01035 "sparse_feature_matrix",
01036 "Array of sparse vectors.");
01037 m_parameters->add(&num_features, "num_features",
01038 "Total number of features.");
01039 }
01040
01041 #define GET_FEATURE_TYPE(sg_type, f_type) \
01042 template<> EFeatureType CSparseFeatures<sg_type>::get_feature_type() const \
01043 { \
01044 return f_type; \
01045 }
01046 GET_FEATURE_TYPE(bool, F_BOOL)
01047 GET_FEATURE_TYPE(char, F_CHAR)
01048 GET_FEATURE_TYPE(uint8_t, F_BYTE)
01049 GET_FEATURE_TYPE(int8_t, F_BYTE)
01050 GET_FEATURE_TYPE(int16_t, F_SHORT)
01051 GET_FEATURE_TYPE(uint16_t, F_WORD)
01052 GET_FEATURE_TYPE(int32_t, F_INT)
01053 GET_FEATURE_TYPE(uint32_t, F_UINT)
01054 GET_FEATURE_TYPE(int64_t, F_LONG)
01055 GET_FEATURE_TYPE(uint64_t, F_ULONG)
01056 GET_FEATURE_TYPE(float32_t, F_SHORTREAL)
01057 GET_FEATURE_TYPE(float64_t, F_DREAL)
01058 GET_FEATURE_TYPE(floatmax_t, F_LONGREAL)
01059 #undef GET_FEATURE_TYPE
01060
01061 #define LOAD(fname, sg_type) \
01062 template<> void CSparseFeatures<sg_type>::load(CFile* loader) \
01063 { \
01064 remove_all_subsets(); \
01065 SG_SET_LOCALE_C; \
01066 ASSERT(loader); \
01067 SGSparseVector<sg_type>* matrix=NULL; \
01068 int32_t num_feat=0; \
01069 int32_t num_vec=0; \
01070 loader->fname(matrix, num_feat, num_vec); \
01071 set_sparse_feature_matrix(SGSparseMatrix<sg_type>(matrix, num_feat, num_vec)); \
01072 SG_RESET_LOCALE; \
01073 }
01074 LOAD(get_sparse_matrix, bool)
01075 LOAD(get_sparse_matrix, char)
01076 LOAD(get_sparse_matrix, uint8_t)
01077 LOAD(get_int8_sparsematrix, int8_t)
01078 LOAD(get_sparse_matrix, int16_t)
01079 LOAD(get_sparse_matrix, uint16_t)
01080 LOAD(get_sparse_matrix, int32_t)
01081 LOAD(get_uint_sparsematrix, uint32_t)
01082 LOAD(get_long_sparsematrix, int64_t)
01083 LOAD(get_ulong_sparsematrix, uint64_t)
01084 LOAD(get_sparse_matrix, float32_t)
01085 LOAD(get_sparse_matrix, float64_t)
01086 LOAD(get_longreal_sparsematrix, floatmax_t)
01087 #undef LOAD
01088
01089 #define WRITE(fname, sg_type) \
01090 template<> void CSparseFeatures<sg_type>::save(CFile* writer) \
01091 { \
01092 if (m_subset_stack->has_subsets()) \
01093 SG_ERROR("save() not allowed with subset\n"); \
01094 SG_SET_LOCALE_C; \
01095 ASSERT(writer); \
01096 writer->fname(sparse_feature_matrix, num_features, num_vectors); \
01097 SG_RESET_LOCALE; \
01098 }
01099 WRITE(set_sparse_matrix, bool)
01100 WRITE(set_sparse_matrix, char)
01101 WRITE(set_sparse_matrix, uint8_t)
01102 WRITE(set_int8_sparsematrix, int8_t)
01103 WRITE(set_sparse_matrix, int16_t)
01104 WRITE(set_sparse_matrix, uint16_t)
01105 WRITE(set_sparse_matrix, int32_t)
01106 WRITE(set_uint_sparsematrix, uint32_t)
01107 WRITE(set_long_sparsematrix, int64_t)
01108 WRITE(set_ulong_sparsematrix, uint64_t)
01109 WRITE(set_sparse_matrix, float32_t)
01110 WRITE(set_sparse_matrix, float64_t)
01111 WRITE(set_longreal_sparsematrix, floatmax_t)
01112 #undef WRITE
01113
01114 template class CSparseFeatures<bool>;
01115 template class CSparseFeatures<char>;
01116 template class CSparseFeatures<int8_t>;
01117 template class CSparseFeatures<uint8_t>;
01118 template class CSparseFeatures<int16_t>;
01119 template class CSparseFeatures<uint16_t>;
01120 template class CSparseFeatures<int32_t>;
01121 template class CSparseFeatures<uint32_t>;
01122 template class CSparseFeatures<int64_t>;
01123 template class CSparseFeatures<uint64_t>;
01124 template class CSparseFeatures<float32_t>;
01125 template class CSparseFeatures<float64_t>;
01126 template class CSparseFeatures<floatmax_t>;
01127 }