16 sparse_feature_matrix(NULL), feature_cache(NULL)
22 int32_t num_feat, int32_t num_vec,
bool copy)
23 :
CDotFeatures(0), num_vectors(num_vec), num_features(num_feat),
24 sparse_feature_matrix(NULL), feature_cache(NULL)
39 sparse_feature_matrix(NULL), feature_cache(NULL)
48 sparse_feature_matrix(NULL), feature_cache(NULL)
57 num_features(orig.num_features),
58 sparse_feature_matrix(orig.sparse_feature_matrix),
59 feature_cache(orig.feature_cache)
81 sparse_feature_matrix(NULL), feature_cache(NULL)
90 free_sparse_features();
94 for (int32_t i=0; i<num_vectors; i++)
95 (&sparse_feature_matrix[i])->~SGSparseVector();
100 remove_all_subsets();
104 free_sparse_feature_matrix();
105 delete feature_cache;
106 feature_cache = NULL;
115 ASSERT(index>=0 && index<num_features);
116 ASSERT(num>=0 && num<get_num_vectors());
129 free_sparse_feature_vector(num);
146 for (i=0; i<num_features; i++)
153 free_sparse_feature_vector(num);
160 if (num>=get_num_vectors())
162 SG_ERROR(
"Index out of bounds (number of vectors %d, you "
163 "requested %d)\n", get_num_vectors(), num);
179 free_sparse_feature_vector(num);
188 free_sparse_feature_vector(num);
194 ASSERT(num<get_num_vectors());
196 index_t real_num=m_subset_stack->subset_idx_conversion(num);
200 if (sparse_feature_matrix)
202 return sparse_feature_matrix[real_num];
208 result.
features=feature_cache->lock_entry(num);
214 result.
features=feature_cache->set_entry(num);
221 result.
features=compute_sparse_feature_vector(num,
225 if (get_num_preprocessors())
231 for (int32_t i=0; i<get_num_preprocessors(); i++)
237 tmp_feat_before=tmp_feat_after;
240 memcpy(result.
features, tmp_feat_after,
261 for (int32_t i=0; i<alen; i++)
265 while ( (j<blen) && (bvec[j].feat_index < a_feat_idx) )
268 if ( (j<blen) && (bvec[j].feat_index == a_feat_idx) )
278 for (int32_t i=0; i<blen; i++)
282 while ( (j<alen) && (avec[j].feat_index < b_feat_idx) )
285 if ( (j<alen) && (avec[j].feat_index == b_feat_idx) )
302 ASSERT(dim==num_features);
316 free_sparse_feature_vector(num);
323 if (dim!=num_features)
325 SG_ERROR(
"dimension of vec (=%d) does not match number of features (=%d)\n",
351 free_sparse_feature_vector(num);
357 feature_cache->unlock_entry(m_subset_stack->subset_idx_conversion(num));
364 if (m_subset_stack->has_subsets())
365 SG_ERROR(
"get_sparse_feature_matrix() not allowed with subset\n");
367 num_feat=num_features;
370 return sparse_feature_matrix;
375 if (m_subset_stack->has_subsets())
376 SG_ERROR(
"get_sparse_feature_matrix() not allowed with subset\n");
394 num_feat=get_num_vectors();
395 num_vec=num_features;
398 int32_t* hist=
SG_MALLOC(int32_t, num_features);
399 memset(hist, 0,
sizeof(int32_t)*num_features);
402 for (int32_t v=0; v<num_feat; v++)
412 for (int32_t v=0; v<num_vec; v++)
416 memset(hist,0,
sizeof(int32_t)*num_features);
417 for (int32_t v=0; v<num_feat; v++)
438 if (m_subset_stack->has_subsets())
439 SG_ERROR(
"set_sparse_feature_matrix() not allowed with subset\n");
445 sparse_matrix[i] = sm[i];
448 sparse_feature_matrix=sparse_matrix;
458 SG_INFO(
"converting sparse features to full feature matrix of %ld x %ld entries\n", num_vectors, num_features);
460 for (int32_t v=0; v<full.
num_cols; v++)
462 int32_t idx=m_subset_stack->subset_idx_conversion(v);
467 int64_t offs=(idx*num_features)
479 remove_all_subsets();
485 free_sparse_feature_matrix();
487 num_features=num_feat;
490 SG_INFO(
"converting dense feature matrix to sparse one\n");
491 int32_t* num_feat_entries=
SG_MALLOC(
int, num_vectors);
493 if (num_feat_entries)
495 int64_t num_total_entries=0;
498 for (int32_t i=0; i< num_vec; i++)
500 num_feat_entries[i]=0;
501 for (int32_t j=0; j< num_feat; j++)
503 if (src[i*((int64_t) num_feat) + j] != 0)
504 num_feat_entries[i]++;
512 if (sparse_feature_matrix)
514 for (int32_t i=0; i< num_vec; i++)
518 int32_t sparse_feat_idx=0;
520 for (int32_t j=0; j< num_feat; j++)
522 int64_t pos= i*num_feat + j;
526 sparse_feature_matrix[i].
features[sparse_feat_idx].
entry=src[pos];
527 sparse_feature_matrix[i].features[sparse_feat_idx].feat_index=j;
536 SG_ERROR(
"allocation of sparse feature matrix failed\n");
540 SG_INFO(
"sparse feature matrix has %ld entries (full matrix had %ld, sparsity %2.2f%%)\n",
541 num_total_entries, int64_t(num_feat)*num_vec, (100.0*num_total_entries)/(int64_t(num_feat)*num_vec));
545 SG_ERROR(
"huh ? zero size matrix given ?\n");
555 SG_INFO(
"force: %d\n", force_preprocessing);
557 if ( sparse_feature_matrix && get_num_preprocessors() )
559 for (int32_t i=0; i<get_num_preprocessors(); i++)
561 if ( (!is_preprocessed(i) || force_preprocessing) )
564 SG_INFO(
"preprocessing using preproc %s\n", get_preprocessor(i)->get_name());
574 SG_WARNING(
"no sparse feature matrix available or features already preprocessed - skipping.\n");
589 return set_full_feature_matrix(fm);
594 return m_subset_stack->has_subsets() ? m_subset_stack->get_size() : num_vectors;
604 int32_t n=num_features;
618 feature_cache->unlock_entry(m_subset_stack->subset_idx_conversion(num));
626 index_t num_vec=get_num_vectors();
627 for (int32_t i=0; i<num_vec; i++)
628 num+=sparse_feature_matrix[m_subset_stack->subset_idx_conversion(i)].num_feat_entries;
637 index_t num_vec=get_num_vectors();
638 for (int32_t i=0; i<num_vec; i++)
646 free_feature_vector(i);
665 float64_t result=sq_lhs[idx_a]+sq_rhs[idx_b];
713 bool do_sort_features)
715 remove_all_subsets();
719 size_t blocksize=1024*1024;
720 size_t required_blocksize=blocksize;
721 uint8_t* dummy=
SG_MALLOC(uint8_t, blocksize);
722 FILE* f=fopen(fname,
"ro");
726 free_sparse_feature_matrix();
730 SG_INFO(
"counting line numbers in file %s\n", fname);
733 size_t old_block_offs=0;
734 fseek(f, 0, SEEK_END);
735 size_t fsize=ftell(f);
738 while (sz == blocksize)
740 sz=fread(dummy,
sizeof(uint8_t), blocksize, f);
741 for (
size_t i=0; i<sz; i++)
744 if (dummy[i]==
'\n' || (i==sz-1 && sz<blocksize))
747 required_blocksize=
CMath::max(required_blocksize, block_offs-old_block_offs+1);
748 old_block_offs=block_offs;
751 SG_PROGRESS(block_offs, 0, fsize, 1,
"COUNTING:\t");
754 SG_INFO(
"found %d feature vectors\n", num_vectors);
756 blocksize=required_blocksize;
761 for (int32_t i=0; i<num_vectors; i++)
766 while (sz == blocksize)
768 sz=fread(dummy,
sizeof(uint8_t), blocksize, f);
771 for (
size_t i=0; i<sz; i++)
773 if (i==sz-1 && dummy[i]!=
'\n' && sz==blocksize)
775 size_t len=i-old_sz+1;
776 uint8_t* data=&dummy[old_sz];
778 for (
size_t j=0; j<len; j++)
781 sz=fread(dummy+len,
sizeof(uint8_t), blocksize-len, f);
787 if (dummy[i]==
'\n' || (i==sz-1 && sz<blocksize))
791 uint8_t* data=&dummy[old_sz];
794 for (
size_t j=0; j<len; j++)
802 SG_ERROR(
"Error in line %d - number of"
803 " dimensions is %d line is %d characters"
804 " long\n line_content:'%.*s'\n", lines,
805 dims, len, len, (
const char*) data);
816 lab->
set_label(lines, atof((
const char*) data));
823 uint8_t* start=&data[j];
830 feat[d].
feat_index=(int32_t) atoi((
const char*) start)-1;
831 num_features=
CMath::max(num_features, feat[d].feat_index+1);
837 if (data[j]==
' ' || data[j]==
'\n')
840 feat[d].
entry=(ST) atof((
const char*) start);
849 feat[dims-1].
entry=(ST) atof((
const char*) start);
857 sparse_feature_matrix[lines].num_feat_entries=dims;
858 sparse_feature_matrix[lines].features=feat;
862 SG_PROGRESS(lines, 0, num_vectors, 1,
"LOADING:\t");
866 SG_INFO(
"file successfully read\n");
872 if (do_sort_features)
880 if (m_subset_stack->has_subsets())
881 SG_ERROR(
"sort_features() not allowed with subset\n");
883 ASSERT(get_num_preprocessors()==0);
885 if (!sparse_feature_matrix)
886 SG_ERROR(
"Requires sparse feature matrix to be available in-memory\n");
888 for (int32_t i=0; i<num_vectors; i++)
890 int32_t len=sparse_feature_matrix[i].num_feat_entries;
896 int32_t* feat_idx=
SG_MALLOC(int32_t, len);
897 int32_t* orig_idx=
SG_MALLOC(int32_t, len);
899 for (
int j=0; j<len; j++)
908 for (
int j=0; j<len; j++)
909 sf_new[j]=sf_orig[orig_idx[j]];
911 sparse_feature_matrix[i].features=sf_new;
914 for (
int j=0; j<len-1; j++)
915 ASSERT(sf_new[j].feat_index<sf_new[j+1].feat_index);
926 if (m_subset_stack->has_subsets())
927 SG_ERROR(
"write_svmlight_file() not allowed with subset\n");
934 FILE* f=fopen(fname,
"wb");
938 for (int32_t i=0; i<num; i++)
943 int32_t num_feat = sparse_feature_matrix[i].num_feat_entries;
945 for (int32_t j=0; j<num_feat; j++)
948 fprintf(f,
"%d:%f ", (int32_t) vec[j].feat_index+1, (
double) vec[j].entry);
950 fprintf(f,
"%d:%f\n", (int32_t) vec[j].feat_index+1, (
double) vec[j].entry);
979 free_sparse_feature_vector(vec_idx1);
980 sf->free_sparse_feature_vector(vec_idx2);
987 if (vec2_len!=num_features)
989 SG_ERROR(
"dimension of vec2 (=%d) does not match number of features (=%d)\n",
990 vec2_len, num_features);
1002 free_sparse_feature_vector(vec_idx1);
1009 if (vector_index>=get_num_vectors())
1011 SG_ERROR(
"Index out of bounds (number of vectors %d, you "
1012 "requested %d)\n", get_num_vectors(), vector_index);
1015 if (!sparse_feature_matrix)
1016 SG_ERROR(
"Requires a in-memory feature matrix\n");
1018 sparse_feature_iterator* it=
SG_MALLOC(sparse_feature_iterator, 1);
1019 it->sv=get_sparse_feature_vector(vector_index);
1021 it->vector_index=vector_index;
1028 sparse_feature_iterator* it=(sparse_feature_iterator*) iterator;
1029 if (!it || it->index>=it->sv.num_feat_entries)
1032 int32_t i=it->index++;
1034 index=it->sv.features[i].feat_index;
1035 value=(
float64_t) it->sv.features[i].entry;
1045 sparse_feature_iterator* it=(sparse_feature_iterator*) iterator;
1046 free_sparse_feature_vector(it->vector_index);
1053 get_dim_feature_space());
1059 index_t real_index=m_subset_stack->subset_idx_conversion(index);
1065 free_sparse_feature_vector(index);
1086 m_parameters->add_vector(&sparse_feature_matrix, &num_vectors,
1087 "sparse_feature_matrix",
1088 "Array of sparse vectors.");
1089 m_parameters->add(&num_features,
"num_features",
1090 "Total number of features.");
1093 #define GET_FEATURE_TYPE(sg_type, f_type) \
1094 template<> EFeatureType CSparseFeatures<sg_type>::get_feature_type() const \
1111 #undef GET_FEATURE_TYPE
1113 #define LOAD(fname, sg_type) \
1114 template<> void CSparseFeatures<sg_type>::load(CFile* loader) \
1116 remove_all_subsets(); \
1119 SGSparseVector<sg_type>* matrix=NULL; \
1120 int32_t num_feat=0; \
1121 int32_t num_vec=0; \
1122 loader->fname(matrix, num_feat, num_vec); \
1123 set_sparse_feature_matrix(SGSparseMatrix<sg_type>(matrix, num_feat, num_vec)); \
1126 LOAD(get_sparse_matrix,
bool)
1127 LOAD(get_sparse_matrix,
char)
1128 LOAD(get_sparse_matrix, uint8_t)
1129 LOAD(get_int8_sparsematrix, int8_t)
1130 LOAD(get_sparse_matrix, int16_t)
1131 LOAD(get_sparse_matrix, uint16_t)
1132 LOAD(get_sparse_matrix, int32_t)
1133 LOAD(get_uint_sparsematrix, uint32_t)
1134 LOAD(get_long_sparsematrix, int64_t)
1135 LOAD(get_ulong_sparsematrix, uint64_t)
1136 LOAD(get_sparse_matrix, float32_t)
1137 LOAD(get_sparse_matrix, float64_t)
1138 LOAD(get_longreal_sparsematrix, floatmax_t)
1141 #define WRITE(fname, sg_type) \
1142 template<> void CSparseFeatures<sg_type>::save(CFile* writer) \
1144 if (m_subset_stack->has_subsets()) \
1145 SG_ERROR("save() not allowed with subset\n"); \
1148 writer->fname(sparse_feature_matrix, num_features, num_vectors); \
1151 WRITE(set_sparse_matrix,
bool)
1152 WRITE(set_sparse_matrix,
char)
1153 WRITE(set_sparse_matrix, uint8_t)
1154 WRITE(set_int8_sparsematrix, int8_t)
1155 WRITE(set_sparse_matrix, int16_t)
1156 WRITE(set_sparse_matrix, uint16_t)
1157 WRITE(set_sparse_matrix, int32_t)
1158 WRITE(set_uint_sparsematrix, uint32_t)
1159 WRITE(set_long_sparsematrix, int64_t)
1160 WRITE(set_ulong_sparsematrix, uint64_t)
1161 WRITE(set_sparse_matrix, float32_t)
1162 WRITE(set_sparse_matrix, float64_t)
1163 WRITE(set_longreal_sparsematrix, floatmax_t)
1166 template class CSparseFeatures<bool>;
1167 template class CSparseFeatures<char>;
1168 template class CSparseFeatures<int8_t>;
1169 template class CSparseFeatures<uint8_t>;
1170 template class CSparseFeatures<int16_t>;
1171 template class CSparseFeatures<uint16_t>;
1172 template class CSparseFeatures<int32_t>;
1173 template class CSparseFeatures<uint32_t>;
1174 template class CSparseFeatures<int64_t>;
1175 template class CSparseFeatures<uint64_t>;
1176 template class CSparseFeatures<float32_t>;
1177 template class CSparseFeatures<float64_t>;
1178 template class CSparseFeatures<floatmax_t>;