00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011 #include <shogun/features/ImplicitWeightedSpecFeatures.h>
00012 #include <shogun/io/SGIO.h>
00013
00014 using namespace shogun;
00015
00016 CImplicitWeightedSpecFeatures::CImplicitWeightedSpecFeatures()
00017 :CDotFeatures()
00018 {
00019 SG_UNSTABLE("CImplicitWeightedSpecFeatures::"
00020 "CImplicitWeightedSpecFeatures()", "\n");
00021
00022 strings = NULL;
00023 normalization_factors = NULL;
00024 num_strings = 0;
00025 alphabet_size = 0;
00026
00027 degree = 0;
00028 spec_size = 0;
00029 spec_weights = 0;
00030 }
00031
00032 CImplicitWeightedSpecFeatures::CImplicitWeightedSpecFeatures(CStringFeatures<uint16_t>* str, bool normalize) : CDotFeatures()
00033 {
00034 ASSERT(str);
00035 strings=str;
00036 SG_REF(strings)
00037 normalization_factors=NULL;
00038 spec_weights=NULL;
00039 num_strings = str->get_num_vectors();
00040 alphabet_size = str->get_original_num_symbols();
00041 degree=str->get_order();
00042 set_wd_weights();
00043
00044 SG_DEBUG("WEIGHTED SPEC alphasz=%d, size=%d, num_str=%d\n", alphabet_size,
00045 spec_size, num_strings);
00046
00047 if (normalize)
00048 compute_normalization_const();
00049 }
00050
00051 void CImplicitWeightedSpecFeatures::compute_normalization_const()
00052 {
00053 float64_t* factors=SG_MALLOC(float64_t, num_strings);
00054
00055 for (int32_t i=0; i<num_strings; i++)
00056 factors[i]=1.0/CMath::sqrt(dot(i, this, i));
00057
00058 normalization_factors=factors;
00059
00060 }
00061
00062 bool CImplicitWeightedSpecFeatures::set_wd_weights()
00063 {
00064 SG_FREE(spec_weights);
00065 spec_weights=SG_MALLOC(float64_t, degree);
00066
00067 int32_t i;
00068 float64_t sum=0;
00069 spec_size=0;
00070
00071 for (i=0; i<degree; i++)
00072 {
00073 spec_size+=CMath::pow(alphabet_size, i+1);
00074 spec_weights[i]=degree-i;
00075 sum+=spec_weights[i];
00076 }
00077 for (i=0; i<degree; i++)
00078 spec_weights[i]=CMath::sqrt(spec_weights[i]/sum);
00079
00080 return spec_weights!=NULL;
00081 }
00082
00083 bool CImplicitWeightedSpecFeatures::set_weights(float64_t* w, int32_t d)
00084 {
00085 ASSERT(d==degree);
00086
00087 SG_FREE(spec_weights);
00088 spec_weights=SG_MALLOC(float64_t, degree);
00089 for (int32_t i=0; i<degree; i++)
00090 spec_weights[i]=CMath::sqrt(w[i]);
00091 return true;
00092 }
00093
00094 CImplicitWeightedSpecFeatures::CImplicitWeightedSpecFeatures(const CImplicitWeightedSpecFeatures& orig) : CDotFeatures(orig),
00095 num_strings(orig.num_strings),
00096 alphabet_size(orig.alphabet_size), spec_size(orig.spec_size)
00097 {
00098 SG_NOTIMPLEMENTED;
00099 SG_REF(strings);
00100 }
00101
00102 CImplicitWeightedSpecFeatures::~CImplicitWeightedSpecFeatures()
00103 {
00104 SG_UNREF(strings);
00105 SG_FREE(spec_weights);
00106 SG_FREE(normalization_factors);
00107 }
00108
00109 float64_t CImplicitWeightedSpecFeatures::dot(int32_t vec_idx1, CDotFeatures* df, int32_t vec_idx2)
00110 {
00111 ASSERT(df);
00112 ASSERT(df->get_feature_type() == get_feature_type());
00113 ASSERT(df->get_feature_class() == get_feature_class());
00114 CImplicitWeightedSpecFeatures* sf = (CImplicitWeightedSpecFeatures*) df;
00115
00116 ASSERT(vec_idx1 < num_strings);
00117 ASSERT(vec_idx2 < sf->get_num_vectors());
00118
00119 int32_t len1=-1;
00120 int32_t len2=-1;
00121 bool free_vec1;
00122 bool free_vec2;
00123 uint16_t* vec1=strings->get_feature_vector(vec_idx1, len1, free_vec1);
00124 uint16_t* vec2=sf->strings->get_feature_vector(vec_idx2, len2, free_vec2);
00125
00126 float64_t result=0;
00127 uint8_t mask=0;
00128
00129 for (int32_t d=0; d<degree; d++)
00130 {
00131 mask = mask | (1 << (degree-d-1));
00132 uint16_t masked=strings->get_masked_symbols(0xffff, mask);
00133
00134 int32_t left_idx=0;
00135 int32_t right_idx=0;
00136 float64_t weight=spec_weights[d]*spec_weights[d];
00137
00138 while (left_idx < len1 && right_idx < len2)
00139 {
00140 uint16_t lsym=vec1[left_idx] & masked;
00141 uint16_t rsym=vec2[right_idx] & masked;
00142
00143 if (lsym == rsym)
00144 {
00145 int32_t old_left_idx=left_idx;
00146 int32_t old_right_idx=right_idx;
00147
00148 while (left_idx<len1 && (vec1[left_idx] & masked) ==lsym)
00149 left_idx++;
00150
00151 while (right_idx<len2 && (vec2[right_idx] & masked) ==lsym)
00152 right_idx++;
00153
00154 result+=weight*(left_idx-old_left_idx)*(right_idx-old_right_idx);
00155 }
00156 else if (lsym<rsym)
00157 left_idx++;
00158 else
00159 right_idx++;
00160 }
00161 }
00162
00163 strings->free_feature_vector(vec1, vec_idx1, free_vec1);
00164 sf->strings->free_feature_vector(vec2, vec_idx2, free_vec2);
00165
00166 if (normalization_factors)
00167 return result*normalization_factors[vec_idx1]*normalization_factors[vec_idx2];
00168 else
00169 return result;
00170 }
00171
00172 float64_t CImplicitWeightedSpecFeatures::dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len)
00173 {
00174 ASSERT(vec2_len == spec_size);
00175 ASSERT(vec_idx1 < num_strings);
00176
00177 float64_t result=0;
00178 int32_t len1=-1;
00179 bool free_vec1;
00180 uint16_t* vec1=strings->get_feature_vector(vec_idx1, len1, free_vec1);
00181
00182 if (vec1 && len1>0)
00183 {
00184 for (int32_t j=0; j<len1; j++)
00185 {
00186 uint8_t mask=0;
00187 int32_t offs=0;
00188 uint16_t v=*vec1++;
00189
00190 for (int32_t d=0; d<degree; d++)
00191 {
00192 mask = mask | (1 << (degree-d-1));
00193 int32_t idx=strings->get_masked_symbols(v, mask);
00194 idx=strings->shift_symbol(idx, degree-d-1);
00195 result += vec2[offs + idx]*spec_weights[d];
00196 offs+=strings->shift_offset(1,d+1);
00197 }
00198 }
00199
00200 strings->free_feature_vector(vec1, vec_idx1, free_vec1);
00201
00202 if (normalization_factors)
00203 result*=normalization_factors[vec_idx1];
00204 }
00205 else
00206 SG_ERROR("huh?\n");
00207
00208 return result;
00209 }
00210
00211 void CImplicitWeightedSpecFeatures::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val)
00212 {
00213 int32_t len1=-1;
00214 bool free_vec1;
00215 uint16_t* vec=strings->get_feature_vector(vec_idx1, len1, free_vec1);
00216
00217 if (normalization_factors)
00218 alpha*=normalization_factors[vec_idx1];
00219
00220 if (vec && len1>0)
00221 {
00222 for (int32_t j=0; j<len1; j++)
00223 {
00224 uint8_t mask=0;
00225 int32_t offs=0;
00226 for (int32_t d=0; d<degree; d++)
00227 {
00228 mask = mask | (1 << (degree-d-1));
00229 int32_t idx=strings->get_masked_symbols(vec[j], mask);
00230 idx=strings->shift_symbol(idx, degree-d-1);
00231 if (abs_val)
00232 vec2[offs + idx] += CMath::abs(alpha*spec_weights[d]);
00233 else
00234 vec2[offs + idx] += alpha*spec_weights[d];
00235 offs+=strings->shift_offset(1,d+1);
00236 }
00237 }
00238 }
00239
00240 strings->free_feature_vector(vec, vec_idx1, free_vec1);
00241 }
00242
00243 CFeatures* CImplicitWeightedSpecFeatures::duplicate() const
00244 {
00245 return new CImplicitWeightedSpecFeatures(*this);
00246 }
00247
00248 int32_t CImplicitWeightedSpecFeatures::get_dim_feature_space() const
00249 {
00250 return spec_size;
00251 }
00252
00253 void* CImplicitWeightedSpecFeatures::get_feature_iterator(int32_t vector_index)
00254 {
00255 if (vector_index>=num_strings)
00256 {
00257 SG_ERROR("Index out of bounds (number of strings %d, you "
00258 "requested %d)\n", num_strings, vector_index);
00259 }
00260
00261 wspec_feature_iterator* it=SG_MALLOC(wspec_feature_iterator, 1);
00262 it->vec= strings->get_feature_vector(vector_index, it->vlen, it->vfree);
00263 it->vidx=vector_index;
00264
00265 it->offs=0;
00266 it->d=0;
00267 it->j=0;
00268 it->mask=0;
00269 it->alpha=normalization_factors[vector_index];
00270
00271 return it;
00272 }
00273
00274 bool CImplicitWeightedSpecFeatures::get_next_feature(int32_t& index, float64_t& value, void* iterator)
00275 {
00276 wspec_feature_iterator* it=(wspec_feature_iterator*) iterator;
00277
00278 if (it->d>=degree)
00279 {
00280 if (it->j < it->vlen-1)
00281 {
00282 it->j++;
00283 it->d=0;
00284 it->mask=0;
00285 it->offs=0;
00286 }
00287 else
00288 return false;
00289 }
00290
00291 int32_t d=it->d;
00292
00293 it->mask = it->mask | (1 << (degree-d-1));
00294 int32_t idx=strings->get_masked_symbols(it->vec[it->j], it->mask);
00295 idx=strings->shift_symbol(idx, degree-d-1);
00296 value=it->alpha*spec_weights[d];
00297 index=it->offs + idx;
00298 it->offs+=strings->shift_offset(1,d+1);
00299
00300 it->d=d+1;
00301 return true;
00302 }
00303
00304 void CImplicitWeightedSpecFeatures::free_feature_iterator(void* iterator)
00305 {
00306 ASSERT(iterator);
00307 wspec_feature_iterator* it=(wspec_feature_iterator*) iterator;
00308 strings->free_feature_vector(it->vec, it->vidx, it->vfree);
00309 SG_FREE(it);
00310 }
00311
00312
00313 int32_t CImplicitWeightedSpecFeatures::get_nnz_features_for_vector(int32_t num)
00314 {
00315 int32_t vlen=-1;
00316 bool free_vec;
00317 uint16_t* vec1=strings->get_feature_vector(num, vlen, free_vec);
00318 strings->free_feature_vector(vec1, num, free_vec);
00319 int32_t nnz=0;
00320 for (int32_t i=1; i<=degree; i++)
00321 nnz+=CMath::min(CMath::pow(alphabet_size,i), vlen);
00322 return nnz;
00323 }
00324
00325 EFeatureType CImplicitWeightedSpecFeatures::get_feature_type()
00326 {
00327 return F_UNKNOWN;
00328 }
00329
00330 EFeatureClass CImplicitWeightedSpecFeatures::get_feature_class()
00331 {
00332 return C_WEIGHTEDSPEC;
00333 }
00334
00335 int32_t CImplicitWeightedSpecFeatures::get_num_vectors() const
00336 {
00337 return num_strings;
00338 }
00339
00340 int32_t CImplicitWeightedSpecFeatures::get_size()
00341 {
00342 return sizeof(float64_t);
00343 }