00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011 #include <shogun/features/ImplicitWeightedSpecFeatures.h>
00012 #include <shogun/io/SGIO.h>
00013
00014 using namespace shogun;
00015
00016 CImplicitWeightedSpecFeatures::CImplicitWeightedSpecFeatures(void)
00017 :CDotFeatures()
00018 {
00019 SG_UNSTABLE("CImplicitWeightedSpecFeatures::"
00020 "CImplicitWeightedSpecFeatures(void)", "\n");
00021
00022 strings = NULL;
00023 normalization_factors = NULL;
00024 num_strings = 0;
00025 alphabet_size = 0;
00026
00027 degree = 0;
00028 spec_size = 0;
00029 spec_weights = 0;
00030 }
00031
00032 CImplicitWeightedSpecFeatures::CImplicitWeightedSpecFeatures(CStringFeatures<uint16_t>* str, bool normalize) : CDotFeatures()
00033 {
00034 ASSERT(str);
00035 strings=str;
00036 SG_REF(strings)
00037 normalization_factors=NULL;
00038 spec_weights=NULL;
00039 num_strings = str->get_num_vectors();
00040 alphabet_size = str->get_original_num_symbols();
00041 degree=str->get_order();
00042 set_wd_weights();
00043
00044 SG_DEBUG("WEIGHTED SPEC alphasz=%d, size=%d, num_str=%d\n", alphabet_size,
00045 spec_size, num_strings);
00046
00047 if (normalize)
00048 compute_normalization_const();
00049 }
00050
00051 void CImplicitWeightedSpecFeatures::compute_normalization_const()
00052 {
00053 float64_t* factors=SG_MALLOC(float64_t, num_strings);
00054
00055 for (int32_t i=0; i<num_strings; i++)
00056 factors[i]=1.0/CMath::sqrt(dot(i, this, i));
00057
00058 normalization_factors=factors;
00059
00060 }
00061
00062 bool CImplicitWeightedSpecFeatures::set_wd_weights()
00063 {
00064 SG_FREE(spec_weights);
00065 spec_weights=SG_MALLOC(float64_t, degree);
00066
00067 int32_t i;
00068 float64_t sum=0;
00069 spec_size=0;
00070
00071 for (i=0; i<degree; i++)
00072 {
00073 spec_size+=CMath::pow(alphabet_size, i+1);
00074 spec_weights[i]=degree-i;
00075 sum+=spec_weights[i];
00076 }
00077 for (i=0; i<degree; i++)
00078 spec_weights[i]=CMath::sqrt(spec_weights[i]/sum);
00079
00080 return spec_weights!=NULL;
00081 }
00082
00083 bool CImplicitWeightedSpecFeatures::set_weights(float64_t* w, int32_t d)
00084 {
00085 ASSERT(d==degree);
00086
00087 SG_FREE(spec_weights);
00088 spec_weights=SG_MALLOC(float64_t, degree);
00089 for (int32_t i=0; i<degree; i++)
00090 spec_weights[i]=CMath::sqrt(w[i]);
00091 return true;
00092 }
00093
00094 CImplicitWeightedSpecFeatures::CImplicitWeightedSpecFeatures(const CImplicitWeightedSpecFeatures& orig) : CDotFeatures(orig),
00095 num_strings(orig.num_strings),
00096 alphabet_size(orig.alphabet_size), spec_size(orig.spec_size)
00097 {
00098 SG_NOTIMPLEMENTED;
00099 SG_REF(strings);
00100 }
00101
00102 CImplicitWeightedSpecFeatures::~CImplicitWeightedSpecFeatures()
00103 {
00104 SG_UNREF(strings);
00105 SG_FREE(spec_weights);
00106 SG_FREE(normalization_factors);
00107 }
00108
00109 float64_t CImplicitWeightedSpecFeatures::dot(int32_t vec_idx1, CDotFeatures* df, int32_t vec_idx2)
00110 {
00111 ASSERT(df);
00112 ASSERT(df->get_feature_type() == get_feature_type());
00113 ASSERT(df->get_feature_class() == get_feature_class());
00114 CImplicitWeightedSpecFeatures* sf = (CImplicitWeightedSpecFeatures*) df;
00115
00116 ASSERT(vec_idx1 < num_strings);
00117 ASSERT(vec_idx2 < sf->get_num_vectors());
00118
00119 int32_t len1=-1;
00120 int32_t len2=-1;
00121 bool free_vec1;
00122 bool free_vec2;
00123 uint16_t* vec1=strings->get_feature_vector(vec_idx1, len1, free_vec1);
00124 uint16_t* vec2=sf->strings->get_feature_vector(vec_idx2, len2, free_vec2);
00125
00126 float64_t result=0;
00127 uint8_t mask=0;
00128
00129 for (int32_t d=0; d<degree; d++)
00130 {
00131 mask = mask | (1 << (degree-d-1));
00132 uint16_t masked=strings->get_masked_symbols(0xffff, mask);
00133
00134 int32_t left_idx=0;
00135 int32_t right_idx=0;
00136 float64_t weight=spec_weights[d]*spec_weights[d];
00137
00138 while (left_idx < len1 && right_idx < len2)
00139 {
00140 uint16_t lsym=vec1[left_idx] & masked;
00141 uint16_t rsym=vec2[right_idx] & masked;
00142
00143 if (lsym == rsym)
00144 {
00145 int32_t old_left_idx=left_idx;
00146 int32_t old_right_idx=right_idx;
00147
00148 while (left_idx<len1 && (vec1[left_idx] & masked) ==lsym)
00149 left_idx++;
00150
00151 while (right_idx<len2 && (vec2[right_idx] & masked) ==lsym)
00152 right_idx++;
00153
00154 result+=weight*(left_idx-old_left_idx)*(right_idx-old_right_idx);
00155 }
00156 else if (lsym<rsym)
00157 left_idx++;
00158 else
00159 right_idx++;
00160 }
00161 }
00162
00163 strings->free_feature_vector(vec1, vec_idx1, free_vec1);
00164 sf->strings->free_feature_vector(vec2, vec_idx2, free_vec2);
00165
00166 if (normalization_factors)
00167 return result*normalization_factors[vec_idx1]*normalization_factors[vec_idx2];
00168 else
00169 return result;
00170 }
00171
00172 float64_t CImplicitWeightedSpecFeatures::dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len)
00173 {
00174 ASSERT(vec2_len == spec_size);
00175 ASSERT(vec_idx1 < num_strings);
00176
00177 float64_t result=0;
00178 int32_t len1=-1;
00179 bool free_vec1;
00180 uint16_t* vec1=strings->get_feature_vector(vec_idx1, len1, free_vec1);
00181
00182 if (vec1 && len1>0)
00183 {
00184 for (int32_t j=0; j<len1; j++)
00185 {
00186 uint8_t mask=0;
00187 int32_t offs=0;
00188 uint16_t v=*vec1++;
00189
00190 for (int32_t d=0; d<degree; d++)
00191 {
00192 mask = mask | (1 << (degree-d-1));
00193 int32_t idx=strings->get_masked_symbols(v, mask);
00194 idx=strings->shift_symbol(idx, degree-d-1);
00195 result += vec2[offs + idx]*spec_weights[d];
00196 offs+=strings->shift_offset(1,d+1);
00197 }
00198 }
00199
00200 strings->free_feature_vector(vec1, vec_idx1, free_vec1);
00201
00202 if (normalization_factors)
00203 result*=normalization_factors[vec_idx1];
00204 }
00205 else
00206 SG_ERROR("huh?\n");
00207
00208 return result;
00209 }
00210
00211 void CImplicitWeightedSpecFeatures::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val)
00212 {
00213 int32_t len1=-1;
00214 bool free_vec1;
00215 uint16_t* vec=strings->get_feature_vector(vec_idx1, len1, free_vec1);
00216
00217 if (normalization_factors)
00218 alpha*=normalization_factors[vec_idx1];
00219
00220 if (vec && len1>0)
00221 {
00222 for (int32_t j=0; j<len1; j++)
00223 {
00224 uint8_t mask=0;
00225 int32_t offs=0;
00226 for (int32_t d=0; d<degree; d++)
00227 {
00228 mask = mask | (1 << (degree-d-1));
00229 int32_t idx=strings->get_masked_symbols(vec[j], mask);
00230 idx=strings->shift_symbol(idx, degree-d-1);
00231 if (abs_val)
00232 vec2[offs + idx] += CMath::abs(alpha*spec_weights[d]);
00233 else
00234 vec2[offs + idx] += alpha*spec_weights[d];
00235 offs+=strings->shift_offset(1,d+1);
00236 }
00237 }
00238 }
00239
00240 strings->free_feature_vector(vec, vec_idx1, free_vec1);
00241 }
00242
00243 CFeatures* CImplicitWeightedSpecFeatures::duplicate() const
00244 {
00245 return new CImplicitWeightedSpecFeatures(*this);
00246 }
00247
00248 void* CImplicitWeightedSpecFeatures::get_feature_iterator(int32_t vector_index)
00249 {
00250 if (vector_index>=num_strings)
00251 {
00252 SG_ERROR("Index out of bounds (number of strings %d, you "
00253 "requested %d)\n", num_strings, vector_index);
00254 }
00255
00256 wspec_feature_iterator* it=SG_MALLOC(wspec_feature_iterator, 1);
00257 it->vec= strings->get_feature_vector(vector_index, it->vlen, it->vfree);
00258 it->vidx=vector_index;
00259
00260 it->offs=0;
00261 it->d=0;
00262 it->j=0;
00263 it->mask=0;
00264 it->alpha=normalization_factors[vector_index];
00265
00266 return it;
00267 }
00268
00269 bool CImplicitWeightedSpecFeatures::get_next_feature(int32_t& index, float64_t& value, void* iterator)
00270 {
00271 wspec_feature_iterator* it=(wspec_feature_iterator*) iterator;
00272
00273 if (it->d>=degree)
00274 {
00275 if (it->j < it->vlen-1)
00276 {
00277 it->j++;
00278 it->d=0;
00279 it->mask=0;
00280 it->offs=0;
00281 }
00282 else
00283 return false;
00284 }
00285
00286 int32_t d=it->d;
00287
00288 it->mask = it->mask | (1 << (degree-d-1));
00289 int32_t idx=strings->get_masked_symbols(it->vec[it->j], it->mask);
00290 idx=strings->shift_symbol(idx, degree-d-1);
00291 value=it->alpha*spec_weights[d];
00292 index=it->offs + idx;
00293 it->offs+=strings->shift_offset(1,d+1);
00294
00295 it->d=d+1;
00296 return true;
00297 }
00298
00299 void CImplicitWeightedSpecFeatures::free_feature_iterator(void* iterator)
00300 {
00301 ASSERT(iterator);
00302 wspec_feature_iterator* it=(wspec_feature_iterator*) iterator;
00303 strings->free_feature_vector(it->vec, it->vidx, it->vfree);
00304 SG_FREE(it);
00305 }