00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011 #include <shogun/features/WDFeatures.h>
00012 #include <shogun/io/SGIO.h>
00013
00014 using namespace shogun;
00015
00016 CWDFeatures::CWDFeatures(void) :CDotFeatures()
00017 {
00018 SG_UNSTABLE("CWDFeatures::CWDFeatures(void) :CDotFeatures()",
00019 "\n");
00020
00021 strings = NULL;
00022
00023 degree = 0;
00024 from_degree = 0;
00025 string_length = 0;
00026 num_strings = 0;
00027 alphabet_size = 0;
00028 w_dim = 0;
00029 wd_weights = NULL;
00030 normalization_const = 0.0;
00031 }
00032
00033 CWDFeatures::CWDFeatures(CStringFeatures<uint8_t>* str,
00034 int32_t order, int32_t from_order) : CDotFeatures()
00035 {
00036 ASSERT(str);
00037 ASSERT(str->have_same_length());
00038 SG_REF(str);
00039
00040 strings=str;
00041 string_length=str->get_max_vector_length();
00042 num_strings=str->get_num_vectors();
00043 CAlphabet* alpha=str->get_alphabet();
00044 alphabet_size=alpha->get_num_symbols();
00045 SG_UNREF(alpha);
00046
00047 degree=order;
00048 from_degree=from_order;
00049 wd_weights=NULL;
00050 set_wd_weights();
00051 set_normalization_const();
00052
00053 }
00054
00055 CWDFeatures::CWDFeatures(const CWDFeatures& orig)
00056 : CDotFeatures(orig), strings(orig.strings),
00057 degree(orig.degree), from_degree(orig.from_degree),
00058 normalization_const(orig.normalization_const)
00059 {
00060 SG_REF(strings);
00061 string_length=strings->get_max_vector_length();
00062 num_strings=strings->get_num_vectors();
00063 CAlphabet* alpha=strings->get_alphabet();
00064 alphabet_size=alpha->get_num_symbols();
00065 SG_UNREF(alpha);
00066
00067 wd_weights=NULL;
00068 set_wd_weights();
00069 }
00070
00071 CWDFeatures::~CWDFeatures()
00072 {
00073 SG_UNREF(strings);
00074 SG_FREE(wd_weights);
00075 }
00076
00077 float64_t CWDFeatures::dot(int32_t vec_idx1, CDotFeatures* df, int32_t vec_idx2)
00078 {
00079 ASSERT(df);
00080 ASSERT(df->get_feature_type() == get_feature_type());
00081 ASSERT(df->get_feature_class() == get_feature_class());
00082 CWDFeatures* wdf = (CWDFeatures*) df;
00083
00084 int32_t len1, len2;
00085 bool free_vec1, free_vec2;
00086
00087 uint8_t* vec1=strings->get_feature_vector(vec_idx1, len1, free_vec1);
00088 uint8_t* vec2=wdf->strings->get_feature_vector(vec_idx2, len2, free_vec2);
00089
00090 ASSERT(len1==len2);
00091
00092 float64_t sum=0.0;
00093
00094 for (int32_t i=0; i<len1; i++)
00095 {
00096 for (int32_t j=0; (i+j<len1) && (j<degree); j++)
00097 {
00098 if (vec1[i+j]!=vec2[i+j])
00099 break ;
00100 sum += wd_weights[j]*wd_weights[j];
00101 }
00102 }
00103 strings->free_feature_vector(vec1, vec_idx1, free_vec1);
00104 wdf->strings->free_feature_vector(vec2, vec_idx2, free_vec2);
00105 return sum/CMath::sq(normalization_const);
00106 }
00107
00108 float64_t CWDFeatures::dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len)
00109 {
00110 if (vec2_len != w_dim)
00111 SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim);
00112
00113 float64_t sum=0;
00114 int32_t lim=CMath::min(degree, string_length);
00115 int32_t len;
00116 bool free_vec1;
00117 uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1);
00118 int32_t* val=SG_MALLOC(int32_t, len);
00119 CMath::fill_vector(val, len, 0);
00120
00121 int32_t asize=alphabet_size;
00122 int32_t asizem1=1;
00123 int32_t offs=0;
00124
00125 for (int32_t k=0; k<lim; k++)
00126 {
00127 float64_t wd = wd_weights[k];
00128
00129 int32_t o=offs;
00130 for (int32_t i=0; i+k < len; i++)
00131 {
00132 val[i]+=asizem1*vec[i+k];
00133 sum+=vec2[val[i]+o]*wd;
00134 o+=asize;
00135 }
00136 offs+=asize*len;
00137 asize*=alphabet_size;
00138 asizem1*=alphabet_size;
00139 }
00140 SG_FREE(val);
00141 strings->free_feature_vector(vec, vec_idx1, free_vec1);
00142
00143 return sum/normalization_const;
00144 }
00145
00146 void CWDFeatures::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val)
00147 {
00148 if (vec2_len != w_dim)
00149 SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim);
00150
00151 int32_t lim=CMath::min(degree, string_length);
00152 int32_t len;
00153 bool free_vec1;
00154 uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1);
00155 int32_t* val=SG_MALLOC(int32_t, len);
00156 CMath::fill_vector(val, len, 0);
00157
00158 int32_t asize=alphabet_size;
00159 int32_t asizem1=1;
00160 int32_t offs=0;
00161
00162 for (int32_t k=0; k<lim; k++)
00163 {
00164 float64_t wd = alpha*wd_weights[k]/normalization_const;
00165
00166 if (abs_val)
00167 wd=CMath::abs(wd);
00168
00169 int32_t o=offs;
00170 for (int32_t i=0; i+k < len; i++)
00171 {
00172 val[i]+=asizem1*vec[i+k];
00173 vec2[val[i]+o]+=wd;
00174 o+=asize;
00175 }
00176 offs+=asize*len;
00177 asize*=alphabet_size;
00178 asizem1*=alphabet_size;
00179 }
00180 SG_FREE(val);
00181
00182 strings->free_feature_vector(vec, vec_idx1, free_vec1);
00183 }
00184
00185 void CWDFeatures::set_wd_weights()
00186 {
00187 ASSERT(degree>0 && degree<=8);
00188 SG_FREE(wd_weights);
00189 wd_weights=SG_MALLOC(float64_t, degree);
00190 w_dim=0;
00191
00192 for (int32_t i=0; i<degree; i++)
00193 {
00194 w_dim+=CMath::pow(alphabet_size, i+1)*string_length;
00195 wd_weights[i]=sqrt(2.0*(from_degree-i)/(from_degree*(from_degree+1)));
00196 }
00197 SG_DEBUG("created WDFeatures with d=%d (%d), alphabetsize=%d, dim=%d num=%d, len=%d\n", degree, from_degree, alphabet_size, w_dim, num_strings, string_length);
00198 }
00199
00200
00201 void CWDFeatures::set_normalization_const(float64_t n)
00202 {
00203 if (n==0)
00204 {
00205 normalization_const=0;
00206 for (int32_t i=0; i<degree; i++)
00207 normalization_const+=(string_length-i)*wd_weights[i]*wd_weights[i];
00208
00209 normalization_const=CMath::sqrt(normalization_const);
00210 }
00211 else
00212 normalization_const=n;
00213
00214 SG_DEBUG("normalization_const:%f\n", normalization_const);
00215 }
00216
00217 void* CWDFeatures::get_feature_iterator(int32_t vector_index)
00218 {
00219 if (vector_index>=num_strings)
00220 {
00221 SG_ERROR("Index out of bounds (number of strings %d, you "
00222 "requested %d)\n", num_strings, vector_index);
00223 }
00224
00225 wd_feature_iterator* it=SG_MALLOC(wd_feature_iterator, 1);
00226
00227 it->lim=CMath::min(degree, string_length);
00228 it->vec= strings->get_feature_vector(vector_index, it->vlen, it->vfree);
00229 it->vidx=vector_index;
00230
00231 it->vec = strings->get_feature_vector(vector_index, it->vlen, it->vfree);
00232 it->val=SG_MALLOC(int32_t, it->vlen);
00233 CMath::fill_vector(it->val, it->vlen, 0);
00234
00235 it->asize=alphabet_size;
00236 it->asizem1=1;
00237 it->offs=0;
00238 it->k=0;
00239 it->i=0;
00240 it->o=0;
00241
00242 return it;
00243 }
00244
00245 bool CWDFeatures::get_next_feature(int32_t& index, float64_t& value, void* iterator)
00246 {
00247 wd_feature_iterator* it=(wd_feature_iterator*) iterator;
00248
00249 if (it->i + it->k >= it->vlen)
00250 {
00251 if (it->k < it->lim-1)
00252 {
00253 it->offs+=it->asize*it->vlen;
00254 it->asize*=alphabet_size;
00255 it->asizem1*=alphabet_size;
00256 it->k++;
00257 it->i=0;
00258 it->o=it->offs;
00259 }
00260 else
00261 return false;
00262 }
00263
00264 int32_t i=it->i;
00265 int32_t k=it->k;
00266 #ifdef DEBUG_WDFEATURES
00267 SG_PRINT("i=%d k=%d offs=%d o=%d asize=%d asizem1=%d\n", i, k, it->offs, it->o, it->asize, it->asizem1);
00268 #endif
00269
00270 it->val[i]+=it->asizem1*it->vec[i+k];
00271 value=wd_weights[k]/normalization_const;
00272 index=it->val[i]+it->o;
00273 #ifdef DEBUG_WDFEATURES
00274 SG_PRINT("index=%d val=%f w_size=%d lim=%d vlen=%d\n", index, value, w_dim, it->lim, it->vlen);
00275 #endif
00276
00277 it->o+=it->asize;
00278 it->i=i+1;
00279
00280 return true;
00281 }
00282
00283 void CWDFeatures::free_feature_iterator(void* iterator)
00284 {
00285 ASSERT(iterator);
00286 wd_feature_iterator* it=(wd_feature_iterator*) iterator;
00287 strings->free_feature_vector(it->vec, it->vidx, it->vfree);
00288 SG_FREE(it->val);
00289 SG_FREE(it);
00290 }
00291
00292 CFeatures* CWDFeatures::duplicate() const
00293 {
00294 return new CWDFeatures(*this);
00295 }