00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011 #include <shogun/features/HashedWDFeatures.h>
00012 #include <shogun/io/SGIO.h>
00013
00014 using namespace shogun;
00015
00016 CHashedWDFeatures::CHashedWDFeatures() :CDotFeatures()
00017 {
00018 SG_UNSTABLE("CHashedWDFeatures::CHashedWDFeatures()", "\n");
00019
00020 strings = NULL;
00021
00022 degree = 0;
00023 start_degree = 0;
00024 from_degree = 0;
00025 string_length = 0;
00026 num_strings = 0;
00027 alphabet_size = 0;
00028 w_dim = 0;
00029 partial_w_dim = 0;
00030 wd_weights = NULL;
00031 mask = 0;
00032 m_hash_bits = 0;
00033
00034 normalization_const = 0.0;
00035 }
00036
00037 CHashedWDFeatures::CHashedWDFeatures(CStringFeatures<uint8_t>* str,
00038 int32_t start_order, int32_t order, int32_t from_order,
00039 int32_t hash_bits) : CDotFeatures()
00040 {
00041 ASSERT(start_order>=0);
00042 ASSERT(start_order<order);
00043 ASSERT(order<=from_order);
00044 ASSERT(hash_bits>0);
00045 ASSERT(str);
00046 ASSERT(str->have_same_length());
00047 SG_REF(str);
00048
00049 strings=str;
00050 string_length=str->get_max_vector_length();
00051 num_strings=str->get_num_vectors();
00052 CAlphabet* alpha=str->get_alphabet();
00053 alphabet_size=alpha->get_num_symbols();
00054 SG_UNREF(alpha);
00055
00056 degree=order;
00057 start_degree=start_order;
00058 from_degree=from_order;
00059 m_hash_bits=hash_bits;
00060 set_wd_weights();
00061 set_normalization_const();
00062 }
00063
00064 CHashedWDFeatures::CHashedWDFeatures(const CHashedWDFeatures& orig)
00065 : CDotFeatures(orig), strings(orig.strings),
00066 degree(orig.degree), start_degree(orig.start_degree),
00067 from_degree(orig.from_degree), m_hash_bits(orig.m_hash_bits),
00068 normalization_const(orig.normalization_const)
00069 {
00070 SG_REF(strings);
00071 string_length=strings->get_max_vector_length();
00072 num_strings=strings->get_num_vectors();
00073 CAlphabet* alpha=strings->get_alphabet();
00074 alphabet_size=alpha->get_num_symbols();
00075 SG_UNREF(alpha);
00076
00077 set_wd_weights();
00078 }
00079
00080 CHashedWDFeatures::~CHashedWDFeatures()
00081 {
00082 SG_UNREF(strings);
00083 SG_FREE(wd_weights);
00084 }
00085
00086 float64_t CHashedWDFeatures::dot(int32_t vec_idx1, CDotFeatures* df, int32_t vec_idx2)
00087 {
00088 ASSERT(df);
00089 ASSERT(df->get_feature_type() == get_feature_type());
00090 ASSERT(df->get_feature_class() == get_feature_class());
00091 CHashedWDFeatures* wdf = (CHashedWDFeatures*) df;
00092
00093 int32_t len1, len2;
00094 bool free_vec1, free_vec2;
00095
00096 uint8_t* vec1=strings->get_feature_vector(vec_idx1, len1, free_vec1);
00097 uint8_t* vec2=wdf->strings->get_feature_vector(vec_idx2, len2, free_vec2);
00098
00099 ASSERT(len1==len2);
00100
00101 float64_t sum=0.0;
00102
00103 for (int32_t i=0; i<len1; i++)
00104 {
00105 for (int32_t j=0; (i+j<len1) && (j<degree); j++)
00106 {
00107 if (vec1[i+j]!=vec2[i+j])
00108 break;
00109 if (j>=start_degree)
00110 sum += wd_weights[j]*wd_weights[j];
00111 }
00112 }
00113 strings->free_feature_vector(vec1, vec_idx1, free_vec1);
00114 wdf->strings->free_feature_vector(vec2, vec_idx2, free_vec2);
00115 return sum/CMath::sq(normalization_const);
00116 }
00117
00118 float64_t CHashedWDFeatures::dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len)
00119 {
00120 if (vec2_len != w_dim)
00121 SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim);
00122
00123 float64_t sum=0;
00124 int32_t lim=CMath::min(degree, string_length);
00125 int32_t len;
00126 bool free_vec1;
00127 uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1);
00128 uint32_t* val=SG_MALLOC(uint32_t, len);
00129
00130 uint32_t offs=0;
00131
00132 if (start_degree>0)
00133 {
00134
00135 for (int32_t i=0; i+start_degree < len; i++)
00136 val[i]=CHash::MurmurHash3(&vec[i], start_degree, 0xDEADBEAF);
00137 }
00138 else
00139 SGVector<uint32_t>::fill_vector(val, len, 0xDEADBEAF);
00140
00141 for (int32_t k=start_degree; k<lim; k++)
00142 {
00143 float64_t wd = wd_weights[k];
00144
00145 uint32_t o=offs;
00146 uint32_t carry = 0;
00147 uint32_t chunk = 0;
00148
00149 for (int32_t i=0; i+k < len; i++)
00150 {
00151 chunk++;
00152 CHash::IncrementalMurmurHash3(&(val[i]), &carry, &(vec[i+k]), 1);
00153 uint32_t h =
00154 CHash::FinalizeIncrementalMurmurHash3(val[i], carry, chunk);
00155 #ifdef DEBUG_HASHEDWD
00156 SG_PRINT("vec[i]=%d, k=%d, offs=%d o=%d\n", vec[i], k,offs, o);
00157 #endif
00158 sum+=vec2[o+(h & mask)]*wd;
00159 val[i] = h;
00160 o+=partial_w_dim;
00161 }
00162 val[len-k-1] =
00163 CHash::FinalizeIncrementalMurmurHash3(val[len-k-1], carry, chunk);
00164 offs+=partial_w_dim*len;
00165 }
00166 SG_FREE(val);
00167 strings->free_feature_vector(vec, vec_idx1, free_vec1);
00168
00169 return sum/normalization_const;
00170 }
00171
00172 void CHashedWDFeatures::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val)
00173 {
00174 if (vec2_len != w_dim)
00175 SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim);
00176
00177 int32_t lim=CMath::min(degree, string_length);
00178 int32_t len;
00179 bool free_vec1;
00180 uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1);
00181 uint32_t* val=SG_MALLOC(uint32_t, len);
00182
00183 uint32_t offs=0;
00184
00185 if (start_degree>0)
00186 {
00187
00188 for (int32_t i=0; i+start_degree < len; i++)
00189 val[i]=CHash::MurmurHash3(&vec[i], start_degree, 0xDEADBEAF);
00190 }
00191 else
00192 SGVector<uint32_t>::fill_vector(val, len, 0xDEADBEAF);
00193
00194 for (int32_t k=start_degree; k<lim; k++)
00195 {
00196 float64_t wd = alpha*wd_weights[k]/normalization_const;
00197
00198 if (abs_val)
00199 wd=CMath::abs(wd);
00200
00201 uint32_t o=offs;
00202 uint32_t carry = 0;
00203 uint32_t chunk = 0;
00204
00205 for (int32_t i=0; i+k < len; i++)
00206 {
00207 chunk++;
00208 CHash::IncrementalMurmurHash3(&(val[i]), &carry, &(vec[i+k]), 1);
00209 uint32_t h = CHash::FinalizeIncrementalMurmurHash3(val[i], carry, chunk);
00210
00211 #ifdef DEBUG_HASHEDWD
00212 SG_PRINT("offs=%d o=%d h=%d \n", offs, o, h);
00213 SG_PRINT("vec[i]=%d, k=%d, offs=%d o=%d\n", vec[i], k,offs, o);
00214 #endif
00215 vec2[o+(h & mask)]+=wd;
00216 val[i] = h;
00217 o+=partial_w_dim;
00218 }
00219 val[len-k-1] =
00220 CHash::FinalizeIncrementalMurmurHash3(val[len-k-1], carry, chunk);
00221
00222 offs+=partial_w_dim*len;
00223 }
00224
00225 SG_FREE(val);
00226 strings->free_feature_vector(vec, vec_idx1, free_vec1);
00227 }
00228
00229 void CHashedWDFeatures::set_wd_weights()
00230 {
00231 ASSERT(degree>0);
00232
00233 mask=(uint32_t) (((uint64_t) 1)<<m_hash_bits)-1;
00234 partial_w_dim=1<<m_hash_bits;
00235 w_dim=partial_w_dim*string_length*(degree-start_degree);
00236
00237 wd_weights=SG_MALLOC(float64_t, degree);
00238
00239 for (int32_t i=0; i<degree; i++)
00240 wd_weights[i]=sqrt(2.0*(from_degree-i)/(from_degree*(from_degree+1)));
00241
00242 SG_DEBUG("created HashedWDFeatures with d=%d (%d), alphabetsize=%d, "
00243 "dim=%d partial_dim=%d num=%d, len=%d\n",
00244 degree, from_degree, alphabet_size,
00245 w_dim, partial_w_dim, num_strings, string_length);
00246 }
00247
00248
00249 void CHashedWDFeatures::set_normalization_const(float64_t n)
00250 {
00251 if (n==0)
00252 {
00253 normalization_const=0;
00254 for (int32_t i=0; i<degree; i++)
00255 normalization_const+=(string_length-i)*wd_weights[i]*wd_weights[i];
00256
00257 normalization_const=CMath::sqrt(normalization_const);
00258 }
00259 else
00260 normalization_const=n;
00261
00262 SG_DEBUG("normalization_const:%f\n", normalization_const);
00263 }
00264
00265 CFeatures* CHashedWDFeatures::duplicate() const
00266 {
00267 return new CHashedWDFeatures(*this);
00268 }
00269
00270
00271 int32_t CHashedWDFeatures::get_nnz_features_for_vector(int32_t num)
00272 {
00273 int32_t vlen=-1;
00274 bool free_vec;
00275 uint8_t* vec=strings->get_feature_vector(num, vlen, free_vec);
00276 strings->free_feature_vector(vec, num, free_vec);
00277 return degree*vlen;
00278 }
00279
00280 void* CHashedWDFeatures::get_feature_iterator(int32_t vector_index)
00281 {
00282 SG_NOTIMPLEMENTED;
00283 return NULL;
00284 }
00285
00286 bool CHashedWDFeatures::get_next_feature(int32_t& index, float64_t& value,
00287 void* iterator)
00288 {
00289 SG_NOTIMPLEMENTED;
00290 return NULL;
00291 }
00292
00293 void CHashedWDFeatures::free_feature_iterator(void* iterator)
00294 {
00295 SG_NOTIMPLEMENTED;
00296 }