00001
00002
00003
00004
00005
00006
00007
00008
00009
00010 #include <shogun/features/SparsePolyFeatures.h>
00011 #include <shogun/lib/Hash.h>
00012
00013 using namespace shogun;
00014
00015 CSparsePolyFeatures::CSparsePolyFeatures()
00016 {
00017 SG_UNSTABLE("CSparsePolyFeatures::CSparsePolyFeatures()",
00018 "\n");
00019
00020 m_feat = NULL;
00021 m_degree = 0;
00022 m_normalize = false;
00023 m_input_dimensions = 0;
00024 m_output_dimensions = 0;
00025 m_normalization_values = NULL;
00026 mask = 0;
00027 m_hash_bits = 0;
00028 }
00029
00030 CSparsePolyFeatures::CSparsePolyFeatures(CSparseFeatures<float64_t>* feat, int32_t degree, bool normalize, int32_t hash_bits)
00031 : CDotFeatures(), m_normalization_values(NULL)
00032 {
00033 ASSERT(feat);
00034
00035 m_feat = feat;
00036 SG_REF(m_feat);
00037 m_degree=degree;
00038 m_normalize=normalize;
00039 m_hash_bits=hash_bits;
00040 mask=(uint32_t) (((uint64_t) 1)<<m_hash_bits)-1;
00041 m_output_dimensions=1<<m_hash_bits;
00042 m_input_dimensions=feat->get_num_features();
00043
00044 if (m_normalize)
00045 store_normalization_values();
00046 }
00047
00048 CSparsePolyFeatures::~CSparsePolyFeatures()
00049 {
00050 SG_FREE(m_normalization_values);
00051 SG_UNREF(m_feat);
00052 }
00053
00054 CSparsePolyFeatures::CSparsePolyFeatures(const CSparsePolyFeatures & orig)
00055 {
00056 SG_PRINT("CSparsePolyFeatures:\n");
00057 SG_NOTIMPLEMENTED;
00058 }
00059
00060 int32_t CSparsePolyFeatures::get_dim_feature_space() const
00061 {
00062 return m_output_dimensions;
00063 }
00064
00065 int32_t CSparsePolyFeatures::get_nnz_features_for_vector(int32_t num)
00066 {
00067 int32_t vlen;
00068 SGSparseVector<float64_t> vec=m_feat->get_sparse_feature_vector(num);
00069 vlen=vec.num_feat_entries;
00070 m_feat->free_feature_vector(num);
00071 return vlen*(vlen+1)/2;
00072 }
00073
00074 EFeatureType CSparsePolyFeatures::get_feature_type() const
00075 {
00076 return F_UNKNOWN;
00077 }
00078
00079 EFeatureClass CSparsePolyFeatures::get_feature_class() const
00080 {
00081 return C_POLY;
00082 }
00083
00084 int32_t CSparsePolyFeatures::get_num_vectors() const
00085 {
00086 if (m_feat)
00087 return m_feat->get_num_vectors();
00088 else
00089 return 0;
00090
00091 }
00092
00093 int32_t CSparsePolyFeatures::get_size() const
00094 {
00095 return sizeof(float64_t);
00096 }
00097
00098 void* CSparsePolyFeatures::get_feature_iterator(int32_t vector_index)
00099 {
00100 SG_NOTIMPLEMENTED;
00101 return NULL;
00102 }
00103
00104 bool CSparsePolyFeatures::get_next_feature(int32_t& index, float64_t& value, void* iterator)
00105 {
00106 SG_NOTIMPLEMENTED;
00107 return NULL;
00108 }
00109
00110 void CSparsePolyFeatures::free_feature_iterator(void* iterator)
00111 {
00112 SG_NOTIMPLEMENTED;
00113 }
00114
00115 float64_t CSparsePolyFeatures::dot(int32_t vec_idx1, CDotFeatures* df, int32_t vec_idx2)
00116 {
00117 ASSERT(df);
00118 ASSERT(df->get_feature_type() == get_feature_type());
00119 ASSERT(df->get_feature_class() == get_feature_class());
00120
00121 CSparsePolyFeatures* pf=(CSparsePolyFeatures*) df;
00122
00123 SGSparseVector<float64_t> vec1=m_feat->get_sparse_feature_vector(vec_idx1);
00124 SGSparseVector<float64_t> vec2=pf->m_feat->get_sparse_feature_vector(
00125 vec_idx2);
00126
00127 float64_t result=SGSparseVector<float64_t>::sparse_dot(vec1, vec2);
00128 result=CMath::pow(result, m_degree);
00129
00130 m_feat->free_feature_vector(vec_idx1);
00131 pf->m_feat->free_feature_vector(vec_idx2);
00132
00133 return result;
00134 }
00135
00136 float64_t CSparsePolyFeatures::dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len)
00137 {
00138 if (vec2_len != m_output_dimensions)
00139 SG_ERROR("Dimensions don't match, vec2_dim=%d, m_output_dimensions=%d\n", vec2_len, m_output_dimensions);
00140
00141 SGSparseVector<float64_t> vec=m_feat->get_sparse_feature_vector(vec_idx1);
00142
00143 float64_t result=0;
00144
00145 if (vec.features)
00146 {
00147 if (m_degree==2)
00148 {
00149
00150 for (int32_t i=0; i<vec.num_feat_entries; i++)
00151 {
00152 float64_t v1=vec.features[i].entry;
00153 uint32_t seed=CHash::MurmurHash3(
00154 (uint8_t*)&(vec.features[i].feat_index),
00155 sizeof(int32_t), 0xDEADBEAF);
00156
00157 for (int32_t j=i; j<vec.num_feat_entries; j++)
00158 {
00159 float64_t v2=vec.features[j].entry;
00160 uint32_t h=CHash::MurmurHash3(
00161 (uint8_t*)&(vec.features[j].feat_index),
00162 sizeof(int32_t), seed)&mask;
00163 float64_t v;
00164
00165 if (i==j)
00166 v=v1*v1;
00167 else
00168 v=CMath::sqrt(2.0)*v1*v2;
00169
00170 result+=v*vec2[h];
00171 }
00172 }
00173 }
00174 else if (m_degree==3)
00175 SG_NOTIMPLEMENTED;
00176 }
00177
00178 if (m_normalize)
00179 result/=m_normalization_values[vec_idx1];
00180
00181 m_feat->free_feature_vector(vec_idx1);
00182 return result;
00183 }
00184
00185 void CSparsePolyFeatures::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val)
00186 {
00187 if (vec2_len!=m_output_dimensions)
00188 SG_ERROR("Dimensions don't match, vec2_dim=%d, m_output_dimensions=%d\n", vec2_len, m_output_dimensions);
00189
00190 SGSparseVector<float64_t> vec=m_feat->get_sparse_feature_vector(vec_idx1);
00191
00192 float64_t norm_val=1.0;
00193 if (m_normalize)
00194 norm_val = m_normalization_values[vec_idx1];
00195 alpha/=norm_val;
00196
00197 if (m_degree==2)
00198 {
00199
00200 for (int32_t i=0; i<vec.num_feat_entries; i++)
00201 {
00202 float64_t v1=vec.features[i].entry;
00203 uint32_t seed=CHash::MurmurHash3(
00204 (uint8_t*)&(vec.features[i].feat_index), sizeof(int32_t),
00205 0xDEADBEAF);
00206
00207 for (int32_t j=i; j<vec.num_feat_entries; j++)
00208 {
00209 float64_t v2=vec.features[j].entry;
00210 uint32_t h=CHash::MurmurHash3(
00211 (uint8_t*)&(vec.features[j].feat_index),
00212 sizeof(int32_t), seed)&mask;
00213 float64_t v;
00214
00215 if (i==j)
00216 v=alpha*v1*v1;
00217 else
00218 v=alpha*CMath::sqrt(2.0)*v1*v2;
00219
00220 if (abs_val)
00221 vec2[h]+=CMath::abs(v);
00222 else
00223 vec2[h]+=v;
00224 }
00225 }
00226 }
00227 else if (m_degree==3)
00228 SG_NOTIMPLEMENTED;
00229
00230 m_feat->free_feature_vector(vec_idx1);
00231 }
00232
00233 void CSparsePolyFeatures::store_normalization_values()
00234 {
00235 SG_FREE(m_normalization_values);
00236
00237 m_normalization_values_len = this->get_num_vectors();
00238
00239 m_normalization_values=SG_MALLOC(float64_t, m_normalization_values_len);
00240 for (int i=0; i<m_normalization_values_len; i++)
00241 {
00242 float64_t val = CMath::sqrt(dot(i, this,i));
00243 if (val==0)
00244
00245 m_normalization_values[i]=1.0;
00246 else
00247 m_normalization_values[i]=val;
00248 }
00249
00250 }
00251
00252 CFeatures* CSparsePolyFeatures::duplicate() const
00253 {
00254 return new CSparsePolyFeatures(*this);
00255 }
00256
00257 void CSparsePolyFeatures::init()
00258 {
00259 m_parameters->add((CSGObject**) &m_feat, "features",
00260 "Features in original space.");
00261 m_parameters->add(&m_degree, "degree", "Degree of the polynomial kernel.");
00262 m_parameters->add(&m_normalize, "normalize", "Normalize");
00263 m_parameters->add(&m_input_dimensions, "input_dimensions",
00264 "Dimensions of the input space.");
00265 m_parameters->add(&m_output_dimensions, "output_dimensions",
00266 "Dimensions of the feature space of the polynomial kernel.");
00267 m_normalization_values_len = get_num_vectors();
00268 m_parameters->add_vector(&m_normalization_values, &m_normalization_values_len,
00269 "m_normalization_values", "Norm of each training example");
00270 m_parameters->add(&mask, "mask", "Mask.");
00271 m_parameters->add(&m_hash_bits, "m_hash_bits", "Number of bits in hash");
00272 }