00001
00002
00003
00004
00005
00006
00007
00008
00009
00010 #include <shogun/features/SparsePolyFeatures.h>
00011 #include <shogun/lib/Hash.h>
00012
00013 using namespace shogun;
00014
00015 CSparsePolyFeatures::CSparsePolyFeatures()
00016 {
00017 SG_UNSTABLE("CSparsePolyFeatures::CSparsePolyFeatures()",
00018 "\n");
00019
00020 m_feat = NULL;
00021 m_degree = 0;
00022 m_normalize = false;
00023 m_input_dimensions = 0;
00024 m_output_dimensions = 0;
00025 m_normalization_values = NULL;
00026 mask = 0;
00027 m_hash_bits = 0;
00028 }
00029
00030 CSparsePolyFeatures::CSparsePolyFeatures(CSparseFeatures<float64_t>* feat, int32_t degree, bool normalize, int32_t hash_bits)
00031 : CDotFeatures(), m_normalization_values(NULL)
00032 {
00033 ASSERT(feat);
00034
00035 m_feat = feat;
00036 SG_REF(m_feat);
00037 m_degree=degree;
00038 m_normalize=normalize;
00039 m_hash_bits=hash_bits;
00040 mask=(uint32_t) (((uint64_t) 1)<<m_hash_bits)-1;
00041 m_output_dimensions=1<<m_hash_bits;
00042 m_input_dimensions=feat->get_num_features();
00043
00044 if (m_normalize)
00045 store_normalization_values();
00046 }
00047
00048 CSparsePolyFeatures::~CSparsePolyFeatures()
00049 {
00050 SG_FREE(m_normalization_values);
00051 SG_UNREF(m_feat);
00052 }
00053
00054 CSparsePolyFeatures::CSparsePolyFeatures(const CSparsePolyFeatures & orig)
00055 {
00056 SG_PRINT("CSparsePolyFeatures:\n");
00057 SG_NOTIMPLEMENTED;
00058 }
00059
00060 int32_t CSparsePolyFeatures::get_dim_feature_space() const
00061 {
00062 return m_output_dimensions;
00063 }
00064
00065 int32_t CSparsePolyFeatures::get_nnz_features_for_vector(int32_t num)
00066 {
00067 int32_t vlen;
00068 SGSparseVector<float64_t> vec=m_feat->get_sparse_feature_vector(num);
00069 vlen=vec.num_feat_entries;
00070 m_feat->free_feature_vector(vec, num);
00071 return vlen*(vlen+1)/2;
00072 }
00073
00074 EFeatureType CSparsePolyFeatures::get_feature_type()
00075 {
00076 return F_UNKNOWN;
00077 }
00078
00079 EFeatureClass CSparsePolyFeatures::get_feature_class()
00080 {
00081 return C_POLY;
00082 }
00083
00084 int32_t CSparsePolyFeatures::get_num_vectors() const
00085 {
00086 if (m_feat)
00087 return m_feat->get_num_vectors();
00088 else
00089 return 0;
00090
00091 }
00092
00093 int32_t CSparsePolyFeatures::get_size()
00094 {
00095 return sizeof(float64_t);
00096 }
00097
00098 void* CSparsePolyFeatures::get_feature_iterator(int32_t vector_index)
00099 {
00100 SG_NOTIMPLEMENTED;
00101 return NULL;
00102 }
00103
00104 bool CSparsePolyFeatures::get_next_feature(int32_t& index, float64_t& value, void* iterator)
00105 {
00106 SG_NOTIMPLEMENTED;
00107 return NULL;
00108 }
00109
00110 void CSparsePolyFeatures::free_feature_iterator(void* iterator)
00111 {
00112 SG_NOTIMPLEMENTED;
00113 }
00114
00115 float64_t CSparsePolyFeatures::dot(int32_t vec_idx1, CDotFeatures* df, int32_t vec_idx2)
00116 {
00117 ASSERT(df);
00118 ASSERT(df->get_feature_type() == get_feature_type());
00119 ASSERT(df->get_feature_class() == get_feature_class());
00120
00121 CSparsePolyFeatures* pf=(CSparsePolyFeatures*) df;
00122
00123 SGSparseVector<float64_t> vec1=m_feat->get_sparse_feature_vector(vec_idx1);
00124 SGSparseVector<float64_t> vec2=pf->m_feat->get_sparse_feature_vector(
00125 vec_idx2);
00126
00127 float64_t result=CSparseFeatures<float64_t>::sparse_dot(1, vec1.features,
00128 vec1.num_feat_entries, vec2.features, vec2.num_feat_entries);
00129 result=CMath::pow(result, m_degree);
00130
00131 m_feat->free_feature_vector(vec1, vec_idx1);
00132 pf->m_feat->free_feature_vector(vec2, vec_idx2);
00133
00134 return result;
00135 }
00136
00137 float64_t CSparsePolyFeatures::dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len)
00138 {
00139 if (vec2_len != m_output_dimensions)
00140 SG_ERROR("Dimensions don't match, vec2_dim=%d, m_output_dimensions=%d\n", vec2_len, m_output_dimensions);
00141
00142 SGSparseVector<float64_t> vec=m_feat->get_sparse_feature_vector(vec_idx1);
00143
00144 float64_t result=0;
00145
00146 if (vec.features)
00147 {
00148 if (m_degree==2)
00149 {
00150
00151 for (int32_t i=0; i<vec.num_feat_entries; i++)
00152 {
00153 float64_t v1=vec.features[i].entry;
00154 uint32_t seed=CHash::MurmurHash2(
00155 (uint8_t*)&(vec.features[i].feat_index),
00156 sizeof(int32_t), 0xDEADBEAF);
00157
00158 for (int32_t j=i; j<vec.num_feat_entries; j++)
00159 {
00160 float64_t v2=vec.features[j].entry;
00161 uint32_t h=CHash::MurmurHash2(
00162 (uint8_t*)&(vec.features[j].feat_index),
00163 sizeof(int32_t), seed)&mask;
00164 float64_t v;
00165
00166 if (i==j)
00167 v=v1*v1;
00168 else
00169 v=CMath::sqrt(2.0)*v1*v2;
00170
00171 result+=v*vec2[h];
00172 }
00173 }
00174 }
00175 else if (m_degree==3)
00176 SG_NOTIMPLEMENTED;
00177 }
00178
00179 if (m_normalize)
00180 result/=m_normalization_values[vec_idx1];
00181
00182 m_feat->free_feature_vector(vec, vec_idx1);
00183 return result;
00184 }
00185
00186 void CSparsePolyFeatures::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val)
00187 {
00188 if (vec2_len!=m_output_dimensions)
00189 SG_ERROR("Dimensions don't match, vec2_dim=%d, m_output_dimensions=%d\n", vec2_len, m_output_dimensions);
00190
00191 SGSparseVector<float64_t> vec=m_feat->get_sparse_feature_vector(vec_idx1);
00192
00193 float64_t norm_val=1.0;
00194 if (m_normalize)
00195 norm_val = m_normalization_values[vec_idx1];
00196 alpha/=norm_val;
00197
00198 if (m_degree==2)
00199 {
00200
00201 for (int32_t i=0; i<vec.num_feat_entries; i++)
00202 {
00203 float64_t v1=vec.features[i].entry;
00204 uint32_t seed=CHash::MurmurHash2(
00205 (uint8_t*)&(vec.features[i].feat_index), sizeof(int32_t),
00206 0xDEADBEAF);
00207
00208 for (int32_t j=i; j<vec.num_feat_entries; j++)
00209 {
00210 float64_t v2=vec.features[j].entry;
00211 uint32_t h=CHash::MurmurHash2(
00212 (uint8_t*)&(vec.features[j].feat_index),
00213 sizeof(int32_t), seed)&mask;
00214 float64_t v;
00215
00216 if (i==j)
00217 v=alpha*v1*v1;
00218 else
00219 v=alpha*CMath::sqrt(2.0)*v1*v2;
00220
00221 if (abs_val)
00222 vec2[h]+=CMath::abs(v);
00223 else
00224 vec2[h]+=v;
00225 }
00226 }
00227 }
00228 else if (m_degree==3)
00229 SG_NOTIMPLEMENTED;
00230
00231 m_feat->free_feature_vector(vec, vec_idx1);
00232 }
00233
00234 void CSparsePolyFeatures::store_normalization_values()
00235 {
00236 SG_FREE(m_normalization_values);
00237
00238 m_normalization_values_len = this->get_num_vectors();
00239
00240 m_normalization_values=SG_MALLOC(float64_t, m_normalization_values_len);
00241 for (int i=0; i<m_normalization_values_len; i++)
00242 {
00243 float64_t val = CMath::sqrt(dot(i, this,i));
00244 if (val==0)
00245
00246 m_normalization_values[i]=1.0;
00247 else
00248 m_normalization_values[i]=val;
00249 }
00250
00251 }
00252
00253 CFeatures* CSparsePolyFeatures::duplicate() const
00254 {
00255 return new CSparsePolyFeatures(*this);
00256 }
00257
00258 void CSparsePolyFeatures::init()
00259 {
00260 m_parameters->add((CSGObject**) &m_feat, "features",
00261 "Features in original space.");
00262 m_parameters->add(&m_degree, "degree", "Degree of the polynomial kernel.");
00263 m_parameters->add(&m_normalize, "normalize", "Normalize");
00264 m_parameters->add(&m_input_dimensions, "input_dimensions",
00265 "Dimensions of the input space.");
00266 m_parameters->add(&m_output_dimensions, "output_dimensions",
00267 "Dimensions of the feature space of the polynomial kernel.");
00268 m_normalization_values_len = get_num_vectors();
00269 m_parameters->add_vector(&m_normalization_values, &m_normalization_values_len,
00270 "m_normalization_values", "Norm of each training example");
00271 m_parameters->add(&mask, "mask", "Mask.");
00272 m_parameters->add(&m_hash_bits, "m_hash_bits", "Number of bits in hash");
00273 }