00001
00002
00003
00004
00005
00006
00007
00008
00009
00010 #include <shogun/features/SparsePolyFeatures.h>
00011 #include <shogun/lib/Hash.h>
00012
00013 using namespace shogun;
00014
00015 CSparsePolyFeatures::CSparsePolyFeatures()
00016 {
00017 SG_UNSTABLE("CSparsePolyFeatures::CSparsePolyFeatures(void)",
00018 "\n");
00019
00020 m_feat = NULL;
00021 m_degree = 0;
00022 m_normalize = false;
00023 m_input_dimensions = 0;
00024 m_output_dimensions = 0;
00025 m_normalization_values = NULL;
00026 mask = 0;
00027 m_hash_bits = 0;
00028 }
00029
00030 CSparsePolyFeatures::CSparsePolyFeatures(CSparseFeatures<float64_t>* feat, int32_t degree, bool normalize, int32_t hash_bits)
00031 : CDotFeatures(), m_normalization_values(NULL)
00032 {
00033 ASSERT(feat);
00034
00035 m_feat = feat;
00036 SG_REF(m_feat);
00037 m_degree=degree;
00038 m_normalize=normalize;
00039 m_hash_bits=hash_bits;
00040 mask=(uint32_t) (((uint64_t) 1)<<m_hash_bits)-1;
00041 m_output_dimensions=1<<m_hash_bits;
00042 m_input_dimensions=feat->get_num_features();
00043
00044 if (m_normalize)
00045 store_normalization_values();
00046 }
00047
00048 CSparsePolyFeatures::~CSparsePolyFeatures()
00049 {
00050 SG_FREE(m_normalization_values);
00051 SG_UNREF(m_feat);
00052 }
00053
00054 float64_t CSparsePolyFeatures::dot(int32_t vec_idx1, CDotFeatures* df, int32_t vec_idx2)
00055 {
00056 ASSERT(df);
00057 ASSERT(df->get_feature_type() == get_feature_type());
00058 ASSERT(df->get_feature_class() == get_feature_class());
00059
00060 CSparsePolyFeatures* pf=(CSparsePolyFeatures*) df;
00061
00062 SGSparseVector<float64_t> vec1=m_feat->get_sparse_feature_vector(vec_idx1);
00063 SGSparseVector<float64_t> vec2=pf->m_feat->get_sparse_feature_vector(
00064 vec_idx2);
00065
00066 float64_t result=CSparseFeatures<float64_t>::sparse_dot(1, vec1.features,
00067 vec1.num_feat_entries, vec2.features, vec2.num_feat_entries);
00068 result=CMath::pow(result, m_degree);
00069
00070 m_feat->free_feature_vector(vec1, vec_idx1);
00071 pf->m_feat->free_feature_vector(vec2, vec_idx2);
00072
00073 return result;
00074 }
00075
00076 float64_t CSparsePolyFeatures::dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len)
00077 {
00078 if (vec2_len != m_output_dimensions)
00079 SG_ERROR("Dimensions don't match, vec2_dim=%d, m_output_dimensions=%d\n", vec2_len, m_output_dimensions);
00080
00081 SGSparseVector<float64_t> vec=m_feat->get_sparse_feature_vector(vec_idx1);
00082
00083 float64_t result=0;
00084
00085 if (vec.features)
00086 {
00087 if (m_degree==2)
00088 {
00089
00090 for (int32_t i=0; i<vec.num_feat_entries; i++)
00091 {
00092 float64_t v1=vec.features[i].entry;
00093 uint32_t seed=CHash::MurmurHash2(
00094 (uint8_t*)&(vec.features[i].feat_index),
00095 sizeof(int32_t), 0xDEADBEAF);
00096
00097 for (int32_t j=i; j<vec.num_feat_entries; j++)
00098 {
00099 float64_t v2=vec.features[j].entry;
00100 uint32_t h=CHash::MurmurHash2(
00101 (uint8_t*)&(vec.features[j].feat_index),
00102 sizeof(int32_t), seed)&mask;
00103 float64_t v;
00104
00105 if (i==j)
00106 v=v1*v1;
00107 else
00108 v=CMath::sqrt(2.0)*v1*v2;
00109
00110 result+=v*vec2[h];
00111 }
00112 }
00113 }
00114 else if (m_degree==3)
00115 SG_NOTIMPLEMENTED;
00116 }
00117
00118 if (m_normalize)
00119 result/=m_normalization_values[vec_idx1];
00120
00121 m_feat->free_feature_vector(vec, vec_idx1);
00122 return result;
00123 }
00124
00125 void CSparsePolyFeatures::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val)
00126 {
00127 if (vec2_len!=m_output_dimensions)
00128 SG_ERROR("Dimensions don't match, vec2_dim=%d, m_output_dimensions=%d\n", vec2_len, m_output_dimensions);
00129
00130 SGSparseVector<float64_t> vec=m_feat->get_sparse_feature_vector(vec_idx1);
00131
00132 float64_t norm_val=1.0;
00133 if (m_normalize)
00134 norm_val = m_normalization_values[vec_idx1];
00135 alpha/=norm_val;
00136
00137 if (m_degree==2)
00138 {
00139
00140 for (int32_t i=0; i<vec.num_feat_entries; i++)
00141 {
00142 float64_t v1=vec.features[i].entry;
00143 uint32_t seed=CHash::MurmurHash2(
00144 (uint8_t*)&(vec.features[i].feat_index), sizeof(int32_t),
00145 0xDEADBEAF);
00146
00147 for (int32_t j=i; j<vec.num_feat_entries; j++)
00148 {
00149 float64_t v2=vec.features[j].entry;
00150 uint32_t h=CHash::MurmurHash2(
00151 (uint8_t*)&(vec.features[j].feat_index),
00152 sizeof(int32_t), seed)&mask;
00153 float64_t v;
00154
00155 if (i==j)
00156 v=alpha*v1*v1;
00157 else
00158 v=alpha*CMath::sqrt(2.0)*v1*v2;
00159
00160 if (abs_val)
00161 vec2[h]+=CMath::abs(v);
00162 else
00163 vec2[h]+=v;
00164 }
00165 }
00166 }
00167 else if (m_degree==3)
00168 SG_NOTIMPLEMENTED;
00169
00170 m_feat->free_feature_vector(vec, vec_idx1);
00171 }
00172
00173 void CSparsePolyFeatures::store_normalization_values()
00174 {
00175 SG_FREE(m_normalization_values);
00176
00177 m_normalization_values_len = this->get_num_vectors();
00178
00179 m_normalization_values=SG_MALLOC(float64_t, m_normalization_values_len);
00180 for (int i=0; i<m_normalization_values_len; i++)
00181 {
00182 float64_t val = CMath::sqrt(dot(i, this,i));
00183 if (val==0)
00184
00185 m_normalization_values[i]=1.0;
00186 else
00187 m_normalization_values[i]=val;
00188 }
00189
00190 }
00191
00192 CFeatures* CSparsePolyFeatures::duplicate() const
00193 {
00194 return new CSparsePolyFeatures(*this);
00195 }
00196
00197 void CSparsePolyFeatures::init()
00198 {
00199 m_parameters->add((CSGObject**) &m_feat, "features",
00200 "Features in original space.");
00201 m_parameters->add(&m_degree, "degree", "Degree of the polynomial kernel.");
00202 m_parameters->add(&m_normalize, "normalize", "Normalize");
00203 m_parameters->add(&m_input_dimensions, "input_dimensions",
00204 "Dimensions of the input space.");
00205 m_parameters->add(&m_output_dimensions, "output_dimensions",
00206 "Dimensions of the feature space of the polynomial kernel.");
00207 m_normalization_values_len = get_num_vectors();
00208 m_parameters->add_vector(&m_normalization_values, &m_normalization_values_len,
00209 "m_normalization_values", "Norm of each training example");
00210 m_parameters->add(&mask, "mask", "Mask.");
00211 m_parameters->add(&m_hash_bits, "m_hash_bits", "Number of bits in hash");
00212 }