SparsePolyFeatures.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2010 Soeren Sonnenburg
00008  * Copyright (C) 2010 Berlin Institute of Technology
00009  */
00010 #include "features/SparsePolyFeatures.h"
00011 #include "lib/Hash.h"
00012 
00013 using namespace shogun;
00014 
00015 CSparsePolyFeatures::CSparsePolyFeatures(void)
00016 {
00017     SG_UNSTABLE("CSparsePolyFeatures::CSparsePolyFeatures(void)",
00018                 "\n");
00019 
00020     m_feat = NULL;
00021     m_degree = 0;
00022     m_normalize = false;
00023     m_input_dimensions = 0;
00024     m_output_dimensions = 0;
00025     m_normalization_values = NULL;
00026     mask = 0;
00027     m_hash_bits = 0;
00028 }
00029 
00030 CSparsePolyFeatures::CSparsePolyFeatures(CSparseFeatures<float64_t>* feat, int32_t degree, bool normalize, int32_t hash_bits)
00031     : CDotFeatures(), m_normalization_values(NULL)
00032 {
00033     ASSERT(feat);
00034 
00035     m_feat = feat;
00036     SG_REF(m_feat);
00037     m_degree=degree;
00038     m_normalize=normalize;
00039     m_hash_bits=hash_bits;
00040     mask=(uint32_t) (((uint64_t) 1)<<m_hash_bits)-1;
00041     m_output_dimensions=1<<m_hash_bits;
00042     m_input_dimensions=feat->get_num_features();
00043 
00044     if (m_normalize)
00045         store_normalization_values();
00046 }
00047 
00048 CSparsePolyFeatures::~CSparsePolyFeatures()
00049 {
00050     delete[] m_normalization_values;
00051     SG_UNREF(m_feat);
00052 }
00053 
00054 float64_t CSparsePolyFeatures::dot(int32_t vec_idx1, CDotFeatures* df, int32_t vec_idx2)
00055 {
00056     ASSERT(df);
00057     ASSERT(df->get_feature_type() == get_feature_type());
00058     ASSERT(df->get_feature_class() == get_feature_class());
00059 
00060     CSparsePolyFeatures* pf=(CSparsePolyFeatures*) df;
00061 
00062     int32_t len1, len2;
00063     bool do_free1, do_free2;
00064     TSparseEntry<float64_t>* vec1 = m_feat->get_sparse_feature_vector(vec_idx1, len1, do_free1);
00065     TSparseEntry<float64_t>* vec2 = pf->m_feat->get_sparse_feature_vector(vec_idx2, len2, do_free2);
00066 
00067     float64_t result=CSparseFeatures<float64_t>::sparse_dot(1, vec1, len1, vec2, len2);
00068     result=CMath::pow(result, m_degree);
00069 
00070     m_feat->free_feature_vector(vec1, len1, do_free1);
00071     pf->m_feat->free_feature_vector(vec2, len2, do_free2);
00072 
00073     return result;
00074 }
00075 
00076 float64_t CSparsePolyFeatures::dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len)
00077 {
00078     if (vec2_len != m_output_dimensions)
00079         SG_ERROR("Dimensions don't match, vec2_dim=%d, m_output_dimensions=%d\n", vec2_len, m_output_dimensions);
00080 
00081     int32_t vlen;
00082     bool do_free;
00083     TSparseEntry<float64_t>* vec = m_feat->get_sparse_feature_vector(vec_idx1, vlen, do_free);
00084 
00085     float64_t result=0;
00086 
00087     if (vec)
00088     {
00089         if (m_degree==2)
00090         {
00091             /* (a+b)^2 = a^2 + 2ab +b^2 */
00092             for (int32_t i=0; i<vlen; i++)
00093             {
00094                 float64_t v1=vec[i].entry;
00095                 uint32_t seed=CHash::MurmurHash2((uint8_t*) &(vec[i].feat_index), sizeof(int32_t), 0xDEADBEAF);
00096 
00097                 for (int32_t j=i; j<vlen; j++)
00098                 {
00099                     float64_t v2=vec[j].entry;
00100                     uint32_t h=CHash::MurmurHash2((uint8_t*) &(vec[j].feat_index), sizeof(int32_t), seed) & mask;
00101                     float64_t v;
00102 
00103                     if (i==j)
00104                         v=v1*v1;
00105                     else
00106                         v=CMath::sqrt(2.0)*v1*v2;
00107 
00108                     result+=v*vec2[h];
00109                 }
00110             }
00111         }
00112         else if (m_degree==3)
00113             SG_NOTIMPLEMENTED;
00114     }
00115     
00116     if (m_normalize)
00117         result/=m_normalization_values[vec_idx1];
00118 
00119     m_feat->free_feature_vector(vec, vlen, do_free);
00120     return result;
00121 }
00122 
00123 void CSparsePolyFeatures::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val)
00124 {
00125     if (vec2_len != m_output_dimensions)
00126         SG_ERROR("Dimensions don't match, vec2_dim=%d, m_output_dimensions=%d\n", vec2_len, m_output_dimensions);
00127 
00128     int32_t vlen;
00129     bool do_free;
00130     TSparseEntry<float64_t>* vec = m_feat->get_sparse_feature_vector(vec_idx1, vlen, do_free);
00131 
00132     float64_t norm_val=1.0;
00133     if (m_normalize)
00134         norm_val = m_normalization_values[vec_idx1];
00135     alpha/=norm_val;
00136 
00137     if (m_degree==2)
00138     {
00139         /* (a+b)^2 = a^2 + 2ab +b^2 */
00140         for (int32_t i=0; i<vlen; i++)
00141         {
00142             float64_t v1=vec[i].entry;
00143             uint32_t seed=CHash::MurmurHash2((uint8_t*) &(vec[i].feat_index), sizeof(int32_t), 0xDEADBEAF);
00144 
00145             for (int32_t j=i; j<vlen; j++)
00146             {
00147                 float64_t v2=vec[j].entry;
00148                 uint32_t h=CHash::MurmurHash2((uint8_t*) &(vec[j].feat_index), sizeof(int32_t), seed) & mask;
00149                 float64_t v;
00150 
00151                 if (i==j)
00152                     v=alpha*v1*v1;
00153                 else
00154                     v=alpha*CMath::sqrt(2.0)*v1*v2;
00155 
00156                 if (abs_val)
00157                     vec2[h]+=CMath::abs(v); 
00158                 else
00159                     vec2[h]+=v; 
00160             }
00161         }
00162     }
00163     else if (m_degree==3)
00164         SG_NOTIMPLEMENTED;
00165 
00166     m_feat->free_feature_vector(vec, vlen, do_free);
00167 }
00168 
00169 void CSparsePolyFeatures::store_normalization_values()
00170 {
00171     delete[] m_normalization_values;
00172 
00173     int32_t num_vec = this->get_num_vectors();
00174 
00175     m_normalization_values=new float64_t[num_vec];
00176     for (int i=0; i<num_vec; i++)
00177     {
00178         float64_t val = CMath::sqrt(dot(i, this,i)); 
00179         if (val==0)
00180             // trap division by zero
00181             m_normalization_values[i]=1.0;
00182         else 
00183             m_normalization_values[i]=val;
00184     }
00185         
00186 }
00187 
00188 CFeatures* CSparsePolyFeatures::duplicate() const
00189 {
00190     return new CSparsePolyFeatures(*this);
00191 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation