SNPFeatures.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2009 Soeren Sonnenburg
00008  * Copyright (C) 2009 Fraunhofer Institute FIRST and Max-Planck-Society
00009  */
00010 
00011 #include <shogun/features/SNPFeatures.h>
00012 #include <shogun/io/SGIO.h>
00013 #include <shogun/features/Alphabet.h>
00014 
00015 using namespace shogun;
00016 
00017 CSNPFeatures::CSNPFeatures()
00018 {
00019     SG_UNSTABLE("CSNPFeatures::CSNPFeatures()", "\n");
00020 
00021     strings = NULL;
00022 
00023     string_length = 0;
00024     num_strings = 0;
00025     w_dim = 0;
00026 
00027     normalization_const = 0.0;
00028 
00029     m_str_min = NULL;
00030     m_str_maj = NULL;
00031 }
00032 
00033 CSNPFeatures::CSNPFeatures(CStringFeatures<uint8_t>* str) : CDotFeatures(),
00034     m_str_min(NULL), m_str_maj(NULL)
00035 {
00036     ASSERT(str);
00037     ASSERT(str->have_same_length());
00038     SG_REF(str);
00039 
00040     strings=str;
00041     string_length=str->get_max_vector_length();
00042     ASSERT((string_length & 1) == 0); // length divisible by 2
00043     w_dim=3*string_length/2;
00044     num_strings=str->get_num_vectors();
00045     CAlphabet* alpha=str->get_alphabet();
00046     ASSERT(alpha->get_alphabet()==SNP);
00047     SG_UNREF(alpha);
00048 
00049     obtain_base_strings();
00050     set_normalization_const();
00051 
00052 }
00053 
00054 CSNPFeatures::CSNPFeatures(const CSNPFeatures& orig)
00055     : CDotFeatures(orig), strings(orig.strings),
00056     normalization_const(orig.normalization_const),
00057     m_str_min(NULL), m_str_maj(NULL)
00058 {
00059     SG_REF(strings);
00060     string_length=strings->get_max_vector_length();
00061     ASSERT((string_length & 1) == 0); // length divisible by 2
00062     w_dim=3*string_length;
00063     num_strings=strings->get_num_vectors();
00064     CAlphabet* alpha=strings->get_alphabet();
00065     SG_UNREF(alpha);
00066 
00067     obtain_base_strings();
00068 }
00069 
00070 CSNPFeatures::~CSNPFeatures()
00071 {
00072     SG_UNREF(strings);
00073 }
00074 
00075 int32_t CSNPFeatures::get_dim_feature_space() const
00076 {
00077     return w_dim;
00078 }
00079 
00080 int32_t CSNPFeatures::get_nnz_features_for_vector(int32_t num)
00081 {
00082     return w_dim/3;
00083 }
00084 
00085 EFeatureType CSNPFeatures::get_feature_type()
00086 {
00087     return F_UNKNOWN;
00088 }
00089 
00090 EFeatureClass CSNPFeatures::get_feature_class()
00091 {
00092     return C_WD;
00093 }
00094 
00095 int32_t CSNPFeatures::get_num_vectors() const
00096 {
00097     return num_strings;
00098 }
00099 
00100 int32_t CSNPFeatures::get_size()
00101 {
00102     return sizeof(float64_t);
00103 }
00104 
00105 float64_t CSNPFeatures::get_normalization_const()
00106 {
00107     return normalization_const;
00108 }
00109 
00110 void CSNPFeatures::set_minor_base_string(const char* str)
00111 {
00112     m_str_min=(uint8_t*) strdup(str);
00113 }
00114 
00115 void CSNPFeatures::set_major_base_string(const char* str)
00116 {
00117     m_str_maj=(uint8_t*) strdup(str);
00118 }
00119 
00120 char* CSNPFeatures::get_minor_base_string()
00121 {
00122     return (char*) m_str_min;
00123 }
00124 
00125 char* CSNPFeatures::get_major_base_string()
00126 {
00127     return (char*) m_str_maj;
00128 }
00129 
00130 float64_t CSNPFeatures::dot(int32_t idx_a, CDotFeatures* df, int32_t idx_b)
00131 {
00132     ASSERT(df);
00133     ASSERT(df->get_feature_type() == get_feature_type());
00134     ASSERT(df->get_feature_class() == get_feature_class());
00135     CSNPFeatures* sf=(CSNPFeatures*) df;
00136 
00137     int32_t alen, blen;
00138     bool free_avec, free_bvec;
00139 
00140     uint8_t* avec = strings->get_feature_vector(idx_a, alen, free_avec);
00141     uint8_t* bvec = sf->strings->get_feature_vector(idx_b, blen, free_bvec);
00142 
00143     ASSERT(alen==blen);
00144     if (alen!=string_length)
00145         SG_ERROR("alen (%d) !=string_length (%d)\n", alen, string_length);
00146     ASSERT(m_str_min);
00147     ASSERT(m_str_maj);
00148 
00149     float64_t total=0;
00150 
00151     for (int32_t i = 0; i<alen-1; i+=2)
00152     {
00153         int32_t sumaa=0;
00154         int32_t sumbb=0;
00155         int32_t sumab=0;
00156 
00157         uint8_t a1=avec[i];
00158         uint8_t a2=avec[i+1];
00159         uint8_t b1=bvec[i];
00160         uint8_t b2=bvec[i+1];
00161 
00162         if ((a1!=a2 || a1=='0' || a1=='0') && (b1!=b2 || b1=='0' || b2=='0'))
00163             sumab++;
00164         else if (a1==a2 && b1==b2)
00165         {
00166             if (a1!=b1)
00167                 continue;
00168 
00169             if (a1==m_str_min[i])
00170                 sumaa++;
00171             else if (a1==m_str_maj[i])
00172                 sumbb++;
00173             else
00174             {
00175                 SG_ERROR("The impossible happened i=%d a1=%c "
00176                         "a2=%c b1=%c b2=%c min=%c maj=%c\n", i, a1,a2, b1,b2, m_str_min[i], m_str_maj[i]);
00177             }
00178 
00179         }
00180         total+=sumaa+sumbb+sumab;
00181     }
00182 
00183     strings->free_feature_vector(avec, idx_a, free_avec);
00184     sf->strings->free_feature_vector(bvec, idx_b, free_bvec);
00185     return total;
00186 }
00187 
00188 float64_t CSNPFeatures::dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len)
00189 {
00190     if (vec2_len != w_dim)
00191         SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim);
00192 
00193     float64_t sum=0;
00194     int32_t len;
00195     bool free_vec1;
00196     uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1);
00197     int32_t offs=0;
00198 
00199     for (int32_t i=0; i<len; i+=2)
00200     {
00201         int32_t dim=0;
00202 
00203         char a1=vec[i];
00204         char a2=vec[i+1];
00205 
00206         if (a1==a2 && a1!='0' && a2!='0')
00207         {
00208             if (a1==m_str_min[i])
00209                 dim=1;
00210             else if (a1==m_str_maj[i])
00211                 dim=2;
00212             else
00213             {
00214                 SG_ERROR("The impossible happened i=%d a1=%c a2=%c min=%c maj=%c\n",
00215                         i, a1,a2, m_str_min[i], m_str_maj[i]);
00216             }
00217         }
00218 
00219         sum+=vec2[offs+dim];
00220         offs+=3;
00221     }
00222     strings->free_feature_vector(vec, vec_idx1, free_vec1);
00223 
00224     return sum/normalization_const;
00225 }
00226 
00227 void CSNPFeatures::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val)
00228 {
00229     if (vec2_len != w_dim)
00230         SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim);
00231 
00232     int32_t len;
00233     bool free_vec1;
00234     uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1);
00235     int32_t offs=0;
00236 
00237     if (abs_val)
00238         alpha=CMath::abs(alpha);
00239 
00240     for (int32_t i=0; i<len; i+=2)
00241     {
00242         int32_t dim=0;
00243 
00244         char a1=vec[i];
00245         char a2=vec[i+1];
00246 
00247         if (a1==a2 && a1!='0' && a2!='0')
00248         {
00249             if (a1==m_str_min[i])
00250                 dim=1;
00251             else if (a1==m_str_maj[i])
00252                 dim=2;
00253             else
00254             {
00255                 SG_ERROR("The impossible happened i=%d a1=%c a2=%c min=%c maj=%c\n",
00256                         i, a1,a2, m_str_min[i], m_str_maj[i]);
00257             }
00258         }
00259 
00260         vec2[offs+dim]+=alpha;
00261         offs+=3;
00262     }
00263     strings->free_feature_vector(vec, vec_idx1, free_vec1);
00264 }
00265 
00266 void CSNPFeatures::find_minor_major_strings(uint8_t* minor, uint8_t* major)
00267 {
00268     for (int32_t i=0; i<num_strings; i++)
00269     {
00270         int32_t len;
00271         bool free_vec;
00272         uint8_t* vec = ((CStringFeatures<uint8_t>*) strings)->get_feature_vector(i, len, free_vec);
00273         ASSERT(string_length==len);
00274 
00275         for (int32_t j=0; j<len; j++)
00276         {
00277             // skip sequencing errors
00278             if (vec[j]=='0')
00279                 continue;
00280 
00281             if (minor[j]==0)
00282                 minor[j]=vec[j];
00283             else if (major[j]==0 && vec[j]!=minor[j])
00284                 major[j]=vec[j];
00285         }
00286 
00287         ((CStringFeatures<uint8_t>*) strings)->free_feature_vector(vec, i, free_vec);
00288     }
00289 }
00290 
00291 void CSNPFeatures::obtain_base_strings(CSNPFeatures* snp)
00292 {
00293     SG_FREE(m_str_min);
00294     SG_FREE(m_str_maj);
00295     size_t tlen=(string_length+1)*sizeof(uint8_t);
00296 
00297     m_str_min=SG_CALLOC(uint8_t, tlen);
00298     m_str_maj=SG_CALLOC(uint8_t, tlen);
00299 
00300     find_minor_major_strings(m_str_min, m_str_maj);
00301 
00302     if (snp)
00303         snp->find_minor_major_strings(m_str_min, m_str_maj);
00304 
00305     for (int32_t j=0; j<string_length; j++)
00306     {
00307         // if only one symbol occurs use 0
00308         if (m_str_min[j]==0)
00309             m_str_min[j]='0';
00310         if (m_str_maj[j]==0)
00311             m_str_maj[j]='0';
00312 
00313         if (m_str_min[j]>m_str_maj[j])
00314             CMath::swap(m_str_min[j], m_str_maj[j]);
00315     }
00316 }
00317 
00318 void CSNPFeatures::set_normalization_const(float64_t n)
00319 {
00320     if (n==0)
00321     {
00322         normalization_const=string_length;
00323         normalization_const=CMath::sqrt(normalization_const);
00324     }
00325     else
00326         normalization_const=n;
00327 
00328     SG_DEBUG("normalization_const:%f\n", normalization_const);
00329 }
00330 
00331 void* CSNPFeatures::get_feature_iterator(int32_t vector_index)
00332 {
00333     return NULL;
00334 }
00335 
00336 bool CSNPFeatures::get_next_feature(int32_t& index, float64_t& value, void* iterator)
00337 {
00338     return false;
00339 }
00340 
00341 void CSNPFeatures::free_feature_iterator(void* iterator)
00342 {
00343 }
00344 
00345 CFeatures* CSNPFeatures::duplicate() const
00346 {
00347     return new CSNPFeatures(*this);
00348 }
00349 
00350 SGMatrix<float64_t> CSNPFeatures::get_histogram(bool normalize)
00351 {
00352     int32_t nsym=3;
00353     float64_t* h= SG_CALLOC(float64_t, size_t(nsym)*string_length/2);
00354 
00355     float64_t* h_normalizer=SG_MALLOC(float64_t, string_length/2);
00356     memset(h_normalizer, 0, string_length/2*sizeof(float64_t));
00357     int32_t num_str=get_num_vectors();
00358     for (int32_t i=0; i<num_str; i++)
00359     {
00360         int32_t len;
00361         bool free_vec;
00362         uint8_t* vec = strings->get_feature_vector(i, len, free_vec);
00363 
00364         for (int32_t j=0; j<len; j+=2)
00365         {
00366             int32_t dim=0;
00367 
00368             char a1=vec[j];
00369             char a2=vec[j+1];
00370 
00371             if (a1==a2 && a1!='0' && a2!='0')
00372             {
00373                 if (a1==m_str_min[j])
00374                     dim=1;
00375                 else if (a1==m_str_maj[j])
00376                     dim=2;
00377                 else
00378                 {
00379                     SG_ERROR("The impossible happened j=%d a1=%c a2=%c min=%c maj=%c\n",
00380                             j, a1,a2, m_str_min[j], m_str_maj[j]);
00381                 }
00382             }
00383 
00384             h[int64_t(j/2)*nsym+dim]++;
00385             h_normalizer[j/2]++;
00386         }
00387 
00388         strings->free_feature_vector(vec, i, free_vec);
00389     }
00390 
00391     if (normalize)
00392     {
00393         for (int32_t i=0; i<string_length/2; i++)
00394         {
00395             for (int32_t j=0; j<nsym; j++)
00396             {
00397                 if (h_normalizer && h_normalizer[i])
00398                     h[int64_t(i)*nsym+j]/=h_normalizer[i];
00399             }
00400         }
00401     }
00402     SG_FREE(h_normalizer);
00403 
00404     return SGMatrix<float64_t>(h, nsym, string_length/2);
00405 }
00406 
00407 SGMatrix<float64_t> CSNPFeatures::get_2x3_table(CSNPFeatures* pos, CSNPFeatures* neg)
00408 {
00409 
00410     ASSERT(pos->strings->get_max_vector_length() == neg->strings->get_max_vector_length());
00411     int32_t len=pos->strings->get_max_vector_length();
00412 
00413     float64_t* table=SG_MALLOC(float64_t, 3*2*len/2);
00414 
00415     SGMatrix<float64_t> p_hist=pos->get_histogram(false);
00416     SGMatrix<float64_t> n_hist=neg->get_histogram(false);
00417 
00418 
00419     for (int32_t i=0; i<3*len/2; i++)
00420     {
00421         table[2*i]=p_hist.matrix[i];
00422         table[2*i+1]=n_hist.matrix[i];
00423     }
00424     return SGMatrix<float64_t>(table, 2,3*len/2);
00425 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation