SNPFeatures.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2009 Soeren Sonnenburg
00008  * Copyright (C) 2009 Fraunhofer Institute FIRST and Max-Planck-Society
00009  */
00010 
00011 #include <shogun/features/SNPFeatures.h>
00012 #include <shogun/io/SGIO.h>
00013 #include <shogun/features/Alphabet.h>
00014 
00015 using namespace shogun;
00016 
00017 CSNPFeatures::CSNPFeatures()
00018 {
00019     SG_UNSTABLE("CSNPFeatures::CSNPFeatures(void)", "\n");
00020 
00021     strings = NULL;
00022 
00023     string_length = 0;
00024     num_strings = 0;
00025     w_dim = 0;
00026 
00027     normalization_const = 0.0;
00028 
00029     m_str_min = NULL;
00030     m_str_maj = NULL;
00031 }
00032 
00033 CSNPFeatures::CSNPFeatures(CStringFeatures<uint8_t>* str) : CDotFeatures(),
00034     m_str_min(NULL), m_str_maj(NULL)
00035 {
00036     ASSERT(str);
00037     ASSERT(str->have_same_length());
00038     SG_REF(str);
00039 
00040     strings=str;
00041     string_length=str->get_max_vector_length();
00042     ASSERT((string_length & 1) == 0); // length divisible by 2
00043     w_dim=3*string_length/2;
00044     num_strings=str->get_num_vectors();
00045     CAlphabet* alpha=str->get_alphabet();
00046     ASSERT(alpha->get_alphabet()==SNP);
00047     SG_UNREF(alpha);
00048 
00049     obtain_base_strings();
00050     set_normalization_const();
00051 
00052 }
00053 
00054 CSNPFeatures::CSNPFeatures(const CSNPFeatures& orig)
00055     : CDotFeatures(orig), strings(orig.strings),
00056     normalization_const(orig.normalization_const),
00057     m_str_min(NULL), m_str_maj(NULL)
00058 {
00059     SG_REF(strings);
00060     string_length=strings->get_max_vector_length();
00061     ASSERT((string_length & 1) == 0); // length divisible by 2
00062     w_dim=3*string_length;
00063     num_strings=strings->get_num_vectors();
00064     CAlphabet* alpha=strings->get_alphabet();
00065     SG_UNREF(alpha);
00066 
00067     obtain_base_strings();
00068 }
00069 
00070 CSNPFeatures::~CSNPFeatures()
00071 {
00072     SG_UNREF(strings);
00073 }
00074 
00075 float64_t CSNPFeatures::dot(int32_t idx_a, CDotFeatures* df, int32_t idx_b)
00076 {
00077     ASSERT(df);
00078     ASSERT(df->get_feature_type() == get_feature_type());
00079     ASSERT(df->get_feature_class() == get_feature_class());
00080     CSNPFeatures* sf=(CSNPFeatures*) df;
00081 
00082     int32_t alen, blen;
00083     bool free_avec, free_bvec;
00084 
00085     uint8_t* avec = strings->get_feature_vector(idx_a, alen, free_avec);
00086     uint8_t* bvec = sf->strings->get_feature_vector(idx_b, blen, free_bvec);
00087 
00088     ASSERT(alen==blen);
00089     if (alen!=string_length)
00090         SG_ERROR("alen (%d) !=string_length (%d)\n", alen, string_length);
00091     ASSERT(m_str_min);
00092     ASSERT(m_str_maj);
00093 
00094     float64_t total=0;
00095 
00096     for (int32_t i = 0; i<alen-1; i+=2)
00097     {
00098         int32_t sumaa=0;
00099         int32_t sumbb=0;
00100         int32_t sumab=0;
00101 
00102         uint8_t a1=avec[i];
00103         uint8_t a2=avec[i+1];
00104         uint8_t b1=bvec[i];
00105         uint8_t b2=bvec[i+1];
00106 
00107         if ((a1!=a2 || a1=='0' || a1=='0') && (b1!=b2 || b1=='0' || b2=='0'))
00108             sumab++;
00109         else if (a1==a2 && b1==b2)
00110         {
00111             if (a1!=b1)
00112                 continue;
00113 
00114             if (a1==m_str_min[i])
00115                 sumaa++;
00116             else if (a1==m_str_maj[i])
00117                 sumbb++;
00118             else
00119             {
00120                 SG_ERROR("The impossible happened i=%d a1=%c "
00121                         "a2=%c b1=%c b2=%c min=%c maj=%c\n", i, a1,a2, b1,b2, m_str_min[i], m_str_maj[i]);
00122             }
00123 
00124         }
00125         total+=sumaa+sumbb+sumab;
00126     }
00127 
00128     strings->free_feature_vector(avec, idx_a, free_avec);
00129     sf->strings->free_feature_vector(bvec, idx_b, free_bvec);
00130     return total;
00131 }
00132 
00133 float64_t CSNPFeatures::dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len)
00134 {
00135     if (vec2_len != w_dim)
00136         SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim);
00137 
00138     float64_t sum=0;
00139     int32_t len;
00140     bool free_vec1;
00141     uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1);
00142     int32_t offs=0;
00143 
00144     for (int32_t i=0; i<len; i+=2)
00145     {
00146         int32_t dim=0;
00147 
00148         char a1=vec[i];
00149         char a2=vec[i+1];
00150 
00151         if (a1==a2 && a1!='0' && a2!='0')
00152         {
00153             if (a1==m_str_min[i])
00154                 dim=1;
00155             else if (a1==m_str_maj[i])
00156                 dim=2;
00157             else
00158             {
00159                 SG_ERROR("The impossible happened i=%d a1=%c a2=%c min=%c maj=%c\n",
00160                         i, a1,a2, m_str_min[i], m_str_maj[i]);
00161             }
00162         }
00163 
00164         sum+=vec2[offs+dim];
00165         offs+=3;
00166     }
00167     strings->free_feature_vector(vec, vec_idx1, free_vec1);
00168 
00169     return sum/normalization_const;
00170 }
00171 
00172 void CSNPFeatures::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val)
00173 {
00174     if (vec2_len != w_dim)
00175         SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim);
00176 
00177     int32_t len;
00178     bool free_vec1;
00179     uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1);
00180     int32_t offs=0;
00181 
00182     if (abs_val)
00183         alpha=CMath::abs(alpha);
00184 
00185     for (int32_t i=0; i<len; i+=2)
00186     {
00187         int32_t dim=0;
00188 
00189         char a1=vec[i];
00190         char a2=vec[i+1];
00191 
00192         if (a1==a2 && a1!='0' && a2!='0')
00193         {
00194             if (a1==m_str_min[i])
00195                 dim=1;
00196             else if (a1==m_str_maj[i])
00197                 dim=2;
00198             else
00199             {
00200                 SG_ERROR("The impossible happened i=%d a1=%c a2=%c min=%c maj=%c\n",
00201                         i, a1,a2, m_str_min[i], m_str_maj[i]);
00202             }
00203         }
00204 
00205         vec2[offs+dim]+=alpha;
00206         offs+=3;
00207     }
00208     strings->free_feature_vector(vec, vec_idx1, free_vec1);
00209 }
00210 
00211 void CSNPFeatures::find_minor_major_strings(uint8_t* minor, uint8_t* major)
00212 {
00213     for (int32_t i=0; i<num_strings; i++)
00214     {
00215         int32_t len;
00216         bool free_vec;
00217         uint8_t* vec = ((CStringFeatures<uint8_t>*) strings)->get_feature_vector(i, len, free_vec);
00218         ASSERT(string_length==len);
00219 
00220         for (int32_t j=0; j<len; j++)
00221         {
00222             // skip sequencing errors
00223             if (vec[j]=='0')
00224                 continue;
00225 
00226             if (minor[j]==0)
00227                 minor[j]=vec[j];
00228             else if (major[j]==0 && vec[j]!=minor[j])
00229                 major[j]=vec[j];
00230         }
00231 
00232         ((CStringFeatures<uint8_t>*) strings)->free_feature_vector(vec, i, free_vec);
00233     }
00234 }
00235 
00236 void CSNPFeatures::obtain_base_strings(CSNPFeatures* snp)
00237 {
00238     SG_FREE(m_str_min);
00239     SG_FREE(m_str_maj);
00240     size_t tlen=(string_length+1)*sizeof(uint8_t);
00241 
00242     m_str_min=SG_CALLOC(uint8_t, tlen);
00243     m_str_maj=SG_CALLOC(uint8_t, tlen);
00244 
00245     find_minor_major_strings(m_str_min, m_str_maj);
00246 
00247     if (snp)
00248         snp->find_minor_major_strings(m_str_min, m_str_maj);
00249 
00250     for (int32_t j=0; j<string_length; j++)
00251     {
00252         // if only one symbol occurs use 0
00253         if (m_str_min[j]==0)
00254             m_str_min[j]='0';
00255         if (m_str_maj[j]==0)
00256             m_str_maj[j]='0';
00257 
00258         if (m_str_min[j]>m_str_maj[j])
00259             CMath::swap(m_str_min[j], m_str_maj[j]);
00260     }
00261 }
00262 
00263 void CSNPFeatures::set_normalization_const(float64_t n)
00264 {
00265     if (n==0)
00266     {
00267         normalization_const=string_length;
00268         normalization_const=CMath::sqrt(normalization_const);
00269     }
00270     else
00271         normalization_const=n;
00272 
00273     SG_DEBUG("normalization_const:%f\n", normalization_const);
00274 }
00275 
00276 void* CSNPFeatures::get_feature_iterator(int32_t vector_index)
00277 {
00278     return NULL;
00279 }
00280 
00281 bool CSNPFeatures::get_next_feature(int32_t& index, float64_t& value, void* iterator)
00282 {
00283     return false;
00284 }
00285 
00286 void CSNPFeatures::free_feature_iterator(void* iterator)
00287 {
00288 }
00289 
00290 CFeatures* CSNPFeatures::duplicate() const
00291 {
00292     return new CSNPFeatures(*this);
00293 }
00294 
00295 SGMatrix<float64_t> CSNPFeatures::get_histogram(bool normalize)
00296 {
00297     int32_t nsym=3;
00298     float64_t* h= SG_CALLOC(float64_t, size_t(nsym)*string_length/2);
00299 
00300     float64_t* h_normalizer=SG_MALLOC(float64_t, string_length/2);
00301     memset(h_normalizer, 0, string_length/2*sizeof(float64_t));
00302     int32_t num_str=get_num_vectors();
00303     for (int32_t i=0; i<num_str; i++)
00304     {
00305         int32_t len;
00306         bool free_vec;
00307         uint8_t* vec = strings->get_feature_vector(i, len, free_vec);
00308 
00309         for (int32_t j=0; j<len; j+=2)
00310         {
00311             int32_t dim=0;
00312 
00313             char a1=vec[j];
00314             char a2=vec[j+1];
00315 
00316             if (a1==a2 && a1!='0' && a2!='0')
00317             {
00318                 if (a1==m_str_min[j])
00319                     dim=1;
00320                 else if (a1==m_str_maj[j])
00321                     dim=2;
00322                 else
00323                 {
00324                     SG_ERROR("The impossible happened j=%d a1=%c a2=%c min=%c maj=%c\n",
00325                             j, a1,a2, m_str_min[j], m_str_maj[j]);
00326                 }
00327             }
00328 
00329             h[int64_t(j/2)*nsym+dim]++;
00330             h_normalizer[j/2]++;
00331         }
00332 
00333         strings->free_feature_vector(vec, i, free_vec);
00334     }
00335 
00336     if (normalize)
00337     {
00338         for (int32_t i=0; i<string_length/2; i++)
00339         {
00340             for (int32_t j=0; j<nsym; j++)
00341             {
00342                 if (h_normalizer && h_normalizer[i])
00343                     h[int64_t(i)*nsym+j]/=h_normalizer[i];
00344             }
00345         }
00346     }
00347     SG_FREE(h_normalizer);
00348 
00349     return SGMatrix<float64_t>(h, nsym, string_length/2);
00350 }
00351 
00352 SGMatrix<float64_t> CSNPFeatures::get_2x3_table(CSNPFeatures* pos, CSNPFeatures* neg)
00353 {
00354 
00355     ASSERT(pos->strings->get_max_vector_length() == neg->strings->get_max_vector_length());
00356     int32_t len=pos->strings->get_max_vector_length();
00357 
00358     float64_t* table=SG_MALLOC(float64_t, 3*2*len/2);
00359 
00360     SGMatrix<float64_t> p_hist=pos->get_histogram(false);
00361     SGMatrix<float64_t> n_hist=neg->get_histogram(false);
00362 
00363 
00364     for (int32_t i=0; i<3*len/2; i++)
00365     {
00366         table[2*i]=p_hist.matrix[i];
00367         table[2*i+1]=n_hist.matrix[i];
00368     }
00369     return SGMatrix<float64_t>(table, 2,3*len/2);
00370 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation