SNPStringKernel.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2009 Soeren Sonnenburg
00008  * Copyright (C) 2009 Berlin Institute of Technology
00009  */
00010 
00011 #include <shogun/lib/common.h>
00012 #include <shogun/io/SGIO.h>
00013 #include <shogun/kernel/SNPStringKernel.h>
00014 #include <shogun/kernel/SqrtDiagKernelNormalizer.h>
00015 #include <shogun/features/Features.h>
00016 #include <shogun/features/StringFeatures.h>
00017 
00018 using namespace shogun;
00019 
00020 CSNPStringKernel::CSNPStringKernel()
00021 : CStringKernel<char>(0),
00022   m_degree(0), m_win_len(0), m_inhomogene(false)
00023 {
00024     init();
00025     set_normalizer(new CSqrtDiagKernelNormalizer());
00026     register_params();
00027 }
00028 
00029 CSNPStringKernel::CSNPStringKernel(int32_t size,
00030         int32_t degree, int32_t win_len, bool inhomogene)
00031 : CStringKernel<char>(size),
00032     m_degree(degree), m_win_len(2*win_len), m_inhomogene(inhomogene)
00033 {
00034     init();
00035     set_normalizer(new CSqrtDiagKernelNormalizer());
00036     register_params();
00037 }
00038 
00039 CSNPStringKernel::CSNPStringKernel(
00040     CStringFeatures<char>* l, CStringFeatures<char>* r,
00041     int32_t degree, int32_t win_len, bool inhomogene)
00042 : CStringKernel<char>(10), m_degree(degree), m_win_len(2*win_len),
00043     m_inhomogene(inhomogene)
00044 {
00045     init();
00046     set_normalizer(new CSqrtDiagKernelNormalizer());
00047     if (l==r)
00048         obtain_base_strings();
00049     init(l, r);
00050     register_params();
00051 }
00052 
00053 CSNPStringKernel::~CSNPStringKernel()
00054 {
00055     cleanup();
00056 }
00057 
00058 bool CSNPStringKernel::init(CFeatures* l, CFeatures* r)
00059 {
00060     CStringKernel<char>::init(l, r);
00061     return init_normalizer();
00062 }
00063 
00064 void CSNPStringKernel::cleanup()
00065 {
00066     CKernel::cleanup();
00067     SG_FREE(m_str_min);
00068     SG_FREE(m_str_maj);
00069 }
00070 
00071 void CSNPStringKernel::obtain_base_strings()
00072 {
00073     //should only be called on training data
00074     ASSERT(lhs==rhs);
00075 
00076     m_str_len=0;
00077 
00078     for (int32_t i=0; i<num_lhs; i++)
00079     {
00080         int32_t len;
00081         bool free_vec;
00082         char* vec = ((CStringFeatures<char>*) lhs)->get_feature_vector(i, len, free_vec);
00083 
00084         if (m_str_len==0)
00085         {
00086             m_str_len=len;
00087             m_str_min=SG_CALLOC(char, len+1);
00088             m_str_maj=SG_CALLOC(char, len+1);
00089         }
00090         else
00091         {
00092             ASSERT(m_str_len==len);
00093         }
00094 
00095         for (int32_t j=0; j<len; j++)
00096         {
00097             // skip sequencing errors
00098             if (vec[j]=='0')
00099                 continue;
00100 
00101             if (m_str_min[j]==0)
00102                 m_str_min[j]=vec[j];
00103             else if (m_str_maj[j]==0 && vec[j]!=m_str_min[j])
00104                 m_str_maj[j]=vec[j];
00105         }
00106 
00107         ((CStringFeatures<char>*) lhs)->free_feature_vector(vec, i, free_vec);
00108     }
00109 
00110     for (int32_t j=0; j<m_str_len; j++)
00111     {
00112         // if only one one symbol occurs use 0
00113         if (m_str_min[j]==0)
00114             m_str_min[j]='0';
00115         if (m_str_maj[j]==0)
00116             m_str_maj[j]='0';
00117 
00118         if (m_str_min[j]>m_str_maj[j])
00119             CMath::swap(m_str_min[j], m_str_maj[j]);
00120     }
00121 }
00122 
00123 float64_t CSNPStringKernel::compute(int32_t idx_a, int32_t idx_b)
00124 {
00125     int32_t alen, blen;
00126     bool free_avec, free_bvec;
00127 
00128     char* avec = ((CStringFeatures<char>*) lhs)->get_feature_vector(idx_a, alen, free_avec);
00129     char* bvec = ((CStringFeatures<char>*) rhs)->get_feature_vector(idx_b, blen, free_bvec);
00130 
00131     ASSERT(alen==blen);
00132     if (alen!=m_str_len)
00133         SG_ERROR("alen (%d) !=m_str_len (%d)\n", alen, m_str_len);
00134     ASSERT(m_str_min);
00135     ASSERT(m_str_maj);
00136 
00137     float64_t total=0;
00138     int32_t inhomogene= (m_inhomogene) ? 1 : 0;
00139 
00140     for (int32_t i = 0; i<alen-1; i+=2)
00141     {
00142         int32_t sumaa=0;
00143         int32_t sumbb=0;
00144         int32_t sumab=0;
00145 
00146         for (int32_t l=0; l<m_win_len && i+l<alen-1; l+=2)
00147         {
00148             char a1=avec[i+l];
00149             char a2=avec[i+l+1];
00150             char b1=bvec[i+l];
00151             char b2=bvec[i+l+1];
00152 
00153             if ((a1!=a2 || a1=='0' || a1=='0') && (b1!=b2 || b1=='0' || b2=='0'))
00154                 sumab++;
00155             else if (a1==a2 && b1==b2)
00156             {
00157                 if (a1!=b1)
00158                     continue;
00159 
00160                 if (a1==m_str_min[i+l])
00161                     sumaa++;
00162                 else if (a1==m_str_maj[i+l])
00163                     sumbb++;
00164                 else
00165                 {
00166                     SG_ERROR("The impossible happened i=%d l=%d a1=%c "
00167                             "a2=%c b1=%c b2=%c min=%c maj=%c\n", i, l, a1,a2, b1,b2, m_str_min[i+l], m_str_maj[i+l]);
00168                 }
00169             }
00170 
00171         }
00172         total+=CMath::pow(float64_t(sumaa+sumbb+sumab+inhomogene),
00173                 (int32_t) m_degree);
00174     }
00175 
00176     ((CStringFeatures<char>*) lhs)->free_feature_vector(avec, idx_a, free_avec);
00177     ((CStringFeatures<char>*) rhs)->free_feature_vector(bvec, idx_b, free_bvec);
00178     return total;
00179 }
00180 
00181 void CSNPStringKernel::register_params()
00182 {
00183     m_parameters->add(&m_degree, "m_degree", "the order of the kernel");
00184     m_parameters->add(&m_win_len, "m_win_len", "the window length");
00185     m_parameters->add(&m_inhomogene, "m_inhomogene", "the mark of whether it's an inhomogeneous poly kernel");
00186     m_parameters->add_vector(&m_str_min, &m_str_len, "m_str_min", "allele A");
00187     m_parameters->add_vector(&m_str_maj, &m_str_len, "m_str_maj", "allele B");
00188 }
00189 
00190 void CSNPStringKernel::init()
00191 {
00192     m_str_min=NULL;
00193     m_str_maj=NULL;
00194 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation