SNPStringKernel.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2009 Soeren Sonnenburg
00008  * Copyright (C) 2009 Berlin Institute of Technology
00009  */
00010 
00011 #include "lib/common.h"
00012 #include "lib/io.h"
00013 #include "kernel/SNPStringKernel.h"
00014 #include "kernel/SqrtDiagKernelNormalizer.h"
00015 #include "features/Features.h"
00016 #include "features/StringFeatures.h"
00017 
00018 using namespace shogun;
00019 
00020 CSNPStringKernel::CSNPStringKernel(void)
00021 : CStringKernel<char>(0),
00022   m_degree(0), m_win_len(0), m_inhomogene(false)
00023 {
00024     SG_UNSTABLE("CSNPStringKernel::CSNPStringKernel(void)", "\n");
00025 
00026     m_str_min=NULL;
00027     m_str_maj=NULL;
00028     set_normalizer(new CSqrtDiagKernelNormalizer());
00029 }
00030 
00031 CSNPStringKernel::CSNPStringKernel(int32_t size,
00032         int32_t degree, int32_t win_len, bool inhomogene)
00033 : CStringKernel<char>(size),
00034     m_degree(degree), m_win_len(2*win_len), m_inhomogene(inhomogene)
00035 {
00036     m_str_min=NULL;
00037     m_str_maj=NULL;
00038     set_normalizer(new CSqrtDiagKernelNormalizer());
00039 }
00040 
00041 CSNPStringKernel::CSNPStringKernel(
00042     CStringFeatures<char>* l, CStringFeatures<char>* r,
00043     int32_t degree, int32_t win_len, bool inhomogene)
00044 : CStringKernel<char>(10), m_degree(degree), m_win_len(2*win_len),
00045     m_inhomogene(inhomogene)
00046 {
00047     m_str_min=NULL;
00048     m_str_maj=NULL;
00049     set_normalizer(new CSqrtDiagKernelNormalizer());
00050     if (l==r)
00051         obtain_base_strings();
00052     init(l, r);
00053 }
00054 
00055 CSNPStringKernel::~CSNPStringKernel()
00056 {
00057     cleanup();
00058 }
00059 
00060 bool CSNPStringKernel::init(CFeatures* l, CFeatures* r)
00061 {
00062     CStringKernel<char>::init(l, r);
00063     return init_normalizer();
00064 }
00065 
00066 void CSNPStringKernel::cleanup()
00067 {
00068     CKernel::cleanup();
00069     free(m_str_min);
00070     free(m_str_maj);
00071 }
00072 
00073 void CSNPStringKernel::obtain_base_strings()
00074 {
00075     //should only be called on training data
00076     ASSERT(lhs==rhs);
00077 
00078     m_str_len=0;
00079 
00080     for (int32_t i=0; i<num_lhs; i++)
00081     {
00082         int32_t len;
00083         bool free_vec;
00084         char* vec = ((CStringFeatures<char>*) lhs)->get_feature_vector(i, len, free_vec);
00085 
00086         if (m_str_len==0)
00087         {
00088             m_str_len=len;
00089             size_t tlen=(len+1)*sizeof(char);
00090             m_str_min=(char*) malloc(tlen);
00091             m_str_maj=(char*) malloc(tlen);
00092             memset(m_str_min, 0, tlen);
00093             memset(m_str_maj, 0, tlen);
00094         }
00095         else
00096         {
00097             ASSERT(m_str_len==len);
00098         }
00099 
00100         for (int32_t j=0; j<len; j++)
00101         {
00102             // skip sequencing errors
00103             if (vec[j]=='0')
00104                 continue;
00105 
00106             if (m_str_min[j]==0)
00107                 m_str_min[j]=vec[j];
00108             else if (m_str_maj[j]==0 && vec[j]!=m_str_min[j])
00109                 m_str_maj[j]=vec[j];
00110         }
00111 
00112         ((CStringFeatures<char>*) lhs)->free_feature_vector(vec, i, free_vec);
00113     }
00114 
00115     for (int32_t j=0; j<m_str_len; j++)
00116     {
00117         // if only one one symbol occurs use 0
00118         if (m_str_min[j]==0)
00119             m_str_min[j]='0';
00120         if (m_str_maj[j]==0)
00121             m_str_maj[j]='0';
00122 
00123         if (m_str_min[j]>m_str_maj[j])
00124             CMath::swap(m_str_min[j], m_str_maj[j]);
00125     }
00126 }
00127 
00128 float64_t CSNPStringKernel::compute(int32_t idx_a, int32_t idx_b)
00129 {
00130     int32_t alen, blen;
00131     bool free_avec, free_bvec;
00132 
00133     char* avec = ((CStringFeatures<char>*) lhs)->get_feature_vector(idx_a, alen, free_avec);
00134     char* bvec = ((CStringFeatures<char>*) rhs)->get_feature_vector(idx_b, blen, free_bvec);
00135 
00136     ASSERT(alen==blen);
00137     if (alen!=m_str_len)
00138         SG_ERROR("alen (%d) !=m_str_len (%d)\n", alen, m_str_len);
00139     ASSERT(m_str_min);
00140     ASSERT(m_str_maj);
00141 
00142     float64_t total=0;
00143     int32_t inhomogene= (m_inhomogene) ? 1 : 0;
00144 
00145     for (int32_t i = 0; i<alen-1; i+=2)
00146     {
00147         int32_t sumaa=0;
00148         int32_t sumbb=0;
00149         int32_t sumab=0;
00150 
00151         for (int32_t l=0; l<m_win_len && i+l<alen-1; l+=2)
00152         {
00153             char a1=avec[i+l];
00154             char a2=avec[i+l+1];
00155             char b1=bvec[i+l];
00156             char b2=bvec[i+l+1];
00157 
00158             if ((a1!=a2 || a1=='0' || a1=='0') && (b1!=b2 || b1=='0' || b2=='0'))
00159                 sumab++;
00160             else if (a1==a2 && b1==b2)
00161             {
00162                 if (a1!=b1)
00163                     continue;
00164 
00165                 if (a1==m_str_min[i+l])
00166                     sumaa++;
00167                 else if (a1==m_str_maj[i+l])
00168                     sumbb++;
00169                 else
00170                 {
00171                     SG_ERROR("The impossible happened i=%d l=%d a1=%c "
00172                             "a2=%c b1=%c b2=%c min=%c maj=%c\n", i, l, a1,a2, b1,b2, m_str_min[i+l], m_str_maj[i+l]);
00173                 }
00174             }
00175 
00176         }
00177         total+=CMath::pow(float64_t(sumaa+sumbb+sumab+inhomogene),
00178                 (int32_t) m_degree);
00179     }
00180 
00181     ((CStringFeatures<char>*) lhs)->free_feature_vector(avec, idx_a, free_avec);
00182     ((CStringFeatures<char>*) rhs)->free_feature_vector(bvec, idx_b, free_bvec);
00183     return total;
00184 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation