00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011 #include <shogun/lib/common.h>
00012 #include <shogun/io/SGIO.h>
00013 #include <shogun/kernel/SNPStringKernel.h>
00014 #include <shogun/kernel/SqrtDiagKernelNormalizer.h>
00015 #include <shogun/features/Features.h>
00016 #include <shogun/features/StringFeatures.h>
00017
00018 using namespace shogun;
00019
00020 CSNPStringKernel::CSNPStringKernel()
00021 : CStringKernel<char>(0),
00022 m_degree(0), m_win_len(0), m_inhomogene(false)
00023 {
00024 init();
00025 set_normalizer(new CSqrtDiagKernelNormalizer());
00026 register_params();
00027 }
00028
00029 CSNPStringKernel::CSNPStringKernel(int32_t size,
00030 int32_t degree, int32_t win_len, bool inhomogene)
00031 : CStringKernel<char>(size),
00032 m_degree(degree), m_win_len(2*win_len), m_inhomogene(inhomogene)
00033 {
00034 init();
00035 set_normalizer(new CSqrtDiagKernelNormalizer());
00036 register_params();
00037 }
00038
00039 CSNPStringKernel::CSNPStringKernel(
00040 CStringFeatures<char>* l, CStringFeatures<char>* r,
00041 int32_t degree, int32_t win_len, bool inhomogene)
00042 : CStringKernel<char>(10), m_degree(degree), m_win_len(2*win_len),
00043 m_inhomogene(inhomogene)
00044 {
00045 init();
00046 set_normalizer(new CSqrtDiagKernelNormalizer());
00047 if (l==r)
00048 obtain_base_strings();
00049 init(l, r);
00050 register_params();
00051 }
00052
00053 CSNPStringKernel::~CSNPStringKernel()
00054 {
00055 cleanup();
00056 }
00057
00058 bool CSNPStringKernel::init(CFeatures* l, CFeatures* r)
00059 {
00060 CStringKernel<char>::init(l, r);
00061 return init_normalizer();
00062 }
00063
00064 void CSNPStringKernel::cleanup()
00065 {
00066 CKernel::cleanup();
00067 SG_FREE(m_str_min);
00068 SG_FREE(m_str_maj);
00069 }
00070
00071 void CSNPStringKernel::obtain_base_strings()
00072 {
00073
00074 ASSERT(lhs==rhs);
00075
00076 m_str_len=0;
00077
00078 for (int32_t i=0; i<num_lhs; i++)
00079 {
00080 int32_t len;
00081 bool free_vec;
00082 char* vec = ((CStringFeatures<char>*) lhs)->get_feature_vector(i, len, free_vec);
00083
00084 if (m_str_len==0)
00085 {
00086 m_str_len=len;
00087 m_str_min=SG_CALLOC(char, len+1);
00088 m_str_maj=SG_CALLOC(char, len+1);
00089 }
00090 else
00091 {
00092 ASSERT(m_str_len==len);
00093 }
00094
00095 for (int32_t j=0; j<len; j++)
00096 {
00097
00098 if (vec[j]=='0')
00099 continue;
00100
00101 if (m_str_min[j]==0)
00102 m_str_min[j]=vec[j];
00103 else if (m_str_maj[j]==0 && vec[j]!=m_str_min[j])
00104 m_str_maj[j]=vec[j];
00105 }
00106
00107 ((CStringFeatures<char>*) lhs)->free_feature_vector(vec, i, free_vec);
00108 }
00109
00110 for (int32_t j=0; j<m_str_len; j++)
00111 {
00112
00113 if (m_str_min[j]==0)
00114 m_str_min[j]='0';
00115 if (m_str_maj[j]==0)
00116 m_str_maj[j]='0';
00117
00118 if (m_str_min[j]>m_str_maj[j])
00119 CMath::swap(m_str_min[j], m_str_maj[j]);
00120 }
00121 }
00122
00123 float64_t CSNPStringKernel::compute(int32_t idx_a, int32_t idx_b)
00124 {
00125 int32_t alen, blen;
00126 bool free_avec, free_bvec;
00127
00128 char* avec = ((CStringFeatures<char>*) lhs)->get_feature_vector(idx_a, alen, free_avec);
00129 char* bvec = ((CStringFeatures<char>*) rhs)->get_feature_vector(idx_b, blen, free_bvec);
00130
00131 ASSERT(alen==blen);
00132 if (alen!=m_str_len)
00133 SG_ERROR("alen (%d) !=m_str_len (%d)\n", alen, m_str_len);
00134 ASSERT(m_str_min);
00135 ASSERT(m_str_maj);
00136
00137 float64_t total=0;
00138 int32_t inhomogene= (m_inhomogene) ? 1 : 0;
00139
00140 for (int32_t i = 0; i<alen-1; i+=2)
00141 {
00142 int32_t sumaa=0;
00143 int32_t sumbb=0;
00144 int32_t sumab=0;
00145
00146 for (int32_t l=0; l<m_win_len && i+l<alen-1; l+=2)
00147 {
00148 char a1=avec[i+l];
00149 char a2=avec[i+l+1];
00150 char b1=bvec[i+l];
00151 char b2=bvec[i+l+1];
00152
00153 if ((a1!=a2 || a1=='0' || a1=='0') && (b1!=b2 || b1=='0' || b2=='0'))
00154 sumab++;
00155 else if (a1==a2 && b1==b2)
00156 {
00157 if (a1!=b1)
00158 continue;
00159
00160 if (a1==m_str_min[i+l])
00161 sumaa++;
00162 else if (a1==m_str_maj[i+l])
00163 sumbb++;
00164 else
00165 {
00166 SG_ERROR("The impossible happened i=%d l=%d a1=%c "
00167 "a2=%c b1=%c b2=%c min=%c maj=%c\n", i, l, a1,a2, b1,b2, m_str_min[i+l], m_str_maj[i+l]);
00168 }
00169 }
00170
00171 }
00172 total+=CMath::pow(float64_t(sumaa+sumbb+sumab+inhomogene),
00173 (int32_t) m_degree);
00174 }
00175
00176 ((CStringFeatures<char>*) lhs)->free_feature_vector(avec, idx_a, free_avec);
00177 ((CStringFeatures<char>*) rhs)->free_feature_vector(bvec, idx_b, free_bvec);
00178 return total;
00179 }
00180
00181 void CSNPStringKernel::register_params()
00182 {
00183 m_parameters->add(&m_degree, "m_degree", "the order of the kernel");
00184 m_parameters->add(&m_win_len, "m_win_len", "the window length");
00185 m_parameters->add(&m_inhomogene, "m_inhomogene", "the mark of whether it's an inhomogeneous poly kernel");
00186 m_parameters->add_vector(&m_str_min, &m_str_len, "m_str_min", "allele A");
00187 m_parameters->add_vector(&m_str_maj, &m_str_len, "m_str_maj", "allele B");
00188 }
00189
00190 void CSNPStringKernel::init()
00191 {
00192 m_str_min=NULL;
00193 m_str_maj=NULL;
00194 }