SHOGUN  4.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
SNPStringKernel.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2009 Soeren Sonnenburg
8  * Copyright (C) 2009 Berlin Institute of Technology
9  */
10 
11 #include <shogun/lib/common.h>
12 #include <shogun/io/SGIO.h>
17 
18 using namespace shogun;
19 
21 : CStringKernel<char>(0),
22  m_degree(0), m_win_len(0), m_inhomogene(false)
23 {
24  init();
27 }
28 
30  int32_t degree, int32_t win_len, bool inhomogene)
31 : CStringKernel<char>(size),
32  m_degree(degree), m_win_len(2*win_len), m_inhomogene(inhomogene)
33 {
34  init();
37 }
38 
41  int32_t degree, int32_t win_len, bool inhomogene)
42 : CStringKernel<char>(10), m_degree(degree), m_win_len(2*win_len),
43  m_inhomogene(inhomogene)
44 {
45  init();
47  if (l==r)
49  init(l, r);
51 }
52 
54 {
55  cleanup();
56 }
57 
58 bool CSNPStringKernel::init(CFeatures* l, CFeatures* r)
59 {
61  return init_normalizer();
62 }
63 
65 {
67  SG_FREE(m_str_min);
68  SG_FREE(m_str_maj);
69 }
70 
72 {
73  //should only be called on training data
74  ASSERT(lhs==rhs)
75 
76  m_str_len=0;
77 
78  for (int32_t i=0; i<num_lhs; i++)
79  {
80  int32_t len;
81  bool free_vec;
82  char* vec = ((CStringFeatures<char>*) lhs)->get_feature_vector(i, len, free_vec);
83 
84  if (m_str_len==0)
85  {
86  m_str_len=len;
87  m_str_min=SG_CALLOC(char, len+1);
88  m_str_maj=SG_CALLOC(char, len+1);
89  }
90  else
91  {
92  ASSERT(m_str_len==len)
93  }
94 
95  for (int32_t j=0; j<len; j++)
96  {
97  // skip sequencing errors
98  if (vec[j]=='0')
99  continue;
100 
101  if (m_str_min[j]==0)
102  m_str_min[j]=vec[j];
103  else if (m_str_maj[j]==0 && vec[j]!=m_str_min[j])
104  m_str_maj[j]=vec[j];
105  }
106 
107  ((CStringFeatures<char>*) lhs)->free_feature_vector(vec, i, free_vec);
108  }
109 
110  for (int32_t j=0; j<m_str_len; j++)
111  {
112  // if only one one symbol occurs use 0
113  if (m_str_min[j]==0)
114  m_str_min[j]='0';
115  if (m_str_maj[j]==0)
116  m_str_maj[j]='0';
117 
118  if (m_str_min[j]>m_str_maj[j])
120  }
121 }
122 
123 float64_t CSNPStringKernel::compute(int32_t idx_a, int32_t idx_b)
124 {
125  int32_t alen, blen;
126  bool free_avec, free_bvec;
127 
128  char* avec = ((CStringFeatures<char>*) lhs)->get_feature_vector(idx_a, alen, free_avec);
129  char* bvec = ((CStringFeatures<char>*) rhs)->get_feature_vector(idx_b, blen, free_bvec);
130 
131  ASSERT(alen==blen)
132  if (alen!=m_str_len)
133  SG_ERROR("alen (%d) !=m_str_len (%d)\n", alen, m_str_len)
136 
137  float64_t total=0;
138  int32_t inhomogene= (m_inhomogene) ? 1 : 0;
139 
140  for (int32_t i = 0; i<alen-1; i+=2)
141  {
142  int32_t sumaa=0;
143  int32_t sumbb=0;
144  int32_t sumab=0;
145 
146  for (int32_t l=0; l<m_win_len && i+l<alen-1; l+=2)
147  {
148  char a1=avec[i+l];
149  char a2=avec[i+l+1];
150  char b1=bvec[i+l];
151  char b2=bvec[i+l+1];
152 
153  if ((a1!=a2 || a1=='0' || a2=='0') && (b1!=b2 || b1=='0' || b2=='0'))
154  sumab++;
155  else if (a1==a2 && b1==b2)
156  {
157  if (a1!=b1)
158  continue;
159 
160  if (a1==m_str_min[i+l])
161  sumaa++;
162  else if (a1==m_str_maj[i+l])
163  sumbb++;
164  else
165  {
166  SG_ERROR("The impossible happened i=%d l=%d a1=%c "
167  "a2=%c b1=%c b2=%c min=%c maj=%c\n", i, l, a1,a2, b1,b2, m_str_min[i+l], m_str_maj[i+l]);
168  }
169  }
170 
171  }
172  total+=CMath::pow(float64_t(sumaa+sumbb+sumab+inhomogene),
173  (int32_t) m_degree);
174  }
175 
176  ((CStringFeatures<char>*) lhs)->free_feature_vector(avec, idx_a, free_avec);
177  ((CStringFeatures<char>*) rhs)->free_feature_vector(bvec, idx_b, free_bvec);
178  return total;
179 }
180 
182 {
183  SG_ADD(&m_degree, "m_degree", "the order of the kernel", MS_AVAILABLE);
184  SG_ADD(&m_win_len, "m_win_len", "the window length", MS_AVAILABLE);
185  SG_ADD(&m_inhomogene, "m_inhomogene",
186  "the mark of whether it's an inhomogeneous poly kernel", MS_NOT_AVAILABLE);
187  m_parameters->add_vector(&m_str_min, &m_str_len, "m_str_min", "allele A");
188  m_parameters->add_vector(&m_str_maj, &m_str_len, "m_str_maj", "allele B");
189 }
190 
191 void CSNPStringKernel::init()
192 {
193  m_str_min=NULL;
194  m_str_maj=NULL;
195  m_str_len=0;
196 }
virtual void cleanup()
Definition: Kernel.cpp:173
virtual bool set_normalizer(CKernelNormalizer *normalizer)
Definition: Kernel.cpp:150
virtual float64_t compute(int32_t idx_a, int32_t idx_b)
#define SG_ERROR(...)
Definition: SGIO.h:129
Parameter * m_parameters
Definition: SGObject.h:546
virtual bool init(CFeatures *l, CFeatures *r)
#define ASSERT(x)
Definition: SGIO.h:201
double float64_t
Definition: common.h:50
int32_t num_lhs
number of feature vectors on left hand side
Definition: Kernel.h:1068
virtual bool init_normalizer()
Definition: Kernel.cpp:168
CFeatures * rhs
feature vectors to occur on right hand side
Definition: Kernel.h:1062
void add_vector(bool **param, index_t *length, const char *name, const char *description="")
Definition: Parameter.cpp:334
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
CFeatures * lhs
feature vectors to occur on left hand side
Definition: Kernel.h:1060
The class Features is the base class of all feature objects.
Definition: Features.h:68
static void swap(T &a, T &b)
Definition: Math.h:438
#define SG_ADD(...)
Definition: SGObject.h:84
friend class CSqrtDiagKernelNormalizer
Definition: Kernel.h:162
Template class StringKernel, is the base class of all String Kernels.
Definition: StringKernel.h:26
static int32_t pow(bool x, int32_t n)
Definition: Math.h:535

SHOGUN Machine Learning Toolbox - Documentation