77 getExpFunctionCache(max_len);
82 const std::string& sequence, uint32_t k_mer_length,
83 const std::string& allowed_characters,
84 std::vector< std::pair<int32_t, float64_t> >& values)
88 std::map<std::string::value_type, uint32_t> residue_values;
90 uint32_t number_of_residues = allowed_characters.size();
91 uint32_t sequence_length = sequence.size();
92 bool sequence_ok =
true;
95 for (uint32_t i = 0; i < sequence.size(); ++i)
97 if (allowed_characters.find(sequence.at(i)) == std::string::npos)
101 if (sequence_ok && k_mer_length <= sequence_length)
103 values.resize(sequence_length - k_mer_length + 1,
104 std::pair<int32_t, float64_t>());
105 for (uint32_t i = 0; i < number_of_residues; ++i)
107 residue_values.insert(std::make_pair(allowed_characters[i], counter));
110 for (int32_t
k = k_mer_length - 1;
k >= 0;
k--)
112 oligo_value += factor * residue_values[sequence[
k]];
113 factor *= number_of_residues;
115 factor /= number_of_residues;
117 values[counter].first = 1;
118 values[counter].second = oligo_value;
121 for (uint32_t j = 1; j < sequence_length - k_mer_length + 1; j++)
123 oligo_value -= factor * residue_values[sequence[j - 1]];
124 oligo_value = oligo_value * number_of_residues +
125 residue_values[sequence[j + k_mer_length - 1]];
127 values[counter].first = j + 1;
128 values[counter].second = oligo_value ;
131 stable_sort(values.begin(), values.end(), cmpOligos_);
140 const std::vector<std::string>& sequences, uint32_t k_mer_length,
141 const std::string& allowed_characters,
142 std::vector< std::vector< std::pair<int32_t, float64_t> > >& encoded_sequences)
144 std::vector< std::pair<int32_t, float64_t> > temp_vector;
145 encoded_sequences.resize(sequences.size(),
146 std::vector< std::pair<int32_t, float64_t> >());
148 for (uint32_t i = 0; i < sequences.size(); ++i)
150 encodeOligo(sequences[i], k_mer_length, allowed_characters, temp_vector);
151 encoded_sequences[i] = temp_vector;
155 void COligoStringKernel::getExpFunctionCache(uint32_t sequence_length)
160 for (uint32_t i = 1; i < sequence_length; i++)
165 const std::vector< std::pair<int32_t, float64_t> >& x,
166 const std::vector< std::pair<int32_t, float64_t> >& y,
167 int32_t max_distance)
173 uint32_t x_size = x.size();
174 uint32_t y_size = y.size();
176 while ((uint32_t) i1 + 1 < x_size && (uint32_t) i2 + 1 < y_size)
178 if (x[i1].second == y[i2].second)
181 || (abs(x[i1].first - y[i2].first)) <= max_distance)
183 result += gauss_table[abs((x[i1].first - y[i2].first))];
184 if (x[i1].second == x[i1 + 1].second)
189 else if (y[i2].second == y[i2 + 1].second)
203 if (x[i1].first < y[i2].first)
205 if (x[i1].second == x[i1 + 1].second)
209 else if (y[i2].second == y[i2 + 1].second)
211 while (y[i2].second == y[i2].second)
233 if (x[i1].second < y[i2].second)
244 const std::vector< std::pair<int32_t, float64_t> >& x,
245 const std::vector< std::pair<int32_t, float64_t> >& y)
251 uint32_t x_size = x.size();
252 uint32_t y_size = y.size();
254 while ((uint32_t) i1 < x_size && (uint32_t) i2 < y_size)
256 if (x[i1].second == y[i2].second)
260 if (((uint32_t) i1+1) < x_size && x[i1].second == x[i1 + 1].second)
265 else if (((uint32_t) i2+1) <y_size && y[i2].second == y[i2 + 1].second)
279 if (x[i1].second < y[i2].second)
296 std::vector< std::pair<int32_t, float64_t> > aenc;
297 std::vector< std::pair<int32_t, float64_t> > benc;
307 void COligoStringKernel::init()
virtual bool init(CFeatures *l, CFeatures *r)
float64_t kernelOligo(const std::vector< std::pair< int32_t, float64_t > > &x, const std::vector< std::pair< int32_t, float64_t > > &y)
returns the value of the oligo kernel for sequences 'x' and 'y'
virtual float64_t compute(int32_t x, int32_t y)
virtual bool set_normalizer(CKernelNormalizer *normalizer)
SGVector< float64_t > gauss_table
static void encodeOligo(const std::string &sequence, uint32_t k_mer_length, const std::string &allowed_characters, std::vector< std::pair< int32_t, float64_t > > &values)
encodes the signals of the sequence
virtual ~COligoStringKernel()
static void getSequences(const std::vector< std::string > &sequences, uint32_t k_mer_length, const std::string &allowed_characters, std::vector< std::vector< std::pair< int32_t, float64_t > > > &encoded_sequences)
encodes all sequences with the encodeOligo function and stores them in 'encoded_sequences' ...
virtual bool init_normalizer()
CFeatures * rhs
feature vectors to occur on right hand side
all of classes and functions are contained in the shogun namespace
CFeatures * lhs
feature vectors to occur on left hand side
The class Features is the base class of all feature objects.
friend class CSqrtDiagKernelNormalizer
Template class StringKernel, is the base class of all String Kernels.
float64_t kernelOligoFast(const std::vector< std::pair< int32_t, float64_t > > &x, const std::vector< std::pair< int32_t, float64_t > > &y, int32_t max_distance=-1)
returns the value of the oligo kernel for sequences 'x' and 'y'