00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014 #include <shogun/kernel/string/OligoStringKernel.h>
00015 #include <shogun/kernel/normalizer/SqrtDiagKernelNormalizer.h>
00016 #include <shogun/features/StringFeatures.h>
00017
00018 #include <map>
00019 #include <vector>
00020 #include <algorithm>
00021
00022 using namespace shogun;
00023
00024 COligoStringKernel::COligoStringKernel()
00025 : CStringKernel<char>()
00026 {
00027 SG_UNSTABLE("OligoStringKernel");
00028 init();
00029 }
00030
00031 COligoStringKernel::COligoStringKernel(int32_t cache_sz, int32_t kmer_len, float64_t w)
00032 : CStringKernel<char>(cache_sz)
00033 {
00034 SG_UNSTABLE("OligoStringKernel");
00035 init();
00036
00037 k=kmer_len;
00038 width=w;
00039 }
00040
00041 COligoStringKernel::~COligoStringKernel()
00042 {
00043 cleanup();
00044 }
00045
00046 void COligoStringKernel::cleanup()
00047 {
00048 SG_FREE(gauss_table);
00049 gauss_table=NULL;
00050 gauss_table_len=0;
00051
00052 CKernel::cleanup();
00053 }
00054
00055 bool COligoStringKernel::init(CFeatures* l, CFeatures* r)
00056 {
00057 cleanup();
00058
00059 CStringKernel<char>::init(l,r);
00060 int32_t max_len=CMath::max(
00061 ((CStringFeatures<char>*) l)->get_max_vector_length(),
00062 ((CStringFeatures<char>*) r)->get_max_vector_length()
00063 );
00064
00065 getExpFunctionCache(max_len);
00066 return init_normalizer();
00067 }
00068
00069 void COligoStringKernel::encodeOligo(
00070 const std::string& sequence, uint32_t k_mer_length,
00071 const std::string& allowed_characters,
00072 std::vector< std::pair<int32_t, float64_t> >& values)
00073 {
00074 float64_t oligo_value = 0.;
00075 float64_t factor = 1.;
00076 std::map<std::string::value_type, uint32_t> residue_values;
00077 uint32_t counter = 0;
00078 uint32_t number_of_residues = allowed_characters.size();
00079 uint32_t sequence_length = sequence.size();
00080 bool sequence_ok = true;
00081
00082
00083 for (uint32_t i = 0; i < sequence.size(); ++i)
00084 {
00085 if (allowed_characters.find(sequence.at(i)) == std::string::npos)
00086 sequence_ok = false;
00087 }
00088
00089 if (sequence_ok && k_mer_length <= sequence_length)
00090 {
00091 values.resize(sequence_length - k_mer_length + 1,
00092 std::pair<int32_t, float64_t>());
00093 for (uint32_t i = 0; i < number_of_residues; ++i)
00094 {
00095 residue_values.insert(std::make_pair(allowed_characters[i], counter));
00096 ++counter;
00097 }
00098 for (int32_t k = k_mer_length - 1; k >= 0; k--)
00099 {
00100 oligo_value += factor * residue_values[sequence[k]];
00101 factor *= number_of_residues;
00102 }
00103 factor /= number_of_residues;
00104 counter = 0;
00105 values[counter].first = 1;
00106 values[counter].second = oligo_value;
00107 ++counter;
00108
00109 for (uint32_t j = 1; j < sequence_length - k_mer_length + 1; j++)
00110 {
00111 oligo_value -= factor * residue_values[sequence[j - 1]];
00112 oligo_value = oligo_value * number_of_residues +
00113 residue_values[sequence[j + k_mer_length - 1]];
00114
00115 values[counter].first = j + 1;
00116 values[counter].second = oligo_value ;
00117 ++counter;
00118 }
00119 stable_sort(values.begin(), values.end(), cmpOligos_);
00120 }
00121 else
00122 {
00123 values.clear();
00124 }
00125 }
00126
00127 void COligoStringKernel::getSequences(
00128 const std::vector<std::string>& sequences, uint32_t k_mer_length,
00129 const std::string& allowed_characters,
00130 std::vector< std::vector< std::pair<int32_t, float64_t> > >& encoded_sequences)
00131 {
00132 std::vector< std::pair<int32_t, float64_t> > temp_vector;
00133 encoded_sequences.resize(sequences.size(),
00134 std::vector< std::pair<int32_t, float64_t> >());
00135
00136 for (uint32_t i = 0; i < sequences.size(); ++i)
00137 {
00138 encodeOligo(sequences[i], k_mer_length, allowed_characters, temp_vector);
00139 encoded_sequences[i] = temp_vector;
00140 }
00141 }
00142
00143 void COligoStringKernel::getExpFunctionCache(uint32_t sequence_length)
00144 {
00145 SG_FREE(gauss_table);
00146 gauss_table=SG_MALLOC(float64_t, sequence_length);
00147
00148 gauss_table[0] = 1;
00149 for (uint32_t i = 1; i < sequence_length; i++)
00150 gauss_table[i] = exp((-1.0 / (CMath::sq(width))) * CMath::sq((float64_t) i));
00151
00152 gauss_table_len=sequence_length;
00153 }
00154
00155 float64_t COligoStringKernel::kernelOligoFast(
00156 const std::vector< std::pair<int32_t, float64_t> >& x,
00157 const std::vector< std::pair<int32_t, float64_t> >& y,
00158 int32_t max_distance)
00159 {
00160 float64_t result = 0;
00161 int32_t i1 = 0;
00162 int32_t i2 = 0;
00163 int32_t c1 = 0;
00164 uint32_t x_size = x.size();
00165 uint32_t y_size = y.size();
00166
00167 while ((uint32_t) i1 + 1 < x_size && (uint32_t) i2 + 1 < y_size)
00168 {
00169 if (x[i1].second == y[i2].second)
00170 {
00171 if (max_distance < 0
00172 || (abs(x[i1].first - y[i2].first)) <= max_distance)
00173 {
00174 result += gauss_table[abs((x[i1].first - y[i2].first))];
00175 if (x[i1].second == x[i1 + 1].second)
00176 {
00177 i1++;
00178 c1++;
00179 }
00180 else if (y[i2].second == y[i2 + 1].second)
00181 {
00182 i2++;
00183 i1 -= c1;
00184 c1 = 0;
00185 }
00186 else
00187 {
00188 i1++;
00189 i2++;
00190 }
00191 }
00192 else
00193 {
00194 if (x[i1].first < y[i2].first)
00195 {
00196 if (x[i1].second == x[i1 + 1].second)
00197 {
00198 i1++;
00199 }
00200 else if (y[i2].second == y[i2 + 1].second)
00201 {
00202 while (y[i2].second == y[i2+1].second)
00203 i2++;
00204 ++i1;
00205 c1 = 0;
00206 }
00207 else
00208 {
00209 i1++;
00210 i2++;
00211 c1 = 0;
00212 }
00213 }
00214 else
00215 {
00216 i2++;
00217 i1 -= c1;
00218 c1 = 0;
00219 }
00220 }
00221 }
00222 else
00223 {
00224 if (x[i1].second < y[i2].second)
00225 i1++;
00226 else
00227 i2++;
00228 c1 = 0;
00229 }
00230 }
00231 return result;
00232 }
00233
00234
00235 float64_t COligoStringKernel::compute(int32_t idx_a, int32_t idx_b)
00236 {
00237 int32_t alen, blen;
00238 bool free_a, free_b;
00239 char* avec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx_a, alen, free_a);
00240 char* bvec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx_b, blen, free_b);
00241 std::vector< std::pair<int32_t, float64_t> > aenc;
00242 std::vector< std::pair<int32_t, float64_t> > benc;
00243 encodeOligo(std::string(avec, alen), k, "ACGT", aenc);
00244 encodeOligo(std::string(bvec, alen), k, "ACGT", benc);
00245 float64_t result=kernelOligoFast(aenc, benc);
00246 ((CStringFeatures<char>*) lhs)->free_feature_vector(avec, idx_a, free_a);
00247 ((CStringFeatures<char>*) rhs)->free_feature_vector(bvec, idx_b, free_b);
00248 return result;
00249 }
00250
00251 void COligoStringKernel::init()
00252 {
00253 k=0;
00254 width=0.0;
00255 gauss_table=NULL;
00256 gauss_table_len=0;
00257
00258 set_normalizer(new CSqrtDiagKernelNormalizer());
00259
00260 SG_ADD(&k, "k", "K-mer length.", MS_AVAILABLE);
00261 SG_ADD(&width, "width", "Width of Gaussian.", MS_AVAILABLE);
00262 m_parameters->add_vector(&gauss_table, &gauss_table_len, "gauss_table",
00263 "Gauss Cache Table.");
00264 }