OligoStringKernel.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2008 Christian Igel, Tobias Glasmachers
00008  * Copyright (C) 2008 Christian Igel, Tobias Glasmachers
00009  *
00010  * Shogun adjustments (W) 2008-2009 Soeren Sonnenburg
00011  * Copyright (C) 2008-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00012  *
00013  */
00014 #include <shogun/kernel/string/OligoStringKernel.h>
00015 #include <shogun/kernel/normalizer/SqrtDiagKernelNormalizer.h>
00016 #include <shogun/features/StringFeatures.h>
00017 
00018 #include <map>
00019 #include <vector>
00020 #include <algorithm>
00021 
00022 using namespace shogun;
00023 
00024 COligoStringKernel::COligoStringKernel()
00025   : CStringKernel<char>()
00026 {
00027     SG_UNSTABLE("OligoStringKernel");
00028     init();
00029 }
00030 
00031 COligoStringKernel::COligoStringKernel(int32_t cache_sz, int32_t kmer_len, float64_t w)
00032 : CStringKernel<char>(cache_sz)
00033 {
00034     SG_UNSTABLE("OligoStringKernel");
00035     init();
00036 
00037     k=kmer_len;
00038     width=w;
00039 }
00040 
00041 COligoStringKernel::~COligoStringKernel()
00042 {
00043     cleanup();
00044 }
00045 
00046 void COligoStringKernel::cleanup()
00047 {
00048     SG_FREE(gauss_table);
00049     gauss_table=NULL;
00050     gauss_table_len=0;
00051 
00052     CKernel::cleanup();
00053 }
00054 
00055 bool COligoStringKernel::init(CFeatures* l, CFeatures* r)
00056 {
00057     cleanup();
00058 
00059     CStringKernel<char>::init(l,r);
00060     int32_t max_len=CMath::max(
00061             ((CStringFeatures<char>*) l)->get_max_vector_length(),
00062             ((CStringFeatures<char>*) r)->get_max_vector_length()
00063             );
00064 
00065     getExpFunctionCache(max_len);
00066     return init_normalizer();
00067 }
00068 
00069 void COligoStringKernel::encodeOligo(
00070     const std::string& sequence, uint32_t k_mer_length,
00071     const std::string& allowed_characters,
00072     std::vector< std::pair<int32_t, float64_t> >& values)
00073 {
00074     float64_t oligo_value = 0.;
00075     float64_t factor      = 1.;
00076     std::map<std::string::value_type, uint32_t> residue_values;
00077     uint32_t counter = 0;
00078     uint32_t number_of_residues = allowed_characters.size();
00079     uint32_t sequence_length = sequence.size();
00080     bool sequence_ok = true;
00081 
00082     // checking if sequence contains illegal characters
00083     for (uint32_t i = 0; i < sequence.size(); ++i)
00084     {
00085         if (allowed_characters.find(sequence.at(i)) == std::string::npos)
00086             sequence_ok = false;
00087     }
00088 
00089     if (sequence_ok && k_mer_length <= sequence_length)
00090     {
00091         values.resize(sequence_length - k_mer_length + 1,
00092             std::pair<int32_t, float64_t>());
00093         for (uint32_t i = 0; i < number_of_residues; ++i)
00094         {
00095             residue_values.insert(std::make_pair(allowed_characters[i], counter));
00096             ++counter;
00097         }
00098         for (int32_t k = k_mer_length - 1; k >= 0; k--)
00099         {
00100             oligo_value += factor * residue_values[sequence[k]];
00101             factor *= number_of_residues;
00102         }
00103         factor /= number_of_residues;
00104         counter = 0;
00105         values[counter].first = 1;
00106         values[counter].second = oligo_value;
00107         ++counter;
00108 
00109         for (uint32_t j = 1; j < sequence_length - k_mer_length + 1; j++)
00110         {
00111             oligo_value -= factor * residue_values[sequence[j - 1]];
00112             oligo_value = oligo_value * number_of_residues +
00113                 residue_values[sequence[j + k_mer_length - 1]];
00114 
00115             values[counter].first = j + 1;
00116             values[counter].second = oligo_value ;
00117             ++counter;
00118         }
00119         stable_sort(values.begin(), values.end(), cmpOligos_);
00120     }
00121     else
00122     {
00123         values.clear();
00124     }
00125 }
00126 
00127 void COligoStringKernel::getSequences(
00128     const std::vector<std::string>& sequences, uint32_t k_mer_length,
00129     const std::string& allowed_characters,
00130     std::vector< std::vector< std::pair<int32_t, float64_t> > >& encoded_sequences)
00131 {
00132     std::vector< std::pair<int32_t, float64_t> > temp_vector;
00133     encoded_sequences.resize(sequences.size(),
00134         std::vector< std::pair<int32_t, float64_t> >());
00135 
00136     for (uint32_t i = 0; i < sequences.size(); ++i)
00137     {
00138         encodeOligo(sequences[i], k_mer_length, allowed_characters, temp_vector);
00139         encoded_sequences[i] = temp_vector;
00140     }
00141 }
00142 
00143 void COligoStringKernel::getExpFunctionCache(uint32_t sequence_length)
00144 {
00145     SG_FREE(gauss_table);
00146     gauss_table=SG_MALLOC(float64_t, sequence_length);
00147 
00148     gauss_table[0] = 1;
00149     for (uint32_t i = 1; i < sequence_length; i++)
00150         gauss_table[i] = exp((-1.0 / (CMath::sq(width))) * CMath::sq((float64_t) i));
00151 
00152     gauss_table_len=sequence_length;
00153 }
00154 
00155 float64_t COligoStringKernel::kernelOligoFast(
00156     const std::vector< std::pair<int32_t, float64_t> >& x,
00157     const std::vector< std::pair<int32_t, float64_t> >& y,
00158     int32_t max_distance)
00159 {
00160     float64_t result = 0;
00161     int32_t  i1     = 0;
00162     int32_t  i2     = 0;
00163     int32_t  c1     = 0;
00164     uint32_t x_size = x.size();
00165     uint32_t y_size = y.size();
00166 
00167     while ((uint32_t) i1 + 1 < x_size && (uint32_t) i2 + 1 < y_size)
00168     {
00169         if (x[i1].second == y[i2].second)
00170         {
00171             if (max_distance < 0
00172                     || (abs(x[i1].first - y[i2].first)) <= max_distance)
00173             {
00174                 result += gauss_table[abs((x[i1].first - y[i2].first))];
00175                 if (x[i1].second == x[i1 + 1].second)
00176                 {
00177                     i1++;
00178                     c1++;
00179                 }
00180                 else if (y[i2].second == y[i2 + 1].second)
00181                 {
00182                     i2++;
00183                     i1 -= c1;
00184                     c1 = 0;
00185                 }
00186                 else
00187                 {
00188                     i1++;
00189                     i2++;
00190                 }
00191             }
00192             else
00193             {
00194                 if (x[i1].first < y[i2].first)
00195                 {
00196                     if (x[i1].second == x[i1 + 1].second)
00197                     {
00198                         i1++;
00199                     }
00200                     else if (y[i2].second == y[i2 + 1].second)
00201                     {
00202                         while (y[i2].second == y[i2+1].second)
00203                             i2++;
00204                         ++i1;
00205                         c1 = 0;
00206                     }
00207                     else
00208                     {
00209                         i1++;
00210                         i2++;
00211                         c1 = 0;
00212                     }
00213                 }
00214                 else
00215                 {
00216                     i2++;
00217                     i1 -= c1;
00218                     c1 = 0;
00219                 }
00220             }
00221         }
00222         else
00223         {
00224             if (x[i1].second < y[i2].second)
00225                 i1++;
00226             else
00227                 i2++;
00228             c1 = 0;
00229         }
00230     }
00231     return result;
00232 }
00233 
00234 
00235 float64_t COligoStringKernel::compute(int32_t idx_a, int32_t idx_b)
00236 {
00237     int32_t alen, blen;
00238     bool free_a, free_b;
00239     char* avec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx_a, alen, free_a);
00240     char* bvec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx_b, blen, free_b);
00241     std::vector< std::pair<int32_t, float64_t> > aenc;
00242     std::vector< std::pair<int32_t, float64_t> > benc;
00243     encodeOligo(std::string(avec, alen), k, "ACGT", aenc);
00244     encodeOligo(std::string(bvec, alen), k, "ACGT", benc);
00245     float64_t result=kernelOligoFast(aenc, benc);
00246     ((CStringFeatures<char>*) lhs)->free_feature_vector(avec, idx_a, free_a);
00247     ((CStringFeatures<char>*) rhs)->free_feature_vector(bvec, idx_b, free_b);
00248     return result;
00249 }
00250 
00251 void COligoStringKernel::init()
00252 {
00253     k=0;
00254     width=0.0;
00255     gauss_table=NULL;
00256     gauss_table_len=0;
00257 
00258     set_normalizer(new CSqrtDiagKernelNormalizer());
00259 
00260     SG_ADD(&k, "k", "K-mer length.", MS_AVAILABLE);
00261     SG_ADD(&width, "width", "Width of Gaussian.", MS_AVAILABLE);
00262     m_parameters->add_vector(&gauss_table, &gauss_table_len, "gauss_table",
00263         "Gauss Cache Table.");
00264 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation