SpectrumRBFKernel.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2009 Soeren Sonnenburg
00008  * Written (W) 1999-2008 Gunnar Raetsch
00009  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00010  */
00011 
00012 #include <vector>
00013 
00014 #include "lib/common.h"
00015 #include "lib/io.h"
00016 #include "lib/Signal.h"
00017 #include "lib/Trie.h"
00018 #include "base/Parallel.h"
00019 
00020 #include "kernel/SpectrumRBFKernel.h"
00021 #include "features/Features.h"
00022 #include "features/StringFeatures.h"
00023 #include <math.h>
00024 
00025 #include <vector>
00026 #include <string>
00027 #include <fstream>
00028 #include <cmath>
00029 
00030 #include <assert.h>
00031 
00032 #ifndef WIN32
00033 #include <pthread.h>
00034 #endif
00035 
00036 
00037 using namespace shogun;
00038 
00039 CSpectrumRBFKernel::CSpectrumRBFKernel(void)
00040   : CStringKernel<char>(0)
00041 {
00042     SG_UNSTABLE("CSpectrumRBFKernel::CSpectrumRBFKernel(void)", "\n");
00043 
00044     alphabet = NULL;
00045     degree = 0;
00046     AA_matrix = NULL;
00047     width = 0.0;
00048     sequences = NULL;
00049     string_features = NULL;
00050     nof_sequences = 0;
00051     max_sequence_length = 0;
00052 
00053     initialized = false;
00054 
00055     max_mismatch = 0;
00056     target_letter_0 = 0;
00057 }
00058 
00059 CSpectrumRBFKernel::CSpectrumRBFKernel (int32_t size, float64_t *AA_matrix_, int32_t degree_, float64_t width_)
00060   : CStringKernel<char>(size), alphabet(NULL), degree(degree_), width(width_), sequences(NULL), string_features(NULL), nof_sequences(0), max_sequence_length(0)
00061 {
00062     lhs=NULL;
00063     rhs=NULL;
00064 
00065     target_letter_0=-1 ;
00066 
00067     AA_matrix=new float64_t[128*128];
00068     
00069 
00070     memcpy(AA_matrix, AA_matrix_, 128*128*sizeof(float64_t)) ;
00071 
00072     read_profiles_and_sequences();
00073 
00074     //string_features = new CStringFeatures<char>(sequences, nof_sequences, max_sequence_length, PROTEIN);
00075     string_features = new CStringFeatures<char>(sequences, nof_sequences, max_sequence_length, IUPAC_AMINO_ACID);
00076     init(string_features, string_features);
00077 }
00078 
00079 CSpectrumRBFKernel::CSpectrumRBFKernel(
00080     CStringFeatures<char>* l, CStringFeatures<char>* r, int32_t size, float64_t* AA_matrix_, int32_t degree_, float64_t width_)
00081 : CStringKernel<char>(size), alphabet(NULL), degree(degree_), width(width_), sequences(NULL), string_features(NULL), nof_sequences(0), max_sequence_length(0)
00082 {
00083     target_letter_0=-1 ;
00084 
00085     AA_matrix=new float64_t[128*128];
00086     memcpy(AA_matrix, AA_matrix_, 128*128*sizeof(float64_t)) ;
00087 
00088     init(l, r);
00089 }
00090 
00091 CSpectrumRBFKernel::~CSpectrumRBFKernel()
00092 {
00093     cleanup();
00094     delete[] AA_matrix ;
00095 }
00096 
00097 
00098 void CSpectrumRBFKernel::remove_lhs()
00099 {
00100 
00101     CKernel::remove_lhs();
00102 }
00103 
00104 void CSpectrumRBFKernel::read_profiles_and_sequences()
00105 {
00106 
00107         int32_t aa_to_index[128];//profile
00108         aa_to_index['A'] = 0;
00109         aa_to_index['R'] = 1;
00110         aa_to_index['N'] = 2;
00111         aa_to_index['D'] = 3;
00112         aa_to_index['C'] = 4;
00113         aa_to_index['Q'] = 5;
00114         aa_to_index['E'] = 6;
00115         aa_to_index['G'] = 7;
00116         aa_to_index['H'] = 8;
00117         aa_to_index['I'] = 9;
00118         aa_to_index['L'] = 10;
00119         aa_to_index['K'] = 11;
00120         aa_to_index['M'] = 12;
00121         aa_to_index['F'] = 13;
00122         aa_to_index['P'] = 14;
00123         aa_to_index['S'] = 15;
00124         aa_to_index['T'] = 16;
00125         aa_to_index['W'] = 17;
00126         aa_to_index['Y'] = 18;
00127         aa_to_index['V'] = 19;
00128     SG_DEBUG("initializing background\n");
00129     double background[20]; // profile
00130     background[0]=0.0799912015849807; //A
00131   background[1]=0.0484482507611578;//R
00132   background[2]=0.044293531582512;//N
00133   background[3]=0.0578891399707563;//D
00134   background[4]=0.0171846021407367;//C
00135   background[5]=0.0380578923048682;//Q
00136   background[6]=0.0638169929675978;//E
00137   background[7]=0.0760659374742852;//G
00138     background[8]=0.0223465499452473;//H
00139     background[9]=0.0550905793661343;//I
00140     background[10]=0.0866897071203864;//L
00141     background[11]=0.060458245507428;//K
00142     background[12]=0.0215379186368154;//M
00143     background[13]=0.0396348024787477;//F
00144     background[14]=0.0465746314476874;//P
00145     background[15]=0.0630028230885602;//S
00146     background[16]=0.0580394726014824;//T
00147     background[17]=0.0144991866213453;//W
00148     background[18]=0.03635438623143;//Y
00149     background[19]=0.0700241481678408;//V
00150 
00151 
00152     std::vector<std::string> seqs;
00153     //int32_t nof_sequences = 7329;
00154 
00155     double C = 0.8;
00156     const char *filename="/fml/ag-raetsch/home/toussaint/scp/aawd_compbio_workshop/code_nora/data/profile/profiles";
00157     std::ifstream fin(filename);    
00158     
00159     SG_DEBUG("Reading profiles from %s\n", filename);
00160     std::string line;
00161   while (!fin.eof())
00162     { 
00163         std::getline(fin, line);
00164 
00165         if (line[0] == '>') // new sequence
00166         { 
00167             int idx = line.find_first_of(' ');
00168             sequence_labels.push_back(line.substr(1,idx-1));
00169             std::getline(fin, line);
00170             std::string orig_sequence = line;
00171             std::string sequence="";
00172 
00173             int len_line = line.length();
00174 
00175             // skip 3 lines
00176 
00177             std::getline(fin, line);
00178             std::getline(fin, line);
00179             std::getline(fin, line);
00180 
00181             profiles.push_back(std::vector<double>());
00182 
00183             std::vector<double>& curr_profile = profiles.back();
00184             for (int i=0; i < len_line; ++i)
00185             { 
00186                     std::getline(fin, line);
00187                     int a = line.find_first_not_of(' '); // index position
00188                     int b = line.find_first_of(' ', a); // index position
00189                     a = line.find_first_not_of(' ', b); // aa position
00190                     b = line.find_first_of(' ', a); // aa position
00191                     std::string aa=line.substr(a,b-a);
00192                     if (0) //(aa =="B" || aa == "X" || aa == "Z")
00193                     {
00194                         int pos = seqs.size()+1;
00195                         SG_DEBUG("Skipping aa in sequence %d\n", pos);
00196                     continue;
00197             }
00198                     else
00199                     {
00200                         sequence += aa;
00201 
00202                         a = line.find_first_not_of(' ', b); // beginning of block to ignore
00203                         b = line.find_first_of(' ', a); // aa position
00204 
00205                         for (int j=0; j < 19; ++j)
00206                         { 
00207                             a = line.find_first_not_of(' ', b);
00208                             b = line.find_first_of(' ', a);
00209                         }
00210 
00211                         int all_zeros = 1;
00212                         // interesting block
00213                         for (int j=0; j < 20; ++j)
00214                         { 
00215                             a = line.find_first_not_of(' ', b);
00216                             b = line.find_first_of(' ', a);
00217                             double p = atof(line.substr(a, b-a).c_str());
00218                             if (p > 0)
00219                             {
00220                                 all_zeros = 0;
00221                             }
00222                             double value = -1* std::log(C*(p/100)+(1-C)*background[j]); // taken from Leslie's example, C actually corresponds to 1/(1+C)
00223                             curr_profile.push_back(value);
00224                             //SG_DEBUG("seq %d aa %d value %f p %f  bg %f\n", i, j, value,p, background[j]);
00225                         }
00226 
00227                         if (all_zeros)
00228                         {
00229                             SG_DEBUG(">>>>>>>>>>>>>>> all zeros");
00230                             if (aa !="B" && aa != "X" && aa != "Z")
00231                             {
00232                                 //profile[i][temp_profile_index]=-log(C+(1-C)*background[re_candidate[temp_profile_index]]);
00233                                 int32_t aa_index = aa_to_index[(int)aa.c_str()[0]];
00234                                 double value = -1* std::log(C+(1-C)*background[aa_index]); // taken from Leslie's example, C actually corresponds to 1/(1+C)
00235                                 SG_DEBUG("before %f\n", profiles.back()[(i-1) * 20 + aa_index]);
00236                                 curr_profile[(i*20) + aa_index] = value;
00237                                 SG_DEBUG(">>> aa %c \t %d \t %f\n", aa.c_str()[0], aa_index, value);
00238                                 
00239                                 /*
00240                                 for (int z=0; z <20; ++z)
00241                                 {
00242                                     SG_DEBUG(" %d \t %f\t", z, curr_profile[z]);
00243                                 }
00244                                 SG_DEBUG("\n");
00245                                 */
00246                             }
00247                         }
00248                     }
00249             }
00250 
00251             if (curr_profile.size() != 20 * sequence.length())
00252         { 
00253                 SG_ERROR("Something's wrong with the profile.\n");
00254                 break;
00255             }
00256 
00257             seqs.push_back(sequence);
00258 
00259 
00260             /*
00261             // 6 irrelevant lines
00262             for (int i=0; i < 6; ++i)
00263             { 
00264                 std::getline(fin, line);
00265             }
00266             //
00267             */
00268         }
00269     }
00270     
00271     fin.close();
00272 
00273     nof_sequences = seqs.size();
00274     sequences = new TString<char>[nof_sequences];
00275 
00276     int max_len = 0;
00277     for (int i=0; i < nof_sequences; ++i)
00278     {
00279         int len = seqs[i].length();
00280         sequences[i].string = new char[len+1];
00281         sequences[i].length = len;
00282         strcpy(sequences[i].string, seqs[i].c_str());
00283 
00284         if (len > max_len) max_len = len;
00285     }
00286 
00287     max_sequence_length = max_len;
00288     //string_features = new CStringFeatures<char>(sequences, nof_sequences, max_sequence_length, PROTEIN);
00289 
00290 }
00291 
00292 bool CSpectrumRBFKernel::init(CFeatures* l, CFeatures* r)
00293 {
00294     // >> profile
00295 /*
00296     read_profiles_and_sequences();
00297     l = string_features;
00298     r = string_features;
00299     */
00300     // << profile
00301 
00302     int32_t lhs_changed=(lhs!=l);
00303     int32_t rhs_changed=(rhs!=r);
00304 
00305     CStringKernel<char>::init(l,r);
00306 
00307     SG_DEBUG("lhs_changed: %i\n", lhs_changed);
00308     SG_DEBUG("rhs_changed: %i\n", rhs_changed);
00309 
00310     CStringFeatures<char>* sf_l=(CStringFeatures<char>*) l;
00311     CStringFeatures<char>* sf_r=(CStringFeatures<char>*) r;
00312 
00313     SG_UNREF(alphabet);
00314     alphabet=sf_l->get_alphabet();
00315     CAlphabet* ralphabet=sf_r->get_alphabet();
00316 
00317     if (!((alphabet->get_alphabet()==DNA) || (alphabet->get_alphabet()==RNA)))
00318         properties &= ((uint64_t) (-1)) ^ (KP_LINADD | KP_BATCHEVALUATION);
00319 
00320     ASSERT(ralphabet->get_alphabet()==alphabet->get_alphabet());
00321     SG_UNREF(ralphabet);
00322 
00323 
00324     return init_normalizer();
00325 }
00326 
00327 void CSpectrumRBFKernel::cleanup()
00328 {
00329 
00330     SG_UNREF(alphabet);
00331     alphabet=NULL;
00332 
00333     CKernel::cleanup();
00334 }
00335 
00336 inline bool isaa(char c)
00337 {
00338   if (c<65 || c>89 || c=='B' || c=='J' || c=='O' || c=='U' || c=='X' || c=='Z')
00339     return false ;
00340   return true ;
00341 }
00342 
00343 float64_t CSpectrumRBFKernel::AA_helper(const char* path, const int seq_degree, const char* joint_seq, unsigned int index)
00344 {
00345     //const char* AA = "ARNDCQEGHILKMFPSTWYV";
00346   float64_t diff=0.0 ;
00347   
00348   for (int i=0; i<seq_degree; i++)
00349     {
00350       if (!isaa(path[i])||!isaa(joint_seq[index+i]))
00351     diff+=1.4 ;
00352       else
00353     {
00354       diff += AA_matrix[ (path[i]-1)*128 + path[i] - 1] ;
00355       diff -= 2*AA_matrix[ (path[i]-1)*128 + joint_seq[index+i] - 1] ;
00356       diff += AA_matrix[ (joint_seq[index+i]-1)*128 + joint_seq[index+i] - 1] ;
00357       if (CMath::is_nan(diff))
00358         fprintf(stderr, "nan occurred: '%c' '%c'\n", path[i], joint_seq[index+i]) ;
00359     }
00360     }
00361   
00362   return exp( - diff/width) ;
00363 }
00364 
00365 float64_t CSpectrumRBFKernel::compute(int32_t idx_a, int32_t idx_b)
00366 {
00367     int32_t alen, blen;
00368     bool afree, bfree;
00369 
00370     char* avec = ((CStringFeatures<char>*) lhs)->get_feature_vector(idx_a, alen, afree);
00371     char* bvec = ((CStringFeatures<char>*) rhs)->get_feature_vector(idx_b, blen, bfree);
00372 
00373     float64_t result=0;
00374     for (int32_t i=0; i<alen; i++)
00375       {
00376         for (int32_t j=0; j<blen; j++)
00377           {
00378         if ((i+degree<=alen) && (j+degree<=blen))
00379           result += AA_helper(&(avec[i]), degree, bvec, j) ;
00380           }
00381       }
00382 
00383     ((CStringFeatures<char>*) lhs)->free_feature_vector(avec, idx_a, afree);
00384     ((CStringFeatures<char>*) rhs)->free_feature_vector(bvec, idx_b, bfree);
00385     return result;
00386 }
00387 
00388 bool CSpectrumRBFKernel::set_AA_matrix(
00389     float64_t* AA_matrix_)
00390 {
00391 
00392     if (AA_matrix_)
00393     {
00394         SG_DEBUG("Setting AA_matrix\n") ;
00395         memcpy(AA_matrix, AA_matrix_, 128*128*sizeof(float64_t)) ;
00396         return true ;
00397     }
00398 
00399     return false;
00400 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation