SpectrumRBFKernel.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2009 Soeren Sonnenburg
00008  * Written (W) 1999-2008 Gunnar Raetsch
00009  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00010  */
00011 
00012 #include <vector>
00013 
00014 #include <shogun/lib/common.h>
00015 #include <shogun/io/SGIO.h>
00016 #include <shogun/lib/Signal.h>
00017 #include <shogun/lib/Trie.h>
00018 #include <shogun/base/Parallel.h>
00019 
00020 #include <shogun/kernel/SpectrumRBFKernel.h>
00021 #include <shogun/features/Features.h>
00022 #include <shogun/features/StringFeatures.h>
00023 #include <math.h>
00024 
00025 #include <vector>
00026 #include <string>
00027 #include <fstream>
00028 #include <cmath>
00029 
00030 #include <assert.h>
00031 
00032 #ifdef HAVE_PTHREAD
00033 #include <pthread.h>
00034 #endif
00035 
00036 
00037 using namespace shogun;
00038 
00039 CSpectrumRBFKernel::CSpectrumRBFKernel()
00040   : CStringKernel<char>(0)
00041 {
00042     init();
00043     register_param();
00044 }
00045 
00046 CSpectrumRBFKernel::CSpectrumRBFKernel (int32_t size, float64_t *AA_matrix_, int32_t degree_, float64_t width_)
00047   : CStringKernel<char>(size), alphabet(NULL), degree(degree_), width(width_), sequences(NULL), string_features(NULL), nof_sequences(0), max_sequence_length(0)
00048 {
00049     lhs=NULL;
00050     rhs=NULL;
00051 
00052     target_letter_0=-1 ;
00053 
00054     AA_matrix=SG_MALLOC(float64_t, 128*128);
00055     
00056 
00057     memcpy(AA_matrix, AA_matrix_, 128*128*sizeof(float64_t)) ;
00058 
00059     read_profiles_and_sequences();
00060     SGStringList<char> string_list;
00061     string_list.strings = sequences;
00062     string_list.num_strings = nof_sequences;
00063     string_list.max_string_length = max_sequence_length;
00064 
00065     //string_features = new CStringFeatures<char>(sequences, nof_sequences, max_sequence_length, PROTEIN);
00066     string_features = new CStringFeatures<char>(string_list, IUPAC_AMINO_ACID);
00067     init(string_features, string_features);
00068     register_param();
00069 }
00070 
00071 CSpectrumRBFKernel::CSpectrumRBFKernel(
00072     CStringFeatures<char>* l, CStringFeatures<char>* r, int32_t size, float64_t* AA_matrix_, int32_t degree_, float64_t width_)
00073 : CStringKernel<char>(size), alphabet(NULL), degree(degree_), width(width_), sequences(NULL), string_features(NULL), nof_sequences(0), max_sequence_length(0)
00074 {
00075     target_letter_0=-1 ;
00076 
00077     AA_matrix=SG_MALLOC(float64_t, 128*128);
00078     memcpy(AA_matrix, AA_matrix_, 128*128*sizeof(float64_t)) ;
00079 
00080     init(l, r);
00081     register_param();
00082 }
00083 
00084 CSpectrumRBFKernel::~CSpectrumRBFKernel()
00085 {
00086     cleanup();
00087     SG_FREE(AA_matrix);
00088 }
00089 
00090 
00091 void CSpectrumRBFKernel::remove_lhs()
00092 {
00093 
00094     CKernel::remove_lhs();
00095 }
00096 
00097 void CSpectrumRBFKernel::read_profiles_and_sequences()
00098 {
00099 
00100         int32_t aa_to_index[128];//profile
00101         aa_to_index[(uint8_t) 'A'] = 0;
00102         aa_to_index[(uint8_t) 'R'] = 1;
00103         aa_to_index[(uint8_t) 'N'] = 2;
00104         aa_to_index[(uint8_t) 'D'] = 3;
00105         aa_to_index[(uint8_t) 'C'] = 4;
00106         aa_to_index[(uint8_t) 'Q'] = 5;
00107         aa_to_index[(uint8_t) 'E'] = 6;
00108         aa_to_index[(uint8_t) 'G'] = 7;
00109         aa_to_index[(uint8_t) 'H'] = 8;
00110         aa_to_index[(uint8_t) 'I'] = 9;
00111         aa_to_index[(uint8_t) 'L'] = 10;
00112         aa_to_index[(uint8_t) 'K'] = 11;
00113         aa_to_index[(uint8_t) 'M'] = 12;
00114         aa_to_index[(uint8_t) 'F'] = 13;
00115         aa_to_index[(uint8_t) 'P'] = 14;
00116         aa_to_index[(uint8_t) 'S'] = 15;
00117         aa_to_index[(uint8_t) 'T'] = 16;
00118         aa_to_index[(uint8_t) 'W'] = 17;
00119         aa_to_index[(uint8_t) 'Y'] = 18;
00120         aa_to_index[(uint8_t) 'V'] = 19;
00121     SG_DEBUG("initializing background\n");
00122     double background[20]; // profile
00123     background[0]=0.0799912015849807; //A
00124     background[1]=0.0484482507611578;//R
00125     background[2]=0.044293531582512;//N
00126     background[3]=0.0578891399707563;//D
00127     background[4]=0.0171846021407367;//C
00128     background[5]=0.0380578923048682;//Q
00129     background[6]=0.0638169929675978;//E
00130     background[7]=0.0760659374742852;//G
00131     background[8]=0.0223465499452473;//H
00132     background[9]=0.0550905793661343;//I
00133     background[10]=0.0866897071203864;//L
00134     background[11]=0.060458245507428;//K
00135     background[12]=0.0215379186368154;//M
00136     background[13]=0.0396348024787477;//F
00137     background[14]=0.0465746314476874;//P
00138     background[15]=0.0630028230885602;//S
00139     background[16]=0.0580394726014824;//T
00140     background[17]=0.0144991866213453;//W
00141     background[18]=0.03635438623143;//Y
00142     background[19]=0.0700241481678408;//V
00143 
00144 
00145     std::vector<std::string> seqs;
00146     //int32_t nof_sequences = 7329;
00147 
00148     double C = 0.8;
00149     const char *filename="/fml/ag-raetsch/home/toussaint/scp/aawd_compbio_workshop/code_nora/data/profile/profiles";
00150     std::ifstream fin(filename);    
00151     
00152     SG_DEBUG("Reading profiles from %s\n", filename);
00153     std::string line;
00154     while (!fin.eof())
00155     { 
00156         std::getline(fin, line);
00157 
00158         if (line[0] == '>') // new sequence
00159         { 
00160             int idx = line.find_first_of(' ');
00161             sequence_labels.push_back(line.substr(1,idx-1));
00162             std::getline(fin, line);
00163             std::string orig_sequence = line;
00164             std::string sequence="";
00165 
00166             int len_line = line.length();
00167 
00168             // skip 3 lines
00169 
00170             std::getline(fin, line);
00171             std::getline(fin, line);
00172             std::getline(fin, line);
00173 
00174             profiles.push_back(std::vector<double>());
00175 
00176             std::vector<double>& curr_profile = profiles.back();
00177             for (int i=0; i < len_line; ++i)
00178             { 
00179                     std::getline(fin, line);
00180                     int a = line.find_first_not_of(' '); // index position
00181                     int b = line.find_first_of(' ', a); // index position
00182                     a = line.find_first_not_of(' ', b); // aa position
00183                     b = line.find_first_of(' ', a); // aa position
00184                     std::string aa=line.substr(a,b-a);
00185                     if (0) //(aa =="B" || aa == "X" || aa == "Z")
00186                     {
00187                         int pos = seqs.size()+1;
00188                         SG_DEBUG("Skipping aa in sequence %d\n", pos);
00189                     continue;
00190             }
00191                     else
00192                     {
00193                         sequence += aa;
00194 
00195                         a = line.find_first_not_of(' ', b); // beginning of block to ignore
00196                         b = line.find_first_of(' ', a); // aa position
00197 
00198                         for (int j=0; j < 19; ++j)
00199                         { 
00200                             a = line.find_first_not_of(' ', b);
00201                             b = line.find_first_of(' ', a);
00202                         }
00203 
00204                         int all_zeros = 1;
00205                         // interesting block
00206                         for (int j=0; j < 20; ++j)
00207                         { 
00208                             a = line.find_first_not_of(' ', b);
00209                             b = line.find_first_of(' ', a);
00210                             double p = atof(line.substr(a, b-a).c_str());
00211                             if (p > 0)
00212                             {
00213                                 all_zeros = 0;
00214                             }
00215                             double value = -1* std::log(C*(p/100)+(1-C)*background[j]); // taken from Leslie's example, C actually corresponds to 1/(1+C)
00216                             curr_profile.push_back(value);
00217                             //SG_DEBUG("seq %d aa %d value %f p %f  bg %f\n", i, j, value,p, background[j]);
00218                         }
00219 
00220                         if (all_zeros)
00221                         {
00222                             SG_DEBUG(">>>>>>>>>>>>>>> all zeros");
00223                             if (aa !="B" && aa != "X" && aa != "Z")
00224                             {
00225                                 //profile[i][temp_profile_index]=-log(C+(1-C)*background[re_candidate[temp_profile_index]]);
00226                                 int32_t aa_index = aa_to_index[(int)aa.c_str()[0]];
00227                                 double value = -1* std::log(C+(1-C)*background[aa_index]); // taken from Leslie's example, C actually corresponds to 1/(1+C)
00228                                 SG_DEBUG("before %f\n", profiles.back()[(i-1) * 20 + aa_index]);
00229                                 curr_profile[(i*20) + aa_index] = value;
00230                                 SG_DEBUG(">>> aa %c \t %d \t %f\n", aa.c_str()[0], aa_index, value);
00231                                 
00232                                 /*
00233                                 for (int z=0; z <20; ++z)
00234                                 {
00235                                     SG_DEBUG(" %d \t %f\t", z, curr_profile[z]);
00236                                 }
00237                                 SG_DEBUG("\n");
00238                                 */
00239                             }
00240                         }
00241                     }
00242             }
00243 
00244             if (curr_profile.size() != 20 * sequence.length())
00245         { 
00246                 SG_ERROR("Something's wrong with the profile.\n");
00247                 break;
00248             }
00249 
00250             seqs.push_back(sequence);
00251 
00252 
00253             /*
00254             // 6 irrelevant lines
00255             for (int i=0; i < 6; ++i)
00256             { 
00257                 std::getline(fin, line);
00258             }
00259             //
00260             */
00261         }
00262     }
00263     
00264     fin.close();
00265 
00266     nof_sequences = seqs.size();
00267     sequences = SG_MALLOC(SGString<char>, nof_sequences);
00268 
00269     int max_len = 0;
00270     for (int i=0; i < nof_sequences; ++i)
00271     {
00272         int len = seqs[i].length();
00273         sequences[i].string = SG_MALLOC(char, len+1);
00274         sequences[i].slen = len;
00275         strcpy(sequences[i].string, seqs[i].c_str());
00276 
00277         if (len > max_len) max_len = len;
00278     }
00279 
00280     max_sequence_length = max_len;
00281     //string_features = new CStringFeatures<char>(sequences, nof_sequences, max_sequence_length, PROTEIN);
00282 
00283 }
00284 
00285 bool CSpectrumRBFKernel::init(CFeatures* l, CFeatures* r)
00286 {
00287     // >> profile
00288 /*
00289     read_profiles_and_sequences();
00290     l = string_features;
00291     r = string_features;
00292     */
00293     // << profile
00294 
00295     int32_t lhs_changed=(lhs!=l);
00296     int32_t rhs_changed=(rhs!=r);
00297 
00298     CStringKernel<char>::init(l,r);
00299 
00300     SG_DEBUG("lhs_changed: %i\n", lhs_changed);
00301     SG_DEBUG("rhs_changed: %i\n", rhs_changed);
00302 
00303     CStringFeatures<char>* sf_l=(CStringFeatures<char>*) l;
00304     CStringFeatures<char>* sf_r=(CStringFeatures<char>*) r;
00305 
00306     SG_UNREF(alphabet);
00307     alphabet=sf_l->get_alphabet();
00308     CAlphabet* ralphabet=sf_r->get_alphabet();
00309 
00310     if (!((alphabet->get_alphabet()==DNA) || (alphabet->get_alphabet()==RNA)))
00311         properties &= ((uint64_t) (-1)) ^ (KP_LINADD | KP_BATCHEVALUATION);
00312 
00313     ASSERT(ralphabet->get_alphabet()==alphabet->get_alphabet());
00314     SG_UNREF(ralphabet);
00315 
00316 
00317     return init_normalizer();
00318 }
00319 
00320 void CSpectrumRBFKernel::cleanup()
00321 {
00322 
00323     SG_UNREF(alphabet);
00324     alphabet=NULL;
00325 
00326     CKernel::cleanup();
00327 }
00328 
00329 inline bool isaa(char c)
00330 {
00331   if (c<65 || c>89 || c=='B' || c=='J' || c=='O' || c=='U' || c=='X' || c=='Z')
00332     return false ;
00333   return true ;
00334 }
00335 
00336 float64_t CSpectrumRBFKernel::AA_helper(const char* path, const int seq_degree, const char* joint_seq, unsigned int index)
00337 {
00338     //const char* AA = "ARNDCQEGHILKMFPSTWYV";
00339   float64_t diff=0.0 ;
00340   
00341   for (int i=0; i<seq_degree; i++)
00342     {
00343       if (!isaa(path[i])||!isaa(joint_seq[index+i]))
00344     diff+=1.4 ;
00345       else
00346     {
00347       diff += AA_matrix[ (path[i]-1)*128 + path[i] - 1] ;
00348       diff -= 2*AA_matrix[ (path[i]-1)*128 + joint_seq[index+i] - 1] ;
00349       diff += AA_matrix[ (joint_seq[index+i]-1)*128 + joint_seq[index+i] - 1] ;
00350       if (CMath::is_nan(diff))
00351         fprintf(stderr, "nan occurred: '%c' '%c'\n", path[i], joint_seq[index+i]) ;
00352     }
00353     }
00354   
00355   return exp( - diff/width) ;
00356 }
00357 
00358 float64_t CSpectrumRBFKernel::compute(int32_t idx_a, int32_t idx_b)
00359 {
00360     int32_t alen, blen;
00361     bool afree, bfree;
00362 
00363     char* avec = ((CStringFeatures<char>*) lhs)->get_feature_vector(idx_a, alen, afree);
00364     char* bvec = ((CStringFeatures<char>*) rhs)->get_feature_vector(idx_b, blen, bfree);
00365 
00366     float64_t result=0;
00367     for (int32_t i=0; i<alen; i++)
00368       {
00369         for (int32_t j=0; j<blen; j++)
00370           {
00371         if ((i+degree<=alen) && (j+degree<=blen))
00372           result += AA_helper(&(avec[i]), degree, bvec, j) ;
00373           }
00374       }
00375 
00376     ((CStringFeatures<char>*) lhs)->free_feature_vector(avec, idx_a, afree);
00377     ((CStringFeatures<char>*) rhs)->free_feature_vector(bvec, idx_b, bfree);
00378     return result;
00379 }
00380 
00381 bool CSpectrumRBFKernel::set_AA_matrix(
00382     float64_t* AA_matrix_)
00383 {
00384 
00385     if (AA_matrix_)
00386     {
00387         SG_DEBUG("Setting AA_matrix\n") ;
00388         memcpy(AA_matrix, AA_matrix_, 128*128*sizeof(float64_t)) ;
00389         return true ;
00390     }
00391 
00392     return false;
00393 }
00394 
00395 void CSpectrumRBFKernel::register_param() 
00396 {
00397     m_parameters->add(&degree, "degree", "degree of the kernel");
00398     m_parameters->add(&AA_matrix_length, "AA_matrix_length", "the length of AA matrix");
00399     m_parameters->add_vector(&AA_matrix, &AA_matrix_length, "AA_matrix", "128*128 scalar product matrix");
00400     m_parameters->add(&width,"width","width of Gaussian");
00401     m_parameters->add(&nof_sequences, "nof_sequences","length of the sequence");
00402     m_parameters->add_vector(&sequences, &nof_sequences, "the sequences as a part of profile");
00403     m_parameters->add(&max_sequence_length,"max_sequence_length","max length of the sequence");
00404 }
00405 
00406 void CSpectrumRBFKernel::register_alphabet()
00407 {
00408     m_parameters->add((CSGObject**)&alphabet, "alphabet", "the alphabet used by kernel");
00409 }
00410 
00411 void CSpectrumRBFKernel::init() 
00412 {
00413     alphabet = NULL;
00414     degree = 0;
00415     AA_matrix = NULL;
00416     AA_matrix_length = 128*128;
00417     width = 0.0;
00418     sequences = NULL;
00419     string_features = NULL;
00420     nof_sequences = 0;
00421     max_sequence_length = 0;
00422     
00423     initialized = false;
00424     
00425     max_mismatch = 0;
00426     target_letter_0 = 0;    
00427 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation