00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include <vector>
00013
00014 #include "lib/common.h"
00015 #include "lib/io.h"
00016 #include "lib/Signal.h"
00017 #include "lib/Trie.h"
00018 #include "base/Parallel.h"
00019
00020 #include "kernel/SpectrumRBFKernel.h"
00021 #include "features/Features.h"
00022 #include "features/StringFeatures.h"
00023 #include <math.h>
00024
00025 #include <vector>
00026 #include <string>
00027 #include <fstream>
00028 #include <cmath>
00029
00030 #include <assert.h>
00031
00032 #ifndef WIN32
00033 #include <pthread.h>
00034 #endif
00035
00036
00037 using namespace shogun;
00038
00039 CSpectrumRBFKernel::CSpectrumRBFKernel(void)
00040 : CStringKernel<char>(0)
00041 {
00042 SG_UNSTABLE("CSpectrumRBFKernel::CSpectrumRBFKernel(void)", "\n");
00043
00044 alphabet = NULL;
00045 degree = 0;
00046 AA_matrix = NULL;
00047 width = 0.0;
00048 sequences = NULL;
00049 string_features = NULL;
00050 nof_sequences = 0;
00051 max_sequence_length = 0;
00052
00053 initialized = false;
00054
00055 max_mismatch = 0;
00056 target_letter_0 = 0;
00057 }
00058
00059 CSpectrumRBFKernel::CSpectrumRBFKernel (int32_t size, float64_t *AA_matrix_, int32_t degree_, float64_t width_)
00060 : CStringKernel<char>(size), alphabet(NULL), degree(degree_), width(width_), sequences(NULL), string_features(NULL), nof_sequences(0), max_sequence_length(0)
00061 {
00062 lhs=NULL;
00063 rhs=NULL;
00064
00065 target_letter_0=-1 ;
00066
00067 AA_matrix=new float64_t[128*128];
00068
00069
00070 memcpy(AA_matrix, AA_matrix_, 128*128*sizeof(float64_t)) ;
00071
00072 read_profiles_and_sequences();
00073
00074
00075 string_features = new CStringFeatures<char>(sequences, nof_sequences, max_sequence_length, IUPAC_AMINO_ACID);
00076 init(string_features, string_features);
00077 }
00078
00079 CSpectrumRBFKernel::CSpectrumRBFKernel(
00080 CStringFeatures<char>* l, CStringFeatures<char>* r, int32_t size, float64_t* AA_matrix_, int32_t degree_, float64_t width_)
00081 : CStringKernel<char>(size), alphabet(NULL), degree(degree_), width(width_), sequences(NULL), string_features(NULL), nof_sequences(0), max_sequence_length(0)
00082 {
00083 target_letter_0=-1 ;
00084
00085 AA_matrix=new float64_t[128*128];
00086 memcpy(AA_matrix, AA_matrix_, 128*128*sizeof(float64_t)) ;
00087
00088 init(l, r);
00089 }
00090
00091 CSpectrumRBFKernel::~CSpectrumRBFKernel()
00092 {
00093 cleanup();
00094 delete[] AA_matrix ;
00095 }
00096
00097
00098 void CSpectrumRBFKernel::remove_lhs()
00099 {
00100
00101 CKernel::remove_lhs();
00102 }
00103
00104 void CSpectrumRBFKernel::read_profiles_and_sequences()
00105 {
00106
00107 int32_t aa_to_index[128];
00108 aa_to_index['A'] = 0;
00109 aa_to_index['R'] = 1;
00110 aa_to_index['N'] = 2;
00111 aa_to_index['D'] = 3;
00112 aa_to_index['C'] = 4;
00113 aa_to_index['Q'] = 5;
00114 aa_to_index['E'] = 6;
00115 aa_to_index['G'] = 7;
00116 aa_to_index['H'] = 8;
00117 aa_to_index['I'] = 9;
00118 aa_to_index['L'] = 10;
00119 aa_to_index['K'] = 11;
00120 aa_to_index['M'] = 12;
00121 aa_to_index['F'] = 13;
00122 aa_to_index['P'] = 14;
00123 aa_to_index['S'] = 15;
00124 aa_to_index['T'] = 16;
00125 aa_to_index['W'] = 17;
00126 aa_to_index['Y'] = 18;
00127 aa_to_index['V'] = 19;
00128 SG_DEBUG("initializing background\n");
00129 double background[20];
00130 background[0]=0.0799912015849807;
00131 background[1]=0.0484482507611578;
00132 background[2]=0.044293531582512;
00133 background[3]=0.0578891399707563;
00134 background[4]=0.0171846021407367;
00135 background[5]=0.0380578923048682;
00136 background[6]=0.0638169929675978;
00137 background[7]=0.0760659374742852;
00138 background[8]=0.0223465499452473;
00139 background[9]=0.0550905793661343;
00140 background[10]=0.0866897071203864;
00141 background[11]=0.060458245507428;
00142 background[12]=0.0215379186368154;
00143 background[13]=0.0396348024787477;
00144 background[14]=0.0465746314476874;
00145 background[15]=0.0630028230885602;
00146 background[16]=0.0580394726014824;
00147 background[17]=0.0144991866213453;
00148 background[18]=0.03635438623143;
00149 background[19]=0.0700241481678408;
00150
00151
00152 std::vector<std::string> seqs;
00153
00154
00155 double C = 0.8;
00156 const char *filename="/fml/ag-raetsch/home/toussaint/scp/aawd_compbio_workshop/code_nora/data/profile/profiles";
00157 std::ifstream fin(filename);
00158
00159 SG_DEBUG("Reading profiles from %s\n", filename);
00160 std::string line;
00161 while (!fin.eof())
00162 {
00163 std::getline(fin, line);
00164
00165 if (line[0] == '>')
00166 {
00167 int idx = line.find_first_of(' ');
00168 sequence_labels.push_back(line.substr(1,idx-1));
00169 std::getline(fin, line);
00170 std::string orig_sequence = line;
00171 std::string sequence="";
00172
00173 int len_line = line.length();
00174
00175
00176
00177 std::getline(fin, line);
00178 std::getline(fin, line);
00179 std::getline(fin, line);
00180
00181 profiles.push_back(std::vector<double>());
00182
00183 std::vector<double>& curr_profile = profiles.back();
00184 for (int i=0; i < len_line; ++i)
00185 {
00186 std::getline(fin, line);
00187 int a = line.find_first_not_of(' ');
00188 int b = line.find_first_of(' ', a);
00189 a = line.find_first_not_of(' ', b);
00190 b = line.find_first_of(' ', a);
00191 std::string aa=line.substr(a,b-a);
00192 if (0)
00193 {
00194 int pos = seqs.size()+1;
00195 SG_DEBUG("Skipping aa in sequence %d\n", pos);
00196 continue;
00197 }
00198 else
00199 {
00200 sequence += aa;
00201
00202 a = line.find_first_not_of(' ', b);
00203 b = line.find_first_of(' ', a);
00204
00205 for (int j=0; j < 19; ++j)
00206 {
00207 a = line.find_first_not_of(' ', b);
00208 b = line.find_first_of(' ', a);
00209 }
00210
00211 int all_zeros = 1;
00212
00213 for (int j=0; j < 20; ++j)
00214 {
00215 a = line.find_first_not_of(' ', b);
00216 b = line.find_first_of(' ', a);
00217 double p = atof(line.substr(a, b-a).c_str());
00218 if (p > 0)
00219 {
00220 all_zeros = 0;
00221 }
00222 double value = -1* std::log(C*(p/100)+(1-C)*background[j]);
00223 curr_profile.push_back(value);
00224
00225 }
00226
00227 if (all_zeros)
00228 {
00229 SG_DEBUG(">>>>>>>>>>>>>>> all zeros");
00230 if (aa !="B" && aa != "X" && aa != "Z")
00231 {
00232
00233 int32_t aa_index = aa_to_index[(int)aa.c_str()[0]];
00234 double value = -1* std::log(C+(1-C)*background[aa_index]);
00235 SG_DEBUG("before %f\n", profiles.back()[(i-1) * 20 + aa_index]);
00236 curr_profile[(i*20) + aa_index] = value;
00237 SG_DEBUG(">>> aa %c \t %d \t %f\n", aa.c_str()[0], aa_index, value);
00238
00239
00240
00241
00242
00243
00244
00245
00246 }
00247 }
00248 }
00249 }
00250
00251 if (curr_profile.size() != 20 * sequence.length())
00252 {
00253 SG_ERROR("Something's wrong with the profile.\n");
00254 break;
00255 }
00256
00257 seqs.push_back(sequence);
00258
00259
00260
00261
00262
00263
00264
00265
00266
00267
00268 }
00269 }
00270
00271 fin.close();
00272
00273 nof_sequences = seqs.size();
00274 sequences = new TString<char>[nof_sequences];
00275
00276 int max_len = 0;
00277 for (int i=0; i < nof_sequences; ++i)
00278 {
00279 int len = seqs[i].length();
00280 sequences[i].string = new char[len+1];
00281 sequences[i].length = len;
00282 strcpy(sequences[i].string, seqs[i].c_str());
00283
00284 if (len > max_len) max_len = len;
00285 }
00286
00287 max_sequence_length = max_len;
00288
00289
00290 }
00291
00292 bool CSpectrumRBFKernel::init(CFeatures* l, CFeatures* r)
00293 {
00294
00295
00296
00297
00298
00299
00300
00301
00302 int32_t lhs_changed=(lhs!=l);
00303 int32_t rhs_changed=(rhs!=r);
00304
00305 CStringKernel<char>::init(l,r);
00306
00307 SG_DEBUG("lhs_changed: %i\n", lhs_changed);
00308 SG_DEBUG("rhs_changed: %i\n", rhs_changed);
00309
00310 CStringFeatures<char>* sf_l=(CStringFeatures<char>*) l;
00311 CStringFeatures<char>* sf_r=(CStringFeatures<char>*) r;
00312
00313 SG_UNREF(alphabet);
00314 alphabet=sf_l->get_alphabet();
00315 CAlphabet* ralphabet=sf_r->get_alphabet();
00316
00317 if (!((alphabet->get_alphabet()==DNA) || (alphabet->get_alphabet()==RNA)))
00318 properties &= ((uint64_t) (-1)) ^ (KP_LINADD | KP_BATCHEVALUATION);
00319
00320 ASSERT(ralphabet->get_alphabet()==alphabet->get_alphabet());
00321 SG_UNREF(ralphabet);
00322
00323
00324 return init_normalizer();
00325 }
00326
00327 void CSpectrumRBFKernel::cleanup()
00328 {
00329
00330 SG_UNREF(alphabet);
00331 alphabet=NULL;
00332
00333 CKernel::cleanup();
00334 }
00335
00336 inline bool isaa(char c)
00337 {
00338 if (c<65 || c>89 || c=='B' || c=='J' || c=='O' || c=='U' || c=='X' || c=='Z')
00339 return false ;
00340 return true ;
00341 }
00342
00343 float64_t CSpectrumRBFKernel::AA_helper(const char* path, const int seq_degree, const char* joint_seq, unsigned int index)
00344 {
00345
00346 float64_t diff=0.0 ;
00347
00348 for (int i=0; i<seq_degree; i++)
00349 {
00350 if (!isaa(path[i])||!isaa(joint_seq[index+i]))
00351 diff+=1.4 ;
00352 else
00353 {
00354 diff += AA_matrix[ (path[i]-1)*128 + path[i] - 1] ;
00355 diff -= 2*AA_matrix[ (path[i]-1)*128 + joint_seq[index+i] - 1] ;
00356 diff += AA_matrix[ (joint_seq[index+i]-1)*128 + joint_seq[index+i] - 1] ;
00357 if (CMath::is_nan(diff))
00358 fprintf(stderr, "nan occurred: '%c' '%c'\n", path[i], joint_seq[index+i]) ;
00359 }
00360 }
00361
00362 return exp( - diff/width) ;
00363 }
00364
00365 float64_t CSpectrumRBFKernel::compute(int32_t idx_a, int32_t idx_b)
00366 {
00367 int32_t alen, blen;
00368 bool afree, bfree;
00369
00370 char* avec = ((CStringFeatures<char>*) lhs)->get_feature_vector(idx_a, alen, afree);
00371 char* bvec = ((CStringFeatures<char>*) rhs)->get_feature_vector(idx_b, blen, bfree);
00372
00373 float64_t result=0;
00374 for (int32_t i=0; i<alen; i++)
00375 {
00376 for (int32_t j=0; j<blen; j++)
00377 {
00378 if ((i+degree<=alen) && (j+degree<=blen))
00379 result += AA_helper(&(avec[i]), degree, bvec, j) ;
00380 }
00381 }
00382
00383 ((CStringFeatures<char>*) lhs)->free_feature_vector(avec, idx_a, afree);
00384 ((CStringFeatures<char>*) rhs)->free_feature_vector(bvec, idx_b, bfree);
00385 return result;
00386 }
00387
00388 bool CSpectrumRBFKernel::set_AA_matrix(
00389 float64_t* AA_matrix_)
00390 {
00391
00392 if (AA_matrix_)
00393 {
00394 SG_DEBUG("Setting AA_matrix\n") ;
00395 memcpy(AA_matrix, AA_matrix_, 128*128*sizeof(float64_t)) ;
00396 return true ;
00397 }
00398
00399 return false;
00400 }