00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include <vector>
00013
00014 #include <shogun/lib/common.h>
00015 #include <shogun/io/SGIO.h>
00016 #include <shogun/lib/Signal.h>
00017 #include <shogun/lib/Trie.h>
00018 #include <shogun/base/Parallel.h>
00019
00020 #include <shogun/kernel/SpectrumRBFKernel.h>
00021 #include <shogun/features/Features.h>
00022 #include <shogun/features/StringFeatures.h>
00023 #include <math.h>
00024
00025 #include <vector>
00026 #include <string>
00027 #include <fstream>
00028 #include <cmath>
00029
00030 #include <assert.h>
00031
00032 #ifdef HAVE_PTHREAD
00033 #include <pthread.h>
00034 #endif
00035
00036
00037 using namespace shogun;
00038
00039 CSpectrumRBFKernel::CSpectrumRBFKernel()
00040 : CStringKernel<char>(0)
00041 {
00042 init();
00043 register_param();
00044 }
00045
00046 CSpectrumRBFKernel::CSpectrumRBFKernel (int32_t size, float64_t *AA_matrix_, int32_t degree_, float64_t width_)
00047 : CStringKernel<char>(size), alphabet(NULL), degree(degree_), width(width_), sequences(NULL), string_features(NULL), nof_sequences(0), max_sequence_length(0)
00048 {
00049 lhs=NULL;
00050 rhs=NULL;
00051
00052 target_letter_0=-1 ;
00053
00054 AA_matrix=SG_MALLOC(float64_t, 128*128);
00055
00056
00057 memcpy(AA_matrix, AA_matrix_, 128*128*sizeof(float64_t)) ;
00058
00059 read_profiles_and_sequences();
00060 SGStringList<char> string_list;
00061 string_list.strings = sequences;
00062 string_list.num_strings = nof_sequences;
00063 string_list.max_string_length = max_sequence_length;
00064
00065
00066 string_features = new CStringFeatures<char>(string_list, IUPAC_AMINO_ACID);
00067 init(string_features, string_features);
00068 register_param();
00069 }
00070
00071 CSpectrumRBFKernel::CSpectrumRBFKernel(
00072 CStringFeatures<char>* l, CStringFeatures<char>* r, int32_t size, float64_t* AA_matrix_, int32_t degree_, float64_t width_)
00073 : CStringKernel<char>(size), alphabet(NULL), degree(degree_), width(width_), sequences(NULL), string_features(NULL), nof_sequences(0), max_sequence_length(0)
00074 {
00075 target_letter_0=-1 ;
00076
00077 AA_matrix=SG_MALLOC(float64_t, 128*128);
00078 memcpy(AA_matrix, AA_matrix_, 128*128*sizeof(float64_t)) ;
00079
00080 init(l, r);
00081 register_param();
00082 }
00083
00084 CSpectrumRBFKernel::~CSpectrumRBFKernel()
00085 {
00086 cleanup();
00087 SG_FREE(AA_matrix);
00088 }
00089
00090
00091 void CSpectrumRBFKernel::remove_lhs()
00092 {
00093
00094 CKernel::remove_lhs();
00095 }
00096
00097 void CSpectrumRBFKernel::read_profiles_and_sequences()
00098 {
00099
00100 int32_t aa_to_index[128];
00101 aa_to_index[(uint8_t) 'A'] = 0;
00102 aa_to_index[(uint8_t) 'R'] = 1;
00103 aa_to_index[(uint8_t) 'N'] = 2;
00104 aa_to_index[(uint8_t) 'D'] = 3;
00105 aa_to_index[(uint8_t) 'C'] = 4;
00106 aa_to_index[(uint8_t) 'Q'] = 5;
00107 aa_to_index[(uint8_t) 'E'] = 6;
00108 aa_to_index[(uint8_t) 'G'] = 7;
00109 aa_to_index[(uint8_t) 'H'] = 8;
00110 aa_to_index[(uint8_t) 'I'] = 9;
00111 aa_to_index[(uint8_t) 'L'] = 10;
00112 aa_to_index[(uint8_t) 'K'] = 11;
00113 aa_to_index[(uint8_t) 'M'] = 12;
00114 aa_to_index[(uint8_t) 'F'] = 13;
00115 aa_to_index[(uint8_t) 'P'] = 14;
00116 aa_to_index[(uint8_t) 'S'] = 15;
00117 aa_to_index[(uint8_t) 'T'] = 16;
00118 aa_to_index[(uint8_t) 'W'] = 17;
00119 aa_to_index[(uint8_t) 'Y'] = 18;
00120 aa_to_index[(uint8_t) 'V'] = 19;
00121 SG_DEBUG("initializing background\n");
00122 double background[20];
00123 background[0]=0.0799912015849807;
00124 background[1]=0.0484482507611578;
00125 background[2]=0.044293531582512;
00126 background[3]=0.0578891399707563;
00127 background[4]=0.0171846021407367;
00128 background[5]=0.0380578923048682;
00129 background[6]=0.0638169929675978;
00130 background[7]=0.0760659374742852;
00131 background[8]=0.0223465499452473;
00132 background[9]=0.0550905793661343;
00133 background[10]=0.0866897071203864;
00134 background[11]=0.060458245507428;
00135 background[12]=0.0215379186368154;
00136 background[13]=0.0396348024787477;
00137 background[14]=0.0465746314476874;
00138 background[15]=0.0630028230885602;
00139 background[16]=0.0580394726014824;
00140 background[17]=0.0144991866213453;
00141 background[18]=0.03635438623143;
00142 background[19]=0.0700241481678408;
00143
00144
00145 std::vector<std::string> seqs;
00146
00147
00148 double C = 0.8;
00149 const char *filename="/fml/ag-raetsch/home/toussaint/scp/aawd_compbio_workshop/code_nora/data/profile/profiles";
00150 std::ifstream fin(filename);
00151
00152 SG_DEBUG("Reading profiles from %s\n", filename);
00153 std::string line;
00154 while (!fin.eof())
00155 {
00156 std::getline(fin, line);
00157
00158 if (line[0] == '>')
00159 {
00160 int idx = line.find_first_of(' ');
00161 sequence_labels.push_back(line.substr(1,idx-1));
00162 std::getline(fin, line);
00163 std::string orig_sequence = line;
00164 std::string sequence="";
00165
00166 int len_line = line.length();
00167
00168
00169
00170 std::getline(fin, line);
00171 std::getline(fin, line);
00172 std::getline(fin, line);
00173
00174 profiles.push_back(std::vector<double>());
00175
00176 std::vector<double>& curr_profile = profiles.back();
00177 for (int i=0; i < len_line; ++i)
00178 {
00179 std::getline(fin, line);
00180 int a = line.find_first_not_of(' ');
00181 int b = line.find_first_of(' ', a);
00182 a = line.find_first_not_of(' ', b);
00183 b = line.find_first_of(' ', a);
00184 std::string aa=line.substr(a,b-a);
00185 if (0)
00186 {
00187 int pos = seqs.size()+1;
00188 SG_DEBUG("Skipping aa in sequence %d\n", pos);
00189 continue;
00190 }
00191 else
00192 {
00193 sequence += aa;
00194
00195 a = line.find_first_not_of(' ', b);
00196 b = line.find_first_of(' ', a);
00197
00198 for (int j=0; j < 19; ++j)
00199 {
00200 a = line.find_first_not_of(' ', b);
00201 b = line.find_first_of(' ', a);
00202 }
00203
00204 int all_zeros = 1;
00205
00206 for (int j=0; j < 20; ++j)
00207 {
00208 a = line.find_first_not_of(' ', b);
00209 b = line.find_first_of(' ', a);
00210 double p = atof(line.substr(a, b-a).c_str());
00211 if (p > 0)
00212 {
00213 all_zeros = 0;
00214 }
00215 double value = -1* std::log(C*(p/100)+(1-C)*background[j]);
00216 curr_profile.push_back(value);
00217
00218 }
00219
00220 if (all_zeros)
00221 {
00222 SG_DEBUG(">>>>>>>>>>>>>>> all zeros");
00223 if (aa !="B" && aa != "X" && aa != "Z")
00224 {
00225
00226 int32_t aa_index = aa_to_index[(int)aa.c_str()[0]];
00227 double value = -1* std::log(C+(1-C)*background[aa_index]);
00228 SG_DEBUG("before %f\n", profiles.back()[(i-1) * 20 + aa_index]);
00229 curr_profile[(i*20) + aa_index] = value;
00230 SG_DEBUG(">>> aa %c \t %d \t %f\n", aa.c_str()[0], aa_index, value);
00231
00232
00233
00234
00235
00236
00237
00238
00239 }
00240 }
00241 }
00242 }
00243
00244 if (curr_profile.size() != 20 * sequence.length())
00245 {
00246 SG_ERROR("Something's wrong with the profile.\n");
00247 break;
00248 }
00249
00250 seqs.push_back(sequence);
00251
00252
00253
00254
00255
00256
00257
00258
00259
00260
00261 }
00262 }
00263
00264 fin.close();
00265
00266 nof_sequences = seqs.size();
00267 sequences = SG_MALLOC(SGString<char>, nof_sequences);
00268
00269 int max_len = 0;
00270 for (int i=0; i < nof_sequences; ++i)
00271 {
00272 int len = seqs[i].length();
00273 sequences[i].string = SG_MALLOC(char, len+1);
00274 sequences[i].slen = len;
00275 strcpy(sequences[i].string, seqs[i].c_str());
00276
00277 if (len > max_len) max_len = len;
00278 }
00279
00280 max_sequence_length = max_len;
00281
00282
00283 }
00284
00285 bool CSpectrumRBFKernel::init(CFeatures* l, CFeatures* r)
00286 {
00287
00288
00289
00290
00291
00292
00293
00294
00295 int32_t lhs_changed=(lhs!=l);
00296 int32_t rhs_changed=(rhs!=r);
00297
00298 CStringKernel<char>::init(l,r);
00299
00300 SG_DEBUG("lhs_changed: %i\n", lhs_changed);
00301 SG_DEBUG("rhs_changed: %i\n", rhs_changed);
00302
00303 CStringFeatures<char>* sf_l=(CStringFeatures<char>*) l;
00304 CStringFeatures<char>* sf_r=(CStringFeatures<char>*) r;
00305
00306 SG_UNREF(alphabet);
00307 alphabet=sf_l->get_alphabet();
00308 CAlphabet* ralphabet=sf_r->get_alphabet();
00309
00310 if (!((alphabet->get_alphabet()==DNA) || (alphabet->get_alphabet()==RNA)))
00311 properties &= ((uint64_t) (-1)) ^ (KP_LINADD | KP_BATCHEVALUATION);
00312
00313 ASSERT(ralphabet->get_alphabet()==alphabet->get_alphabet());
00314 SG_UNREF(ralphabet);
00315
00316
00317 return init_normalizer();
00318 }
00319
00320 void CSpectrumRBFKernel::cleanup()
00321 {
00322
00323 SG_UNREF(alphabet);
00324 alphabet=NULL;
00325
00326 CKernel::cleanup();
00327 }
00328
00329 inline bool isaa(char c)
00330 {
00331 if (c<65 || c>89 || c=='B' || c=='J' || c=='O' || c=='U' || c=='X' || c=='Z')
00332 return false ;
00333 return true ;
00334 }
00335
00336 float64_t CSpectrumRBFKernel::AA_helper(const char* path, const int seq_degree, const char* joint_seq, unsigned int index)
00337 {
00338
00339 float64_t diff=0.0 ;
00340
00341 for (int i=0; i<seq_degree; i++)
00342 {
00343 if (!isaa(path[i])||!isaa(joint_seq[index+i]))
00344 diff+=1.4 ;
00345 else
00346 {
00347 diff += AA_matrix[ (path[i]-1)*128 + path[i] - 1] ;
00348 diff -= 2*AA_matrix[ (path[i]-1)*128 + joint_seq[index+i] - 1] ;
00349 diff += AA_matrix[ (joint_seq[index+i]-1)*128 + joint_seq[index+i] - 1] ;
00350 if (CMath::is_nan(diff))
00351 fprintf(stderr, "nan occurred: '%c' '%c'\n", path[i], joint_seq[index+i]) ;
00352 }
00353 }
00354
00355 return exp( - diff/width) ;
00356 }
00357
00358 float64_t CSpectrumRBFKernel::compute(int32_t idx_a, int32_t idx_b)
00359 {
00360 int32_t alen, blen;
00361 bool afree, bfree;
00362
00363 char* avec = ((CStringFeatures<char>*) lhs)->get_feature_vector(idx_a, alen, afree);
00364 char* bvec = ((CStringFeatures<char>*) rhs)->get_feature_vector(idx_b, blen, bfree);
00365
00366 float64_t result=0;
00367 for (int32_t i=0; i<alen; i++)
00368 {
00369 for (int32_t j=0; j<blen; j++)
00370 {
00371 if ((i+degree<=alen) && (j+degree<=blen))
00372 result += AA_helper(&(avec[i]), degree, bvec, j) ;
00373 }
00374 }
00375
00376 ((CStringFeatures<char>*) lhs)->free_feature_vector(avec, idx_a, afree);
00377 ((CStringFeatures<char>*) rhs)->free_feature_vector(bvec, idx_b, bfree);
00378 return result;
00379 }
00380
00381 bool CSpectrumRBFKernel::set_AA_matrix(
00382 float64_t* AA_matrix_)
00383 {
00384
00385 if (AA_matrix_)
00386 {
00387 SG_DEBUG("Setting AA_matrix\n") ;
00388 memcpy(AA_matrix, AA_matrix_, 128*128*sizeof(float64_t)) ;
00389 return true ;
00390 }
00391
00392 return false;
00393 }
00394
00395 void CSpectrumRBFKernel::register_param()
00396 {
00397 m_parameters->add(°ree, "degree", "degree of the kernel");
00398 m_parameters->add(&AA_matrix_length, "AA_matrix_length", "the length of AA matrix");
00399 m_parameters->add_vector(&AA_matrix, &AA_matrix_length, "AA_matrix", "128*128 scalar product matrix");
00400 m_parameters->add(&width,"width","width of Gaussian");
00401 m_parameters->add(&nof_sequences, "nof_sequences","length of the sequence");
00402 m_parameters->add_vector(&sequences, &nof_sequences, "the sequences as a part of profile");
00403 m_parameters->add(&max_sequence_length,"max_sequence_length","max length of the sequence");
00404 }
00405
00406 void CSpectrumRBFKernel::register_alphabet()
00407 {
00408 m_parameters->add((CSGObject**)&alphabet, "alphabet", "the alphabet used by kernel");
00409 }
00410
00411 void CSpectrumRBFKernel::init()
00412 {
00413 alphabet = NULL;
00414 degree = 0;
00415 AA_matrix = NULL;
00416 AA_matrix_length = 128*128;
00417 width = 0.0;
00418 sequences = NULL;
00419 string_features = NULL;
00420 nof_sequences = 0;
00421 max_sequence_length = 0;
00422
00423 initialized = false;
00424
00425 max_mismatch = 0;
00426 target_letter_0 = 0;
00427 }