37 using namespace shogun;
47 :
CStringKernel<char>(size), alphabet(NULL), degree(degree_), width(width_), sequences(NULL), string_features(NULL), nof_sequences(0), max_sequence_length(0)
72 :
CStringKernel<char>(size), alphabet(NULL), degree(degree_), width(width_), sequences(NULL), string_features(NULL), nof_sequences(0), max_sequence_length(0)
93 int32_t aa_to_index[128];
94 aa_to_index[(uint8_t)
'A'] = 0;
95 aa_to_index[(uint8_t)
'R'] = 1;
96 aa_to_index[(uint8_t)
'N'] = 2;
97 aa_to_index[(uint8_t)
'D'] = 3;
98 aa_to_index[(uint8_t)
'C'] = 4;
99 aa_to_index[(uint8_t)
'Q'] = 5;
100 aa_to_index[(uint8_t)
'E'] = 6;
101 aa_to_index[(uint8_t)
'G'] = 7;
102 aa_to_index[(uint8_t)
'H'] = 8;
103 aa_to_index[(uint8_t)
'I'] = 9;
104 aa_to_index[(uint8_t)
'L'] = 10;
105 aa_to_index[(uint8_t)
'K'] = 11;
106 aa_to_index[(uint8_t)
'M'] = 12;
107 aa_to_index[(uint8_t)
'F'] = 13;
108 aa_to_index[(uint8_t)
'P'] = 14;
109 aa_to_index[(uint8_t)
'S'] = 15;
110 aa_to_index[(uint8_t)
'T'] = 16;
111 aa_to_index[(uint8_t)
'W'] = 17;
112 aa_to_index[(uint8_t)
'Y'] = 18;
113 aa_to_index[(uint8_t)
'V'] = 19;
114 SG_DEBUG(
"initializing background\n")
115 double background[20];
116 background[0]=0.0799912015849807;
117 background[1]=0.0484482507611578;
118 background[2]=0.044293531582512;
119 background[3]=0.0578891399707563;
120 background[4]=0.0171846021407367;
121 background[5]=0.0380578923048682;
122 background[6]=0.0638169929675978;
123 background[7]=0.0760659374742852;
124 background[8]=0.0223465499452473;
125 background[9]=0.0550905793661343;
126 background[10]=0.0866897071203864;
127 background[11]=0.060458245507428;
128 background[12]=0.0215379186368154;
129 background[13]=0.0396348024787477;
130 background[14]=0.0465746314476874;
131 background[15]=0.0630028230885602;
132 background[16]=0.0580394726014824;
133 background[17]=0.0144991866213453;
134 background[18]=0.03635438623143;
135 background[19]=0.0700241481678408;
138 std::vector<std::string> seqs;
142 const char *filename=
"/fml/ag-raetsch/home/toussaint/scp/aawd_compbio_workshop/code_nora/data/profile/profiles";
143 std::ifstream fin(filename);
145 SG_DEBUG(
"Reading profiles from %s\n", filename)
149 std::getline(fin, line);
153 int idx = line.find_first_of(
' ');
155 std::getline(fin, line);
156 std::string orig_sequence = line;
157 std::string sequence=
"";
159 int len_line = line.length();
163 std::getline(fin, line);
164 std::getline(fin, line);
165 std::getline(fin, line);
167 profiles.push_back(std::vector<double>());
169 std::vector<double>& curr_profile =
profiles.back();
170 for (
int i=0; i < len_line; ++i)
172 std::getline(fin, line);
173 int a = line.find_first_not_of(
' ');
174 int b = line.find_first_of(
' ', a);
175 a = line.find_first_not_of(
' ', b);
176 b = line.find_first_of(
' ', a);
177 std::string aa=line.substr(a,b-a);
180 int pos = seqs.size()+1;
181 SG_DEBUG(
"Skipping aa in sequence %d\n", pos)
188 a = line.find_first_not_of(
' ', b);
189 b = line.find_first_of(
' ', a);
191 for (
int j=0; j < 19; ++j)
193 a = line.find_first_not_of(
' ', b);
194 b = line.find_first_of(
' ', a);
199 for (
int j=0; j < 20; ++j)
201 a = line.find_first_not_of(
' ', b);
202 b = line.find_first_of(
' ', a);
203 double p = atof(line.substr(a, b-a).c_str());
208 double value = -1* std::log(C*(p/100)+(1-C)*background[j]);
209 curr_profile.push_back(value);
215 SG_DEBUG(
">>>>>>>>>>>>>>> all zeros")
216 if (aa !=
"B" && aa !=
"X" && aa !=
"Z")
219 int32_t aa_index = aa_to_index[(int)aa.c_str()[0]];
220 double value = -1* std::log(C+(1-C)*background[aa_index]);
222 curr_profile[(i*20) + aa_index] = value;
223 SG_DEBUG(
">>> aa %c \t %d \t %f\n", aa.c_str()[0], aa_index, value)
237 if (curr_profile.size() != 20 * sequence.length())
239 SG_ERROR(
"Something's wrong with the profile.\n")
243 seqs.push_back(sequence);
265 int len = seqs[i].length();
268 strcpy(
sequences[i].
string, seqs[i].c_str());
270 if (len > max_len) max_len = len;
288 int32_t lhs_changed=(
lhs!=l);
289 int32_t rhs_changed=(
rhs!=r);
293 SG_DEBUG(
"lhs_changed: %i\n", lhs_changed)
294 SG_DEBUG(
"rhs_changed: %i\n", rhs_changed)
324 if (c<65 || c>89 || c==
'B' || c==
'J' || c==
'O' || c==
'U' || c==
'X' || c==
'Z')
334 for (
int i=0; i<seq_degree; i++)
336 if (!
isaa(path[i])||!
isaa(joint_seq[index+i]))
341 diff -= 2*
AA_matrix.
matrix[ (path[i]-1)*128 + joint_seq[index+i] - 1] ;
342 diff +=
AA_matrix.
matrix[ (joint_seq[index+i]-1)*128 + joint_seq[index+i] - 1] ;
344 fprintf(stderr,
"nan occurred: '%c' '%c'\n", path[i], joint_seq[index+i]) ;
348 return exp( - diff/
width) ;
360 for (int32_t i=0; i<alen; i++)
362 for (int32_t j=0; j<blen; j++)
406 void CSpectrumRBFKernel::init()