48 :
CStringKernel<char>(size), alphabet(NULL), degree(degree_), width(width_), sequences(NULL), string_features(NULL), nof_sequences(0), max_sequence_length(0)
73 :
CStringKernel<char>(size), alphabet(NULL), degree(degree_), width(width_), sequences(NULL), string_features(NULL), nof_sequences(0), max_sequence_length(0)
94 int32_t aa_to_index[128];
95 aa_to_index[(uint8_t)
'A'] = 0;
96 aa_to_index[(uint8_t)
'R'] = 1;
97 aa_to_index[(uint8_t)
'N'] = 2;
98 aa_to_index[(uint8_t)
'D'] = 3;
99 aa_to_index[(uint8_t)
'C'] = 4;
100 aa_to_index[(uint8_t)
'Q'] = 5;
101 aa_to_index[(uint8_t)
'E'] = 6;
102 aa_to_index[(uint8_t)
'G'] = 7;
103 aa_to_index[(uint8_t)
'H'] = 8;
104 aa_to_index[(uint8_t)
'I'] = 9;
105 aa_to_index[(uint8_t)
'L'] = 10;
106 aa_to_index[(uint8_t)
'K'] = 11;
107 aa_to_index[(uint8_t)
'M'] = 12;
108 aa_to_index[(uint8_t)
'F'] = 13;
109 aa_to_index[(uint8_t)
'P'] = 14;
110 aa_to_index[(uint8_t)
'S'] = 15;
111 aa_to_index[(uint8_t)
'T'] = 16;
112 aa_to_index[(uint8_t)
'W'] = 17;
113 aa_to_index[(uint8_t)
'Y'] = 18;
114 aa_to_index[(uint8_t)
'V'] = 19;
115 SG_DEBUG(
"initializing background\n")
116 double background[20];
117 background[0]=0.0799912015849807;
118 background[1]=0.0484482507611578;
119 background[2]=0.044293531582512;
120 background[3]=0.0578891399707563;
121 background[4]=0.0171846021407367;
122 background[5]=0.0380578923048682;
123 background[6]=0.0638169929675978;
124 background[7]=0.0760659374742852;
125 background[8]=0.0223465499452473;
126 background[9]=0.0550905793661343;
127 background[10]=0.0866897071203864;
128 background[11]=0.060458245507428;
129 background[12]=0.0215379186368154;
130 background[13]=0.0396348024787477;
131 background[14]=0.0465746314476874;
132 background[15]=0.0630028230885602;
133 background[16]=0.0580394726014824;
134 background[17]=0.0144991866213453;
135 background[18]=0.03635438623143;
136 background[19]=0.0700241481678408;
139 std::vector<std::string> seqs;
143 const char *filename=
"/fml/ag-raetsch/home/toussaint/scp/aawd_compbio_workshop/code_nora/data/profile/profiles";
144 std::ifstream fin(filename);
146 SG_DEBUG(
"Reading profiles from %s\n", filename)
150 std::getline(fin, line);
154 int idx = line.find_first_of(
' ');
156 std::getline(fin, line);
157 std::string orig_sequence = line;
158 std::string sequence=
"";
160 int len_line = line.length();
164 std::getline(fin, line);
165 std::getline(fin, line);
166 std::getline(fin, line);
168 profiles.push_back(std::vector<double>());
170 std::vector<double>& curr_profile =
profiles.back();
171 for (
int i=0; i < len_line; ++i)
173 std::getline(fin, line);
174 int a = line.find_first_not_of(
' ');
175 int b = line.find_first_of(
' ', a);
176 a = line.find_first_not_of(
' ', b);
177 b = line.find_first_of(
' ', a);
178 std::string aa=line.substr(a,b-a);
181 int pos = seqs.size()+1;
182 SG_DEBUG(
"Skipping aa in sequence %d\n", pos)
189 a = line.find_first_not_of(
' ', b);
190 b = line.find_first_of(
' ', a);
192 for (
int j=0; j < 19; ++j)
194 a = line.find_first_not_of(
' ', b);
195 b = line.find_first_of(
' ', a);
200 for (
int j=0; j < 20; ++j)
202 a = line.find_first_not_of(
' ', b);
203 b = line.find_first_of(
' ', a);
204 double p = atof(line.substr(a, b-a).c_str());
209 double value = -1* std::log(C*(p/100)+(1-C)*background[j]);
210 curr_profile.push_back(value);
216 SG_DEBUG(
">>>>>>>>>>>>>>> all zeros")
217 if (aa !=
"B" && aa !=
"X" && aa !=
"Z")
220 int32_t aa_index = aa_to_index[(int)aa.c_str()[0]];
221 double value = -1* std::log(C+(1-C)*background[aa_index]);
223 curr_profile[(i*20) + aa_index] = value;
224 SG_DEBUG(
">>> aa %c \t %d \t %f\n", aa.c_str()[0], aa_index, value)
238 if (curr_profile.size() != 20 * sequence.length())
240 SG_ERROR(
"Something's wrong with the profile.\n")
244 seqs.push_back(sequence);
266 int len = seqs[i].length();
269 strcpy(
sequences[i].
string, seqs[i].c_str());
271 if (len > max_len) max_len = len;
289 int32_t lhs_changed=(
lhs!=l);
290 int32_t rhs_changed=(
rhs!=r);
294 SG_DEBUG(
"lhs_changed: %i\n", lhs_changed)
295 SG_DEBUG(
"rhs_changed: %i\n", rhs_changed)
325 if (c<65 || c>89 || c==
'B' || c==
'J' || c==
'O' || c==
'U' || c==
'X' || c==
'Z')
335 for (
int i=0; i<seq_degree; i++)
337 if (!
isaa(path[i])||!
isaa(joint_seq[index+i]))
342 diff -= 2*
AA_matrix.
matrix[ (path[i]-1)*128 + joint_seq[index+i] - 1] ;
343 diff +=
AA_matrix.
matrix[ (joint_seq[index+i]-1)*128 + joint_seq[index+i] - 1] ;
345 fprintf(stderr,
"nan occurred: '%c' '%c'\n", path[i], joint_seq[index+i]) ;
349 return exp( - diff/
width) ;
361 for (int32_t i=0; i<alen; i++)
363 for (int32_t j=0; j<blen; j++)
407 void CSpectrumRBFKernel::init()
template class SGStringList
virtual bool init(CFeatures *l, CFeatures *r)
EAlphabet get_alphabet() const
The class Alphabet implements an alphabet and alphabet utility functions.
int32_t max_sequence_length
float64_t AA_helper(const char *path, const int degree, const char *joint_seq, unsigned int index)
CStringFeatures< char > * string_features
bool set_AA_matrix(float64_t *AA_matrix_)
void read_profiles_and_sequences()
Class SGObject is the base class of all shogun objects.
SGString< char > * sequences
std::vector< std::vector< float64_t > > profiles
CAlphabet * get_alphabet()
virtual void register_param()
index_t max_string_length
virtual bool init_normalizer()
CFeatures * rhs
feature vectors to occur on right hand side
void add_vector(bool **param, index_t *length, const char *name, const char *description="")
all of classes and functions are contained in the shogun namespace
CFeatures * lhs
feature vectors to occur on left hand side
static int is_nan(double f)
checks whether a float is nan
The class Features is the base class of all feature objects.
virtual ~CSpectrumRBFKernel()
float64_t compute(int32_t idx_a, int32_t idx_b)
Template class StringKernel, is the base class of all String Kernels.
SGMatrix< float64_t > AA_matrix
std::vector< std::string > sequence_labels