00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011 #include <shogun/features/SNPFeatures.h>
00012 #include <shogun/io/SGIO.h>
00013 #include <shogun/features/Alphabet.h>
00014
00015 using namespace shogun;
00016
00017 CSNPFeatures::CSNPFeatures()
00018 {
00019 SG_UNSTABLE("CSNPFeatures::CSNPFeatures(void)", "\n");
00020
00021 strings = NULL;
00022
00023 string_length = 0;
00024 num_strings = 0;
00025 w_dim = 0;
00026
00027 normalization_const = 0.0;
00028
00029 m_str_min = NULL;
00030 m_str_maj = NULL;
00031 }
00032
00033 CSNPFeatures::CSNPFeatures(CStringFeatures<uint8_t>* str) : CDotFeatures(),
00034 m_str_min(NULL), m_str_maj(NULL)
00035 {
00036 ASSERT(str);
00037 ASSERT(str->have_same_length());
00038 SG_REF(str);
00039
00040 strings=str;
00041 string_length=str->get_max_vector_length();
00042 ASSERT((string_length & 1) == 0);
00043 w_dim=3*string_length/2;
00044 num_strings=str->get_num_vectors();
00045 CAlphabet* alpha=str->get_alphabet();
00046 ASSERT(alpha->get_alphabet()==SNP);
00047 SG_UNREF(alpha);
00048
00049 obtain_base_strings();
00050 set_normalization_const();
00051
00052 }
00053
00054 CSNPFeatures::CSNPFeatures(const CSNPFeatures& orig)
00055 : CDotFeatures(orig), strings(orig.strings),
00056 normalization_const(orig.normalization_const),
00057 m_str_min(NULL), m_str_maj(NULL)
00058 {
00059 SG_REF(strings);
00060 string_length=strings->get_max_vector_length();
00061 ASSERT((string_length & 1) == 0);
00062 w_dim=3*string_length;
00063 num_strings=strings->get_num_vectors();
00064 CAlphabet* alpha=strings->get_alphabet();
00065 SG_UNREF(alpha);
00066
00067 obtain_base_strings();
00068 }
00069
00070 CSNPFeatures::~CSNPFeatures()
00071 {
00072 SG_UNREF(strings);
00073 }
00074
00075 float64_t CSNPFeatures::dot(int32_t idx_a, CDotFeatures* df, int32_t idx_b)
00076 {
00077 ASSERT(df);
00078 ASSERT(df->get_feature_type() == get_feature_type());
00079 ASSERT(df->get_feature_class() == get_feature_class());
00080 CSNPFeatures* sf=(CSNPFeatures*) df;
00081
00082 int32_t alen, blen;
00083 bool free_avec, free_bvec;
00084
00085 uint8_t* avec = strings->get_feature_vector(idx_a, alen, free_avec);
00086 uint8_t* bvec = sf->strings->get_feature_vector(idx_b, blen, free_bvec);
00087
00088 ASSERT(alen==blen);
00089 if (alen!=string_length)
00090 SG_ERROR("alen (%d) !=string_length (%d)\n", alen, string_length);
00091 ASSERT(m_str_min);
00092 ASSERT(m_str_maj);
00093
00094 float64_t total=0;
00095
00096 for (int32_t i = 0; i<alen-1; i+=2)
00097 {
00098 int32_t sumaa=0;
00099 int32_t sumbb=0;
00100 int32_t sumab=0;
00101
00102 uint8_t a1=avec[i];
00103 uint8_t a2=avec[i+1];
00104 uint8_t b1=bvec[i];
00105 uint8_t b2=bvec[i+1];
00106
00107 if ((a1!=a2 || a1=='0' || a1=='0') && (b1!=b2 || b1=='0' || b2=='0'))
00108 sumab++;
00109 else if (a1==a2 && b1==b2)
00110 {
00111 if (a1!=b1)
00112 continue;
00113
00114 if (a1==m_str_min[i])
00115 sumaa++;
00116 else if (a1==m_str_maj[i])
00117 sumbb++;
00118 else
00119 {
00120 SG_ERROR("The impossible happened i=%d a1=%c "
00121 "a2=%c b1=%c b2=%c min=%c maj=%c\n", i, a1,a2, b1,b2, m_str_min[i], m_str_maj[i]);
00122 }
00123
00124 }
00125 total+=sumaa+sumbb+sumab;
00126 }
00127
00128 strings->free_feature_vector(avec, idx_a, free_avec);
00129 sf->strings->free_feature_vector(bvec, idx_b, free_bvec);
00130 return total;
00131 }
00132
00133 float64_t CSNPFeatures::dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len)
00134 {
00135 if (vec2_len != w_dim)
00136 SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim);
00137
00138 float64_t sum=0;
00139 int32_t len;
00140 bool free_vec1;
00141 uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1);
00142 int32_t offs=0;
00143
00144 for (int32_t i=0; i<len; i+=2)
00145 {
00146 int32_t dim=0;
00147
00148 char a1=vec[i];
00149 char a2=vec[i+1];
00150
00151 if (a1==a2 && a1!='0' && a2!='0')
00152 {
00153 if (a1==m_str_min[i])
00154 dim=1;
00155 else if (a1==m_str_maj[i])
00156 dim=2;
00157 else
00158 {
00159 SG_ERROR("The impossible happened i=%d a1=%c a2=%c min=%c maj=%c\n",
00160 i, a1,a2, m_str_min[i], m_str_maj[i]);
00161 }
00162 }
00163
00164 sum+=vec2[offs+dim];
00165 offs+=3;
00166 }
00167 strings->free_feature_vector(vec, vec_idx1, free_vec1);
00168
00169 return sum/normalization_const;
00170 }
00171
00172 void CSNPFeatures::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val)
00173 {
00174 if (vec2_len != w_dim)
00175 SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim);
00176
00177 int32_t len;
00178 bool free_vec1;
00179 uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1);
00180 int32_t offs=0;
00181
00182 if (abs_val)
00183 alpha=CMath::abs(alpha);
00184
00185 for (int32_t i=0; i<len; i+=2)
00186 {
00187 int32_t dim=0;
00188
00189 char a1=vec[i];
00190 char a2=vec[i+1];
00191
00192 if (a1==a2 && a1!='0' && a2!='0')
00193 {
00194 if (a1==m_str_min[i])
00195 dim=1;
00196 else if (a1==m_str_maj[i])
00197 dim=2;
00198 else
00199 {
00200 SG_ERROR("The impossible happened i=%d a1=%c a2=%c min=%c maj=%c\n",
00201 i, a1,a2, m_str_min[i], m_str_maj[i]);
00202 }
00203 }
00204
00205 vec2[offs+dim]+=alpha;
00206 offs+=3;
00207 }
00208 strings->free_feature_vector(vec, vec_idx1, free_vec1);
00209 }
00210
00211 void CSNPFeatures::find_minor_major_strings(uint8_t* minor, uint8_t* major)
00212 {
00213 for (int32_t i=0; i<num_strings; i++)
00214 {
00215 int32_t len;
00216 bool free_vec;
00217 uint8_t* vec = ((CStringFeatures<uint8_t>*) strings)->get_feature_vector(i, len, free_vec);
00218 ASSERT(string_length==len);
00219
00220 for (int32_t j=0; j<len; j++)
00221 {
00222
00223 if (vec[j]=='0')
00224 continue;
00225
00226 if (minor[j]==0)
00227 minor[j]=vec[j];
00228 else if (major[j]==0 && vec[j]!=minor[j])
00229 major[j]=vec[j];
00230 }
00231
00232 ((CStringFeatures<uint8_t>*) strings)->free_feature_vector(vec, i, free_vec);
00233 }
00234 }
00235
00236 void CSNPFeatures::obtain_base_strings(CSNPFeatures* snp)
00237 {
00238 SG_FREE(m_str_min);
00239 SG_FREE(m_str_maj);
00240 size_t tlen=(string_length+1)*sizeof(uint8_t);
00241
00242 m_str_min=SG_CALLOC(uint8_t, tlen);
00243 m_str_maj=SG_CALLOC(uint8_t, tlen);
00244
00245 find_minor_major_strings(m_str_min, m_str_maj);
00246
00247 if (snp)
00248 snp->find_minor_major_strings(m_str_min, m_str_maj);
00249
00250 for (int32_t j=0; j<string_length; j++)
00251 {
00252
00253 if (m_str_min[j]==0)
00254 m_str_min[j]='0';
00255 if (m_str_maj[j]==0)
00256 m_str_maj[j]='0';
00257
00258 if (m_str_min[j]>m_str_maj[j])
00259 CMath::swap(m_str_min[j], m_str_maj[j]);
00260 }
00261 }
00262
00263 void CSNPFeatures::set_normalization_const(float64_t n)
00264 {
00265 if (n==0)
00266 {
00267 normalization_const=string_length;
00268 normalization_const=CMath::sqrt(normalization_const);
00269 }
00270 else
00271 normalization_const=n;
00272
00273 SG_DEBUG("normalization_const:%f\n", normalization_const);
00274 }
00275
00276 void* CSNPFeatures::get_feature_iterator(int32_t vector_index)
00277 {
00278 return NULL;
00279 }
00280
00281 bool CSNPFeatures::get_next_feature(int32_t& index, float64_t& value, void* iterator)
00282 {
00283 return false;
00284 }
00285
00286 void CSNPFeatures::free_feature_iterator(void* iterator)
00287 {
00288 }
00289
00290 CFeatures* CSNPFeatures::duplicate() const
00291 {
00292 return new CSNPFeatures(*this);
00293 }
00294
00295 SGMatrix<float64_t> CSNPFeatures::get_histogram(bool normalize)
00296 {
00297 int32_t nsym=3;
00298 float64_t* h= SG_CALLOC(float64_t, size_t(nsym)*string_length/2);
00299
00300 float64_t* h_normalizer=SG_MALLOC(float64_t, string_length/2);
00301 memset(h_normalizer, 0, string_length/2*sizeof(float64_t));
00302 int32_t num_str=get_num_vectors();
00303 for (int32_t i=0; i<num_str; i++)
00304 {
00305 int32_t len;
00306 bool free_vec;
00307 uint8_t* vec = strings->get_feature_vector(i, len, free_vec);
00308
00309 for (int32_t j=0; j<len; j+=2)
00310 {
00311 int32_t dim=0;
00312
00313 char a1=vec[j];
00314 char a2=vec[j+1];
00315
00316 if (a1==a2 && a1!='0' && a2!='0')
00317 {
00318 if (a1==m_str_min[j])
00319 dim=1;
00320 else if (a1==m_str_maj[j])
00321 dim=2;
00322 else
00323 {
00324 SG_ERROR("The impossible happened j=%d a1=%c a2=%c min=%c maj=%c\n",
00325 j, a1,a2, m_str_min[j], m_str_maj[j]);
00326 }
00327 }
00328
00329 h[int64_t(j/2)*nsym+dim]++;
00330 h_normalizer[j/2]++;
00331 }
00332
00333 strings->free_feature_vector(vec, i, free_vec);
00334 }
00335
00336 if (normalize)
00337 {
00338 for (int32_t i=0; i<string_length/2; i++)
00339 {
00340 for (int32_t j=0; j<nsym; j++)
00341 {
00342 if (h_normalizer && h_normalizer[i])
00343 h[int64_t(i)*nsym+j]/=h_normalizer[i];
00344 }
00345 }
00346 }
00347 SG_FREE(h_normalizer);
00348
00349 return SGMatrix<float64_t>(h, nsym, string_length/2);
00350 }
00351
00352 SGMatrix<float64_t> CSNPFeatures::get_2x3_table(CSNPFeatures* pos, CSNPFeatures* neg)
00353 {
00354
00355 ASSERT(pos->strings->get_max_vector_length() == neg->strings->get_max_vector_length());
00356 int32_t len=pos->strings->get_max_vector_length();
00357
00358 float64_t* table=SG_MALLOC(float64_t, 3*2*len/2);
00359
00360 SGMatrix<float64_t> p_hist=pos->get_histogram(false);
00361 SGMatrix<float64_t> n_hist=neg->get_histogram(false);
00362
00363
00364 for (int32_t i=0; i<3*len/2; i++)
00365 {
00366 table[2*i]=p_hist.matrix[i];
00367 table[2*i+1]=n_hist.matrix[i];
00368 }
00369 return SGMatrix<float64_t>(table, 2,3*len/2);
00370 }