00001 #include <shogun/features/StringFeatures.h>
00002 #include <shogun/preprocessor/Preprocessor.h>
00003 #include <shogun/preprocessor/StringPreprocessor.h>
00004 #include <shogun/io/MemoryMappedFile.h>
00005 #include <shogun/io/SGIO.h>
00006 #include <shogun/mathematics/Math.h>
00007 #include <shogun/base/Parameter.h>
00008
00009 #include <sys/types.h>
00010 #include <sys/stat.h>
00011 #include <dirent.h>
00012 #include <stdio.h>
00013 #include <stdlib.h>
00014 #include <unistd.h>
00015
00016
00017 namespace shogun
00018 {
00019
00020 template<class ST> CStringFeatures<ST>::CStringFeatures() : CFeatures(0)
00021 {
00022 init();
00023 alphabet=new CAlphabet();
00024 }
00025
00026 template<class ST> CStringFeatures<ST>::CStringFeatures(EAlphabet alpha) : CFeatures(0)
00027 {
00028 init();
00029
00030 alphabet=new CAlphabet(alpha);
00031 SG_REF(alphabet);
00032 num_symbols=alphabet->get_num_symbols();
00033 original_num_symbols=num_symbols;
00034 }
00035
00036 template<class ST> CStringFeatures<ST>::CStringFeatures(SGStringList<ST> string_list, EAlphabet alpha)
00037 : CFeatures(0)
00038 {
00039 init();
00040
00041 alphabet=new CAlphabet(alpha);
00042 SG_REF(alphabet);
00043 num_symbols=alphabet->get_num_symbols();
00044 original_num_symbols=num_symbols;
00045 set_features(string_list.strings, string_list.num_strings, string_list.max_string_length);
00046 }
00047
00048 template<class ST> CStringFeatures<ST>::CStringFeatures(SGStringList<ST> string_list, CAlphabet* alpha)
00049 : CFeatures(0)
00050 {
00051 init();
00052
00053 alphabet=new CAlphabet(alpha);
00054 SG_REF(alphabet);
00055 num_symbols=alphabet->get_num_symbols();
00056 original_num_symbols=num_symbols;
00057 set_features(string_list.strings, string_list.num_strings, string_list.max_string_length);
00058 }
00059
00060 template<class ST> CStringFeatures<ST>::CStringFeatures(CAlphabet* alpha)
00061 : CFeatures(0)
00062 {
00063 init();
00064
00065 ASSERT(alpha);
00066 SG_REF(alpha);
00067 alphabet=alpha;
00068 num_symbols=alphabet->get_num_symbols();
00069 original_num_symbols=num_symbols;
00070 }
00071
00072 template<class ST> CStringFeatures<ST>::CStringFeatures(const CStringFeatures & orig)
00073 : CFeatures(orig), num_vectors(orig.num_vectors),
00074 single_string(orig.single_string),
00075 length_of_single_string(orig.length_of_single_string),
00076 max_string_length(orig.max_string_length),
00077 num_symbols(orig.num_symbols),
00078 original_num_symbols(orig.original_num_symbols),
00079 order(orig.order), preprocess_on_get(false),
00080 feature_cache(NULL)
00081 {
00082 init();
00083
00084 ASSERT(orig.single_string == NULL);
00085
00086 alphabet=orig.alphabet;
00087 SG_REF(alphabet);
00088
00089 if (orig.features)
00090 {
00091 features=SG_MALLOC(SGString<ST>, orig.num_vectors);
00092
00093 for (int32_t i=0; i<num_vectors; i++)
00094 {
00095 features[i].string=SG_MALLOC(ST, orig.features[i].slen);
00096 features[i].slen=orig.features[i].slen;
00097 memcpy(features[i].string, orig.features[i].string, sizeof(ST)*orig.features[i].slen);
00098 }
00099 }
00100
00101 if (orig.symbol_mask_table)
00102 {
00103 symbol_mask_table=SG_MALLOC(ST, 256);
00104 for (int32_t i=0; i<256; i++)
00105 symbol_mask_table[i]=orig.symbol_mask_table[i];
00106 }
00107
00108 m_subset=orig.m_subset->duplicate();
00109 }
00110
00111 template<class ST> CStringFeatures<ST>::CStringFeatures(CFile* loader, EAlphabet alpha)
00112 : CFeatures(loader), num_vectors(0),
00113 features(NULL), single_string(NULL), length_of_single_string(0),
00114 max_string_length(0), order(0),
00115 symbol_mask_table(NULL), preprocess_on_get(false), feature_cache(NULL)
00116 {
00117 init();
00118
00119 alphabet=new CAlphabet(alpha);
00120 SG_REF(alphabet);
00121 num_symbols=alphabet->get_num_symbols();
00122 original_num_symbols=num_symbols;
00123 load(loader);
00124 }
00125
00126 template<class ST> CStringFeatures<ST>::~CStringFeatures()
00127 {
00128 cleanup();
00129
00130 SG_UNREF(alphabet);
00131 }
00132
00133 template<class ST> void CStringFeatures<ST>::cleanup()
00134 {
00135 remove_subset();
00136
00137 if (single_string)
00138 {
00139 SG_FREE(single_string);
00140 single_string=NULL;
00141 }
00142 else
00143 cleanup_feature_vectors(0, num_vectors-1);
00144
00145 num_vectors=0;
00146 SG_FREE(features);
00147 SG_FREE(symbol_mask_table);
00148 features=NULL;
00149 symbol_mask_table=NULL;
00150
00151
00152
00153
00154
00155 CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet());
00156 SG_UNREF(alphabet);
00157 alphabet=alpha;
00158 SG_REF(alphabet);
00159 }
00160
00161 template<class ST> void CStringFeatures<ST>::cleanup_feature_vector(int32_t num)
00162 {
00163 ASSERT(num<get_num_vectors());
00164
00165 if (features)
00166 {
00167 int32_t real_num=subset_idx_conversion(num);
00168 SG_FREE(features[real_num].string);
00169 features[real_num].string=NULL;
00170 features[real_num].slen=0;
00171
00172 determine_maximum_string_length();
00173 }
00174 }
00175
00176 template<class ST> void CStringFeatures<ST>::cleanup_feature_vectors(int32_t start, int32_t stop)
00177 {
00178 if (features && get_num_vectors())
00179 {
00180 ASSERT(start<get_num_vectors());
00181 ASSERT(stop<get_num_vectors());
00182
00183 for (int32_t i=start; i<=stop; i++)
00184 {
00185 int32_t real_num=subset_idx_conversion(i);
00186 SG_FREE(features[real_num].string);
00187 features[real_num].string=NULL;
00188 features[real_num].slen=0;
00189 }
00190 determine_maximum_string_length();
00191 }
00192 }
00193
00194 template<class ST> EFeatureClass CStringFeatures<ST>::get_feature_class() { return C_STRING; }
00195
00196 template<class ST> EFeatureType CStringFeatures<ST>::get_feature_type() { return F_UNKNOWN; }
00197
00198 template<class ST> CAlphabet* CStringFeatures<ST>::get_alphabet()
00199 {
00200 SG_REF(alphabet);
00201 return alphabet;
00202 }
00203
00204 template<class ST> CFeatures* CStringFeatures<ST>::duplicate() const
00205 {
00206 return new CStringFeatures<ST>(*this);
00207 }
00208
00209 template<class ST> SGVector<ST> CStringFeatures<ST>::get_feature_vector(int32_t num)
00210 {
00211 ASSERT(features);
00212 if (num>=get_num_vectors())
00213 {
00214 SG_ERROR("Index out of bounds (number of strings %d, you "
00215 "requested %d)\n", get_num_vectors(), num);
00216 }
00217
00218 int32_t l;
00219 bool free_vec;
00220 ST* vec=get_feature_vector(num, l, free_vec);
00221 ST* dst=SG_MALLOC(ST, l);
00222 memcpy(dst, vec, l*sizeof(ST));
00223 free_feature_vector(vec, num, free_vec);
00224 return SGVector<ST>(dst, l);
00225 }
00226
00227 template<class ST> void CStringFeatures<ST>::set_feature_vector(SGVector<ST> vector, int32_t num)
00228 {
00229 ASSERT(features);
00230
00231 if (m_subset)
00232 SG_ERROR("A subset is set, cannot set feature vector\n");
00233
00234 if (num>=num_vectors)
00235 {
00236 SG_ERROR("Index out of bounds (number of strings %d, you "
00237 "requested %d)\n", num_vectors, num);
00238 }
00239
00240 if (vector.vlen<=0)
00241 SG_ERROR("String has zero or negative length\n");
00242
00243 cleanup_feature_vector(num);
00244 features[num].slen=vector.vlen;
00245 features[num].string=SG_MALLOC(ST, vector.vlen);
00246 memcpy(features[num].string, vector.vector, vector.vlen*sizeof(ST));
00247
00248 determine_maximum_string_length();
00249 }
00250
00251 template<class ST> void CStringFeatures<ST>::enable_on_the_fly_preprocessing()
00252 {
00253 preprocess_on_get=true;
00254 }
00255
00256 template<class ST> void CStringFeatures<ST>::disable_on_the_fly_preprocessing()
00257 {
00258 preprocess_on_get=false;
00259 }
00260
00261 template<class ST> ST* CStringFeatures<ST>::get_feature_vector(int32_t num, int32_t& len, bool& dofree)
00262 {
00263 ASSERT(features);
00264 ASSERT(num<get_num_vectors());
00265
00266
00267 int32_t real_num=subset_idx_conversion(num);
00268
00269 if (!preprocess_on_get)
00270 {
00271 dofree=false;
00272 len=features[real_num].slen;
00273 return features[real_num].string;
00274 }
00275 else
00276 {
00277 SG_DEBUG( "computing feature vector!\n") ;
00278 ST* feat=compute_feature_vector(num, len);
00279 dofree=true;
00280
00281 if (get_num_preprocessors())
00282 {
00283 ST* tmp_feat_before=feat;
00284
00285 for (int32_t i=0; i<get_num_preprocessors(); i++)
00286 {
00287 CStringPreprocessor<ST>* p=(CStringPreprocessor<ST>*) get_preprocessor(i);
00288 feat=p->apply_to_string(tmp_feat_before, len);
00289 SG_UNREF(p);
00290 SG_FREE(tmp_feat_before);
00291 tmp_feat_before=feat;
00292 }
00293 }
00294
00295 return feat;
00296 }
00297 }
00298
00299 template<class ST> CStringFeatures<ST>* CStringFeatures<ST>::get_transposed()
00300 {
00301 int32_t num_feat;
00302 int32_t num_vec;
00303 SGString<ST>* s=get_transposed(num_feat, num_vec);
00304 SGStringList<ST> string_list;
00305 string_list.strings = s;
00306 string_list.num_strings = num_vec;
00307 string_list.max_string_length = num_feat;
00308
00309 return new CStringFeatures<ST>(string_list, alphabet);
00310 }
00311
00312 template<class ST> SGString<ST>* CStringFeatures<ST>::get_transposed(int32_t &num_feat, int32_t &num_vec)
00313 {
00314 num_feat=get_num_vectors();
00315 num_vec=get_max_vector_length();
00316 ASSERT(have_same_length());
00317
00318 SG_DEBUG("Allocating memory for transposed string features of size %ld\n",
00319 int64_t(num_feat)*num_vec);
00320
00321 SGString<ST>* sf=SG_MALLOC(SGString<ST>, num_vec);
00322
00323 for (int32_t i=0; i<num_vec; i++)
00324 {
00325 sf[i].string=SG_MALLOC(ST, num_feat);
00326 sf[i].slen=num_feat;
00327 }
00328
00329 for (int32_t i=0; i<num_feat; i++)
00330 {
00331 int32_t len=0;
00332 bool free_vec=false;
00333 ST* vec=get_feature_vector(i, len, free_vec);
00334
00335 for (int32_t j=0; j<num_vec; j++)
00336 sf[j].string[i]=vec[j];
00337
00338 free_feature_vector(vec, i, free_vec);
00339 }
00340 return sf;
00341 }
00342
00343 template<class ST> void CStringFeatures<ST>::free_feature_vector(ST* feat_vec, int32_t num, bool dofree)
00344 {
00345 if (num>=get_num_vectors())
00346 {
00347 SG_ERROR(
00348 "Trying to access string[%d] but num_str=%d\n", num,
00349 get_num_vectors());
00350 }
00351
00352 int32_t real_num=subset_idx_conversion(num);
00353
00354 if (feature_cache)
00355 feature_cache->unlock_entry(real_num);
00356
00357 if (dofree)
00358 SG_FREE(feat_vec);
00359 }
00360
00361 template<class ST> void CStringFeatures<ST>::free_feature_vector(SGVector<ST> feat_vec, int32_t num)
00362 {
00363 if (num>=get_num_vectors())
00364 {
00365 SG_ERROR(
00366 "Trying to access string[%d] but num_str=%d\n", num,
00367 get_num_vectors());
00368 }
00369
00370 int32_t real_num=subset_idx_conversion(num);
00371
00372 if (feature_cache)
00373 feature_cache->unlock_entry(real_num);
00374
00375 if (feat_vec.do_free)
00376 SG_FREE(feat_vec.vector);
00377 }
00378
00379 template<class ST> ST CStringFeatures<ST>::get_feature(int32_t vec_num, int32_t feat_num)
00380 {
00381 ASSERT(vec_num<get_num_vectors());
00382
00383 int32_t len;
00384 bool free_vec;
00385 ST* vec=get_feature_vector(vec_num, len, free_vec);
00386 ASSERT(feat_num<len);
00387 ST result=vec[feat_num];
00388 free_feature_vector(vec, vec_num, free_vec);
00389
00390 return result;
00391 }
00392
00393 template<class ST> int32_t CStringFeatures<ST>::get_vector_length(int32_t vec_num)
00394 {
00395 ASSERT(vec_num<get_num_vectors());
00396
00397 int32_t len;
00398 bool free_vec;
00399 ST* vec=get_feature_vector(vec_num, len, free_vec);
00400 free_feature_vector(vec, vec_num, free_vec);
00401 return len;
00402 }
00403
00404 template<class ST> int32_t CStringFeatures<ST>::get_max_vector_length()
00405 {
00406 return max_string_length;
00407 }
00408
00409 template<class ST> int32_t CStringFeatures<ST>::get_num_vectors() const
00410 {
00411 return m_subset ? m_subset->get_size() : num_vectors;
00412 }
00413
00414 template<class ST> floatmax_t CStringFeatures<ST>::get_num_symbols() { return num_symbols; }
00415
00416 template<class ST> floatmax_t CStringFeatures<ST>::get_max_num_symbols() { return CMath::powl(2,sizeof(ST)*8); }
00417
00418 template<class ST> floatmax_t CStringFeatures<ST>::get_original_num_symbols() { return original_num_symbols; }
00419
00420 template<class ST> int32_t CStringFeatures<ST>::get_order() { return order; }
00421
00422 template<class ST> ST CStringFeatures<ST>::get_masked_symbols(ST symbol, uint8_t mask)
00423 {
00424 ASSERT(symbol_mask_table);
00425 return symbol_mask_table[mask] & symbol;
00426 }
00427
00428 template<class ST> ST CStringFeatures<ST>::shift_offset(ST offset, int32_t amount)
00429 {
00430 ASSERT(alphabet);
00431 return (offset << (amount*alphabet->get_num_bits()));
00432 }
00433
00434 template<class ST> ST CStringFeatures<ST>::shift_symbol(ST symbol, int32_t amount)
00435 {
00436 ASSERT(alphabet);
00437 return (symbol >> (amount*alphabet->get_num_bits()));
00438 }
00439
00440 template<class ST> void CStringFeatures<ST>::load_ascii_file(char* fname, bool remap_to_bin,
00441 EAlphabet ascii_alphabet, EAlphabet binary_alphabet)
00442 {
00443 remove_subset();
00444
00445 size_t blocksize=1024*1024;
00446 size_t required_blocksize=0;
00447 uint8_t* dummy=SG_MALLOC(uint8_t, blocksize);
00448 uint8_t* overflow=NULL;
00449 int32_t overflow_len=0;
00450
00451 cleanup();
00452
00453 CAlphabet* alpha=new CAlphabet(ascii_alphabet);
00454 CAlphabet* alpha_bin=new CAlphabet(binary_alphabet);
00455
00456 FILE* f=fopen(fname, "ro");
00457
00458 if (f)
00459 {
00460 num_vectors=0;
00461 max_string_length=0;
00462
00463 SG_INFO("counting line numbers in file %s\n", fname);
00464 size_t block_offs=0;
00465 size_t old_block_offs=0;
00466 fseek(f, 0, SEEK_END);
00467 size_t fsize=ftell(f);
00468 rewind(f);
00469
00470 if (blocksize>fsize)
00471 blocksize=fsize;
00472
00473 SG_DEBUG("block_size=%ld file_size=%ld\n", blocksize, fsize);
00474
00475 size_t sz=blocksize;
00476 while (sz == blocksize)
00477 {
00478 sz=fread(dummy, sizeof(uint8_t), blocksize, f);
00479 for (size_t i=0; i<sz; i++)
00480 {
00481 block_offs++;
00482 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00483 {
00484 num_vectors++;
00485 required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs);
00486 old_block_offs=block_offs;
00487 }
00488 }
00489 SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
00490 }
00491
00492 SG_INFO("found %d strings\n", num_vectors);
00493 SG_FREE(dummy);
00494 blocksize=required_blocksize;
00495 dummy=SG_MALLOC(uint8_t, blocksize);
00496 overflow=SG_MALLOC(uint8_t, blocksize);
00497 features=SG_MALLOC(SGString<ST>, num_vectors);
00498
00499 rewind(f);
00500 sz=blocksize;
00501 int32_t lines=0;
00502 while (sz == blocksize)
00503 {
00504 sz=fread(dummy, sizeof(uint8_t), blocksize, f);
00505
00506 size_t old_sz=0;
00507 for (size_t i=0; i<sz; i++)
00508 {
00509 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00510 {
00511 int32_t len=i-old_sz;
00512
00513 max_string_length=CMath::max(max_string_length, len+overflow_len);
00514
00515 features[lines].slen=len;
00516 features[lines].string=SG_MALLOC(ST, len);
00517
00518 if (remap_to_bin)
00519 {
00520 for (int32_t j=0; j<overflow_len; j++)
00521 features[lines].string[j]=alpha->remap_to_bin(overflow[j]);
00522 for (int32_t j=0; j<len; j++)
00523 features[lines].string[j+overflow_len]=alpha->remap_to_bin(dummy[old_sz+j]);
00524 alpha->add_string_to_histogram(&dummy[old_sz], len);
00525 alpha_bin->add_string_to_histogram(features[lines].string, features[lines].slen);
00526 }
00527 else
00528 {
00529 for (int32_t j=0; j<overflow_len; j++)
00530 features[lines].string[j]=overflow[j];
00531 for (int32_t j=0; j<len; j++)
00532 features[lines].string[j+overflow_len]=dummy[old_sz+j];
00533 alpha->add_string_to_histogram(&dummy[old_sz], len);
00534 alpha->add_string_to_histogram(features[lines].string, features[lines].slen);
00535 }
00536
00537
00538 overflow_len=0;
00539
00540
00541 old_sz=i+1;
00542 lines++;
00543 SG_PROGRESS(lines, 0, num_vectors, 1, "LOADING:\t");
00544 }
00545 }
00546 for (size_t i=old_sz; i<sz; i++)
00547 overflow[i-old_sz]=dummy[i];
00548
00549 overflow_len=sz-old_sz;
00550 }
00551
00552 if (alpha->check_alphabet_size() && alpha->check_alphabet())
00553 {
00554 SG_INFO("file successfully read\n");
00555 SG_INFO("max_string_length=%d\n", max_string_length);
00556 SG_INFO("num_strings=%d\n", num_vectors);
00557 }
00558 fclose(f);
00559 }
00560
00561 SG_FREE(dummy);
00562
00563 SG_UNREF(alphabet);
00564
00565 if (remap_to_bin)
00566 alphabet=alpha_bin;
00567 else
00568 alphabet=alpha;
00569 SG_REF(alphabet);
00570 num_symbols=alphabet->get_num_symbols();
00571 }
00572
00573 template<class ST> bool CStringFeatures<ST>::load_fasta_file(const char* fname, bool ignore_invalid)
00574 {
00575 remove_subset();
00576
00577 int32_t i=0;
00578 uint64_t len=0;
00579 uint64_t offs=0;
00580 int32_t num=0;
00581 int32_t max_len=0;
00582
00583 CMemoryMappedFile<char> f(fname);
00584
00585 while (true)
00586 {
00587 char* s=f.get_line(len, offs);
00588 if (!s)
00589 break;
00590
00591 if (len>0 && s[0]=='>')
00592 num++;
00593 }
00594
00595 if (num==0)
00596 SG_ERROR("No fasta hunks (lines starting with '>') found\n");
00597
00598 cleanup();
00599 SG_UNREF(alphabet);
00600 alphabet=new CAlphabet(DNA);
00601 num_symbols=alphabet->get_num_symbols();
00602
00603 SGString<ST>* strings=SG_MALLOC(SGString<ST>, num);
00604 offs=0;
00605
00606 for (i=0;i<num; i++)
00607 {
00608 uint64_t id_len=0;
00609 char* id=f.get_line(id_len, offs);
00610
00611 char* fasta=f.get_line(len, offs);
00612 char* s=fasta;
00613 int32_t fasta_len=0;
00614 int32_t spanned_lines=0;
00615
00616 while (true)
00617 {
00618 if (!s || len==0)
00619 SG_ERROR("Error reading fasta entry in line %d len=%ld", 4*i+1, len);
00620
00621 if (s[0]=='>' || offs==f.get_size())
00622 {
00623 offs-=len+1;
00624 if (offs==f.get_size())
00625 {
00626 SG_DEBUG("at EOF\n");
00627 fasta_len+=len;
00628 }
00629
00630 len=fasta_len-spanned_lines;
00631 strings[i].string=SG_MALLOC(ST, len);
00632 strings[i].slen=len;
00633
00634 ST* str=strings[i].string;
00635 int32_t idx=0;
00636 SG_DEBUG("'%.*s', len=%d, spanned_lines=%d\n", (int32_t) id_len, id, (int32_t) len, (int32_t) spanned_lines);
00637
00638 for (int32_t j=0; j<fasta_len; j++)
00639 {
00640 if (fasta[j]=='\n')
00641 continue;
00642
00643 ST c=(ST) fasta[j];
00644
00645 if (ignore_invalid && !alphabet->is_valid((uint8_t) fasta[j]))
00646 c=(ST) 'A';
00647
00648 if (uint64_t(idx)>=len)
00649 SG_ERROR("idx=%d j=%d fasta_len=%d, spanned_lines=%d str='%.*s'\n", idx, j, fasta_len, spanned_lines, idx, str);
00650 str[idx++]=c;
00651 }
00652 max_len=CMath::max(max_len, strings[i].slen);
00653
00654
00655 break;
00656 }
00657
00658 spanned_lines++;
00659 fasta_len+=len+1;
00660 s=f.get_line(len, offs);
00661 }
00662 }
00663 return set_features(strings, num, max_len);
00664 }
00665
00666 template<class ST> bool CStringFeatures<ST>::load_fastq_file(const char* fname,
00667 bool ignore_invalid, bool bitremap_in_single_string)
00668 {
00669 remove_subset();
00670
00671 CMemoryMappedFile<char> f(fname);
00672
00673 int32_t i=0;
00674 uint64_t len=0;
00675 uint64_t offs=0;
00676
00677 int32_t num=f.get_num_lines();
00678 int32_t max_len=0;
00679
00680 if (num%4)
00681 SG_ERROR("Number of lines must be divisible by 4 in fastq files\n");
00682 num/=4;
00683
00684 cleanup();
00685 SG_UNREF(alphabet);
00686 alphabet=new CAlphabet(DNA);
00687
00688 SGString<ST>* strings;
00689
00690 ST* str=NULL;
00691 if (bitremap_in_single_string)
00692 {
00693 strings=SG_MALLOC(SGString<ST>, 1);
00694 strings[0].string=SG_MALLOC(ST, num);
00695 strings[0].slen=num;
00696 f.get_line(len, offs);
00697 f.get_line(len, offs);
00698 order=len;
00699 max_len=num;
00700 offs=0;
00701 original_num_symbols=alphabet->get_num_symbols();
00702 str=SG_MALLOC(ST, len);
00703 }
00704 else
00705 strings=SG_MALLOC(SGString<ST>, num);
00706
00707 for (i=0;i<num; i++)
00708 {
00709 if (!f.get_line(len, offs))
00710 SG_ERROR("Error reading 'read' identifier in line %d", 4*i);
00711
00712 char* s=f.get_line(len, offs);
00713 if (!s || len==0)
00714 SG_ERROR("Error reading 'read' in line %d len=%ld", 4*i+1, len);
00715
00716 if (bitremap_in_single_string)
00717 {
00718 if (len!=(uint64_t) order)
00719 SG_ERROR("read in line %d not of length %d (is %d)\n", 4*i+1, order, len);
00720 for (int32_t j=0; j<order; j++)
00721 str[j]=(ST) alphabet->remap_to_bin((uint8_t) s[j]);
00722
00723 strings[0].string[i]=embed_word(str, order);
00724 }
00725 else
00726 {
00727 strings[i].string=SG_MALLOC(ST, len);
00728 strings[i].slen=len;
00729 str=strings[i].string;
00730
00731 if (ignore_invalid)
00732 {
00733 for (uint64_t j=0; j<len; j++)
00734 {
00735 if (alphabet->is_valid((uint8_t) s[j]))
00736 str[j]= (ST) s[j];
00737 else
00738 str[j]= (ST) 'A';
00739 }
00740 }
00741 else
00742 {
00743 for (uint64_t j=0; j<len; j++)
00744 str[j]= (ST) s[j];
00745 }
00746 max_len=CMath::max(max_len, (int32_t) len);
00747 }
00748
00749
00750 if (!f.get_line(len, offs))
00751 SG_ERROR("Error reading 'read' quality identifier in line %d", 4*i+2);
00752
00753 if (!f.get_line(len, offs))
00754 SG_ERROR("Error reading 'read' quality in line %d", 4*i+3);
00755 }
00756
00757 if (bitremap_in_single_string)
00758 num=1;
00759
00760 num_vectors=num;
00761 max_string_length=max_len;
00762 features=strings;
00763
00764 return true;
00765 }
00766
00767 template<class ST> bool CStringFeatures<ST>::load_from_directory(char* dirname)
00768 {
00769 remove_subset();
00770
00771 struct dirent **namelist;
00772 int32_t n;
00773
00774 SGIO::set_dirname(dirname);
00775
00776 SG_DEBUG("dirname '%s'\n", dirname);
00777
00778 n=scandir(dirname, &namelist, &SGIO::filter, alphasort);
00779 if (n <= 0)
00780 {
00781 SG_ERROR("error calling scandir - no files found\n");
00782 return false;
00783 }
00784 else
00785 {
00786 SGString<ST>* strings=NULL;
00787
00788 int32_t num=0;
00789 int32_t max_len=-1;
00790
00791
00792
00793 strings=SG_MALLOC(SGString<ST>, n);
00794
00795 for (int32_t i=0; i<n; i++)
00796 {
00797 char* fname=SGIO::concat_filename(namelist[i]->d_name);
00798
00799 struct stat s;
00800 off_t filesize=0;
00801
00802 if (!stat(fname, &s) && s.st_size>0)
00803 {
00804 filesize=s.st_size/sizeof(ST);
00805
00806 FILE* f=fopen(fname, "ro");
00807 if (f)
00808 {
00809 ST* str=SG_MALLOC(ST, filesize);
00810 SG_DEBUG("%s:%ld\n", fname, (int64_t) filesize);
00811 if (fread(str, sizeof(ST), filesize, f)!=(size_t) filesize)
00812 SG_ERROR("failed to read file\n");
00813 strings[num].string=str;
00814 strings[num].slen=filesize;
00815 max_len=CMath::max(max_len, strings[num].slen);
00816
00817 num++;
00818 fclose(f);
00819 }
00820 }
00821 else
00822 SG_ERROR("empty or non readable file \'%s\'\n", fname);
00823
00824 SG_FREE(namelist[i]);
00825 }
00826 SG_FREE(namelist);
00827
00828 if (num>0 && strings)
00829 {
00830 set_features(strings, num, max_len);
00831 return true;
00832 }
00833 }
00834 return false;
00835 }
00836
00837 template<class ST> void CStringFeatures<ST>::set_features(SGStringList<ST> feats)
00838 {
00839 set_features(feats.strings, feats.num_strings, feats.max_string_length);
00840 }
00841
00842 template<class ST> bool CStringFeatures<ST>::set_features(SGString<ST>* p_features, int32_t p_num_vectors, int32_t p_max_string_length)
00843 {
00844 if (m_subset)
00845 SG_ERROR("Cannot call set_features() with subset.\n");
00846
00847 if (p_features)
00848 {
00849 CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet());
00850
00851
00852 for (int32_t i=0; i<p_num_vectors; i++)
00853 alpha->add_string_to_histogram( p_features[i].string, p_features[i].slen);
00854
00855 SG_INFO("max_value_in_histogram:%d\n", alpha->get_max_value_in_histogram());
00856 SG_INFO("num_symbols_in_histogram:%d\n", alpha->get_num_symbols_in_histogram());
00857
00858 if (alpha->check_alphabet_size() && alpha->check_alphabet())
00859 {
00860 cleanup();
00861 SG_UNREF(alphabet);
00862
00863 alphabet=alpha;
00864 SG_REF(alphabet);
00865
00866 features=p_features;
00867 num_vectors=p_num_vectors;
00868 max_string_length=p_max_string_length;
00869
00870 return true;
00871 }
00872 else
00873 SG_UNREF(alpha);
00874 }
00875
00876 return false;
00877 }
00878
00879 template<class ST> bool CStringFeatures<ST>::append_features(CStringFeatures<ST>* sf)
00880 {
00881 ASSERT(sf);
00882
00883 if (m_subset)
00884 SG_ERROR("Cannot call set_features() with subset.\n");
00885
00886 SGString<ST>* new_features=SG_MALLOC(SGString<ST>, sf->get_num_vectors());
00887
00888 index_t sf_num_str=sf->get_num_vectors();
00889 for (int32_t i=0; i<sf_num_str; i++)
00890 {
00891 int32_t real_i = sf->subset_idx_conversion(i);
00892 int32_t length=sf->features[real_i].slen;
00893 new_features[i].string=SG_MALLOC(ST, length);
00894 memcpy(new_features[i].string, sf->features[real_i].string, length);
00895 new_features[i].slen=length;
00896 }
00897 return append_features(new_features, sf_num_str,
00898 sf->max_string_length);
00899 }
00900
00901 template<class ST> bool CStringFeatures<ST>::append_features(SGString<ST>* p_features, int32_t p_num_vectors, int32_t p_max_string_length)
00902 {
00903 if (m_subset)
00904 SG_ERROR("Cannot call set_features() with subset.\n");
00905
00906 if (!features)
00907 return set_features(p_features, p_num_vectors, p_max_string_length);
00908
00909 CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet());
00910
00911
00912 for (int32_t i=0; i<p_num_vectors; i++)
00913 alpha->add_string_to_histogram( p_features[i].string, p_features[i].slen);
00914
00915 SG_INFO("max_value_in_histogram:%d\n", alpha->get_max_value_in_histogram());
00916 SG_INFO("num_symbols_in_histogram:%d\n", alpha->get_num_symbols_in_histogram());
00917
00918 if (alpha->check_alphabet_size() && alpha->check_alphabet())
00919 {
00920 SG_UNREF(alpha);
00921 for (int32_t i=0; i<p_num_vectors; i++)
00922 alphabet->add_string_to_histogram( p_features[i].string, p_features[i].slen);
00923
00924 int32_t old_num_vectors=num_vectors;
00925 num_vectors=old_num_vectors+p_num_vectors;
00926 SGString<ST>* new_features=SG_MALLOC(SGString<ST>, num_vectors);
00927
00928 for (int32_t i=0; i<num_vectors; i++)
00929 {
00930 if (i<old_num_vectors)
00931 {
00932 new_features[i].string=features[i].string;
00933 new_features[i].slen=features[i].slen;
00934 }
00935 else
00936 {
00937 new_features[i].string=p_features[i-old_num_vectors].string;
00938 new_features[i].slen=p_features[i-old_num_vectors].slen;
00939 }
00940 }
00941 SG_FREE(features);
00942 SG_FREE(p_features);
00943
00944 this->features=new_features;
00945 max_string_length=CMath::max(max_string_length, p_max_string_length);
00946
00947 return true;
00948 }
00949 SG_UNREF(alpha);
00950
00951 return false;
00952 }
00953
00954 template<class ST> SGStringList<ST> CStringFeatures<ST>::get_features()
00955 {
00956 SGStringList<ST> sl;
00957
00958 sl.strings=get_features(sl.num_strings, sl.max_string_length);
00959 return sl;
00960 }
00961
00962 template<class ST> SGString<ST>* CStringFeatures<ST>::get_features(int32_t& num_str, int32_t& max_str_len)
00963 {
00964 if (m_subset)
00965 SG_ERROR("get features() is not possible on subset");
00966
00967 num_str=num_vectors;
00968 max_str_len=max_string_length;
00969 return features;
00970 }
00971
00972 template<class ST> SGString<ST>* CStringFeatures<ST>::copy_features(int32_t& num_str, int32_t& max_str_len)
00973 {
00974 ASSERT(num_vectors>0);
00975
00976 num_str=get_num_vectors();
00977 max_str_len=max_string_length;
00978 SGString<ST>* new_feat=SG_MALLOC(SGString<ST>, num_str);
00979
00980 for (int32_t i=0; i<num_str; i++)
00981 {
00982 int32_t len;
00983 bool free_vec;
00984 ST* vec=get_feature_vector(i, len, free_vec);
00985 new_feat[i].string=SG_MALLOC(ST, len);
00986 new_feat[i].slen=len;
00987 memcpy(new_feat[i].string, vec, ((size_t) len) * sizeof(ST));
00988 free_feature_vector(vec, i, free_vec);
00989 }
00990
00991 return new_feat;
00992 }
00993
00994 template<class ST> void CStringFeatures<ST>::get_features(SGString<ST>** dst, int32_t* num_str)
00995 {
00996 int32_t num_vec;
00997 int32_t max_str_len;
00998 *dst=copy_features(num_vec, max_str_len);
00999 *num_str=num_vec;
01000 }
01001
01002 template<class ST> bool CStringFeatures<ST>::load_compressed(char* src, bool decompress)
01003 {
01004 remove_subset();
01005
01006 FILE* file=NULL;
01007
01008 if (!(file=fopen(src, "r")))
01009 return false;
01010 cleanup();
01011
01012
01013 char id[4];
01014 if (fread(&id[0], sizeof(char), 1, file)!=1)
01015 SG_ERROR("failed to read header");
01016 ASSERT(id[0]=='S');
01017 if (fread(&id[1], sizeof(char), 1, file)!=1)
01018 SG_ERROR("failed to read header");
01019 ASSERT(id[1]=='G');
01020 if (fread(&id[2], sizeof(char), 1, file)!=1)
01021 SG_ERROR("failed to read header");
01022 ASSERT(id[2]=='V');
01023 if (fread(&id[3], sizeof(char), 1, file)!=1)
01024 SG_ERROR("failed to read header");
01025 ASSERT(id[3]=='0');
01026
01027
01028 uint8_t c;
01029 if (fread(&c, sizeof(uint8_t), 1, file)!=1)
01030 SG_ERROR("failed to read compression type");
01031 CCompressor* compressor= new CCompressor((E_COMPRESSION_TYPE) c);
01032
01033 uint8_t a;
01034 delete alphabet;
01035 if (fread(&a, sizeof(uint8_t), 1, file)!=1)
01036 SG_ERROR("failed to read compression alphabet");
01037 alphabet=new CAlphabet((EAlphabet) a);
01038
01039 if (fread(&num_vectors, sizeof(int32_t), 1, file)!=1)
01040 SG_ERROR("failed to read compression number of vectors");
01041 ASSERT(num_vectors>0);
01042
01043 if (fread(&max_string_length, sizeof(int32_t), 1, file)!=1)
01044 SG_ERROR("failed to read maximum string length");
01045 ASSERT(max_string_length>0);
01046
01047 features=SG_MALLOC(SGString<ST>, num_vectors);
01048
01049
01050 for (int32_t i=0; i<num_vectors; i++)
01051 {
01052
01053 int32_t len_compressed;
01054 if (fread(&len_compressed, sizeof(int32_t), 1, file)!=1)
01055 SG_ERROR("failed to read vector length compressed");
01056
01057 int32_t len_uncompressed;
01058 if (fread(&len_uncompressed, sizeof(int32_t), 1, file)!=1)
01059 SG_ERROR("failed to read vector length uncompressed");
01060
01061
01062 if (decompress)
01063 {
01064 features[i].string=SG_MALLOC(ST, len_uncompressed);
01065 features[i].slen=len_uncompressed;
01066 uint8_t* compressed=SG_MALLOC(uint8_t, len_compressed);
01067 if (fread(compressed, sizeof(uint8_t), len_compressed, file)!=(size_t) len_compressed)
01068 SG_ERROR("failed to read compressed data (expected %d bytes)", len_compressed);
01069 uint64_t uncompressed_size=len_uncompressed;
01070 uncompressed_size*=sizeof(ST);
01071 compressor->decompress(compressed, len_compressed,
01072 (uint8_t*) features[i].string, uncompressed_size);
01073 SG_FREE(compressed);
01074 ASSERT(uncompressed_size==((uint64_t) len_uncompressed)*sizeof(ST));
01075 }
01076 else
01077 {
01078 int32_t offs=CMath::ceil(2.0*sizeof(int32_t)/sizeof(ST));
01079 features[i].string=SG_MALLOC(ST, len_compressed+offs);
01080 features[i].slen=len_compressed+offs;
01081 int32_t* feat32ptr=((int32_t*) (features[i].string));
01082 memset(features[i].string, 0, offs*sizeof(ST));
01083 feat32ptr[0]=(int32_t) len_compressed;
01084 feat32ptr[1]=(int32_t) len_uncompressed;
01085 uint8_t* compressed=(uint8_t*) (&features[i].string[offs]);
01086 if (fread(compressed, 1, len_compressed, file)!=(size_t) len_compressed)
01087 SG_ERROR("failed to read uncompressed data");
01088 }
01089 }
01090
01091 delete compressor;
01092 fclose(file);
01093
01094 return false;
01095 }
01096
01097 template<class ST> bool CStringFeatures<ST>::save_compressed(char* dest, E_COMPRESSION_TYPE compression, int level)
01098 {
01099 if (m_subset)
01100 SG_ERROR("save_compressed() is not possible on subset");
01101
01102 FILE* file=NULL;
01103
01104 if (!(file=fopen(dest, "wb")))
01105 return false;
01106
01107 CCompressor* compressor= new CCompressor(compression);
01108
01109
01110 const char* id="SGV0";
01111 fwrite(&id[0], sizeof(char), 1, file);
01112 fwrite(&id[1], sizeof(char), 1, file);
01113 fwrite(&id[2], sizeof(char), 1, file);
01114 fwrite(&id[3], sizeof(char), 1, file);
01115
01116
01117 uint8_t c=(uint8_t) compression;
01118 fwrite(&c, sizeof(uint8_t), 1, file);
01119
01120 uint8_t a=(uint8_t) alphabet->get_alphabet();
01121 fwrite(&a, sizeof(uint8_t), 1, file);
01122
01123 fwrite(&num_vectors, sizeof(int32_t), 1, file);
01124
01125 fwrite(&max_string_length, sizeof(int32_t), 1, file);
01126
01127
01128 for (int32_t i=0; i<num_vectors; i++)
01129 {
01130 int32_t len=-1;
01131 bool vfree;
01132 ST* vec=get_feature_vector(i, len, vfree);
01133
01134 uint8_t* compressed=NULL;
01135 uint64_t compressed_size=0;
01136
01137 compressor->compress((uint8_t*) vec, ((uint64_t) len)*sizeof(ST),
01138 compressed, compressed_size, level);
01139
01140 int32_t len_compressed=(int32_t) compressed_size;
01141
01142 fwrite(&len_compressed, sizeof(int32_t), 1, file);
01143
01144 fwrite(&len, sizeof(int32_t), 1, file);
01145
01146 fwrite(compressed, compressed_size, 1, file);
01147 SG_FREE(compressed);
01148
01149 free_feature_vector(vec, i, vfree);
01150 }
01151
01152 delete compressor;
01153 fclose(file);
01154 return true;
01155 }
01156
01157 template<class ST> int32_t CStringFeatures<ST>::get_size() { return sizeof(ST); }
01158
01159 template<class ST> bool CStringFeatures<ST>::apply_preprocessor(bool force_preprocessing)
01160 {
01161 SG_DEBUG( "force: %d\n", force_preprocessing);
01162
01163 for (int32_t i=0; i<get_num_preprocessors(); i++)
01164 {
01165 if ( (!is_preprocessed(i) || force_preprocessing) )
01166 {
01167 set_preprocessed(i);
01168 CStringPreprocessor<ST>* p=(CStringPreprocessor<ST>*) get_preprocessor(i);
01169 SG_INFO( "preprocessing using preproc %s\n", p->get_name());
01170
01171 if (!p->apply_to_string_features(this))
01172 {
01173 SG_UNREF(p);
01174 return false;
01175 }
01176 else
01177 SG_UNREF(p);
01178 }
01179 }
01180 return true;
01181 }
01182
01183 template<class ST> int32_t CStringFeatures<ST>::obtain_by_sliding_window(int32_t window_size, int32_t step_size, int32_t skip)
01184 {
01185 if (m_subset)
01186 SG_NOTIMPLEMENTED;
01187
01188 ASSERT(step_size>0);
01189 ASSERT(window_size>0);
01190 ASSERT(num_vectors==1 || single_string);
01191 ASSERT(max_string_length>=window_size ||
01192 (single_string && length_of_single_string>=window_size));
01193
01194
01195
01196 if (single_string)
01197 num_vectors= (length_of_single_string-window_size)/step_size + 1;
01198 else if (num_vectors==1)
01199 {
01200 num_vectors= (max_string_length-window_size)/step_size + 1;
01201 length_of_single_string=max_string_length;
01202 }
01203
01204 SGString<ST>* f=SG_MALLOC(SGString<ST>, num_vectors);
01205 int32_t offs=0;
01206 for (int32_t i=0; i<num_vectors; i++)
01207 {
01208 f[i].string=&features[0].string[offs+skip];
01209 f[i].slen=window_size-skip;
01210 offs+=step_size;
01211 }
01212 single_string=features[0].string;
01213 SG_FREE(features);
01214 features=f;
01215 max_string_length=window_size-skip;
01216
01217 return num_vectors;
01218 }
01219
01220 template<class ST> int32_t CStringFeatures<ST>::obtain_by_position_list(int32_t window_size, CDynamicArray<int32_t>* positions,
01221 int32_t skip)
01222 {
01223 if (m_subset)
01224 SG_NOTIMPLEMENTED;
01225
01226 ASSERT(positions);
01227 ASSERT(window_size>0);
01228 ASSERT(num_vectors==1 || single_string);
01229 ASSERT(max_string_length>=window_size ||
01230 (single_string && length_of_single_string>=window_size));
01231
01232 num_vectors= positions->get_num_elements();
01233 ASSERT(num_vectors>0);
01234
01235 int32_t len;
01236
01237
01238
01239 if (single_string)
01240 len=length_of_single_string;
01241 else
01242 {
01243 single_string=features[0].string;
01244 len=max_string_length;
01245 length_of_single_string=max_string_length;
01246 }
01247
01248 SGString<ST>* f=SG_MALLOC(SGString<ST>, num_vectors);
01249 for (int32_t i=0; i<num_vectors; i++)
01250 {
01251 int32_t p=positions->get_element(i);
01252
01253 if (p>=0 && p<=len-window_size)
01254 {
01255 f[i].string=&features[0].string[p+skip];
01256 f[i].slen=window_size-skip;
01257 }
01258 else
01259 {
01260 num_vectors=1;
01261 max_string_length=len;
01262 features[0].slen=len;
01263 single_string=NULL;
01264 SG_FREE(f);
01265 SG_ERROR("window (size:%d) starting at position[%d]=%d does not fit in sequence(len:%d)\n",
01266 window_size, i, p, len);
01267 return -1;
01268 }
01269 }
01270
01271 SG_FREE(features);
01272 features=f;
01273 max_string_length=window_size-skip;
01274
01275 return num_vectors;
01276 }
01277
01278 template<class ST> bool CStringFeatures<ST>::obtain_from_char(CStringFeatures<char>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
01279 {
01280 return obtain_from_char_features(sf, start, p_order, gap, rev);
01281 }
01282
01283 template<class ST> bool CStringFeatures<ST>::have_same_length(int32_t len)
01284 {
01285 if (len!=-1)
01286 {
01287 if (len!=max_string_length)
01288 return false;
01289 }
01290 len=max_string_length;
01291
01292 index_t num_str=get_num_vectors();
01293 for (int32_t i=0; i<num_str; i++)
01294 {
01295 if (get_vector_length(i)!=len)
01296 return false;
01297 }
01298
01299 return true;
01300 }
01301
01302 template<class ST> void CStringFeatures<ST>::embed_features(int32_t p_order)
01303 {
01304 if (m_subset)
01305 SG_NOTIMPLEMENTED;
01306
01307 ASSERT(alphabet->get_num_symbols_in_histogram() > 0);
01308
01309 order=p_order;
01310 original_num_symbols=alphabet->get_num_symbols();
01311 int32_t max_val=alphabet->get_num_bits();
01312
01313 if (p_order>1)
01314 num_symbols=CMath::powl((floatmax_t) 2, (floatmax_t) max_val*p_order);
01315 else
01316 num_symbols=original_num_symbols;
01317
01318 SG_INFO( "max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols);
01319
01320 if ( ((floatmax_t) num_symbols) > CMath::powl(((floatmax_t) 2),((floatmax_t) sizeof(ST)*8)) )
01321 SG_WARNING("symbols did not fit into datatype \"%c\" (%d)\n", (char) max_val, (int) max_val);
01322
01323 ST mask=0;
01324 for (int32_t i=0; i<p_order*max_val; i++)
01325 mask= (mask<<1) | ((ST) 1);
01326
01327 for (int32_t i=0; i<num_vectors; i++)
01328 {
01329 int32_t len=features[i].slen;
01330
01331 if (len < p_order)
01332 SG_ERROR("Sequence must be longer than order (%d vs. %d)\n", len, p_order);
01333
01334 ST* str=features[i].string;
01335
01336
01337 for (int32_t j=0; j<p_order; j++)
01338 str[j]=(ST) alphabet->remap_to_bin(str[j]);
01339 str[0]=embed_word(&str[0], p_order);
01340
01341
01342 int32_t idx=0;
01343 for (int32_t j=p_order; j<len; j++)
01344 {
01345 str[j]=(ST) alphabet->remap_to_bin(str[j]);
01346 str[idx+1]= ((str[idx]<<max_val) | str[j]) & mask;
01347 idx++;
01348 }
01349
01350 features[i].slen=len-p_order+1;
01351 }
01352
01353 compute_symbol_mask_table(max_val);
01354 }
01355
01356 template<class ST> void CStringFeatures<ST>::compute_symbol_mask_table(int64_t max_val)
01357 {
01358 if (m_subset)
01359 SG_NOTIMPLEMENTED;
01360
01361 SG_FREE(symbol_mask_table);
01362 symbol_mask_table=SG_MALLOC(ST, 256);
01363
01364 uint64_t mask=0;
01365 for (int32_t i=0; i< (int64_t) max_val; i++)
01366 mask=(mask<<1) | 1;
01367
01368 for (int32_t i=0; i<256; i++)
01369 {
01370 uint8_t bits=(uint8_t) i;
01371 symbol_mask_table[i]=0;
01372
01373 for (int32_t j=0; j<8; j++)
01374 {
01375 if (bits & 1)
01376 symbol_mask_table[i]|=mask<<(max_val*j);
01377
01378 bits>>=1;
01379 }
01380 }
01381 }
01382
01383 template<class ST> void CStringFeatures<ST>::unembed_word(ST word, uint8_t* seq, int32_t len)
01384 {
01385 uint32_t nbits= (uint32_t) alphabet->get_num_bits();
01386
01387 ST mask=0;
01388 for (uint32_t i=0; i<nbits; i++)
01389 mask=(mask<<1) | (ST) 1;
01390
01391 for (int32_t i=0; i<len; i++)
01392 {
01393 ST w=(word & mask);
01394 seq[len-i-1]=alphabet->remap_to_char((uint8_t) w);
01395 word>>=nbits;
01396 }
01397 }
01398
01399 template<class ST> ST CStringFeatures<ST>::embed_word(ST* seq, int32_t len)
01400 {
01401 ST value=(ST) 0;
01402 uint32_t nbits= (uint32_t) alphabet->get_num_bits();
01403 for (int32_t i=0; i<len; i++)
01404 {
01405 value<<=nbits;
01406 value|=seq[i];
01407 }
01408
01409 return value;
01410 }
01411
01412 template<class ST> void CStringFeatures<ST>::determine_maximum_string_length()
01413 {
01414 max_string_length=0;
01415 index_t num_str=get_num_vectors();
01416
01417 for (int32_t i=0; i<num_str; i++)
01418 {
01419 max_string_length=CMath::max(max_string_length,
01420 features[subset_idx_conversion(i)].slen);
01421 }
01422 }
01423
01424 template<class ST> ST* CStringFeatures<ST>::get_zero_terminated_string_copy(SGString<ST> str)
01425 {
01426 int32_t l=str.slen;
01427 ST* s=SG_MALLOC(ST, l+1);
01428 memcpy(s, str.string, sizeof(ST)*l);
01429 s[l]='\0';
01430 return s;
01431 }
01432
01433 template<class ST> void CStringFeatures<ST>::set_feature_vector(int32_t num, ST* string, int32_t len)
01434 {
01435 ASSERT(features);
01436 ASSERT(num<get_num_vectors());
01437
01438 int32_t real_num=subset_idx_conversion(num);
01439
01440
01441 features[real_num].slen=len ;
01442 features[real_num].string=string ;
01443
01444 max_string_length=CMath::max(len, max_string_length);
01445 }
01446
01447 template<class ST> void CStringFeatures<ST>::get_histogram(float64_t** hist, int32_t* rows, int32_t* cols, bool normalize)
01448 {
01449 int32_t nsym=get_num_symbols();
01450 int32_t slen=get_max_vector_length();
01451 int64_t sz=int64_t(nsym)*slen*sizeof(float64_t);
01452 float64_t* h= SG_MALLOC(float64_t, sz);
01453 memset(h, 0, sz);
01454
01455 float64_t* h_normalizer=SG_MALLOC(float64_t, slen);
01456 memset(h_normalizer, 0, slen*sizeof(float64_t));
01457 int32_t num_str=get_num_vectors();
01458 for (int32_t i=0; i<num_str; i++)
01459 {
01460 int32_t len;
01461 bool free_vec;
01462 ST* vec=get_feature_vector(i, len, free_vec);
01463 for (int32_t j=0; j<len; j++)
01464 {
01465 h[int64_t(j)*nsym+alphabet->remap_to_bin(vec[j])]++;
01466 h_normalizer[j]++;
01467 }
01468 free_feature_vector(vec, i, free_vec);
01469 }
01470
01471 if (normalize)
01472 {
01473 for (int32_t i=0; i<slen; i++)
01474 {
01475 for (int32_t j=0; j<nsym; j++)
01476 {
01477 if (h_normalizer && h_normalizer[i])
01478 h[int64_t(i)*nsym+j]/=h_normalizer[i];
01479 }
01480 }
01481 }
01482 SG_FREE(h_normalizer);
01483
01484 *hist=h;
01485 *rows=nsym;
01486 *cols=slen;
01487 }
01488
01489 template<class ST> void CStringFeatures<ST>::create_random(float64_t* hist, int32_t rows, int32_t cols, int32_t num_vec)
01490 {
01491 ASSERT(rows == get_num_symbols());
01492 cleanup();
01493 float64_t* randoms=SG_MALLOC(float64_t, cols);
01494 SGString<ST>* sf=SG_MALLOC(SGString<ST>, num_vec);
01495
01496 for (int32_t i=0; i<num_vec; i++)
01497 {
01498 sf[i].string=SG_MALLOC(ST, cols);
01499 sf[i].slen=cols;
01500
01501 CMath::random_vector(randoms, cols, 0.0, 1.0);
01502
01503 for (int32_t j=0; j<cols; j++)
01504 {
01505 float64_t lik=hist[int64_t(j)*rows+0];
01506
01507 int32_t c;
01508 for (c=0; c<rows-1; c++)
01509 {
01510 if (randoms[j]<=lik)
01511 break;
01512 lik+=hist[int64_t(j)*rows+c+1];
01513 }
01514 sf[i].string[j]=alphabet->remap_to_char(c);
01515 }
01516 }
01517 SG_FREE(randoms);
01518 set_features(sf, num_vec, cols);
01519 }
01520
01521
01522
01523
01524
01525
01526
01527
01528
01529
01530
01531
01532
01533
01534
01535
01536
01537
01538
01539
01540
01541
01542
01543
01544
01545
01546
01547
01548
01549
01550
01551
01552
01553
01554
01555
01556
01557
01558
01559
01560
01561
01562
01563
01564
01565
01566
01567
01568
01569
01570
01571
01572
01573
01574
01575
01576
01577
01578
01579
01580
01581
01582
01583
01584
01585
01586
01587
01588
01589
01590
01591 template<class ST> CFeatures* CStringFeatures<ST>::copy_subset(SGVector<index_t> indices)
01592 {
01593
01594 SGStringList<ST> list_copy(indices.vlen, max_string_length);
01595
01596
01597 for (index_t i=0; i<indices.vlen; ++i)
01598 {
01599
01600 index_t real_idx=subset_idx_conversion(indices.vector[i]);
01601
01602
01603 SGString<ST> current_string=features[real_idx];
01604 SGString<ST> string_copy(current_string.slen);
01605 memcpy(string_copy.string, current_string.string,
01606 current_string.slen*sizeof(ST));
01607 list_copy.strings[i]=string_copy;
01608 }
01609
01610
01611 CStringFeatures* result=new CStringFeatures(list_copy, alphabet);
01612
01613
01614 result->determine_maximum_string_length();
01615
01616 return result;
01617 }
01618
01619 template<class ST> void CStringFeatures<ST>::subset_changed_post()
01620 {
01621
01622 determine_maximum_string_length();
01623 }
01624
01625 template<class ST> ST* CStringFeatures<ST>::compute_feature_vector(int32_t num, int32_t& len)
01626 {
01627 ASSERT(features && num<get_num_vectors());
01628
01629 int32_t real_num=subset_idx_conversion(num);
01630
01631 len=features[real_num].slen;
01632 if (len<=0)
01633 return NULL;
01634
01635 ST* target=SG_MALLOC(ST, len);
01636 memcpy(target, features[real_num].string, len*sizeof(ST));
01637 return target;
01638 }
01639
01640 template<class ST> void CStringFeatures<ST>::init()
01641 {
01642 set_generic<ST>();
01643
01644 alphabet=NULL;
01645 num_vectors=0;
01646 features=NULL;
01647 single_string=NULL;
01648 length_of_single_string=0;
01649 max_string_length=0;
01650 order=0;
01651 symbol_mask_table=0;
01652 preprocess_on_get=false;
01653 feature_cache=NULL;
01654
01655 m_parameters->add((CSGObject**) &alphabet, "alphabet");
01656 m_parameters->add_vector(&features, &num_vectors, "features",
01657 "This contains the array of features.");
01658 m_parameters->add_vector(&single_string,
01659 &length_of_single_string,
01660 "single_string",
01661 "Created by sliding window.");
01662 m_parameters->add(&max_string_length, "max_string_length",
01663 "Length of longest string.");
01664 m_parameters->add(&num_symbols, "num_symbols",
01665 "Number of used symbols.");
01666 m_parameters->add(&original_num_symbols, "original_num_symbols",
01667 "Original number of used symbols.");
01668 m_parameters->add(&order, "order",
01669 "Order used in higher order mapping.");
01670 m_parameters->add(&preprocess_on_get, "preprocess_on_get",
01671 "Preprocess on-the-fly?");
01672
01673
01674
01675
01676
01677 }
01678
01683 template<> EFeatureType CStringFeatures<bool>::get_feature_type()
01684 {
01685 return F_BOOL;
01686 }
01687
01692 template<> EFeatureType CStringFeatures<char>::get_feature_type()
01693 {
01694 return F_CHAR;
01695 }
01696
01701 template<> EFeatureType CStringFeatures<uint8_t>::get_feature_type()
01702 {
01703 return F_BYTE;
01704 }
01705
01710 template<> EFeatureType CStringFeatures<int16_t>::get_feature_type()
01711 {
01712 return F_SHORT;
01713 }
01714
01719 template<> EFeatureType CStringFeatures<uint16_t>::get_feature_type()
01720 {
01721 return F_WORD;
01722 }
01723
01728 template<> EFeatureType CStringFeatures<int32_t>::get_feature_type()
01729 {
01730 return F_INT;
01731 }
01732
01737 template<> EFeatureType CStringFeatures<uint32_t>::get_feature_type()
01738 {
01739 return F_UINT;
01740 }
01741
01746 template<> EFeatureType CStringFeatures<int64_t>::get_feature_type()
01747 {
01748 return F_LONG;
01749 }
01750
01755 template<> EFeatureType CStringFeatures<uint64_t>::get_feature_type()
01756 {
01757 return F_ULONG;
01758 }
01759
01764 template<> EFeatureType CStringFeatures<float32_t>::get_feature_type()
01765 {
01766 return F_SHORTREAL;
01767 }
01768
01773 template<> EFeatureType CStringFeatures<float64_t>::get_feature_type()
01774 {
01775 return F_DREAL;
01776 }
01777
01782 template<> EFeatureType CStringFeatures<floatmax_t>::get_feature_type()
01783 {
01784 return F_LONGREAL;
01785 }
01786
01787 template<> bool CStringFeatures<bool>::get_masked_symbols(bool symbol, uint8_t mask)
01788 {
01789 return symbol;
01790 }
01791 template<> float32_t CStringFeatures<float32_t>::get_masked_symbols(float32_t symbol, uint8_t mask)
01792 {
01793 return symbol;
01794 }
01795 template<> float64_t CStringFeatures<float64_t>::get_masked_symbols(float64_t symbol, uint8_t mask)
01796 {
01797 return symbol;
01798 }
01799 template<> floatmax_t CStringFeatures<floatmax_t>::get_masked_symbols(floatmax_t symbol, uint8_t mask)
01800 {
01801 return symbol;
01802 }
01803
01804 template<> bool CStringFeatures<bool>::shift_offset(bool symbol, int32_t amount)
01805 {
01806 return false;
01807 }
01808 template<> float32_t CStringFeatures<float32_t>::shift_offset(float32_t symbol, int32_t amount)
01809 {
01810 return 0;
01811 }
01812 template<> float64_t CStringFeatures<float64_t>::shift_offset(float64_t symbol, int32_t amount)
01813 {
01814 return 0;
01815 }
01816 template<> floatmax_t CStringFeatures<floatmax_t>::shift_offset(floatmax_t symbol, int32_t amount)
01817 {
01818 return 0;
01819 }
01820
01821 template<> bool CStringFeatures<bool>::shift_symbol(bool symbol, int32_t amount)
01822 {
01823 return symbol;
01824 }
01825 template<> float32_t CStringFeatures<float32_t>::shift_symbol(float32_t symbol, int32_t amount)
01826 {
01827 return symbol;
01828 }
01829 template<> float64_t CStringFeatures<float64_t>::shift_symbol(float64_t symbol, int32_t amount)
01830 {
01831 return symbol;
01832 }
01833 template<> floatmax_t CStringFeatures<floatmax_t>::shift_symbol(floatmax_t symbol, int32_t amount)
01834 {
01835 return symbol;
01836 }
01837
01838 #ifndef SUNOS
01839 template<> template <class CT> bool CStringFeatures<float32_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
01840 {
01841 return false;
01842 }
01843 template<> template <class CT> bool CStringFeatures<float64_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
01844 {
01845 return false;
01846 }
01847 template<> template <class CT> bool CStringFeatures<floatmax_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
01848 {
01849 return false;
01850 }
01851 #endif
01852
01853 template<> void CStringFeatures<float32_t>::embed_features(int32_t p_order)
01854 {
01855 }
01856 template<> void CStringFeatures<float64_t>::embed_features(int32_t p_order)
01857 {
01858 }
01859 template<> void CStringFeatures<floatmax_t>::embed_features(int32_t p_order)
01860 {
01861 }
01862
01863 template<> void CStringFeatures<float32_t>::compute_symbol_mask_table(int64_t max_val)
01864 {
01865 }
01866 template<> void CStringFeatures<float64_t>::compute_symbol_mask_table(int64_t max_val)
01867 {
01868 }
01869 template<> void CStringFeatures<floatmax_t>::compute_symbol_mask_table(int64_t max_val)
01870 {
01871 }
01872
01873 template<> float32_t CStringFeatures<float32_t>::embed_word(float32_t* seq, int32_t len)
01874 {
01875 return 0;
01876 }
01877 template<> float64_t CStringFeatures<float64_t>::embed_word(float64_t* seq, int32_t len)
01878 {
01879 return 0;
01880 }
01881 template<> floatmax_t CStringFeatures<floatmax_t>::embed_word(floatmax_t* seq, int32_t len)
01882 {
01883 return 0;
01884 }
01885
01886 template<> void CStringFeatures<float32_t>::unembed_word(float32_t word, uint8_t* seq, int32_t len)
01887 {
01888 }
01889 template<> void CStringFeatures<float64_t>::unembed_word(float64_t word, uint8_t* seq, int32_t len)
01890 {
01891 }
01892 template<> void CStringFeatures<floatmax_t>::unembed_word(floatmax_t word, uint8_t* seq, int32_t len)
01893 {
01894 }
01895 #define LOAD(f_load, sg_type) \
01896 template<> void CStringFeatures<sg_type>::load(CFile* loader) \
01897 { \
01898 SG_INFO( "loading...\n"); \
01899 \
01900 SG_SET_LOCALE_C; \
01901 SGString<sg_type>* strs; \
01902 int32_t num_str; \
01903 int32_t max_len; \
01904 loader->f_load(strs, num_str, max_len); \
01905 set_features(strs, num_str, max_len); \
01906 SG_RESET_LOCALE; \
01907 }
01908
01909 LOAD(get_string_list, bool)
01910 LOAD(get_string_list, char)
01911 LOAD(get_int8_string_list, int8_t)
01912 LOAD(get_string_list, uint8_t)
01913 LOAD(get_string_list, int16_t)
01914 LOAD(get_string_list, uint16_t)
01915 LOAD(get_string_list, int32_t)
01916 LOAD(get_uint_string_list, uint32_t)
01917 LOAD(get_long_string_list, int64_t)
01918 LOAD(get_ulong_string_list, uint64_t)
01919 LOAD(get_string_list, float32_t)
01920 LOAD(get_string_list, float64_t)
01921 LOAD(get_longreal_string_list, floatmax_t)
01922 #undef LOAD
01923
01924 #define SAVE(f_write, sg_type) \
01925 template<> void CStringFeatures<sg_type>::save(CFile* writer) \
01926 { \
01927 if (m_subset) \
01928 SG_ERROR("save() is not possible on subset"); \
01929 SG_SET_LOCALE_C; \
01930 ASSERT(writer); \
01931 writer->f_write(features, num_vectors); \
01932 SG_RESET_LOCALE; \
01933 }
01934
01935 SAVE(set_string_list, bool)
01936 SAVE(set_string_list, char)
01937 SAVE(set_int8_string_list, int8_t)
01938 SAVE(set_string_list, uint8_t)
01939 SAVE(set_string_list, int16_t)
01940 SAVE(set_string_list, uint16_t)
01941 SAVE(set_string_list, int32_t)
01942 SAVE(set_uint_string_list, uint32_t)
01943 SAVE(set_long_string_list, int64_t)
01944 SAVE(set_ulong_string_list, uint64_t)
01945 SAVE(set_string_list, float32_t)
01946 SAVE(set_string_list, float64_t)
01947 SAVE(set_longreal_string_list, floatmax_t)
01948 #undef SAVE
01949
01950 template <class ST> template <class CT>
01951 bool CStringFeatures<ST>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start,
01952 int32_t p_order, int32_t gap, bool rev)
01953 {
01954 remove_subset();
01955 ASSERT(sf);
01956
01957 CAlphabet* alpha=sf->get_alphabet();
01958 ASSERT(alpha->get_num_symbols_in_histogram() > 0);
01959
01960 this->order=p_order;
01961 cleanup();
01962
01963 num_vectors=sf->get_num_vectors();
01964 ASSERT(num_vectors>0);
01965 max_string_length=sf->get_max_vector_length()-start;
01966 features=SG_MALLOC(SGString<ST>, num_vectors);
01967
01968 SG_DEBUG( "%1.0llf symbols in StringFeatures<*> %d symbols in histogram\n", sf->get_num_symbols(),
01969 alpha->get_num_symbols_in_histogram());
01970
01971 for (int32_t i=0; i<num_vectors; i++)
01972 {
01973 int32_t len=-1;
01974 bool vfree;
01975 CT* c=sf->get_feature_vector(i, len, vfree);
01976 ASSERT(!vfree);
01977
01978 features[i].string=SG_MALLOC(ST, len);
01979 features[i].slen=len;
01980
01981 ST* str=features[i].string;
01982 for (int32_t j=0; j<len; j++)
01983 str[j]=(ST) alpha->remap_to_bin(c[j]);
01984 }
01985
01986 original_num_symbols=alpha->get_num_symbols();
01987 int32_t max_val=alpha->get_num_bits();
01988
01989 SG_UNREF(alpha);
01990
01991 if (p_order>1)
01992 num_symbols=CMath::powl((floatmax_t) 2, (floatmax_t) max_val*p_order);
01993 else
01994 num_symbols=original_num_symbols;
01995 SG_INFO( "max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols);
01996
01997 if ( ((floatmax_t) num_symbols) > CMath::powl(((floatmax_t) 2),((floatmax_t) sizeof(ST)*8)) )
01998 {
01999 SG_ERROR( "symbol does not fit into datatype \"%c\" (%d)\n", (char) max_val, (int) max_val);
02000 return false;
02001 }
02002
02003 SG_DEBUG( "translate: start=%i order=%i gap=%i(size:%i)\n", start, p_order, gap, sizeof(ST)) ;
02004 for (int32_t line=0; line<num_vectors; line++)
02005 {
02006 int32_t len=0;
02007 bool vfree;
02008 ST* fv=get_feature_vector(line, len, vfree);
02009 ASSERT(!vfree);
02010
02011 if (rev)
02012 CAlphabet::translate_from_single_order_reversed(fv, len, start+gap, p_order+gap, max_val, gap);
02013 else
02014 CAlphabet::translate_from_single_order(fv, len, start+gap, p_order+gap, max_val, gap);
02015
02016
02017 features[line].slen-=start+gap ;
02018 if (features[line].slen<0)
02019 features[line].slen=0 ;
02020 }
02021
02022 compute_symbol_mask_table(max_val);
02023
02024 return true;
02025 }
02026
02027 template class CStringFeatures<bool>;
02028 template class CStringFeatures<char>;
02029 template class CStringFeatures<int8_t>;
02030 template class CStringFeatures<uint8_t>;
02031 template class CStringFeatures<int16_t>;
02032 template class CStringFeatures<uint16_t>;
02033 template class CStringFeatures<int32_t>;
02034 template class CStringFeatures<uint32_t>;
02035 template class CStringFeatures<int64_t>;
02036 template class CStringFeatures<uint64_t>;
02037 template class CStringFeatures<float32_t>;
02038 template class CStringFeatures<float64_t>;
02039 template class CStringFeatures<floatmax_t>;
02040
02041 template bool CStringFeatures<uint16_t>::obtain_from_char_features<uint8_t>(CStringFeatures<uint8_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
02042 template bool CStringFeatures<uint32_t>::obtain_from_char_features<uint8_t>(CStringFeatures<uint8_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
02043 template bool CStringFeatures<uint64_t>::obtain_from_char_features<uint8_t>(CStringFeatures<uint8_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
02044
02045 template bool CStringFeatures<uint16_t>::obtain_from_char_features<uint16_t>(CStringFeatures<uint16_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
02046 template bool CStringFeatures<uint32_t>::obtain_from_char_features<uint16_t>(CStringFeatures<uint16_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
02047 template bool CStringFeatures<uint64_t>::obtain_from_char_features<uint16_t>(CStringFeatures<uint16_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
02048 }