00001 #include <shogun/features/StringFeatures.h>
00002 #include <shogun/preprocessor/Preprocessor.h>
00003 #include <shogun/preprocessor/StringPreprocessor.h>
00004 #include <shogun/io/MemoryMappedFile.h>
00005 #include <shogun/io/SGIO.h>
00006 #include <shogun/mathematics/Math.h>
00007 #include <shogun/base/Parameter.h>
00008
00009 #include <sys/types.h>
00010 #include <sys/stat.h>
00011 #include <dirent.h>
00012 #include <stdio.h>
00013 #include <stdlib.h>
00014 #include <unistd.h>
00015
00016
00017 namespace shogun
00018 {
00019
00020 template<class ST> CStringFeatures<ST>::CStringFeatures() : CFeatures(0)
00021 {
00022 init();
00023 alphabet=new CAlphabet();
00024 }
00025
00026 template<class ST> CStringFeatures<ST>::CStringFeatures(EAlphabet alpha) : CFeatures(0)
00027 {
00028 init();
00029
00030 alphabet=new CAlphabet(alpha);
00031 SG_REF(alphabet);
00032 num_symbols=alphabet->get_num_symbols();
00033 original_num_symbols=num_symbols;
00034 }
00035
00036 template<class ST> CStringFeatures<ST>::CStringFeatures(SGStringList<ST> string_list, EAlphabet alpha)
00037 : CFeatures(0)
00038 {
00039 init();
00040
00041 alphabet=new CAlphabet(alpha);
00042 SG_REF(alphabet);
00043 num_symbols=alphabet->get_num_symbols();
00044 original_num_symbols=num_symbols;
00045 set_features(string_list.strings, string_list.num_strings, string_list.max_string_length);
00046 }
00047
00048 template<class ST> CStringFeatures<ST>::CStringFeatures(SGStringList<ST> string_list, CAlphabet* alpha)
00049 : CFeatures(0)
00050 {
00051 init();
00052
00053 alphabet=new CAlphabet(alpha);
00054 SG_REF(alphabet);
00055 num_symbols=alphabet->get_num_symbols();
00056 original_num_symbols=num_symbols;
00057 set_features(string_list.strings, string_list.num_strings, string_list.max_string_length);
00058 }
00059
00060 template<class ST> CStringFeatures<ST>::CStringFeatures(CAlphabet* alpha)
00061 : CFeatures(0)
00062 {
00063 init();
00064
00065 ASSERT(alpha);
00066 SG_REF(alpha);
00067 alphabet=alpha;
00068 num_symbols=alphabet->get_num_symbols();
00069 original_num_symbols=num_symbols;
00070 }
00071
00072 template<class ST> CStringFeatures<ST>::CStringFeatures(const CStringFeatures & orig)
00073 : CFeatures(orig), num_vectors(orig.num_vectors),
00074 single_string(orig.single_string),
00075 length_of_single_string(orig.length_of_single_string),
00076 max_string_length(orig.max_string_length),
00077 num_symbols(orig.num_symbols),
00078 original_num_symbols(orig.original_num_symbols),
00079 order(orig.order), preprocess_on_get(false),
00080 feature_cache(NULL)
00081 {
00082 init();
00083
00084 ASSERT(orig.single_string == NULL);
00085
00086 alphabet=orig.alphabet;
00087 SG_REF(alphabet);
00088
00089 if (orig.features)
00090 {
00091 features=SG_MALLOC(SGString<ST>, orig.num_vectors);
00092
00093 for (int32_t i=0; i<num_vectors; i++)
00094 {
00095 features[i].string=SG_MALLOC(ST, orig.features[i].slen);
00096 features[i].slen=orig.features[i].slen;
00097 memcpy(features[i].string, orig.features[i].string, sizeof(ST)*orig.features[i].slen);
00098 }
00099 }
00100
00101 if (orig.symbol_mask_table)
00102 {
00103 symbol_mask_table=SG_MALLOC(ST, 256);
00104 for (int32_t i=0; i<256; i++)
00105 symbol_mask_table[i]=orig.symbol_mask_table[i];
00106 }
00107
00108 m_subset_stack=orig.m_subset_stack;
00109 SG_REF(m_subset_stack);
00110 }
00111
00112 template<class ST> CStringFeatures<ST>::CStringFeatures(CFile* loader, EAlphabet alpha)
00113 : CFeatures(loader), num_vectors(0),
00114 features(NULL), single_string(NULL), length_of_single_string(0),
00115 max_string_length(0), order(0),
00116 symbol_mask_table(NULL), preprocess_on_get(false), feature_cache(NULL)
00117 {
00118 init();
00119
00120 alphabet=new CAlphabet(alpha);
00121 SG_REF(alphabet);
00122 num_symbols=alphabet->get_num_symbols();
00123 original_num_symbols=num_symbols;
00124 load(loader);
00125 }
00126
00127 template<class ST> CStringFeatures<ST>::~CStringFeatures()
00128 {
00129 cleanup();
00130
00131 SG_UNREF(alphabet);
00132 }
00133
00134 template<class ST> void CStringFeatures<ST>::cleanup()
00135 {
00136 remove_all_subsets();
00137
00138 if (single_string)
00139 {
00140 SG_FREE(single_string);
00141 single_string=NULL;
00142 }
00143 else
00144 cleanup_feature_vectors(0, num_vectors-1);
00145
00146
00147
00148
00149
00150
00151
00152
00153
00154
00155
00156 num_vectors=0;
00157 SG_FREE(features);
00158 SG_FREE(symbol_mask_table);
00159 features=NULL;
00160 symbol_mask_table=NULL;
00161
00162
00163
00164
00165
00166 CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet());
00167 SG_UNREF(alphabet);
00168 alphabet=alpha;
00169 SG_REF(alphabet);
00170 }
00171
00172 template<class ST> void CStringFeatures<ST>::cleanup_feature_vector(int32_t num)
00173 {
00174 ASSERT(num<get_num_vectors());
00175
00176 if (features)
00177 {
00178 int32_t real_num=m_subset_stack->subset_idx_conversion(num);
00179 SG_FREE(features[real_num].string);
00180 features[real_num].string=NULL;
00181 features[real_num].slen=0;
00182
00183 determine_maximum_string_length();
00184 }
00185 }
00186
00187 template<class ST> void CStringFeatures<ST>::cleanup_feature_vectors(int32_t start, int32_t stop)
00188 {
00189 if (features && get_num_vectors())
00190 {
00191 ASSERT(start<get_num_vectors());
00192 ASSERT(stop<get_num_vectors());
00193
00194 for (int32_t i=start; i<=stop; i++)
00195 {
00196 int32_t real_num=m_subset_stack->subset_idx_conversion(i);
00197 SG_FREE(features[real_num].string);
00198 features[real_num].string=NULL;
00199 features[real_num].slen=0;
00200 }
00201 determine_maximum_string_length();
00202 }
00203 }
00204
00205 template<class ST> EFeatureClass CStringFeatures<ST>::get_feature_class() const { return C_STRING; }
00206
00207 template<class ST> EFeatureType CStringFeatures<ST>::get_feature_type() const { return F_UNKNOWN; }
00208
00209 template<class ST> CAlphabet* CStringFeatures<ST>::get_alphabet()
00210 {
00211 SG_REF(alphabet);
00212 return alphabet;
00213 }
00214
00215 template<class ST> CFeatures* CStringFeatures<ST>::duplicate() const
00216 {
00217 return new CStringFeatures<ST>(*this);
00218 }
00219
00220 template<class ST> SGVector<ST> CStringFeatures<ST>::get_feature_vector(int32_t num)
00221 {
00222 ASSERT(features);
00223 if (num>=get_num_vectors())
00224 {
00225 SG_ERROR("Index out of bounds (number of strings %d, you "
00226 "requested %d)\n", get_num_vectors(), num);
00227 }
00228
00229 int32_t l;
00230 bool free_vec;
00231 ST* vec=get_feature_vector(num, l, free_vec);
00232 ST* dst=SG_MALLOC(ST, l);
00233 memcpy(dst, vec, l*sizeof(ST));
00234 free_feature_vector(vec, num, free_vec);
00235 return SGVector<ST>(dst, l, true);
00236 }
00237
00238 template<class ST> void CStringFeatures<ST>::set_feature_vector(SGVector<ST> vector, int32_t num)
00239 {
00240 ASSERT(features);
00241
00242 if (m_subset_stack->has_subsets())
00243 SG_ERROR("A subset is set, cannot set feature vector\n");
00244
00245 if (num>=num_vectors)
00246 {
00247 SG_ERROR("Index out of bounds (number of strings %d, you "
00248 "requested %d)\n", num_vectors, num);
00249 }
00250
00251 if (vector.vlen<=0)
00252 SG_ERROR("String has zero or negative length\n");
00253
00254 cleanup_feature_vector(num);
00255 features[num].slen=vector.vlen;
00256 features[num].string=SG_MALLOC(ST, vector.vlen);
00257 memcpy(features[num].string, vector.vector, vector.vlen*sizeof(ST));
00258
00259 determine_maximum_string_length();
00260 }
00261
00262 template<class ST> void CStringFeatures<ST>::enable_on_the_fly_preprocessing()
00263 {
00264 preprocess_on_get=true;
00265 }
00266
00267 template<class ST> void CStringFeatures<ST>::disable_on_the_fly_preprocessing()
00268 {
00269 preprocess_on_get=false;
00270 }
00271
00272 template<class ST> ST* CStringFeatures<ST>::get_feature_vector(int32_t num, int32_t& len, bool& dofree)
00273 {
00274 ASSERT(features);
00275 if (num>=get_num_vectors())
00276 SG_ERROR("Requested feature vector with index %d while total num is", num, get_num_vectors());
00277
00278 int32_t real_num=m_subset_stack->subset_idx_conversion(num);
00279
00280 if (!preprocess_on_get)
00281 {
00282 dofree=false;
00283 len=features[real_num].slen;
00284 return features[real_num].string;
00285 }
00286 else
00287 {
00288 SG_DEBUG( "computing feature vector!\n") ;
00289 ST* feat=compute_feature_vector(num, len);
00290 dofree=true;
00291
00292 if (get_num_preprocessors())
00293 {
00294 ST* tmp_feat_before=feat;
00295
00296 for (int32_t i=0; i<get_num_preprocessors(); i++)
00297 {
00298 CStringPreprocessor<ST>* p=(CStringPreprocessor<ST>*) get_preprocessor(i);
00299 feat=p->apply_to_string(tmp_feat_before, len);
00300 SG_UNREF(p);
00301 SG_FREE(tmp_feat_before);
00302 tmp_feat_before=feat;
00303 }
00304 }
00305
00306 return feat;
00307 }
00308 }
00309
00310 template<class ST> CStringFeatures<ST>* CStringFeatures<ST>::get_transposed()
00311 {
00312 int32_t num_feat;
00313 int32_t num_vec;
00314 SGString<ST>* s=get_transposed(num_feat, num_vec);
00315 SGStringList<ST> string_list;
00316 string_list.strings = s;
00317 string_list.num_strings = num_vec;
00318 string_list.max_string_length = num_feat;
00319
00320 return new CStringFeatures<ST>(string_list, alphabet);
00321 }
00322
00323 template<class ST> SGString<ST>* CStringFeatures<ST>::get_transposed(int32_t &num_feat, int32_t &num_vec)
00324 {
00325 num_feat=get_num_vectors();
00326 num_vec=get_max_vector_length();
00327 ASSERT(have_same_length());
00328
00329 SG_DEBUG("Allocating memory for transposed string features of size %ld\n",
00330 int64_t(num_feat)*num_vec);
00331
00332 SGString<ST>* sf=SG_MALLOC(SGString<ST>, num_vec);
00333
00334 for (int32_t i=0; i<num_vec; i++)
00335 {
00336 sf[i].string=SG_MALLOC(ST, num_feat);
00337 sf[i].slen=num_feat;
00338 }
00339
00340 for (int32_t i=0; i<num_feat; i++)
00341 {
00342 int32_t len=0;
00343 bool free_vec=false;
00344 ST* vec=get_feature_vector(i, len, free_vec);
00345
00346 for (int32_t j=0; j<num_vec; j++)
00347 sf[j].string[i]=vec[j];
00348
00349 free_feature_vector(vec, i, free_vec);
00350 }
00351 return sf;
00352 }
00353
00354 template<class ST> void CStringFeatures<ST>::free_feature_vector(ST* feat_vec, int32_t num, bool dofree)
00355 {
00356 if (num>=get_num_vectors())
00357 {
00358 SG_ERROR(
00359 "Trying to access string[%d] but num_str=%d\n", num,
00360 get_num_vectors());
00361 }
00362
00363 int32_t real_num=m_subset_stack->subset_idx_conversion(num);
00364
00365 if (feature_cache)
00366 feature_cache->unlock_entry(real_num);
00367
00368 if (dofree)
00369 SG_FREE(feat_vec);
00370 }
00371
00372 template<class ST> void CStringFeatures<ST>::free_feature_vector(SGVector<ST> feat_vec, int32_t num)
00373 {
00374 if (num>=get_num_vectors())
00375 {
00376 SG_ERROR(
00377 "Trying to access string[%d] but num_str=%d\n", num,
00378 get_num_vectors());
00379 }
00380
00381 int32_t real_num=m_subset_stack->subset_idx_conversion(num);
00382
00383 if (feature_cache)
00384 feature_cache->unlock_entry(real_num);
00385 }
00386
00387 template<class ST> ST CStringFeatures<ST>::get_feature(int32_t vec_num, int32_t feat_num)
00388 {
00389 ASSERT(vec_num<get_num_vectors());
00390
00391 int32_t len;
00392 bool free_vec;
00393 ST* vec=get_feature_vector(vec_num, len, free_vec);
00394 ASSERT(feat_num<len);
00395 ST result=vec[feat_num];
00396 free_feature_vector(vec, vec_num, free_vec);
00397
00398 return result;
00399 }
00400
00401 template<class ST> int32_t CStringFeatures<ST>::get_vector_length(int32_t vec_num)
00402 {
00403 ASSERT(vec_num<get_num_vectors());
00404
00405 int32_t len;
00406 bool free_vec;
00407 ST* vec=get_feature_vector(vec_num, len, free_vec);
00408 free_feature_vector(vec, vec_num, free_vec);
00409 return len;
00410 }
00411
00412 template<class ST> int32_t CStringFeatures<ST>::get_max_vector_length()
00413 {
00414 return max_string_length;
00415 }
00416
00417 template<class ST> int32_t CStringFeatures<ST>::get_num_vectors() const
00418 {
00419 return m_subset_stack->has_subsets() ? m_subset_stack->get_size() : num_vectors;
00420 }
00421
00422 template<class ST> floatmax_t CStringFeatures<ST>::get_num_symbols() { return num_symbols; }
00423
00424 template<class ST> floatmax_t CStringFeatures<ST>::get_max_num_symbols() { return CMath::powl(2,sizeof(ST)*8); }
00425
00426 template<class ST> floatmax_t CStringFeatures<ST>::get_original_num_symbols() { return original_num_symbols; }
00427
00428 template<class ST> int32_t CStringFeatures<ST>::get_order() { return order; }
00429
00430 template<class ST> ST CStringFeatures<ST>::get_masked_symbols(ST symbol, uint8_t mask)
00431 {
00432 ASSERT(symbol_mask_table);
00433 return symbol_mask_table[mask] & symbol;
00434 }
00435
00436 template<class ST> ST CStringFeatures<ST>::shift_offset(ST offset, int32_t amount)
00437 {
00438 ASSERT(alphabet);
00439 return (offset << (amount*alphabet->get_num_bits()));
00440 }
00441
00442 template<class ST> ST CStringFeatures<ST>::shift_symbol(ST symbol, int32_t amount)
00443 {
00444 ASSERT(alphabet);
00445 return (symbol >> (amount*alphabet->get_num_bits()));
00446 }
00447
00448 template<class ST> void CStringFeatures<ST>::load_ascii_file(char* fname, bool remap_to_bin,
00449 EAlphabet ascii_alphabet, EAlphabet binary_alphabet)
00450 {
00451 remove_all_subsets();
00452
00453 size_t blocksize=1024*1024;
00454 size_t required_blocksize=0;
00455 uint8_t* dummy=SG_MALLOC(uint8_t, blocksize);
00456 uint8_t* overflow=NULL;
00457 int32_t overflow_len=0;
00458
00459 cleanup();
00460
00461 CAlphabet* alpha=new CAlphabet(ascii_alphabet);
00462 CAlphabet* alpha_bin=new CAlphabet(binary_alphabet);
00463
00464 FILE* f=fopen(fname, "ro");
00465
00466 if (f)
00467 {
00468 num_vectors=0;
00469 max_string_length=0;
00470
00471 SG_INFO("counting line numbers in file %s\n", fname);
00472 size_t block_offs=0;
00473 size_t old_block_offs=0;
00474 fseek(f, 0, SEEK_END);
00475 size_t fsize=ftell(f);
00476 rewind(f);
00477
00478 if (blocksize>fsize)
00479 blocksize=fsize;
00480
00481 SG_DEBUG("block_size=%ld file_size=%ld\n", blocksize, fsize);
00482
00483 size_t sz=blocksize;
00484 while (sz == blocksize)
00485 {
00486 sz=fread(dummy, sizeof(uint8_t), blocksize, f);
00487 for (size_t i=0; i<sz; i++)
00488 {
00489 block_offs++;
00490 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00491 {
00492 num_vectors++;
00493 required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs);
00494 old_block_offs=block_offs;
00495 }
00496 }
00497 SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
00498 }
00499
00500 SG_INFO("found %d strings\n", num_vectors);
00501 SG_FREE(dummy);
00502 blocksize=required_blocksize;
00503 dummy=SG_MALLOC(uint8_t, blocksize);
00504 overflow=SG_MALLOC(uint8_t, blocksize);
00505 features=SG_MALLOC(SGString<ST>, num_vectors);
00506
00507 rewind(f);
00508 sz=blocksize;
00509 int32_t lines=0;
00510 while (sz == blocksize)
00511 {
00512 sz=fread(dummy, sizeof(uint8_t), blocksize, f);
00513
00514 size_t old_sz=0;
00515 for (size_t i=0; i<sz; i++)
00516 {
00517 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00518 {
00519 int32_t len=i-old_sz;
00520
00521 max_string_length=CMath::max(max_string_length, len+overflow_len);
00522
00523 features[lines].slen=len;
00524 features[lines].string=SG_MALLOC(ST, len);
00525
00526 if (remap_to_bin)
00527 {
00528 for (int32_t j=0; j<overflow_len; j++)
00529 features[lines].string[j]=alpha->remap_to_bin(overflow[j]);
00530 for (int32_t j=0; j<len; j++)
00531 features[lines].string[j+overflow_len]=alpha->remap_to_bin(dummy[old_sz+j]);
00532 alpha->add_string_to_histogram(&dummy[old_sz], len);
00533 alpha_bin->add_string_to_histogram(features[lines].string, features[lines].slen);
00534 }
00535 else
00536 {
00537 for (int32_t j=0; j<overflow_len; j++)
00538 features[lines].string[j]=overflow[j];
00539 for (int32_t j=0; j<len; j++)
00540 features[lines].string[j+overflow_len]=dummy[old_sz+j];
00541 alpha->add_string_to_histogram(&dummy[old_sz], len);
00542 alpha->add_string_to_histogram(features[lines].string, features[lines].slen);
00543 }
00544
00545
00546 overflow_len=0;
00547
00548
00549 old_sz=i+1;
00550 lines++;
00551 SG_PROGRESS(lines, 0, num_vectors, 1, "LOADING:\t");
00552 }
00553 }
00554 for (size_t i=old_sz; i<sz; i++)
00555 overflow[i-old_sz]=dummy[i];
00556
00557 overflow_len=sz-old_sz;
00558 }
00559
00560 if (alpha->check_alphabet_size() && alpha->check_alphabet())
00561 {
00562 SG_INFO("file successfully read\n");
00563 SG_INFO("max_string_length=%d\n", max_string_length);
00564 SG_INFO("num_strings=%d\n", num_vectors);
00565 }
00566 fclose(f);
00567 }
00568
00569 SG_FREE(dummy);
00570
00571 SG_UNREF(alphabet);
00572
00573 if (remap_to_bin)
00574 alphabet=alpha_bin;
00575 else
00576 alphabet=alpha;
00577 SG_REF(alphabet);
00578 num_symbols=alphabet->get_num_symbols();
00579 }
00580
00581 template<class ST> bool CStringFeatures<ST>::load_fasta_file(const char* fname, bool ignore_invalid)
00582 {
00583 remove_all_subsets();
00584
00585 int32_t i=0;
00586 uint64_t len=0;
00587 uint64_t offs=0;
00588 int32_t num=0;
00589 int32_t max_len=0;
00590
00591 CMemoryMappedFile<char> f(fname);
00592
00593 while (true)
00594 {
00595 char* s=f.get_line(len, offs);
00596 if (!s)
00597 break;
00598
00599 if (len>0 && s[0]=='>')
00600 num++;
00601 }
00602
00603 if (num==0)
00604 SG_ERROR("No fasta hunks (lines starting with '>') found\n");
00605
00606 cleanup();
00607 SG_UNREF(alphabet);
00608 alphabet=new CAlphabet(DNA);
00609 num_symbols=alphabet->get_num_symbols();
00610
00611 SGString<ST>* strings=SG_MALLOC(SGString<ST>, num);
00612 offs=0;
00613
00614 for (i=0;i<num; i++)
00615 {
00616 uint64_t id_len=0;
00617 char* id=f.get_line(id_len, offs);
00618
00619 char* fasta=f.get_line(len, offs);
00620 char* s=fasta;
00621 int32_t fasta_len=0;
00622 int32_t spanned_lines=0;
00623
00624 while (true)
00625 {
00626 if (!s || len==0)
00627 SG_ERROR("Error reading fasta entry in line %d len=%ld", 4*i+1, len);
00628
00629 if (s[0]=='>' || offs==f.get_size())
00630 {
00631 offs-=len+1;
00632 if (offs==f.get_size())
00633 {
00634 SG_DEBUG("at EOF\n");
00635 fasta_len+=len;
00636 }
00637
00638 len=fasta_len-spanned_lines;
00639 strings[i].string=SG_MALLOC(ST, len);
00640 strings[i].slen=len;
00641
00642 ST* str=strings[i].string;
00643 int32_t idx=0;
00644 SG_DEBUG("'%.*s', len=%d, spanned_lines=%d\n", (int32_t) id_len, id, (int32_t) len, (int32_t) spanned_lines);
00645
00646 for (int32_t j=0; j<fasta_len; j++)
00647 {
00648 if (fasta[j]=='\n')
00649 continue;
00650
00651 ST c=(ST) fasta[j];
00652
00653 if (ignore_invalid && !alphabet->is_valid((uint8_t) fasta[j]))
00654 c=(ST) 'A';
00655
00656 if (uint64_t(idx)>=len)
00657 SG_ERROR("idx=%d j=%d fasta_len=%d, spanned_lines=%d str='%.*s'\n", idx, j, fasta_len, spanned_lines, idx, str);
00658 str[idx++]=c;
00659 }
00660 max_len=CMath::max(max_len, strings[i].slen);
00661
00662
00663 break;
00664 }
00665
00666 spanned_lines++;
00667 fasta_len+=len+1;
00668 s=f.get_line(len, offs);
00669 }
00670 }
00671 return set_features(strings, num, max_len);
00672 }
00673
00674 template<class ST> bool CStringFeatures<ST>::load_fastq_file(const char* fname,
00675 bool ignore_invalid, bool bitremap_in_single_string)
00676 {
00677 remove_all_subsets();
00678
00679 CMemoryMappedFile<char> f(fname);
00680
00681 int32_t i=0;
00682 uint64_t len=0;
00683 uint64_t offs=0;
00684
00685 int32_t num=f.get_num_lines();
00686 int32_t max_len=0;
00687
00688 if (num%4)
00689 SG_ERROR("Number of lines must be divisible by 4 in fastq files\n");
00690 num/=4;
00691
00692 cleanup();
00693 SG_UNREF(alphabet);
00694 alphabet=new CAlphabet(DNA);
00695
00696 SGString<ST>* strings;
00697
00698 ST* str=NULL;
00699 if (bitremap_in_single_string)
00700 {
00701 strings=SG_MALLOC(SGString<ST>, 1);
00702 strings[0].string=SG_MALLOC(ST, num);
00703 strings[0].slen=num;
00704 f.get_line(len, offs);
00705 f.get_line(len, offs);
00706 order=len;
00707 max_len=num;
00708 offs=0;
00709 original_num_symbols=alphabet->get_num_symbols();
00710 str=SG_MALLOC(ST, len);
00711 }
00712 else
00713 strings=SG_MALLOC(SGString<ST>, num);
00714
00715 for (i=0;i<num; i++)
00716 {
00717 if (!f.get_line(len, offs))
00718 SG_ERROR("Error reading 'read' identifier in line %d", 4*i);
00719
00720 char* s=f.get_line(len, offs);
00721 if (!s || len==0)
00722 SG_ERROR("Error reading 'read' in line %d len=%ld", 4*i+1, len);
00723
00724 if (bitremap_in_single_string)
00725 {
00726 if (len!=(uint64_t) order)
00727 SG_ERROR("read in line %d not of length %d (is %d)\n", 4*i+1, order, len);
00728 for (int32_t j=0; j<order; j++)
00729 str[j]=(ST) alphabet->remap_to_bin((uint8_t) s[j]);
00730
00731 strings[0].string[i]=embed_word(str, order);
00732 }
00733 else
00734 {
00735 strings[i].string=SG_MALLOC(ST, len);
00736 strings[i].slen=len;
00737 str=strings[i].string;
00738
00739 if (ignore_invalid)
00740 {
00741 for (uint64_t j=0; j<len; j++)
00742 {
00743 if (alphabet->is_valid((uint8_t) s[j]))
00744 str[j]= (ST) s[j];
00745 else
00746 str[j]= (ST) 'A';
00747 }
00748 }
00749 else
00750 {
00751 for (uint64_t j=0; j<len; j++)
00752 str[j]= (ST) s[j];
00753 }
00754 max_len=CMath::max(max_len, (int32_t) len);
00755 }
00756
00757
00758 if (!f.get_line(len, offs))
00759 SG_ERROR("Error reading 'read' quality identifier in line %d", 4*i+2);
00760
00761 if (!f.get_line(len, offs))
00762 SG_ERROR("Error reading 'read' quality in line %d", 4*i+3);
00763 }
00764
00765 if (bitremap_in_single_string)
00766 num=1;
00767
00768 num_vectors=num;
00769 max_string_length=max_len;
00770 features=strings;
00771
00772 return true;
00773 }
00774
00775 template<class ST> bool CStringFeatures<ST>::load_from_directory(char* dirname)
00776 {
00777 remove_all_subsets();
00778
00779 struct dirent **namelist;
00780 int32_t n;
00781
00782 SGIO::set_dirname(dirname);
00783
00784 SG_DEBUG("dirname '%s'\n", dirname);
00785
00786 n=scandir(dirname, &namelist, &SGIO::filter, alphasort);
00787 if (n <= 0)
00788 {
00789 SG_ERROR("error calling scandir - no files found\n");
00790 return false;
00791 }
00792 else
00793 {
00794 SGString<ST>* strings=NULL;
00795
00796 int32_t num=0;
00797 int32_t max_len=-1;
00798
00799
00800
00801 strings=SG_MALLOC(SGString<ST>, n);
00802
00803 for (int32_t i=0; i<n; i++)
00804 {
00805 char* fname=SGIO::concat_filename(namelist[i]->d_name);
00806
00807 struct stat s;
00808 off_t filesize=0;
00809
00810 if (!stat(fname, &s) && s.st_size>0)
00811 {
00812 filesize=s.st_size/sizeof(ST);
00813
00814 FILE* f=fopen(fname, "ro");
00815 if (f)
00816 {
00817 ST* str=SG_MALLOC(ST, filesize);
00818 SG_DEBUG("%s:%ld\n", fname, (int64_t) filesize);
00819 if (fread(str, sizeof(ST), filesize, f)!=(size_t) filesize)
00820 SG_ERROR("failed to read file\n");
00821 strings[num].string=str;
00822 strings[num].slen=filesize;
00823 max_len=CMath::max(max_len, strings[num].slen);
00824
00825 num++;
00826 fclose(f);
00827 }
00828 }
00829 else
00830 SG_ERROR("empty or non readable file \'%s\'\n", fname);
00831
00832 SG_FREE(namelist[i]);
00833 }
00834 SG_FREE(namelist);
00835
00836 if (num>0 && strings)
00837 {
00838 set_features(strings, num, max_len);
00839 return true;
00840 }
00841 }
00842 return false;
00843 }
00844
00845 template<class ST> void CStringFeatures<ST>::set_features(SGStringList<ST> feats)
00846 {
00847 set_features(feats.strings, feats.num_strings, feats.max_string_length);
00848 }
00849
00850 template<class ST> bool CStringFeatures<ST>::set_features(SGString<ST>* p_features, int32_t p_num_vectors, int32_t p_max_string_length)
00851 {
00852 if (m_subset_stack->has_subsets())
00853 SG_ERROR("Cannot call set_features() with subset.\n");
00854
00855 if (p_features)
00856 {
00857 CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet());
00858
00859
00860 for (int32_t i=0; i<p_num_vectors; i++)
00861 alpha->add_string_to_histogram( p_features[i].string, p_features[i].slen);
00862
00863 SG_INFO("max_value_in_histogram:%d\n", alpha->get_max_value_in_histogram());
00864 SG_INFO("num_symbols_in_histogram:%d\n", alpha->get_num_symbols_in_histogram());
00865
00866 if (alpha->check_alphabet_size() && alpha->check_alphabet())
00867 {
00868 cleanup();
00869 SG_UNREF(alphabet);
00870
00871 alphabet=alpha;
00872 SG_REF(alphabet);
00873
00874
00875 features = SG_MALLOC(SGString<ST>,p_num_vectors);
00876 memcpy(features,p_features,sizeof(SGString<ST>)*p_num_vectors);
00877 num_vectors = p_num_vectors;
00878 max_string_length = p_max_string_length;
00879
00880 return true;
00881 }
00882 else
00883 SG_UNREF(alpha);
00884 }
00885
00886 return false;
00887 }
00888
00889 template<class ST> bool CStringFeatures<ST>::append_features(CStringFeatures<ST>* sf)
00890 {
00891 ASSERT(sf);
00892
00893 if (m_subset_stack->has_subsets())
00894 SG_ERROR("Cannot call set_features() with subset.\n");
00895
00896 SGString<ST>* new_features=SG_MALLOC(SGString<ST>, sf->get_num_vectors());
00897
00898 index_t sf_num_str=sf->get_num_vectors();
00899 for (int32_t i=0; i<sf_num_str; i++)
00900 {
00901 int32_t real_i = sf->m_subset_stack->subset_idx_conversion(i);
00902 int32_t length=sf->features[real_i].slen;
00903 new_features[i].string=SG_MALLOC(ST, length);
00904 memcpy(new_features[i].string, sf->features[real_i].string, length);
00905 new_features[i].slen=length;
00906 }
00907 return append_features(new_features, sf_num_str,
00908 sf->max_string_length);
00909 }
00910
00911 template<class ST> bool CStringFeatures<ST>::append_features(SGString<ST>* p_features, int32_t p_num_vectors, int32_t p_max_string_length)
00912 {
00913 if (m_subset_stack->has_subsets())
00914 SG_ERROR("Cannot call set_features() with subset.\n");
00915
00916 if (!features)
00917 return set_features(p_features, p_num_vectors, p_max_string_length);
00918
00919 CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet());
00920
00921
00922 for (int32_t i=0; i<p_num_vectors; i++)
00923 alpha->add_string_to_histogram( p_features[i].string, p_features[i].slen);
00924
00925 SG_INFO("max_value_in_histogram:%d\n", alpha->get_max_value_in_histogram());
00926 SG_INFO("num_symbols_in_histogram:%d\n", alpha->get_num_symbols_in_histogram());
00927
00928 if (alpha->check_alphabet_size() && alpha->check_alphabet())
00929 {
00930 SG_UNREF(alpha);
00931 for (int32_t i=0; i<p_num_vectors; i++)
00932 alphabet->add_string_to_histogram( p_features[i].string, p_features[i].slen);
00933
00934 int32_t old_num_vectors=num_vectors;
00935 num_vectors=old_num_vectors+p_num_vectors;
00936 SGString<ST>* new_features=SG_MALLOC(SGString<ST>, num_vectors);
00937
00938 for (int32_t i=0; i<num_vectors; i++)
00939 {
00940 if (i<old_num_vectors)
00941 {
00942 new_features[i].string=features[i].string;
00943 new_features[i].slen=features[i].slen;
00944 }
00945 else
00946 {
00947 new_features[i].string=p_features[i-old_num_vectors].string;
00948 new_features[i].slen=p_features[i-old_num_vectors].slen;
00949 }
00950 }
00951 SG_FREE(features);
00952 SG_FREE(p_features);
00953
00954 this->features=new_features;
00955 max_string_length=CMath::max(max_string_length, p_max_string_length);
00956
00957 return true;
00958 }
00959 SG_UNREF(alpha);
00960
00961 return false;
00962 }
00963
00964 template<class ST> SGStringList<ST> CStringFeatures<ST>::get_features()
00965 {
00966 SGStringList<ST> sl(NULL,0,0,false);
00967
00968 sl.strings=get_features(sl.num_strings, sl.max_string_length);
00969 return sl;
00970 }
00971
00972 template<class ST> SGString<ST>* CStringFeatures<ST>::get_features(int32_t& num_str, int32_t& max_str_len)
00973 {
00974 if (m_subset_stack->has_subsets())
00975 SG_ERROR("get features() is not possible on subset");
00976
00977 num_str=num_vectors;
00978 max_str_len=max_string_length;
00979 return features;
00980 }
00981
00982 template<class ST> SGString<ST>* CStringFeatures<ST>::copy_features(int32_t& num_str, int32_t& max_str_len)
00983 {
00984 ASSERT(num_vectors>0);
00985
00986 num_str=get_num_vectors();
00987 max_str_len=max_string_length;
00988 SGString<ST>* new_feat=SG_MALLOC(SGString<ST>, num_str);
00989
00990 for (int32_t i=0; i<num_str; i++)
00991 {
00992 int32_t len;
00993 bool free_vec;
00994 ST* vec=get_feature_vector(i, len, free_vec);
00995 new_feat[i].string=SG_MALLOC(ST, len);
00996 new_feat[i].slen=len;
00997 memcpy(new_feat[i].string, vec, ((size_t) len) * sizeof(ST));
00998 free_feature_vector(vec, i, free_vec);
00999 }
01000
01001 return new_feat;
01002 }
01003
01004 template<class ST> void CStringFeatures<ST>::get_features(SGString<ST>** dst, int32_t* num_str)
01005 {
01006 int32_t num_vec;
01007 int32_t max_str_len;
01008 *dst=copy_features(num_vec, max_str_len);
01009 *num_str=num_vec;
01010 }
01011
01012 template<class ST> bool CStringFeatures<ST>::load_compressed(char* src, bool decompress)
01013 {
01014 remove_all_subsets();
01015
01016 FILE* file=NULL;
01017
01018 if (!(file=fopen(src, "r")))
01019 return false;
01020 cleanup();
01021
01022
01023 char id[4];
01024 if (fread(&id[0], sizeof(char), 1, file)!=1)
01025 SG_ERROR("failed to read header");
01026 ASSERT(id[0]=='S');
01027 if (fread(&id[1], sizeof(char), 1, file)!=1)
01028 SG_ERROR("failed to read header");
01029 ASSERT(id[1]=='G');
01030 if (fread(&id[2], sizeof(char), 1, file)!=1)
01031 SG_ERROR("failed to read header");
01032 ASSERT(id[2]=='V');
01033 if (fread(&id[3], sizeof(char), 1, file)!=1)
01034 SG_ERROR("failed to read header");
01035 ASSERT(id[3]=='0');
01036
01037
01038 uint8_t c;
01039 if (fread(&c, sizeof(uint8_t), 1, file)!=1)
01040 SG_ERROR("failed to read compression type");
01041 CCompressor* compressor= new CCompressor((E_COMPRESSION_TYPE) c);
01042
01043 uint8_t a;
01044 delete alphabet;
01045 if (fread(&a, sizeof(uint8_t), 1, file)!=1)
01046 SG_ERROR("failed to read compression alphabet");
01047 alphabet=new CAlphabet((EAlphabet) a);
01048
01049 if (fread(&num_vectors, sizeof(int32_t), 1, file)!=1)
01050 SG_ERROR("failed to read compression number of vectors");
01051 ASSERT(num_vectors>0);
01052
01053 if (fread(&max_string_length, sizeof(int32_t), 1, file)!=1)
01054 SG_ERROR("failed to read maximum string length");
01055 ASSERT(max_string_length>0);
01056
01057 features=SG_MALLOC(SGString<ST>, num_vectors);
01058
01059
01060 for (int32_t i=0; i<num_vectors; i++)
01061 {
01062
01063 int32_t len_compressed;
01064 if (fread(&len_compressed, sizeof(int32_t), 1, file)!=1)
01065 SG_ERROR("failed to read vector length compressed");
01066
01067 int32_t len_uncompressed;
01068 if (fread(&len_uncompressed, sizeof(int32_t), 1, file)!=1)
01069 SG_ERROR("failed to read vector length uncompressed");
01070
01071
01072 if (decompress)
01073 {
01074 features[i].string=SG_MALLOC(ST, len_uncompressed);
01075 features[i].slen=len_uncompressed;
01076 uint8_t* compressed=SG_MALLOC(uint8_t, len_compressed);
01077 if (fread(compressed, sizeof(uint8_t), len_compressed, file)!=(size_t) len_compressed)
01078 SG_ERROR("failed to read compressed data (expected %d bytes)", len_compressed);
01079 uint64_t uncompressed_size=len_uncompressed;
01080 uncompressed_size*=sizeof(ST);
01081 compressor->decompress(compressed, len_compressed,
01082 (uint8_t*) features[i].string, uncompressed_size);
01083 SG_FREE(compressed);
01084 ASSERT(uncompressed_size==((uint64_t) len_uncompressed)*sizeof(ST));
01085 }
01086 else
01087 {
01088 int32_t offs=CMath::ceil(2.0*sizeof(int32_t)/sizeof(ST));
01089 features[i].string=SG_MALLOC(ST, len_compressed+offs);
01090 features[i].slen=len_compressed+offs;
01091 int32_t* feat32ptr=((int32_t*) (features[i].string));
01092 memset(features[i].string, 0, offs*sizeof(ST));
01093 feat32ptr[0]=(int32_t) len_compressed;
01094 feat32ptr[1]=(int32_t) len_uncompressed;
01095 uint8_t* compressed=(uint8_t*) (&features[i].string[offs]);
01096 if (fread(compressed, 1, len_compressed, file)!=(size_t) len_compressed)
01097 SG_ERROR("failed to read uncompressed data");
01098 }
01099 }
01100
01101 delete compressor;
01102 fclose(file);
01103
01104 return false;
01105 }
01106
01107 template<class ST> bool CStringFeatures<ST>::save_compressed(char* dest, E_COMPRESSION_TYPE compression, int level)
01108 {
01109 if (m_subset_stack->has_subsets())
01110 SG_ERROR("save_compressed() is not possible on subset");
01111
01112 FILE* file=NULL;
01113
01114 if (!(file=fopen(dest, "wb")))
01115 return false;
01116
01117 CCompressor* compressor= new CCompressor(compression);
01118
01119
01120 const char* id="SGV0";
01121 fwrite(&id[0], sizeof(char), 1, file);
01122 fwrite(&id[1], sizeof(char), 1, file);
01123 fwrite(&id[2], sizeof(char), 1, file);
01124 fwrite(&id[3], sizeof(char), 1, file);
01125
01126
01127 uint8_t c=(uint8_t) compression;
01128 fwrite(&c, sizeof(uint8_t), 1, file);
01129
01130 uint8_t a=(uint8_t) alphabet->get_alphabet();
01131 fwrite(&a, sizeof(uint8_t), 1, file);
01132
01133 fwrite(&num_vectors, sizeof(int32_t), 1, file);
01134
01135 fwrite(&max_string_length, sizeof(int32_t), 1, file);
01136
01137
01138 for (int32_t i=0; i<num_vectors; i++)
01139 {
01140 int32_t len=-1;
01141 bool vfree;
01142 ST* vec=get_feature_vector(i, len, vfree);
01143
01144 uint8_t* compressed=NULL;
01145 uint64_t compressed_size=0;
01146
01147 compressor->compress((uint8_t*) vec, ((uint64_t) len)*sizeof(ST),
01148 compressed, compressed_size, level);
01149
01150 int32_t len_compressed=(int32_t) compressed_size;
01151
01152 fwrite(&len_compressed, sizeof(int32_t), 1, file);
01153
01154 fwrite(&len, sizeof(int32_t), 1, file);
01155
01156 fwrite(compressed, compressed_size, 1, file);
01157 SG_FREE(compressed);
01158
01159 free_feature_vector(vec, i, vfree);
01160 }
01161
01162 delete compressor;
01163 fclose(file);
01164 return true;
01165 }
01166
01167 template<class ST> int32_t CStringFeatures<ST>::get_size() const { return sizeof(ST); }
01168
01169 template<class ST> bool CStringFeatures<ST>::apply_preprocessor(bool force_preprocessing)
01170 {
01171 SG_DEBUG( "force: %d\n", force_preprocessing);
01172
01173 for (int32_t i=0; i<get_num_preprocessors(); i++)
01174 {
01175 if ( (!is_preprocessed(i) || force_preprocessing) )
01176 {
01177 set_preprocessed(i);
01178 CStringPreprocessor<ST>* p=(CStringPreprocessor<ST>*) get_preprocessor(i);
01179 SG_INFO( "preprocessing using preproc %s\n", p->get_name());
01180
01181 if (!p->apply_to_string_features(this))
01182 {
01183 SG_UNREF(p);
01184 return false;
01185 }
01186 else
01187 SG_UNREF(p);
01188 }
01189 }
01190 return true;
01191 }
01192
01193 template<class ST> int32_t CStringFeatures<ST>::obtain_by_sliding_window(int32_t window_size, int32_t step_size, int32_t skip)
01194 {
01195 if (m_subset_stack->has_subsets())
01196 SG_NOTIMPLEMENTED;
01197
01198 ASSERT(step_size>0);
01199 ASSERT(window_size>0);
01200 ASSERT(num_vectors==1 || single_string);
01201 ASSERT(max_string_length>=window_size ||
01202 (single_string && length_of_single_string>=window_size));
01203
01204
01205
01206 if (single_string)
01207 num_vectors= (length_of_single_string-window_size)/step_size + 1;
01208 else if (num_vectors==1)
01209 {
01210 num_vectors= (max_string_length-window_size)/step_size + 1;
01211 length_of_single_string=max_string_length;
01212 }
01213
01214 SGString<ST>* f=SG_MALLOC(SGString<ST>, num_vectors);
01215 int32_t offs=0;
01216 for (int32_t i=0; i<num_vectors; i++)
01217 {
01218 f[i].string=&features[0].string[offs+skip];
01219 f[i].slen=window_size-skip;
01220 offs+=step_size;
01221 }
01222 single_string=features[0].string;
01223 SG_FREE(features);
01224 features=f;
01225 max_string_length=window_size-skip;
01226
01227 return num_vectors;
01228 }
01229
01230 template<class ST> int32_t CStringFeatures<ST>::obtain_by_position_list(int32_t window_size, CDynamicArray<int32_t>* positions,
01231 int32_t skip)
01232 {
01233 if (m_subset_stack->has_subsets())
01234 SG_NOTIMPLEMENTED;
01235
01236 ASSERT(positions);
01237 ASSERT(window_size>0);
01238 ASSERT(num_vectors==1 || single_string);
01239 ASSERT(max_string_length>=window_size ||
01240 (single_string && length_of_single_string>=window_size));
01241
01242 num_vectors= positions->get_num_elements();
01243 ASSERT(num_vectors>0);
01244
01245 int32_t len;
01246
01247
01248
01249 if (single_string)
01250 len=length_of_single_string;
01251 else
01252 {
01253 single_string=features[0].string;
01254 len=max_string_length;
01255 length_of_single_string=max_string_length;
01256 }
01257
01258 SGString<ST>* f=SG_MALLOC(SGString<ST>, num_vectors);
01259 for (int32_t i=0; i<num_vectors; i++)
01260 {
01261 int32_t p=positions->get_element(i);
01262
01263 if (p>=0 && p<=len-window_size)
01264 {
01265 f[i].string=&features[0].string[p+skip];
01266 f[i].slen=window_size-skip;
01267 }
01268 else
01269 {
01270 num_vectors=1;
01271 max_string_length=len;
01272 features[0].slen=len;
01273 single_string=NULL;
01274 SG_FREE(f);
01275 SG_ERROR("window (size:%d) starting at position[%d]=%d does not fit in sequence(len:%d)\n",
01276 window_size, i, p, len);
01277 return -1;
01278 }
01279 }
01280
01281 SG_FREE(features);
01282 features=f;
01283 max_string_length=window_size-skip;
01284
01285 return num_vectors;
01286 }
01287
01288 template<class ST> bool CStringFeatures<ST>::obtain_from_char(CStringFeatures<char>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
01289 {
01290 return obtain_from_char_features(sf, start, p_order, gap, rev);
01291 }
01292
01293 template<class ST> bool CStringFeatures<ST>::have_same_length(int32_t len)
01294 {
01295 if (len!=-1)
01296 {
01297 if (len!=max_string_length)
01298 return false;
01299 }
01300 len=max_string_length;
01301
01302 index_t num_str=get_num_vectors();
01303 for (int32_t i=0; i<num_str; i++)
01304 {
01305 if (get_vector_length(i)!=len)
01306 return false;
01307 }
01308
01309 return true;
01310 }
01311
01312 template<class ST> void CStringFeatures<ST>::embed_features(int32_t p_order)
01313 {
01314 if (m_subset_stack->has_subsets())
01315 SG_NOTIMPLEMENTED;
01316
01317 ASSERT(alphabet->get_num_symbols_in_histogram() > 0);
01318
01319 order=p_order;
01320 original_num_symbols=alphabet->get_num_symbols();
01321 int32_t max_val=alphabet->get_num_bits();
01322
01323 if (p_order>1)
01324 num_symbols=CMath::powl((floatmax_t) 2, (floatmax_t) max_val*p_order);
01325 else
01326 num_symbols=original_num_symbols;
01327
01328 SG_INFO( "max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols);
01329
01330 if ( ((floatmax_t) num_symbols) > CMath::powl(((floatmax_t) 2),((floatmax_t) sizeof(ST)*8)) )
01331 SG_WARNING("symbols did not fit into datatype \"%c\" (%d)\n", (char) max_val, (int) max_val);
01332
01333 ST mask=0;
01334 for (int32_t i=0; i<p_order*max_val; i++)
01335 mask= (mask<<1) | ((ST) 1);
01336
01337 for (int32_t i=0; i<num_vectors; i++)
01338 {
01339 int32_t len=features[i].slen;
01340
01341 if (len < p_order)
01342 SG_ERROR("Sequence must be longer than order (%d vs. %d)\n", len, p_order);
01343
01344 ST* str=features[i].string;
01345
01346
01347 for (int32_t j=0; j<p_order; j++)
01348 str[j]=(ST) alphabet->remap_to_bin(str[j]);
01349 str[0]=embed_word(&str[0], p_order);
01350
01351
01352 int32_t idx=0;
01353 for (int32_t j=p_order; j<len; j++)
01354 {
01355 str[j]=(ST) alphabet->remap_to_bin(str[j]);
01356 str[idx+1]= ((str[idx]<<max_val) | str[j]) & mask;
01357 idx++;
01358 }
01359
01360 features[i].slen=len-p_order+1;
01361 }
01362
01363 compute_symbol_mask_table(max_val);
01364 }
01365
01366 template<class ST> void CStringFeatures<ST>::compute_symbol_mask_table(int64_t max_val)
01367 {
01368 if (m_subset_stack->has_subsets())
01369 SG_NOTIMPLEMENTED;
01370
01371 SG_FREE(symbol_mask_table);
01372 symbol_mask_table=SG_MALLOC(ST, 256);
01373
01374 uint64_t mask=0;
01375 for (int32_t i=0; i< (int64_t) max_val; i++)
01376 mask=(mask<<1) | 1;
01377
01378 for (int32_t i=0; i<256; i++)
01379 {
01380 uint8_t bits=(uint8_t) i;
01381 symbol_mask_table[i]=0;
01382
01383 for (int32_t j=0; j<8; j++)
01384 {
01385 if (bits & 1)
01386 symbol_mask_table[i]|=mask<<(max_val*j);
01387
01388 bits>>=1;
01389 }
01390 }
01391 }
01392
01393 template<class ST> void CStringFeatures<ST>::unembed_word(ST word, uint8_t* seq, int32_t len)
01394 {
01395 uint32_t nbits= (uint32_t) alphabet->get_num_bits();
01396
01397 ST mask=0;
01398 for (uint32_t i=0; i<nbits; i++)
01399 mask=(mask<<1) | (ST) 1;
01400
01401 for (int32_t i=0; i<len; i++)
01402 {
01403 ST w=(word & mask);
01404 seq[len-i-1]=alphabet->remap_to_char((uint8_t) w);
01405 word>>=nbits;
01406 }
01407 }
01408
01409 template<class ST> ST CStringFeatures<ST>::embed_word(ST* seq, int32_t len)
01410 {
01411 ST value=(ST) 0;
01412 uint32_t nbits= (uint32_t) alphabet->get_num_bits();
01413 for (int32_t i=0; i<len; i++)
01414 {
01415 value<<=nbits;
01416 value|=seq[i];
01417 }
01418
01419 return value;
01420 }
01421
01422 template<class ST> void CStringFeatures<ST>::determine_maximum_string_length()
01423 {
01424 max_string_length=0;
01425 index_t num_str=get_num_vectors();
01426
01427 for (int32_t i=0; i<num_str; i++)
01428 {
01429 max_string_length=CMath::max(max_string_length,
01430 features[m_subset_stack->subset_idx_conversion(i)].slen);
01431 }
01432 }
01433
01434 template<class ST> ST* CStringFeatures<ST>::get_zero_terminated_string_copy(SGString<ST> str)
01435 {
01436 int32_t l=str.slen;
01437 ST* s=SG_MALLOC(ST, l+1);
01438 memcpy(s, str.string, sizeof(ST)*l);
01439 s[l]='\0';
01440 return s;
01441 }
01442
01443 template<class ST> void CStringFeatures<ST>::set_feature_vector(int32_t num, ST* string, int32_t len)
01444 {
01445 ASSERT(features);
01446 ASSERT(num<get_num_vectors());
01447
01448 int32_t real_num=m_subset_stack->subset_idx_conversion(num);
01449
01450
01451 features[real_num].slen=len ;
01452 features[real_num].string=string ;
01453
01454 max_string_length=CMath::max(len, max_string_length);
01455 }
01456
01457 template<class ST> void CStringFeatures<ST>::get_histogram(float64_t** hist, int32_t* rows, int32_t* cols, bool normalize)
01458 {
01459 int32_t nsym=get_num_symbols();
01460 int32_t slen=get_max_vector_length();
01461 int64_t sz=int64_t(nsym)*slen*sizeof(float64_t);
01462 float64_t* h= SG_MALLOC(float64_t, sz);
01463 memset(h, 0, sz);
01464
01465 float64_t* h_normalizer=SG_MALLOC(float64_t, slen);
01466 memset(h_normalizer, 0, slen*sizeof(float64_t));
01467 int32_t num_str=get_num_vectors();
01468 for (int32_t i=0; i<num_str; i++)
01469 {
01470 int32_t len;
01471 bool free_vec;
01472 ST* vec=get_feature_vector(i, len, free_vec);
01473 for (int32_t j=0; j<len; j++)
01474 {
01475 h[int64_t(j)*nsym+alphabet->remap_to_bin(vec[j])]++;
01476 h_normalizer[j]++;
01477 }
01478 free_feature_vector(vec, i, free_vec);
01479 }
01480
01481 if (normalize)
01482 {
01483 for (int32_t i=0; i<slen; i++)
01484 {
01485 for (int32_t j=0; j<nsym; j++)
01486 {
01487 if (h_normalizer && h_normalizer[i])
01488 h[int64_t(i)*nsym+j]/=h_normalizer[i];
01489 }
01490 }
01491 }
01492 SG_FREE(h_normalizer);
01493
01494 *hist=h;
01495 *rows=nsym;
01496 *cols=slen;
01497 }
01498
01499 template<class ST> void CStringFeatures<ST>::create_random(float64_t* hist, int32_t rows, int32_t cols, int32_t num_vec)
01500 {
01501 ASSERT(rows == get_num_symbols());
01502 cleanup();
01503 float64_t* randoms=SG_MALLOC(float64_t, cols);
01504 SGString<ST>* sf=SG_MALLOC(SGString<ST>, num_vec);
01505
01506 for (int32_t i=0; i<num_vec; i++)
01507 {
01508 sf[i].string=SG_MALLOC(ST, cols);
01509 sf[i].slen=cols;
01510
01511 SGVector<float64_t>::random_vector(randoms, cols, 0.0, 1.0);
01512
01513 for (int32_t j=0; j<cols; j++)
01514 {
01515 float64_t lik=hist[int64_t(j)*rows+0];
01516
01517 int32_t c;
01518 for (c=0; c<rows-1; c++)
01519 {
01520 if (randoms[j]<=lik)
01521 break;
01522 lik+=hist[int64_t(j)*rows+c+1];
01523 }
01524 sf[i].string[j]=alphabet->remap_to_char(c);
01525 }
01526 }
01527 SG_FREE(randoms);
01528 set_features(sf, num_vec, cols);
01529 }
01530
01531
01532
01533
01534
01535
01536
01537
01538
01539
01540
01541
01542
01543
01544
01545
01546
01547
01548
01549
01550
01551
01552
01553
01554
01555
01556
01557
01558
01559
01560
01561
01562
01563
01564
01565
01566
01567
01568
01569
01570
01571
01572
01573
01574
01575
01576
01577
01578
01579
01580
01581
01582
01583
01584
01585
01586
01587
01588
01589
01590
01591
01592
01593
01594
01595
01596
01597
01598
01599
01600
01601 template<class ST> CFeatures* CStringFeatures<ST>::copy_subset(
01602 SGVector<index_t> indices)
01603 {
01604
01605 SGStringList<ST> list_copy(indices.vlen, max_string_length);
01606
01607
01608 for (index_t i=0; i<indices.vlen; ++i)
01609 {
01610
01611 index_t real_idx=m_subset_stack->subset_idx_conversion(indices.vector[i]);
01612
01613
01614 SGString<ST> current_string=features[real_idx];
01615 SGString<ST> string_copy(current_string.slen);
01616 memcpy(string_copy.string, current_string.string,
01617 current_string.slen*sizeof(ST));
01618 list_copy.strings[i]=string_copy;
01619 }
01620
01621
01622 CStringFeatures* result=new CStringFeatures(list_copy, alphabet);
01623
01624
01625 result->determine_maximum_string_length();
01626
01627
01628 result->order=order;
01629 result->compute_symbol_mask_table(result->alphabet->get_num_symbols());
01630
01631 SG_REF(result);
01632
01633 return result;
01634 }
01635
01636 template<class ST> void CStringFeatures<ST>::subset_changed_post()
01637 {
01638
01639 determine_maximum_string_length();
01640 }
01641
01642 template<class ST> ST* CStringFeatures<ST>::compute_feature_vector(int32_t num, int32_t& len)
01643 {
01644 ASSERT(features && num<get_num_vectors());
01645
01646 int32_t real_num=m_subset_stack->subset_idx_conversion(num);
01647
01648 len=features[real_num].slen;
01649 if (len<=0)
01650 return NULL;
01651
01652 ST* target=SG_MALLOC(ST, len);
01653 memcpy(target, features[real_num].string, len*sizeof(ST));
01654 return target;
01655 }
01656
01657 template<class ST> void CStringFeatures<ST>::init()
01658 {
01659 set_generic<ST>();
01660
01661 alphabet=NULL;
01662 num_vectors=0;
01663 features=NULL;
01664 single_string=NULL;
01665 length_of_single_string=0;
01666 max_string_length=0;
01667 order=0;
01668 symbol_mask_table=0;
01669 preprocess_on_get=false;
01670 feature_cache=NULL;
01671 symbol_mask_table_len=256;
01672
01673 m_parameters->add((CSGObject**) &alphabet, "alphabet");
01674 m_parameters->add_vector(&features, &num_vectors, "features",
01675 "This contains the array of features.");
01676 m_parameters->add_vector(&single_string,
01677 &length_of_single_string,
01678 "single_string",
01679 "Created by sliding window.");
01680 m_parameters->add(&max_string_length, "max_string_length",
01681 "Length of longest string.");
01682 m_parameters->add(&num_symbols, "num_symbols",
01683 "Number of used symbols.");
01684 m_parameters->add(&original_num_symbols, "original_num_symbols",
01685 "Original number of used symbols.");
01686 m_parameters->add(&order, "order",
01687 "Order used in higher order mapping.");
01688 m_parameters->add(&preprocess_on_get, "preprocess_on_get",
01689 "Preprocess on-the-fly?");
01690
01691
01692
01693
01694
01695 m_parameters->add_vector(&symbol_mask_table, &symbol_mask_table_len, "mask table", "fuck you");
01696 }
01697
01702 template<> EFeatureType CStringFeatures<bool>::get_feature_type() const
01703 {
01704 return F_BOOL;
01705 }
01706
01711 template<> EFeatureType CStringFeatures<char>::get_feature_type() const
01712 {
01713 return F_CHAR;
01714 }
01715
01720 template<> EFeatureType CStringFeatures<uint8_t>::get_feature_type() const
01721 {
01722 return F_BYTE;
01723 }
01724
01729 template<> EFeatureType CStringFeatures<int16_t>::get_feature_type() const
01730 {
01731 return F_SHORT;
01732 }
01733
01738 template<> EFeatureType CStringFeatures<uint16_t>::get_feature_type() const
01739 {
01740 return F_WORD;
01741 }
01742
01747 template<> EFeatureType CStringFeatures<int32_t>::get_feature_type() const
01748 {
01749 return F_INT;
01750 }
01751
01756 template<> EFeatureType CStringFeatures<uint32_t>::get_feature_type() const
01757 {
01758 return F_UINT;
01759 }
01760
01765 template<> EFeatureType CStringFeatures<int64_t>::get_feature_type() const
01766 {
01767 return F_LONG;
01768 }
01769
01774 template<> EFeatureType CStringFeatures<uint64_t>::get_feature_type() const
01775 {
01776 return F_ULONG;
01777 }
01778
01783 template<> EFeatureType CStringFeatures<float32_t>::get_feature_type() const
01784 {
01785 return F_SHORTREAL;
01786 }
01787
01792 template<> EFeatureType CStringFeatures<float64_t>::get_feature_type() const
01793 {
01794 return F_DREAL;
01795 }
01796
01801 template<> EFeatureType CStringFeatures<floatmax_t>::get_feature_type() const
01802 {
01803 return F_LONGREAL;
01804 }
01805
01806 template<> bool CStringFeatures<bool>::get_masked_symbols(bool symbol, uint8_t mask)
01807 {
01808 return symbol;
01809 }
01810 template<> float32_t CStringFeatures<float32_t>::get_masked_symbols(float32_t symbol, uint8_t mask)
01811 {
01812 return symbol;
01813 }
01814 template<> float64_t CStringFeatures<float64_t>::get_masked_symbols(float64_t symbol, uint8_t mask)
01815 {
01816 return symbol;
01817 }
01818 template<> floatmax_t CStringFeatures<floatmax_t>::get_masked_symbols(floatmax_t symbol, uint8_t mask)
01819 {
01820 return symbol;
01821 }
01822
01823 template<> bool CStringFeatures<bool>::shift_offset(bool symbol, int32_t amount)
01824 {
01825 return false;
01826 }
01827 template<> float32_t CStringFeatures<float32_t>::shift_offset(float32_t symbol, int32_t amount)
01828 {
01829 return 0;
01830 }
01831 template<> float64_t CStringFeatures<float64_t>::shift_offset(float64_t symbol, int32_t amount)
01832 {
01833 return 0;
01834 }
01835 template<> floatmax_t CStringFeatures<floatmax_t>::shift_offset(floatmax_t symbol, int32_t amount)
01836 {
01837 return 0;
01838 }
01839
01840 template<> bool CStringFeatures<bool>::shift_symbol(bool symbol, int32_t amount)
01841 {
01842 return symbol;
01843 }
01844 template<> float32_t CStringFeatures<float32_t>::shift_symbol(float32_t symbol, int32_t amount)
01845 {
01846 return symbol;
01847 }
01848 template<> float64_t CStringFeatures<float64_t>::shift_symbol(float64_t symbol, int32_t amount)
01849 {
01850 return symbol;
01851 }
01852 template<> floatmax_t CStringFeatures<floatmax_t>::shift_symbol(floatmax_t symbol, int32_t amount)
01853 {
01854 return symbol;
01855 }
01856
01857 #ifndef SUNOS
01858 template<> template <class CT> bool CStringFeatures<float32_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
01859 {
01860 return false;
01861 }
01862 template<> template <class CT> bool CStringFeatures<float64_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
01863 {
01864 return false;
01865 }
01866 template<> template <class CT> bool CStringFeatures<floatmax_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
01867 {
01868 return false;
01869 }
01870 #endif
01871
01872 template<> void CStringFeatures<float32_t>::embed_features(int32_t p_order)
01873 {
01874 }
01875 template<> void CStringFeatures<float64_t>::embed_features(int32_t p_order)
01876 {
01877 }
01878 template<> void CStringFeatures<floatmax_t>::embed_features(int32_t p_order)
01879 {
01880 }
01881
01882 template<> void CStringFeatures<float32_t>::compute_symbol_mask_table(int64_t max_val)
01883 {
01884 }
01885 template<> void CStringFeatures<float64_t>::compute_symbol_mask_table(int64_t max_val)
01886 {
01887 }
01888 template<> void CStringFeatures<floatmax_t>::compute_symbol_mask_table(int64_t max_val)
01889 {
01890 }
01891
01892 template<> float32_t CStringFeatures<float32_t>::embed_word(float32_t* seq, int32_t len)
01893 {
01894 return 0;
01895 }
01896 template<> float64_t CStringFeatures<float64_t>::embed_word(float64_t* seq, int32_t len)
01897 {
01898 return 0;
01899 }
01900 template<> floatmax_t CStringFeatures<floatmax_t>::embed_word(floatmax_t* seq, int32_t len)
01901 {
01902 return 0;
01903 }
01904
01905 template<> void CStringFeatures<float32_t>::unembed_word(float32_t word, uint8_t* seq, int32_t len)
01906 {
01907 }
01908 template<> void CStringFeatures<float64_t>::unembed_word(float64_t word, uint8_t* seq, int32_t len)
01909 {
01910 }
01911 template<> void CStringFeatures<floatmax_t>::unembed_word(floatmax_t word, uint8_t* seq, int32_t len)
01912 {
01913 }
01914 #define LOAD(f_load, sg_type) \
01915 template<> void CStringFeatures<sg_type>::load(CFile* loader) \
01916 { \
01917 SG_INFO( "loading...\n"); \
01918 \
01919 SG_SET_LOCALE_C; \
01920 SGString<sg_type>* strs; \
01921 int32_t num_str; \
01922 int32_t max_len; \
01923 loader->f_load(strs, num_str, max_len); \
01924 set_features(strs, num_str, max_len); \
01925 SG_RESET_LOCALE; \
01926 }
01927
01928 LOAD(get_string_list, bool)
01929 LOAD(get_string_list, char)
01930 LOAD(get_int8_string_list, int8_t)
01931 LOAD(get_string_list, uint8_t)
01932 LOAD(get_string_list, int16_t)
01933 LOAD(get_string_list, uint16_t)
01934 LOAD(get_string_list, int32_t)
01935 LOAD(get_uint_string_list, uint32_t)
01936 LOAD(get_long_string_list, int64_t)
01937 LOAD(get_ulong_string_list, uint64_t)
01938 LOAD(get_string_list, float32_t)
01939 LOAD(get_string_list, float64_t)
01940 LOAD(get_longreal_string_list, floatmax_t)
01941 #undef LOAD
01942
01943 #define SAVE(f_write, sg_type) \
01944 template<> void CStringFeatures<sg_type>::save(CFile* writer) \
01945 { \
01946 if (m_subset_stack->has_subsets()) \
01947 SG_ERROR("save() is not possible on subset"); \
01948 SG_SET_LOCALE_C; \
01949 ASSERT(writer); \
01950 writer->f_write(features, num_vectors); \
01951 SG_RESET_LOCALE; \
01952 }
01953
01954 SAVE(set_string_list, bool)
01955 SAVE(set_string_list, char)
01956 SAVE(set_int8_string_list, int8_t)
01957 SAVE(set_string_list, uint8_t)
01958 SAVE(set_string_list, int16_t)
01959 SAVE(set_string_list, uint16_t)
01960 SAVE(set_string_list, int32_t)
01961 SAVE(set_uint_string_list, uint32_t)
01962 SAVE(set_long_string_list, int64_t)
01963 SAVE(set_ulong_string_list, uint64_t)
01964 SAVE(set_string_list, float32_t)
01965 SAVE(set_string_list, float64_t)
01966 SAVE(set_longreal_string_list, floatmax_t)
01967 #undef SAVE
01968
01969 template <class ST> template <class CT>
01970 bool CStringFeatures<ST>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start,
01971 int32_t p_order, int32_t gap, bool rev)
01972 {
01973 remove_all_subsets();
01974 ASSERT(sf);
01975
01976 CAlphabet* alpha=sf->get_alphabet();
01977 ASSERT(alpha->get_num_symbols_in_histogram() > 0);
01978
01979 this->order=p_order;
01980 cleanup();
01981
01982 num_vectors=sf->get_num_vectors();
01983 ASSERT(num_vectors>0);
01984 max_string_length=sf->get_max_vector_length()-start;
01985 features=SG_MALLOC(SGString<ST>, num_vectors);
01986
01987 SG_DEBUG( "%1.0llf symbols in StringFeatures<*> %d symbols in histogram\n", sf->get_num_symbols(),
01988 alpha->get_num_symbols_in_histogram());
01989
01990 for (int32_t i=0; i<num_vectors; i++)
01991 {
01992 int32_t len=-1;
01993 bool vfree;
01994 CT* c=sf->get_feature_vector(i, len, vfree);
01995 ASSERT(!vfree);
01996
01997 features[i].string=SG_MALLOC(ST, len);
01998 features[i].slen=len;
01999
02000 ST* str=features[i].string;
02001 for (int32_t j=0; j<len; j++)
02002 str[j]=(ST) alpha->remap_to_bin(c[j]);
02003 }
02004
02005 original_num_symbols=alpha->get_num_symbols();
02006 int32_t max_val=alpha->get_num_bits();
02007
02008 SG_UNREF(alpha);
02009
02010 if (p_order>1)
02011 num_symbols=CMath::powl((floatmax_t) 2, (floatmax_t) max_val*p_order);
02012 else
02013 num_symbols=original_num_symbols;
02014 SG_INFO( "max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols);
02015
02016 if ( ((floatmax_t) num_symbols) > CMath::powl(((floatmax_t) 2),((floatmax_t) sizeof(ST)*8)) )
02017 {
02018 SG_ERROR( "symbol does not fit into datatype \"%c\" (%d)\n", (char) max_val, (int) max_val);
02019 return false;
02020 }
02021
02022 SG_DEBUG( "translate: start=%i order=%i gap=%i(size:%i)\n", start, p_order, gap, sizeof(ST)) ;
02023 for (int32_t line=0; line<num_vectors; line++)
02024 {
02025 int32_t len=0;
02026 bool vfree;
02027 ST* fv=get_feature_vector(line, len, vfree);
02028 ASSERT(!vfree);
02029
02030 if (rev)
02031 CAlphabet::translate_from_single_order_reversed(fv, len, start+gap, p_order+gap, max_val, gap);
02032 else
02033 CAlphabet::translate_from_single_order(fv, len, start+gap, p_order+gap, max_val, gap);
02034
02035
02036 features[line].slen-=start+gap ;
02037 if (features[line].slen<0)
02038 features[line].slen=0 ;
02039 }
02040
02041 compute_symbol_mask_table(max_val);
02042
02043 return true;
02044 }
02045
02046 template class CStringFeatures<bool>;
02047 template class CStringFeatures<char>;
02048 template class CStringFeatures<int8_t>;
02049 template class CStringFeatures<uint8_t>;
02050 template class CStringFeatures<int16_t>;
02051 template class CStringFeatures<uint16_t>;
02052 template class CStringFeatures<int32_t>;
02053 template class CStringFeatures<uint32_t>;
02054 template class CStringFeatures<int64_t>;
02055 template class CStringFeatures<uint64_t>;
02056 template class CStringFeatures<float32_t>;
02057 template class CStringFeatures<float64_t>;
02058 template class CStringFeatures<floatmax_t>;
02059
02060 template bool CStringFeatures<uint16_t>::obtain_from_char_features<uint8_t>(CStringFeatures<uint8_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
02061 template bool CStringFeatures<uint32_t>::obtain_from_char_features<uint8_t>(CStringFeatures<uint8_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
02062 template bool CStringFeatures<uint64_t>::obtain_from_char_features<uint8_t>(CStringFeatures<uint8_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
02063
02064 template bool CStringFeatures<uint16_t>::obtain_from_char_features<uint16_t>(CStringFeatures<uint16_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
02065 template bool CStringFeatures<uint32_t>::obtain_from_char_features<uint16_t>(CStringFeatures<uint16_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
02066 template bool CStringFeatures<uint64_t>::obtain_from_char_features<uint16_t>(CStringFeatures<uint16_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
02067 }