00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #ifndef _CSTRINGFEATURES__H__
00013 #define _CSTRINGFEATURES__H__
00014
00015 #include "lib/common.h"
00016 #include "lib/io.h"
00017 #include "lib/Cache.h"
00018 #include "lib/DynamicArray.h"
00019 #include "lib/File.h"
00020 #include "lib/MemoryMappedFile.h"
00021 #include "lib/Mathematics.h"
00022 #include "lib/Compressor.h"
00023 #include "base/Parameter.h"
00024
00025 #include "preproc/PreProc.h"
00026 #include "preproc/StringPreProc.h"
00027 #include "features/Features.h"
00028 #include "features/Alphabet.h"
00029
00030 #include <sys/types.h>
00031 #include <sys/stat.h>
00032 #include <dirent.h>
00033 #include <stdio.h>
00034 #include <stdlib.h>
00035 #include <unistd.h>
00036
00037 namespace shogun
00038 {
00039 class CCompressor;
00040 enum E_COMPRESSION_TYPE;
00041 class CAlphabet;
00042 enum EAlphabet;
00043 template <class T> class CDynamicArray;
00044 class CFile;
00045 template <class T> class CMemoryMappedFile;
00046 class CMath;
00047 template <class ST> class CStringPreProc;
00048 template <class T> class TString;
00049
00050 struct SSKDoubleFeature
00051 {
00052 int feature1;
00053 int feature2;
00054 int group;
00055 };
00056
00057 struct SSKTripleFeature
00058 {
00059 int feature1;
00060 int feature2;
00061 int feature3;
00062 int group;
00063 };
00064
00083 template <class ST> class CStringFeatures : public CFeatures
00084 {
00085 public:
00089 CStringFeatures() : CFeatures(0), alphabet(NULL), num_vectors(0),
00090 features(NULL), single_string(NULL),length_of_single_string(0),
00091 max_string_length(0), order(0), symbol_mask_table(NULL),
00092 preprocess_on_get(false), feature_cache(NULL)
00093 {
00094 init();
00095 alphabet=new CAlphabet();
00096 }
00097
00102 CStringFeatures(EAlphabet alpha)
00103 : CFeatures(0), num_vectors(0), features(NULL),
00104 single_string(NULL),length_of_single_string(0),
00105 max_string_length(0), order(0), symbol_mask_table(NULL),
00106 preprocess_on_get(false), feature_cache(NULL)
00107 {
00108 init();
00109
00110 alphabet=new CAlphabet(alpha);
00111 SG_REF(alphabet);
00112 num_symbols=alphabet->get_num_symbols();
00113 original_num_symbols=num_symbols;
00114 }
00115
00123 CStringFeatures(TString<ST>* p_features, int32_t p_num_vectors,
00124 int32_t p_max_string_length, EAlphabet alpha)
00125 : CFeatures(0), num_vectors(0), features(NULL),
00126 single_string(NULL),length_of_single_string(0),
00127 max_string_length(0), order(0), symbol_mask_table(NULL),
00128 preprocess_on_get(false), feature_cache(NULL)
00129 {
00130 init();
00131
00132 alphabet=new CAlphabet(alpha);
00133 SG_REF(alphabet);
00134 num_symbols=alphabet->get_num_symbols();
00135 original_num_symbols=num_symbols;
00136 set_features(p_features, p_num_vectors, p_max_string_length);
00137 }
00138
00146 CStringFeatures(TString<ST>* p_features, int32_t p_num_vectors,
00147 int32_t p_max_string_length, CAlphabet* alpha)
00148 : CFeatures(0), num_vectors(0), features(NULL),
00149 single_string(NULL),length_of_single_string(0),
00150 max_string_length(0), order(0), symbol_mask_table(NULL),
00151 preprocess_on_get(false), feature_cache(NULL)
00152 {
00153 init();
00154
00155 alphabet=new CAlphabet(alpha);
00156 SG_REF(alphabet);
00157 num_symbols=alphabet->get_num_symbols();
00158 original_num_symbols=num_symbols;
00159 set_features(p_features, p_num_vectors, p_max_string_length);
00160 }
00161
00166 CStringFeatures(CAlphabet* alpha)
00167 : CFeatures(0), num_vectors(0), features(NULL),
00168 single_string(NULL),length_of_single_string(0),
00169 max_string_length(0), order(0), symbol_mask_table(NULL),
00170 preprocess_on_get(false), feature_cache(NULL)
00171 {
00172 init();
00173
00174 ASSERT(alpha);
00175 SG_REF(alpha);
00176 alphabet=alpha;
00177 num_symbols=alphabet->get_num_symbols();
00178 original_num_symbols=num_symbols;
00179 }
00180
00182 CStringFeatures(const CStringFeatures & orig)
00183 : CFeatures(orig), num_vectors(orig.num_vectors),
00184 single_string(orig.single_string),
00185 length_of_single_string(orig.length_of_single_string),
00186 max_string_length(orig.max_string_length),
00187 num_symbols(orig.num_symbols),
00188 original_num_symbols(orig.original_num_symbols),
00189 order(orig.order), preprocess_on_get(false),
00190 feature_cache(NULL)
00191 {
00192 init();
00193
00194 ASSERT(orig.single_string == NULL);
00195
00196 alphabet=orig.alphabet;
00197 SG_REF(alphabet);
00198
00199 if (orig.features)
00200 {
00201 features=new TString<ST>[orig.num_vectors];
00202
00203 for (int32_t i=0; i<num_vectors; i++)
00204 {
00205 features[i].string=new ST[orig.features[i].length];
00206 features[i].length=orig.features[i].length;
00207 memcpy(features[i].string, orig.features[i].string, sizeof(ST)*orig.features[i].length);
00208 }
00209 }
00210
00211 if (orig.symbol_mask_table)
00212 {
00213 symbol_mask_table=new ST[256];
00214 for (int32_t i=0; i<256; i++)
00215 symbol_mask_table[i]=orig.symbol_mask_table[i];
00216 }
00217 }
00218
00224 CStringFeatures(CFile* loader, EAlphabet alpha=DNA)
00225 : CFeatures(loader), num_vectors(0), features(NULL), single_string(NULL),
00226 length_of_single_string(0), max_string_length(0), order(0),
00227 symbol_mask_table(NULL), preprocess_on_get(false), feature_cache(NULL)
00228 {
00229 init();
00230
00231 alphabet=new CAlphabet(alpha);
00232 SG_REF(alphabet);
00233 num_symbols=alphabet->get_num_symbols();
00234 original_num_symbols=num_symbols;
00235 load(loader);
00236 }
00237
00238 virtual ~CStringFeatures()
00239 {
00240 cleanup();
00241
00242 SG_UNREF(alphabet);
00243 }
00244
00246 virtual void cleanup()
00247 {
00248 if (single_string)
00249 {
00250 delete[] single_string;
00251 single_string=NULL;
00252 }
00253 else
00254 {
00255 for (int32_t i=0; i<num_vectors; i++)
00256 cleanup_feature_vector(i);
00257 }
00258
00259 num_vectors=0;
00260 delete[] features;
00261 delete[] symbol_mask_table;
00262 features=NULL;
00263 symbol_mask_table=NULL;
00264
00265
00266
00267
00268
00269 CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet());
00270 SG_UNREF(alphabet);
00271 alphabet=alpha;
00272 SG_REF(alphabet);
00273 }
00274
00276 virtual void cleanup_feature_vector(int32_t num)
00277 {
00278 ASSERT(num<num_vectors);
00279 if (features)
00280 {
00281 delete[] features[num].string;
00282 features[num].string=NULL;
00283 features[num].length=0;
00284 }
00285 }
00286
00291 inline virtual EFeatureClass get_feature_class() { return C_STRING; }
00292
00297 inline virtual EFeatureType get_feature_type() { return F_UNKNOWN; }
00298
00303 inline CAlphabet* get_alphabet()
00304 {
00305 SG_REF(alphabet);
00306 return alphabet;
00307 }
00308
00313 virtual CFeatures* duplicate() const
00314 {
00315 return new CStringFeatures<ST>(*this);
00316 }
00317
00324 void get_feature_vector(ST** dst, int32_t* len, int32_t num)
00325 {
00326 ASSERT(features);
00327 if (num>=num_vectors)
00328 {
00329 SG_ERROR("Index out of bounds (number of strings %d, you "
00330 "requested %d)\n", num_vectors, num);
00331 }
00332
00333 int32_t l;
00334 bool free_vec;
00335 ST* vec=get_feature_vector(num, l, free_vec);
00336 *len=l;
00337 *dst=(ST*) malloc(*len * sizeof(ST));
00338 ASSERT(*dst);
00339 memcpy(*dst, vec, *len * sizeof(ST));
00340 free_feature_vector(vec, num, free_vec);
00341 }
00342
00349 void set_feature_vector(ST* src, int32_t len, int32_t num)
00350 {
00351 ASSERT(features);
00352 if (num>=num_vectors)
00353 {
00354 SG_ERROR("Index out of bounds (number of strings %d, you "
00355 "requested %d)\n", num_vectors, num);
00356 }
00357
00358 if (len<=0)
00359 SG_ERROR("String has zero or negative length\n");
00360
00361
00362 cleanup_feature_vector(num);
00363 features[num].length=len;
00364 features[num].string=new ST[len];
00365 memcpy(features[num].string, src, len*sizeof(ST));
00366
00367 determine_maximum_string_length();
00368 }
00369
00372 void enable_on_the_fly_preprocessing()
00373 {
00374 preprocess_on_get=true;
00375 }
00376
00380 void disable_on_the_fly_preprocessing()
00381 {
00382 preprocess_on_get=false;
00383 }
00384
00393 ST* get_feature_vector(int32_t num, int32_t& len, bool& dofree)
00394 {
00395 ASSERT(features);
00396 ASSERT(num<num_vectors);
00397
00398 if (!preprocess_on_get)
00399 {
00400 dofree=false;
00401 len=features[num].length;
00402 return features[num].string;
00403 }
00404 else
00405 {
00406 SG_DEBUG( "computing feature vector!\n") ;
00407 ST* feat=compute_feature_vector(num, len);
00408 dofree=true;
00409
00410 if (get_num_preproc())
00411 {
00412 ST* tmp_feat_before = feat;
00413
00414 for (int32_t i=0; i<get_num_preproc(); i++)
00415 {
00416 CStringPreProc<ST>* p = (CStringPreProc<ST>*) get_preproc(i);
00417 feat=p->apply_to_string(tmp_feat_before, len);
00418 SG_UNREF(p);
00419 delete[] tmp_feat_before;
00420 tmp_feat_before=feat;
00421 }
00422 }
00423
00424 return feat;
00425 }
00426 }
00427
00432 CStringFeatures<ST>* get_transposed()
00433 {
00434 int32_t num_feat;
00435 int32_t num_vec;
00436 TString<ST>* s=get_transposed(num_feat, num_vec);
00437
00438 return new CStringFeatures<ST>(s, num_vec, num_feat, alphabet);
00439 }
00440
00452 TString<ST>* get_transposed(int32_t &num_feat, int32_t &num_vec)
00453 {
00454 num_feat=num_vectors;
00455 num_vec=get_max_vector_length();
00456 ASSERT(have_same_length());
00457
00458 SG_DEBUG("Allocating memory for transposed string features of size %ld\n",
00459 int64_t(num_feat)*num_vec);
00460
00461 TString<ST>* sf=new TString<ST>[num_vec];
00462
00463 for (int32_t i=0; i<num_vec; i++)
00464 {
00465 sf[i].string=new ST[num_feat];
00466 sf[i].length=num_feat;
00467 }
00468
00469 for (int32_t i=0; i<num_feat; i++)
00470 {
00471 int32_t len=0;
00472 bool free_vec=false;
00473 ST* vec=get_feature_vector(i, len, free_vec);
00474
00475 for (int32_t j=0; j<num_vec; j++)
00476 sf[j].string[i]=vec[j];
00477
00478 free_feature_vector(vec, i, free_vec);
00479 }
00480 return sf;
00481 }
00482
00489 void free_feature_vector(ST* feat_vec, int32_t num, bool dofree)
00490 {
00491 if (feature_cache)
00492 feature_cache->unlock_entry(num);
00493
00494 if (dofree)
00495 delete[] feat_vec ;
00496 }
00497
00504 virtual ST inline get_feature(int32_t vec_num, int32_t feat_num)
00505 {
00506 int32_t len;
00507 bool free_vec;
00508 ST* vec=get_feature_vector(vec_num, len, free_vec);
00509 ASSERT(feat_num<len);
00510 ST result=vec[feat_num];
00511 free_feature_vector(vec, vec_num, free_vec);
00512
00513 return result;
00514 }
00515
00521 virtual inline int32_t get_vector_length(int32_t vec_num)
00522 {
00523 int32_t len;
00524 bool free_vec;
00525 ST* vec=get_feature_vector(vec_num, len, free_vec);
00526 free_feature_vector(vec, vec_num, free_vec);
00527 return len;
00528 }
00529
00534 virtual inline int32_t get_max_vector_length()
00535 {
00536 return max_string_length;
00537 }
00538
00543 virtual inline int32_t get_num_vectors() { return num_vectors; }
00544
00551 inline floatmax_t get_num_symbols() { return num_symbols; }
00552
00560 inline floatmax_t get_max_num_symbols() { return CMath::powl(2,sizeof(ST)*8); }
00561
00562
00563
00568 inline floatmax_t get_original_num_symbols() { return original_num_symbols; }
00569
00574 inline int32_t get_order() { return order; }
00575
00583 inline ST get_masked_symbols(ST symbol, uint8_t mask)
00584 {
00585 ASSERT(symbol_mask_table);
00586 return symbol_mask_table[mask] & symbol;
00587 }
00588
00595 inline ST shift_offset(ST offset, int32_t amount)
00596 {
00597 ASSERT(alphabet);
00598 return (offset << (amount*alphabet->get_num_bits()));
00599 }
00600
00607 inline ST shift_symbol(ST symbol, int32_t amount)
00608 {
00609 ASSERT(alphabet);
00610 return (symbol >> (amount*alphabet->get_num_bits()));
00611 }
00612
00617 virtual inline void load(CFile* loader);
00618
00627 void load_ascii_file(char* fname, bool remap_to_bin=true,
00628 EAlphabet ascii_alphabet=DNA, EAlphabet binary_alphabet=RAWDNA)
00629 {
00630 size_t blocksize=1024*1024;
00631 size_t required_blocksize=0;
00632 uint8_t* dummy=new uint8_t[blocksize];
00633 uint8_t* overflow=NULL;
00634 int32_t overflow_len=0;
00635
00636 cleanup();
00637
00638 CAlphabet* alpha=new CAlphabet(ascii_alphabet);
00639 CAlphabet* alpha_bin=new CAlphabet(binary_alphabet);
00640
00641 FILE* f=fopen(fname, "ro");
00642
00643 if (f)
00644 {
00645 num_vectors=0;
00646 max_string_length=0;
00647
00648 SG_INFO("counting line numbers in file %s\n", fname);
00649 size_t block_offs=0;
00650 size_t old_block_offs=0;
00651 fseek(f, 0, SEEK_END);
00652 size_t fsize=ftell(f);
00653 rewind(f);
00654
00655 if (blocksize>fsize)
00656 blocksize=fsize;
00657
00658 SG_DEBUG("block_size=%ld file_size=%ld\n", blocksize, fsize);
00659
00660 size_t sz=blocksize;
00661 while (sz == blocksize)
00662 {
00663 sz=fread(dummy, sizeof(uint8_t), blocksize, f);
00664 bool contains_cr=false;
00665 for (size_t i=0; i<sz; i++)
00666 {
00667 block_offs++;
00668 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00669 {
00670 num_vectors++;
00671 contains_cr=true;
00672 required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs);
00673 old_block_offs=block_offs;
00674 }
00675 }
00676 SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
00677 }
00678
00679 SG_INFO("found %d strings\n", num_vectors);
00680 delete[] dummy;
00681 blocksize=required_blocksize;
00682 dummy = new uint8_t[blocksize];
00683 overflow = new uint8_t[blocksize];
00684 features=new TString<ST>[num_vectors];
00685
00686 rewind(f);
00687 sz=blocksize;
00688 int32_t lines=0;
00689 while (sz == blocksize)
00690 {
00691 sz=fread(dummy, sizeof(uint8_t), blocksize, f);
00692
00693 size_t old_sz=0;
00694 for (size_t i=0; i<sz; i++)
00695 {
00696 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00697 {
00698 int32_t len=i-old_sz;
00699
00700 max_string_length=CMath::max(max_string_length, len+overflow_len);
00701
00702 features[lines].length=len;
00703 features[lines].string=new ST[len];
00704
00705 if (remap_to_bin)
00706 {
00707 for (int32_t j=0; j<overflow_len; j++)
00708 features[lines].string[j]=alpha->remap_to_bin(overflow[j]);
00709 for (int32_t j=0; j<len; j++)
00710 features[lines].string[j+overflow_len]=alpha->remap_to_bin(dummy[old_sz+j]);
00711 alpha->add_string_to_histogram(&dummy[old_sz], len);
00712 alpha_bin->add_string_to_histogram(features[lines].string, features[lines].length);
00713 }
00714 else
00715 {
00716 for (int32_t j=0; j<overflow_len; j++)
00717 features[lines].string[j]=overflow[j];
00718 for (int32_t j=0; j<len; j++)
00719 features[lines].string[j+overflow_len]=dummy[old_sz+j];
00720 alpha->add_string_to_histogram(&dummy[old_sz], len);
00721 alpha->add_string_to_histogram(features[lines].string, features[lines].length);
00722 }
00723
00724
00725 overflow_len=0;
00726
00727
00728 old_sz=i+1;
00729 lines++;
00730 SG_PROGRESS(lines, 0, num_vectors, 1, "LOADING:\t");
00731 }
00732 }
00733 for (size_t i=old_sz; i<sz; i++)
00734 overflow[i-old_sz]=dummy[i];
00735
00736 overflow_len=sz-old_sz;
00737 }
00738
00739 if (alpha->check_alphabet_size() && alpha->check_alphabet())
00740 {
00741 SG_INFO("file successfully read\n");
00742 SG_INFO("max_string_length=%d\n", max_string_length);
00743 SG_INFO("num_strings=%d\n", num_vectors);
00744 }
00745 fclose(f);
00746 }
00747
00748 delete[] dummy;
00749
00750 SG_UNREF(alphabet);
00751
00752 if (remap_to_bin)
00753 alphabet = alpha_bin;
00754 else
00755 alphabet = alpha;
00756 SG_REF(alphabet);
00757 num_symbols=alphabet->get_num_symbols();
00758 }
00759
00766 bool load_fasta_file(const char* fname, bool ignore_invalid=false)
00767 {
00768 int32_t i=0;
00769 uint64_t len=0;
00770 uint64_t offs=0;
00771 int32_t num=0;
00772 int32_t max_len=0;
00773
00774 CMemoryMappedFile<char> f(fname);
00775
00776 while (true)
00777 {
00778 char* s=f.get_line(len, offs);
00779 if (!s)
00780 break;
00781
00782 if (len>0 && s[0]=='>')
00783 num++;
00784 }
00785
00786 if (num==0)
00787 SG_ERROR("No fasta hunks (lines starting with '>') found\n");
00788
00789 cleanup();
00790 SG_UNREF(alphabet);
00791 alphabet=new CAlphabet(DNA);
00792 num_symbols=alphabet->get_num_symbols();
00793
00794 TString<ST>* strings=new TString<ST>[num];
00795 offs=0;
00796
00797 for (i=0;i<num; i++)
00798 {
00799 uint64_t id_len=0;
00800 char* id=f.get_line(id_len, offs);
00801
00802 char* fasta=f.get_line(len, offs);
00803 char* s=fasta;
00804 int32_t fasta_len=0;
00805 int32_t spanned_lines=0;
00806
00807 while (true)
00808 {
00809 if (!s || len==0)
00810 SG_ERROR("Error reading fasta entry in line %d len=%ld", 4*i+1, len);
00811
00812 if (s[0]=='>' || offs==f.get_size())
00813 {
00814 offs-=len+1;
00815 if (offs==f.get_size())
00816 {
00817 SG_DEBUG("at EOF\n");
00818 fasta_len+=len;
00819 }
00820
00821 len = fasta_len-spanned_lines;
00822 strings[i].string=new ST[len];
00823 strings[i].length=len;
00824
00825 ST* str=strings[i].string;
00826 int32_t idx=0;
00827 SG_DEBUG("'%.*s', len=%d, spanned_lines=%d\n", (int32_t) id_len, id, (int32_t) len, (int32_t) spanned_lines);
00828
00829 for (int32_t j=0; j<fasta_len; j++)
00830 {
00831 if (fasta[j]=='\n')
00832 continue;
00833
00834 ST c = (ST) fasta[j];
00835
00836 if (ignore_invalid && !alphabet->is_valid((uint8_t) fasta[j]))
00837 c = (ST) 'A';
00838
00839 if (idx>=len)
00840 SG_ERROR("idx=%d j=%d fasta_len=%d, spanned_lines=%d str='%.*s'\n", idx, j, fasta_len, spanned_lines, idx, str);
00841 str[idx++]=c;
00842 }
00843 max_len=CMath::max(max_len, strings[i].length);
00844
00845
00846 break;
00847 }
00848
00849 spanned_lines++;
00850 fasta_len+=len+1;
00851 s=f.get_line(len, offs);
00852 }
00853 }
00854
00855 return set_features(strings, num, max_len);
00856 }
00857
00865 bool load_fastq_file(const char* fname,
00866 bool ignore_invalid=false, bool bitremap_in_single_string=false)
00867 {
00868 CMemoryMappedFile<char> f(fname);
00869
00870 int32_t i=0;
00871 uint64_t len=0;
00872 uint64_t offs=0;
00873
00874 int32_t num=f.get_num_lines();
00875 int32_t max_len=0;
00876
00877 if (num%4)
00878 SG_ERROR("Number of lines must be divisible by 4 in fastq files\n");
00879 num/=4;
00880
00881 cleanup();
00882 SG_UNREF(alphabet);
00883 alphabet=new CAlphabet(DNA);
00884
00885 TString<ST>* strings;
00886
00887 ST* str;
00888 if (bitremap_in_single_string)
00889 {
00890 strings=new TString<ST>[1];
00891 strings[0].string=new ST[num];
00892 strings[0].length=num;
00893 f.get_line(len, offs);
00894 f.get_line(len, offs);
00895 order=len;
00896 max_len=num;
00897 offs=0;
00898 original_num_symbols=alphabet->get_num_symbols();
00899 int32_t max_val=alphabet->get_num_bits();
00900 str=new ST[len];
00901 }
00902 else
00903 strings=new TString<ST>[num];
00904
00905 for (i=0;i<num; i++)
00906 {
00907 if (!f.get_line(len, offs))
00908 SG_ERROR("Error reading 'read' identifier in line %d", 4*i);
00909
00910 char* s=f.get_line(len, offs);
00911 if (!s || len==0)
00912 SG_ERROR("Error reading 'read' in line %d len=%ld", 4*i+1, len);
00913
00914 if (bitremap_in_single_string)
00915 {
00916 if (len!=order)
00917 SG_ERROR("read in line %d not of length %d (is %d)\n", 4*i+1, order, len);
00918 for (int32_t j=0; j<order; j++)
00919 str[j]=(ST) alphabet->remap_to_bin((uint8_t) s[j]);
00920
00921 strings[0].string[i]=embed_word(str, order);
00922 }
00923 else
00924 {
00925 strings[i].string=new ST[len];
00926 strings[i].length=len;
00927 str=strings[i].string;
00928
00929 if (ignore_invalid)
00930 {
00931 for (int32_t j=0; j<len; j++)
00932 {
00933 if (alphabet->is_valid((uint8_t) s[j]))
00934 str[j]= (ST) s[j];
00935 else
00936 str[j]= (ST) 'A';
00937 }
00938 }
00939 else
00940 {
00941 for (int32_t j=0; j<len; j++)
00942 str[j]= (ST) s[j];
00943 }
00944 max_len=CMath::max(max_len, (int32_t) len);
00945 }
00946
00947
00948 if (!f.get_line(len, offs))
00949 SG_ERROR("Error reading 'read' quality identifier in line %d", 4*i+2);
00950
00951 if (!f.get_line(len, offs))
00952 SG_ERROR("Error reading 'read' quality in line %d", 4*i+3);
00953 }
00954
00955 if (bitremap_in_single_string)
00956 num=1;
00957
00958 num_vectors=num;
00959 max_string_length=max_len;
00960 features=strings;
00961
00962 return true;
00963 }
00964
00970 bool load_from_directory(char* dirname)
00971 {
00972 struct dirent **namelist;
00973 int32_t n;
00974
00975 IO::set_dirname(dirname);
00976
00977 SG_DEBUG("dirname '%s'\n", dirname);
00978
00979 n = scandir(dirname, &namelist, &IO::filter, alphasort);
00980 if (n <= 0)
00981 {
00982 SG_ERROR("error calling scandir - no files found\n");
00983 return false;
00984 }
00985 else
00986 {
00987 TString<ST>* strings=NULL;
00988
00989 int32_t num=0;
00990 int32_t max_len=-1;
00991
00992
00993
00994 strings=new TString<ST>[n];
00995
00996 for (int32_t i=0; i<n; i++)
00997 {
00998 char* fname=IO::concat_filename(namelist[i]->d_name);
00999
01000 struct stat s;
01001 off_t filesize=0;
01002
01003 if (!stat(fname, &s) && s.st_size>0)
01004 {
01005 filesize=s.st_size/sizeof(ST);
01006
01007 FILE* f=fopen(fname, "ro");
01008 if (f)
01009 {
01010 ST* str=new ST[filesize];
01011 SG_DEBUG("%s:%ld\n", fname, (int64_t) filesize);
01012 fread(str, sizeof(ST), filesize, f);
01013 strings[num].string=str;
01014 strings[num].length=filesize;
01015 max_len=CMath::max(max_len, strings[num].length);
01016
01017 num++;
01018 fclose(f);
01019 }
01020 }
01021 else
01022 SG_ERROR("empty or non readable file \'%s\'\n", fname);
01023
01024 free(namelist[i]);
01025 }
01026 free(namelist);
01027
01028 if (num>0 && strings)
01029 {
01030 set_features(strings, num, max_len);
01031 return true;
01032 }
01033 }
01034 return false;
01035 }
01036
01044 bool set_features(TString<ST>* p_features, int32_t p_num_vectors, int32_t p_max_string_length)
01045 {
01046 if (p_features)
01047 {
01048 CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet());
01049
01050
01051 for (int32_t i=0; i<p_num_vectors; i++)
01052 alpha->add_string_to_histogram( p_features[i].string, p_features[i].length);
01053
01054 SG_INFO("max_value_in_histogram:%d\n", alpha->get_max_value_in_histogram());
01055 SG_INFO("num_symbols_in_histogram:%d\n", alpha->get_num_symbols_in_histogram());
01056
01057 if (alpha->check_alphabet_size() && alpha->check_alphabet())
01058 {
01059 cleanup();
01060 SG_UNREF(alphabet);
01061
01062 alphabet=alpha;
01063 SG_REF(alphabet);
01064
01065 this->features=p_features;
01066 this->num_vectors=p_num_vectors;
01067 this->max_string_length=p_max_string_length;
01068
01069 return true;
01070 }
01071 else
01072 SG_UNREF(alpha);
01073 }
01074
01075 return false;
01076 }
01077
01083 bool append_features(CStringFeatures<ST>* sf)
01084 {
01085 ASSERT(sf);
01086 TString<ST>* new_features = new TString<ST>[sf->num_vectors];
01087
01088 for (int32_t i=0; i<sf->num_vectors; i++)
01089 {
01090 int32_t length=sf->features[i].length;
01091 new_features[i].string=new ST[length];
01092 memcpy(new_features[i].string, sf->features[i].string, length);
01093 new_features[i].length=length;
01094 }
01095 return append_features(new_features, sf->num_vectors,
01096 sf->max_string_length);
01097 }
01098
01109 bool append_features(TString<ST>* p_features, int32_t p_num_vectors, int32_t p_max_string_length)
01110 {
01111 if (!features)
01112 return set_features(p_features, p_num_vectors, p_max_string_length);
01113
01114 CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet());
01115
01116
01117 for (int32_t i=0; i<p_num_vectors; i++)
01118 alpha->add_string_to_histogram( p_features[i].string, p_features[i].length);
01119
01120 SG_INFO("max_value_in_histogram:%d\n", alpha->get_max_value_in_histogram());
01121 SG_INFO("num_symbols_in_histogram:%d\n", alpha->get_num_symbols_in_histogram());
01122
01123 if (alpha->check_alphabet_size() && alpha->check_alphabet())
01124 {
01125 SG_UNREF(alpha);
01126 for (int32_t i=0; i<p_num_vectors; i++)
01127 alphabet->add_string_to_histogram( p_features[i].string, p_features[i].length);
01128
01129 int32_t old_num_vectors=num_vectors;
01130 num_vectors=old_num_vectors+p_num_vectors;
01131 TString<ST>* new_features = new TString<ST>[num_vectors];
01132
01133 for (int32_t i=0; i<num_vectors; i++)
01134 {
01135 if (i<old_num_vectors)
01136 {
01137 new_features[i].string=features[i].string;
01138 new_features[i].length=features[i].length;
01139 }
01140 else
01141 {
01142 new_features[i].string=p_features[i-old_num_vectors].string;
01143 new_features[i].length=p_features[i-old_num_vectors].length;
01144 }
01145 }
01146 delete[] features;
01147 delete[] p_features;
01148
01149 this->features=new_features;
01150 this->max_string_length=CMath::max(max_string_length, p_max_string_length);
01151
01152 return true;
01153 }
01154 SG_UNREF(alpha);
01155
01156 return false;
01157 }
01158
01165 virtual TString<ST>* get_features(int32_t& num_str, int32_t& max_str_len)
01166 {
01167 num_str=num_vectors;
01168 max_str_len=max_string_length;
01169 return features;
01170 }
01171
01178 virtual TString<ST>* copy_features(int32_t& num_str, int32_t& max_str_len)
01179 {
01180 ASSERT(num_vectors>0);
01181
01182 num_str=num_vectors;
01183 max_str_len=max_string_length;
01184 TString<ST>* new_feat=new TString<ST>[num_str];
01185
01186 for (int32_t i=0; i<num_str; i++)
01187 {
01188 int32_t len;
01189 bool free_vec;
01190 ST* vec=get_feature_vector(i, len, free_vec);
01191 new_feat[i].string=new ST[len];
01192 new_feat[i].length=len;
01193 memcpy(new_feat[i].string, vec, ((size_t) len) * sizeof(ST));
01194 free_feature_vector(vec, i, free_vec);
01195 }
01196
01197 return new_feat;
01198 }
01199
01205 virtual void get_features(TString<ST>** dst, int32_t* num_str)
01206 {
01207 int32_t num_vec;
01208 int32_t max_str_len;
01209 *dst=copy_features(num_vec, max_str_len);
01210 *num_str=num_vec;
01211 }
01212
01217 virtual inline void save(CFile* writer);
01218
01225 virtual bool load_compressed(char* src, bool decompress)
01226 {
01227 FILE* file=NULL;
01228
01229 if (!(file=fopen(src, "r")))
01230 return false;
01231 cleanup();
01232
01233
01234 char id[4];
01235 fread(&id[0], sizeof(char), 1, file);
01236 ASSERT(id[0]=='S');
01237 fread(&id[1], sizeof(char), 1, file);
01238 ASSERT(id[1]=='G');
01239 fread(&id[2], sizeof(char), 1, file);
01240 ASSERT(id[2]=='V');
01241 fread(&id[3], sizeof(char), 1, file);
01242 ASSERT(id[3]=='0');
01243
01244
01245 uint8_t c;
01246 fread(&c, sizeof(uint8_t), 1, file);
01247 CCompressor* compressor= new CCompressor((E_COMPRESSION_TYPE) c);
01248
01249 uint8_t a;
01250 delete alphabet;
01251 fread(&a, sizeof(uint8_t), 1, file);
01252 alphabet=new CAlphabet((EAlphabet) a);
01253
01254 fread(&num_vectors, sizeof(int32_t), 1, file);
01255 ASSERT(num_vectors>0);
01256
01257 fread(&max_string_length, sizeof(int32_t), 1, file);
01258 ASSERT(max_string_length>0);
01259
01260 features=new TString<ST>[num_vectors];
01261
01262
01263 for (int32_t i=0; i<num_vectors; i++)
01264 {
01265
01266 int32_t len_compressed;
01267 fread(&len_compressed, sizeof(int32_t), 1, file);
01268
01269 int32_t len_uncompressed;
01270 fread(&len_uncompressed, sizeof(int32_t), 1, file);
01271
01272
01273 if (decompress)
01274 {
01275 features[i].string=new ST[len_uncompressed];
01276 features[i].length=len_uncompressed;
01277 uint8_t* compressed=new uint8_t[len_compressed];
01278 fread(compressed, len_compressed, 1, file);
01279 uint64_t uncompressed_size=len_uncompressed;
01280 uncompressed_size*=sizeof(ST);
01281 compressor->decompress(compressed, len_compressed,
01282 (uint8_t*) features[i].string, uncompressed_size);
01283 delete[] compressed;
01284 ASSERT(uncompressed_size==((uint64_t) len_uncompressed)*sizeof(ST));
01285 }
01286 else
01287 {
01288 int32_t offs=CMath::ceil(2.0*sizeof(int32_t)/sizeof(ST));
01289 features[i].string=new ST[len_compressed+offs];
01290 features[i].length=len_compressed+offs;
01291 int32_t* feat32ptr=((int32_t*) (features[i].string));
01292 memset(features[i].string, 0, offs*sizeof(ST));
01293 feat32ptr[0]=(int32_t) len_compressed;
01294 feat32ptr[1]=(int32_t) len_uncompressed;
01295 uint8_t* compressed=(uint8_t*) (&features[i].string[offs]);
01296 fread(compressed, len_compressed, 1, file);
01297 }
01298 }
01299
01300 delete compressor;
01301 fclose(file);
01302 return false;
01303 }
01304
01312 virtual bool save_compressed(char* dest, E_COMPRESSION_TYPE compression, int level)
01313 {
01314 FILE* file=NULL;
01315
01316 if (!(file=fopen(dest, "wb")))
01317 return false;
01318
01319 CCompressor* compressor= new CCompressor(compression);
01320
01321
01322 const char* id="SGV0";
01323 fwrite(&id[0], sizeof(char), 1, file);
01324 fwrite(&id[1], sizeof(char), 1, file);
01325 fwrite(&id[2], sizeof(char), 1, file);
01326 fwrite(&id[3], sizeof(char), 1, file);
01327
01328
01329 uint8_t c=(uint8_t) compression;
01330 fwrite(&c, sizeof(uint8_t), 1, file);
01331
01332 uint8_t a=(uint8_t) alphabet->get_alphabet();
01333 fwrite(&a, sizeof(uint8_t), 1, file);
01334
01335 fwrite(&num_vectors, sizeof(int32_t), 1, file);
01336
01337 fwrite(&max_string_length, sizeof(int32_t), 1, file);
01338
01339
01340 for (int32_t i=0; i<num_vectors; i++)
01341 {
01342 int32_t len=-1;
01343 bool vfree;
01344 ST* vec=get_feature_vector(i, len, vfree);
01345
01346 uint8_t* compressed=NULL;
01347 uint64_t compressed_size=0;
01348
01349 compressor->compress((uint8_t*) vec, ((uint64_t) len)*sizeof(ST),
01350 compressed, compressed_size, level);
01351
01352 int32_t len_compressed = (int32_t) compressed_size;
01353
01354 fwrite(&len_compressed, sizeof(int32_t), 1, file);
01355
01356 fwrite(&len, sizeof(int32_t), 1, file);
01357
01358 fwrite(compressed, compressed_size, 1, file);
01359 delete[] compressed;
01360
01361 free_feature_vector(vec, i, vfree);
01362 }
01363
01364 delete compressor;
01365 fclose(file);
01366 return true;
01367 }
01368
01369
01374 virtual int32_t get_size() { return sizeof(ST); }
01375
01381 virtual bool apply_preproc(bool force_preprocessing=false)
01382 {
01383 SG_DEBUG( "force: %d\n", force_preprocessing);
01384
01385 for (int32_t i=0; i<get_num_preproc(); i++)
01386 {
01387 if ( (!is_preprocessed(i) || force_preprocessing) )
01388 {
01389 set_preprocessed(i);
01390 CStringPreProc<ST>* p = (CStringPreProc<ST>*) get_preproc(i);
01391 SG_INFO( "preprocessing using preproc %s\n", p->get_name());
01392
01393 if (!p->apply_to_string_features(this))
01394 {
01395 SG_UNREF(p);
01396 return false;
01397 }
01398 else
01399 SG_UNREF(p);
01400 }
01401 }
01402 return true;
01403 }
01404
01414 int32_t obtain_by_sliding_window(int32_t window_size, int32_t step_size, int32_t skip=0)
01415 {
01416 ASSERT(step_size>0);
01417 ASSERT(window_size>0);
01418 ASSERT(num_vectors==1 || single_string);
01419 ASSERT(max_string_length>=window_size ||
01420 (single_string && length_of_single_string>=window_size));
01421
01422
01423
01424 if (single_string)
01425 num_vectors= (length_of_single_string-window_size)/step_size + 1;
01426 else if (num_vectors==1)
01427 {
01428 num_vectors= (max_string_length-window_size)/step_size + 1;
01429 length_of_single_string=max_string_length;
01430 }
01431
01432 TString<ST>* f=new TString<ST>[num_vectors];
01433 int32_t offs=0;
01434 for (int32_t i=0; i<num_vectors; i++)
01435 {
01436 f[i].string=&features[0].string[offs+skip];
01437 f[i].length=window_size-skip;
01438 offs+=step_size;
01439 }
01440 single_string=features[0].string;
01441 delete[] features;
01442 features=f;
01443 max_string_length=window_size-skip;
01444
01445 return num_vectors;
01446 }
01447
01456 int32_t obtain_by_position_list(int32_t window_size, CDynamicArray<int32_t>* positions, int32_t skip=0)
01457 {
01458 ASSERT(positions);
01459 ASSERT(window_size>0);
01460 ASSERT(num_vectors==1 || single_string);
01461 ASSERT(max_string_length>=window_size ||
01462 (single_string && length_of_single_string>=window_size));
01463
01464 num_vectors= positions->get_num_elements();
01465 ASSERT(num_vectors>0);
01466
01467 int32_t len;
01468
01469
01470
01471 if (single_string)
01472 len=length_of_single_string;
01473 else
01474 {
01475 single_string=features[0].string;
01476 len=max_string_length;
01477 length_of_single_string=max_string_length;
01478 }
01479
01480 TString<ST>* f=new TString<ST>[num_vectors];
01481 for (int32_t i=0; i<num_vectors; i++)
01482 {
01483 int32_t p=positions->get_element(i);
01484
01485 if (p>=0 && p<=len-window_size)
01486 {
01487 f[i].string=&features[0].string[p+skip];
01488 f[i].length=window_size-skip;
01489 }
01490 else
01491 {
01492 num_vectors=1;
01493 max_string_length=len;
01494 features[0].length=len;
01495 single_string=NULL;
01496 delete[] f;
01497 SG_ERROR("window (size:%d) starting at position[%d]=%d does not fit in sequence(len:%d)\n",
01498 window_size, i, p, len);
01499 return -1;
01500 }
01501 }
01502
01503 delete[] features;
01504 features=f;
01505 max_string_length=window_size-skip;
01506
01507 return num_vectors;
01508 }
01509
01521 inline bool obtain_from_char(CStringFeatures<char>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
01522 {
01523 return obtain_from_char_features(sf, start, p_order, gap, rev);
01524 }
01525
01535 template <class CT>
01536 bool obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
01537 {
01538 ASSERT(sf);
01539
01540 CAlphabet* alpha=sf->get_alphabet();
01541 ASSERT(alpha->get_num_symbols_in_histogram() > 0);
01542
01543 this->order=p_order;
01544 cleanup();
01545
01546 num_vectors=sf->get_num_vectors();
01547 ASSERT(num_vectors>0);
01548 max_string_length=sf->get_max_vector_length()-start;
01549 features=new TString<ST>[num_vectors];
01550
01551 SG_DEBUG( "%1.0llf symbols in StringFeatures<*> %d symbols in histogram\n", sf->get_num_symbols(),
01552 alpha->get_num_symbols_in_histogram());
01553
01554 for (int32_t i=0; i<num_vectors; i++)
01555 {
01556 int32_t len=-1;
01557 bool vfree;
01558 CT* c=sf->get_feature_vector(i, len, vfree);
01559 ASSERT(!vfree);
01560
01561 features[i].string=new ST[len];
01562 features[i].length=len;
01563
01564 ST* str=features[i].string;
01565 for (int32_t j=0; j<len; j++)
01566 str[j]=(ST) alpha->remap_to_bin(c[j]);
01567 }
01568
01569 original_num_symbols=alpha->get_num_symbols();
01570 int32_t max_val=alpha->get_num_bits();
01571
01572 SG_UNREF(alpha);
01573
01574 if (p_order>1)
01575 num_symbols=CMath::powl((floatmax_t) 2, (floatmax_t) max_val*p_order);
01576 else
01577 num_symbols=original_num_symbols;
01578 SG_INFO( "max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols);
01579
01580 if ( ((floatmax_t) num_symbols) > CMath::powl(((floatmax_t) 2),((floatmax_t) sizeof(ST)*8)) )
01581 {
01582 SG_ERROR( "symbol does not fit into datatype \"%c\" (%d)\n", (char) max_val, (int) max_val);
01583 return false;
01584 }
01585
01586 SG_DEBUG( "translate: start=%i order=%i gap=%i(size:%i)\n", start, p_order, gap, sizeof(ST)) ;
01587 for (int32_t line=0; line<num_vectors; line++)
01588 {
01589 int32_t len=0;
01590 bool vfree;
01591 ST* fv=get_feature_vector(line, len, vfree);
01592 ASSERT(!vfree);
01593
01594 if (rev)
01595 CAlphabet::translate_from_single_order_reversed(fv, len, start+gap, p_order+gap, max_val, gap);
01596 else
01597 CAlphabet::translate_from_single_order(fv, len, start+gap, p_order+gap, max_val, gap);
01598
01599
01600 features[line].length-=start+gap ;
01601 if (features[line].length<0)
01602 features[line].length=0 ;
01603 }
01604
01605 compute_symbol_mask_table(max_val);
01606
01607 return true;
01608 }
01609
01617 bool have_same_length(int32_t len=-1)
01618 {
01619 if (len!=-1)
01620 {
01621 if (len!=get_max_vector_length())
01622 return false;
01623 }
01624 len = get_max_vector_length();
01625
01626 for (int32_t i=0; i<num_vectors; i++)
01627 {
01628 if (get_vector_length(i)!=len)
01629 return false;
01630 }
01631
01632 return true;
01633 }
01634
01639 inline void embed_features(int32_t p_order)
01640 {
01641 ASSERT(alphabet->get_num_symbols_in_histogram() > 0);
01642
01643 order=p_order;
01644 original_num_symbols=alphabet->get_num_symbols();
01645 int32_t max_val=alphabet->get_num_bits();
01646
01647 if (p_order>1)
01648 num_symbols=CMath::powl((floatmax_t) 2, (floatmax_t) max_val*p_order);
01649 else
01650 num_symbols=original_num_symbols;
01651
01652 SG_INFO( "max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols);
01653
01654 if ( ((floatmax_t) num_symbols) > CMath::powl(((floatmax_t) 2),((floatmax_t) sizeof(ST)*8)) )
01655 SG_WARNING("symbols did not fit into datatype \"%c\" (%d)\n", (char) max_val, (int) max_val);
01656
01657 ST mask=0;
01658 for (int32_t i=0; i<p_order*max_val; i++)
01659 mask= (mask<<1) | ((ST) 1);
01660
01661 for (int32_t i=0; i<num_vectors; i++)
01662 {
01663 int32_t len=features[i].length;
01664
01665 if (len < p_order)
01666 SG_ERROR("Sequence must be longer than order (%d vs. %d)\n", len, p_order);
01667
01668 ST* str = features[i].string;
01669
01670
01671 for (int32_t j=0; j<p_order; j++)
01672 str[j]=(ST) alphabet->remap_to_bin(str[j]);
01673 str[0]=embed_word(&str[0], p_order);
01674
01675
01676 int32_t idx=0;
01677 for (int32_t j=p_order; j<len; j++)
01678 {
01679 str[j]=(ST) alphabet->remap_to_bin(str[j]);
01680 str[idx+1]= ((str[idx]<<max_val) | str[j]) & mask;
01681 idx++;
01682 }
01683
01684 features[i].length=len-p_order+1;
01685 }
01686
01687 compute_symbol_mask_table(max_val);
01688 }
01689
01694 inline void compute_symbol_mask_table(int64_t max_val)
01695 {
01696 delete[] symbol_mask_table;
01697 symbol_mask_table=new ST[256];
01698
01699 uint64_t mask=0;
01700 for (int32_t i=0; i< (int64_t) max_val; i++)
01701 mask=(mask<<1) | 1;
01702
01703 for (int32_t i=0; i<256; i++)
01704 {
01705 uint8_t bits=(uint8_t) i;
01706 symbol_mask_table[i]=0;
01707
01708 for (int32_t j=0; j<8; j++)
01709 {
01710 if (bits & 1)
01711 symbol_mask_table[i]|=mask<<(max_val*j);
01712
01713 bits>>=1;
01714 }
01715 }
01716 }
01717
01724 inline void unembed_word(ST word, uint8_t* seq, int32_t len)
01725 {
01726 uint32_t nbits= (uint32_t) alphabet->get_num_bits();
01727
01728 ST mask=0;
01729 for (int32_t i=0; i<nbits; i++)
01730 mask=(mask<<1) | (ST) 1;
01731
01732 for (int32_t i=0; i<len; i++)
01733 {
01734 ST w=(word & mask);
01735 seq[len-i-1]=alphabet->remap_to_char((uint8_t) w);
01736 word>>=nbits;
01737 }
01738 }
01739
01745 inline ST embed_word(ST* seq, int32_t len)
01746 {
01747 ST value=(ST) 0;
01748 uint32_t nbits= (uint32_t) alphabet->get_num_bits();
01749 for (int32_t i=0; i<len; i++)
01750 {
01751 value<<=nbits;
01752 value|=seq[i];
01753 }
01754
01755 return value;
01756 }
01757
01760 void determine_maximum_string_length()
01761 {
01762 max_string_length=0;
01763
01764 for (int32_t i=0; i<num_vectors; i++)
01765 max_string_length=CMath::max(max_string_length, features[i].length);
01766 }
01767
01775 static ST* get_zero_terminated_string_copy(TString<ST> str)
01776 {
01777 int32_t l=str.length;
01778 ST* s=new ST[l+1];
01779 memcpy(s, str.string, sizeof(ST)*l);
01780 s[l]='\0';
01781 return s;
01782 }
01783
01790 virtual void set_feature_vector(int32_t num, ST* string, int32_t len)
01791 {
01792 ASSERT(features);
01793 ASSERT(num<num_vectors);
01794
01795 features[num].length=len ;
01796 features[num].string=string ;
01797
01798 max_string_length=CMath::max(len, max_string_length);
01799 }
01800
01801
01804 virtual void get_histogram(float64_t** hist, int32_t* rows, int32_t* cols, bool normalize=true)
01805 {
01806 int32_t nsym=get_num_symbols();
01807 int32_t slen=get_max_vector_length();
01808 int64_t sz=int64_t(nsym)*slen*sizeof(float64_t);
01809 float64_t* h= (float64_t*) malloc(sz);
01810 ASSERT(h);
01811 memset(h, 0, sz);
01812
01813 float64_t* h_normalizer=new float64_t[slen];
01814 memset(h_normalizer, 0, slen*sizeof(float64_t));
01815 int32_t num_str=get_num_vectors();
01816 for (int32_t i=0; i<num_str; i++)
01817 {
01818 int32_t len;
01819 bool free_vec;
01820 ST* vec=get_feature_vector(i, len, free_vec);
01821 for (int32_t j=0; j<len; j++)
01822 {
01823 h[int64_t(j)*nsym+alphabet->remap_to_bin(vec[j])]++;
01824 h_normalizer[j]++;
01825 }
01826 free_feature_vector(vec, i, free_vec);
01827 }
01828
01829 if (normalize)
01830 {
01831 for (int32_t i=0; i<slen; i++)
01832 {
01833 for (int32_t j=0; j<nsym; j++)
01834 {
01835 if (h_normalizer && h_normalizer[i])
01836 h[int64_t(i)*nsym+j]/=h_normalizer[i];
01837 }
01838 }
01839 }
01840 delete[] h_normalizer;
01841
01842 *hist=h;
01843 *rows=nsym;
01844 *cols=slen;
01845 }
01846
01849 virtual void create_random(float64_t* hist, int32_t rows, int32_t cols, int32_t num_vec)
01850 {
01851 ASSERT(rows == get_num_symbols());
01852 cleanup();
01853 float64_t* randoms=new float64_t[cols];
01854 TString<ST>* sf=new TString<ST>[num_vec];
01855
01856 for (int32_t i=0; i<num_vec; i++)
01857 {
01858 sf[i].string=new ST[cols];
01859 sf[i].length=cols;
01860
01861 CMath::random_vector(randoms, cols, 0.0, 1.0);
01862
01863 for (int32_t j=0; j<cols; j++)
01864 {
01865 float64_t lik=hist[int64_t(j)*rows+0];
01866
01867 int32_t c;
01868 for (c=0; c<rows-1; c++)
01869 {
01870 if (randoms[j]<=lik)
01871 break;
01872 lik+=hist[int64_t(j)*rows+c+1];
01873 }
01874 sf[i].string[j]=alphabet->remap_to_char(c);
01875 }
01876 }
01877 delete[] randoms;
01878 set_features(sf, num_vec, cols);
01879 }
01880
01881
01882
01883
01884
01885
01886
01887
01888
01889
01890
01891
01892
01893
01894
01895
01896
01897
01898
01899
01900
01901
01902
01903
01904
01905
01906
01907
01908
01909
01910
01911
01912
01913
01914
01915
01916
01917
01918
01919
01920
01921
01922
01923
01924
01925
01926
01927
01928
01929
01930
01931
01932
01933
01934
01935
01936
01937
01938
01939
01940
01941
01942
01943
01944
01945
01946
01947
01948
01949
01950
01951
01952
01954 inline virtual const char* get_name() const { return "StringFeatures"; }
01955
01956 protected:
01957
01968 virtual ST* compute_feature_vector(int32_t num, int32_t& len)
01969 {
01970 ASSERT(features && num<num_vectors);
01971
01972 len=features[num].length;
01973 if (len<=0)
01974 return NULL;
01975
01976 ST* target=new ST[len];
01977 memcpy(target, features[num].string, len*sizeof(ST));
01978 return target;
01979 }
01980
01981 private:
01982 void init(void)
01983 {
01984 set_generic<ST>();
01985
01986 m_parameters->add((CSGObject**) &alphabet, "alphabet");
01987 m_parameters->add_vector(&features, &num_vectors, "features",
01988 "This contains the array of features.");
01989 m_parameters->add_vector(&single_string,
01990 &length_of_single_string,
01991 "single_string",
01992 "Created by sliding window.");
01993 m_parameters->add(&max_string_length, "max_string_length",
01994 "Length of longest string.");
01995 m_parameters->add(&num_symbols, "num_symbols",
01996 "Number of used symbols.");
01997 m_parameters->add(&original_num_symbols, "original_num_symbols",
01998 "Original number of used symbols.");
01999 m_parameters->add(&order, "order",
02000 "Order used in higher order mapping.");
02001 m_parameters->add(&preprocess_on_get, "preprocess_on_get",
02002 "Preprocess on-the-fly?");
02003
02004
02005
02006
02007
02008 }
02009
02010
02011 protected:
02012
02014 CAlphabet* alphabet;
02015
02017 int32_t num_vectors;
02018
02020 TString<ST>* features;
02021
02023 ST* single_string;
02024
02026 int32_t length_of_single_string;
02027
02029 int32_t max_string_length;
02030
02032 floatmax_t num_symbols;
02033
02035 floatmax_t original_num_symbols;
02036
02038 int32_t order;
02039
02041 ST* symbol_mask_table;
02042
02044 bool preprocess_on_get;
02045
02047 CCache<ST>* feature_cache;
02048 };
02049
02050 #ifndef DOXYGEN_SHOULD_SKIP_THIS
02051
02055 template<> inline EFeatureType CStringFeatures<bool>::get_feature_type()
02056 {
02057 return F_BOOL;
02058 }
02059
02064 template<> inline EFeatureType CStringFeatures<char>::get_feature_type()
02065 {
02066 return F_CHAR;
02067 }
02068
02073 template<> inline EFeatureType CStringFeatures<uint8_t>::get_feature_type()
02074 {
02075 return F_BYTE;
02076 }
02077
02082 template<> inline EFeatureType CStringFeatures<int16_t>::get_feature_type()
02083 {
02084 return F_SHORT;
02085 }
02086
02091 template<> inline EFeatureType CStringFeatures<uint16_t>::get_feature_type()
02092 {
02093 return F_WORD;
02094 }
02095
02100 template<> inline EFeatureType CStringFeatures<int32_t>::get_feature_type()
02101 {
02102 return F_INT;
02103 }
02104
02109 template<> inline EFeatureType CStringFeatures<uint32_t>::get_feature_type()
02110 {
02111 return F_UINT;
02112 }
02113
02118 template<> inline EFeatureType CStringFeatures<int64_t>::get_feature_type()
02119 {
02120 return F_LONG;
02121 }
02122
02127 template<> inline EFeatureType CStringFeatures<uint64_t>::get_feature_type()
02128 {
02129 return F_ULONG;
02130 }
02131
02136 template<> inline EFeatureType CStringFeatures<float32_t>::get_feature_type()
02137 {
02138 return F_SHORTREAL;
02139 }
02140
02145 template<> inline EFeatureType CStringFeatures<float64_t>::get_feature_type()
02146 {
02147 return F_DREAL;
02148 }
02149
02154 template<> inline EFeatureType CStringFeatures<floatmax_t>::get_feature_type()
02155 {
02156 return F_LONGREAL;
02157 }
02158
02159 template<> inline bool CStringFeatures<bool>::get_masked_symbols(bool symbol, uint8_t mask)
02160 {
02161 return symbol;
02162 }
02163 template<> inline float32_t CStringFeatures<float32_t>::get_masked_symbols(float32_t symbol, uint8_t mask)
02164 {
02165 return symbol;
02166 }
02167 template<> inline float64_t CStringFeatures<float64_t>::get_masked_symbols(float64_t symbol, uint8_t mask)
02168 {
02169 return symbol;
02170 }
02171 template<> inline floatmax_t CStringFeatures<floatmax_t>::get_masked_symbols(floatmax_t symbol, uint8_t mask)
02172 {
02173 return symbol;
02174 }
02175
02176 template<> inline bool CStringFeatures<bool>::shift_offset(bool symbol, int32_t amount)
02177 {
02178 return false;
02179 }
02180 template<> inline float32_t CStringFeatures<float32_t>::shift_offset(float32_t symbol, int32_t amount)
02181 {
02182 return 0;
02183 }
02184 template<> inline float64_t CStringFeatures<float64_t>::shift_offset(float64_t symbol, int32_t amount)
02185 {
02186 return 0;
02187 }
02188 template<> inline floatmax_t CStringFeatures<floatmax_t>::shift_offset(floatmax_t symbol, int32_t amount)
02189 {
02190 return 0;
02191 }
02192
02193 template<> inline bool CStringFeatures<bool>::shift_symbol(bool symbol, int32_t amount)
02194 {
02195 return symbol;
02196 }
02197 template<> inline float32_t CStringFeatures<float32_t>::shift_symbol(float32_t symbol, int32_t amount)
02198 {
02199 return symbol;
02200 }
02201 template<> inline float64_t CStringFeatures<float64_t>::shift_symbol(float64_t symbol, int32_t amount)
02202 {
02203 return symbol;
02204 }
02205 template<> inline floatmax_t CStringFeatures<floatmax_t>::shift_symbol(floatmax_t symbol, int32_t amount)
02206 {
02207 return symbol;
02208 }
02209
02210 #ifndef SUNOS
02211 template<> template <class CT> bool CStringFeatures<float32_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
02212 {
02213 return false;
02214 }
02215 template<> template <class CT> bool CStringFeatures<float64_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
02216 {
02217 return false;
02218 }
02219 template<> template <class CT> bool CStringFeatures<floatmax_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
02220 {
02221 return false;
02222 }
02223 #endif
02224
02225 template<> inline void CStringFeatures<float32_t>::embed_features(int32_t p_order)
02226 {
02227 }
02228 template<> inline void CStringFeatures<float64_t>::embed_features(int32_t p_order)
02229 {
02230 }
02231 template<> inline void CStringFeatures<floatmax_t>::embed_features(int32_t p_order)
02232 {
02233 }
02234
02235 template<> inline void CStringFeatures<float32_t>::compute_symbol_mask_table(int64_t max_val)
02236 {
02237 }
02238 template<> inline void CStringFeatures<float64_t>::compute_symbol_mask_table(int64_t max_val)
02239 {
02240 }
02241 template<> inline void CStringFeatures<floatmax_t>::compute_symbol_mask_table(int64_t max_val)
02242 {
02243 }
02244
02245 template<> inline float32_t CStringFeatures<float32_t>::embed_word(float32_t* seq, int32_t len)
02246 {
02247 return 0;
02248 }
02249 template<> inline float64_t CStringFeatures<float64_t>::embed_word(float64_t* seq, int32_t len)
02250 {
02251 return 0;
02252 }
02253 template<> inline floatmax_t CStringFeatures<floatmax_t>::embed_word(floatmax_t* seq, int32_t len)
02254 {
02255 return 0;
02256 }
02257
02258 template<> inline void CStringFeatures<float32_t>::unembed_word(float32_t word, uint8_t* seq, int32_t len)
02259 {
02260 }
02261 template<> inline void CStringFeatures<float64_t>::unembed_word(float64_t word, uint8_t* seq, int32_t len)
02262 {
02263 }
02264 template<> inline void CStringFeatures<floatmax_t>::unembed_word(floatmax_t word, uint8_t* seq, int32_t len)
02265 {
02266 }
02267 #define LOAD(f_load, sg_type) \
02268 template<> inline void CStringFeatures<sg_type>::load(CFile* loader) \
02269 { \
02270 SG_INFO( "loading...\n"); \
02271 \
02272 SG_SET_LOCALE_C; \
02273 TString<sg_type>* strs; \
02274 int32_t num_str; \
02275 int32_t max_len; \
02276 loader->f_load(strs, num_str, max_len); \
02277 set_features(strs, num_str, max_len); \
02278 SG_RESET_LOCALE; \
02279 }
02280
02281 LOAD(get_bool_string_list, bool)
02282 LOAD(get_char_string_list, char)
02283 LOAD(get_int8_string_list, int8_t)
02284 LOAD(get_byte_string_list, uint8_t)
02285 LOAD(get_short_string_list, int16_t)
02286 LOAD(get_word_string_list, uint16_t)
02287 LOAD(get_int_string_list, int32_t)
02288 LOAD(get_uint_string_list, uint32_t)
02289 LOAD(get_long_string_list, int64_t)
02290 LOAD(get_ulong_string_list, uint64_t)
02291 LOAD(get_shortreal_string_list, float32_t)
02292 LOAD(get_real_string_list, float64_t)
02293 LOAD(get_longreal_string_list, floatmax_t)
02294 #undef LOAD
02295
02296 #define SAVE(f_write, sg_type) \
02297 template<> inline void CStringFeatures<sg_type>::save(CFile* writer) \
02298 { \
02299 SG_SET_LOCALE_C; \
02300 ASSERT(writer); \
02301 writer->f_write(features, num_vectors); \
02302 SG_RESET_LOCALE; \
02303 }
02304
02305 SAVE(set_bool_string_list, bool)
02306 SAVE(set_char_string_list, char)
02307 SAVE(set_int8_string_list, int8_t)
02308 SAVE(set_byte_string_list, uint8_t)
02309 SAVE(set_short_string_list, int16_t)
02310 SAVE(set_word_string_list, uint16_t)
02311 SAVE(set_int_string_list, int32_t)
02312 SAVE(set_uint_string_list, uint32_t)
02313 SAVE(set_long_string_list, int64_t)
02314 SAVE(set_ulong_string_list, uint64_t)
02315 SAVE(set_shortreal_string_list, float32_t)
02316 SAVE(set_real_string_list, float64_t)
02317 SAVE(set_longreal_string_list, floatmax_t)
02318 #undef SAVE
02319 #endif // DOXYGEN_SHOULD_SKIP_THIS
02320 }
02321 #endif // _CSTRINGFEATURES__H__