00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013 #ifndef _CSTRINGFEATURES__H__
00014 #define _CSTRINGFEATURES__H__
00015
00016 #include <shogun/lib/common.h>
00017 #include <shogun/io/SGIO.h>
00018 #include <shogun/lib/Cache.h>
00019 #include <shogun/lib/DynamicArray.h>
00020 #include <shogun/io/File.h>
00021 #include <shogun/io/MemoryMappedFile.h>
00022 #include <shogun/mathematics/Math.h>
00023 #include <shogun/lib/Compressor.h>
00024 #include <shogun/base/Parameter.h>
00025
00026 #include <shogun/preprocessor/Preprocessor.h>
00027 #include <shogun/preprocessor/StringPreprocessor.h>
00028 #include <shogun/features/Features.h>
00029 #include <shogun/features/Alphabet.h>
00030
00031 #include <sys/types.h>
00032 #include <sys/stat.h>
00033 #include <dirent.h>
00034 #include <stdio.h>
00035 #include <stdlib.h>
00036 #include <unistd.h>
00037
00038 namespace shogun
00039 {
00040 class CCompressor;
00041 enum E_COMPRESSION_TYPE;
00042 class CAlphabet;
00043 enum EAlphabet;
00044 template <class T> class CDynamicArray;
00045 class CFile;
00046 template <class T> class CMemoryMappedFile;
00047 class CMath;
00048 template <class ST> class CStringPreprocessor;
00049 template <class T> class SGString;
00050
00051 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00052 struct SSKDoubleFeature
00053 {
00054 int feature1;
00055 int feature2;
00056 int group;
00057 };
00058
00059 struct SSKTripleFeature
00060 {
00061 int feature1;
00062 int feature2;
00063 int feature3;
00064 int group;
00065 };
00066 #endif
00067
00090 template <class ST> class CStringFeatures : public CFeatures
00091 {
00092 public:
00096 CStringFeatures() : CFeatures(0)
00097 {
00098 init();
00099 alphabet=new CAlphabet();
00100 }
00101
00106 CStringFeatures(EAlphabet alpha) : CFeatures(0)
00107 {
00108 init();
00109
00110 alphabet=new CAlphabet(alpha);
00111 SG_REF(alphabet);
00112 num_symbols=alphabet->get_num_symbols();
00113 original_num_symbols=num_symbols;
00114 }
00115
00120 CStringFeatures(SGStringList<ST> string_list, EAlphabet alpha)
00121 : CFeatures(0)
00122 {
00123 init();
00124
00125 alphabet=new CAlphabet(alpha);
00126 SG_REF(alphabet);
00127 num_symbols=alphabet->get_num_symbols();
00128 original_num_symbols=num_symbols;
00129 set_features(string_list.strings, string_list.num_strings, string_list.max_string_length);
00130 }
00131
00136 CStringFeatures(SGStringList<ST> string_list, CAlphabet* alpha)
00137 : CFeatures(0)
00138 {
00139 init();
00140
00141 alphabet=new CAlphabet(alpha);
00142 SG_REF(alphabet);
00143 num_symbols=alphabet->get_num_symbols();
00144 original_num_symbols=num_symbols;
00145 set_features(string_list.strings, string_list.num_strings, string_list.max_string_length);
00146 }
00147
00152 CStringFeatures(CAlphabet* alpha)
00153 : CFeatures(0)
00154 {
00155 init();
00156
00157 ASSERT(alpha);
00158 SG_REF(alpha);
00159 alphabet=alpha;
00160 num_symbols=alphabet->get_num_symbols();
00161 original_num_symbols=num_symbols;
00162 }
00163
00165 CStringFeatures(const CStringFeatures & orig)
00166 : CFeatures(orig), num_vectors(orig.num_vectors),
00167 single_string(orig.single_string),
00168 length_of_single_string(orig.length_of_single_string),
00169 max_string_length(orig.max_string_length),
00170 num_symbols(orig.num_symbols),
00171 original_num_symbols(orig.original_num_symbols),
00172 order(orig.order), preprocess_on_get(false),
00173 feature_cache(NULL)
00174 {
00175 init();
00176
00177 ASSERT(orig.single_string == NULL);
00178
00179 alphabet=orig.alphabet;
00180 SG_REF(alphabet);
00181
00182 if (orig.features)
00183 {
00184 features=SG_MALLOC(SGString<ST>, orig.num_vectors);
00185
00186 for (int32_t i=0; i<num_vectors; i++)
00187 {
00188 features[i].string=SG_MALLOC(ST, orig.features[i].slen);
00189 features[i].slen=orig.features[i].slen;
00190 memcpy(features[i].string, orig.features[i].string, sizeof(ST)*orig.features[i].slen);
00191 }
00192 }
00193
00194 if (orig.symbol_mask_table)
00195 {
00196 symbol_mask_table=SG_MALLOC(ST, 256);
00197 for (int32_t i=0; i<256; i++)
00198 symbol_mask_table[i]=orig.symbol_mask_table[i];
00199 }
00200
00201 m_subset=orig.m_subset->duplicate();
00202 }
00203
00209 CStringFeatures(CFile* loader, EAlphabet alpha=DNA)
00210 : CFeatures(loader), num_vectors(0),
00211 features(NULL), single_string(NULL), length_of_single_string(0),
00212 max_string_length(0), order(0),
00213 symbol_mask_table(NULL), preprocess_on_get(false), feature_cache(NULL)
00214 {
00215 init();
00216
00217 alphabet=new CAlphabet(alpha);
00218 SG_REF(alphabet);
00219 num_symbols=alphabet->get_num_symbols();
00220 original_num_symbols=num_symbols;
00221 load(loader);
00222 }
00223
00224 virtual ~CStringFeatures()
00225 {
00226 cleanup();
00227
00228 SG_UNREF(alphabet);
00229 }
00230
00236 virtual void cleanup()
00237 {
00238 remove_subset();
00239
00240 if (single_string)
00241 {
00242 SG_FREE(single_string);
00243 single_string=NULL;
00244 }
00245 else
00246 {
00247 for (int32_t i=0; i<num_vectors; i++)
00248 cleanup_feature_vector(i);
00249 }
00250
00251 num_vectors=0;
00252 SG_FREE(features);
00253 SG_FREE(symbol_mask_table);
00254 features=NULL;
00255 symbol_mask_table=NULL;
00256
00257
00258
00259
00260
00261 CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet());
00262 SG_UNREF(alphabet);
00263 alphabet=alpha;
00264 SG_REF(alphabet);
00265 }
00266
00273 virtual void cleanup_feature_vector(int32_t num)
00274 {
00275 ASSERT(num<get_num_vectors());
00276
00277 if (features)
00278 {
00279 int32_t real_num=subset_idx_conversion(num);
00280 SG_FREE(features[real_num].string);
00281 features[real_num].string=NULL;
00282 features[real_num].slen=0;
00283
00284 determine_maximum_string_length();
00285 }
00286 }
00287
00292 inline virtual EFeatureClass get_feature_class() { return C_STRING; }
00293
00298 inline virtual EFeatureType get_feature_type() { return F_UNKNOWN; }
00299
00304 inline CAlphabet* get_alphabet()
00305 {
00306 SG_REF(alphabet);
00307 return alphabet;
00308 }
00309
00314 virtual CFeatures* duplicate() const
00315 {
00316 return new CStringFeatures<ST>(*this);
00317 }
00318
00325 SGVector<ST> get_feature_vector(int32_t num)
00326 {
00327 ASSERT(features);
00328 if (num>=get_num_vectors())
00329 {
00330 SG_ERROR("Index out of bounds (number of strings %d, you "
00331 "requested %d)\n", get_num_vectors(), num);
00332 }
00333
00334 int32_t l;
00335 bool free_vec;
00336 ST* vec=get_feature_vector(num, l, free_vec);
00337 ST* dst=SG_MALLOC(ST, l);
00338 memcpy(dst, vec, l*sizeof(ST));
00339 free_feature_vector(vec, num, free_vec);
00340 return SGVector<ST>(dst, l);
00341 }
00342
00350 void set_feature_vector(SGVector<ST> vector, int32_t num)
00351 {
00352 ASSERT(features);
00353
00354 if (m_subset)
00355 SG_ERROR("A subset is set, cannot set feature vector\n");
00356
00357 if (num>=num_vectors)
00358 {
00359 SG_ERROR("Index out of bounds (number of strings %d, you "
00360 "requested %d)\n", num_vectors, num);
00361 }
00362
00363 if (vector.vlen<=0)
00364 SG_ERROR("String has zero or negative length\n");
00365
00366 cleanup_feature_vector(num);
00367 features[num].slen=vector.vlen;
00368 features[num].string=SG_MALLOC(ST, vector.vlen);
00369 memcpy(features[num].string, vector.vector, vector.vlen*sizeof(ST));
00370
00371 determine_maximum_string_length();
00372 }
00373
00376 void enable_on_the_fly_preprocessing()
00377 {
00378 preprocess_on_get=true;
00379 }
00380
00384 void disable_on_the_fly_preprocessing()
00385 {
00386 preprocess_on_get=false;
00387 }
00388
00399 ST* get_feature_vector(int32_t num, int32_t& len, bool& dofree)
00400 {
00401 ASSERT(features);
00402 ASSERT(num<get_num_vectors());
00403
00404
00405 int32_t real_num=subset_idx_conversion(num);
00406
00407 if (!preprocess_on_get)
00408 {
00409 dofree=false;
00410 len=features[real_num].slen;
00411 return features[real_num].string;
00412 }
00413 else
00414 {
00415 SG_DEBUG( "computing feature vector!\n") ;
00416 ST* feat=compute_feature_vector(num, len);
00417 dofree=true;
00418
00419 if (get_num_preprocessors())
00420 {
00421 ST* tmp_feat_before=feat;
00422
00423 for (int32_t i=0; i<get_num_preprocessors(); i++)
00424 {
00425 CStringPreprocessor<ST>* p=(CStringPreprocessor<ST>*) get_preprocessor(i);
00426 feat=p->apply_to_string(tmp_feat_before, len);
00427 SG_UNREF(p);
00428 SG_FREE(tmp_feat_before);
00429 tmp_feat_before=feat;
00430 }
00431 }
00432
00433 return feat;
00434 }
00435 }
00436
00443 CStringFeatures<ST>* get_transposed()
00444 {
00445 int32_t num_feat;
00446 int32_t num_vec;
00447 SGString<ST>* s=get_transposed(num_feat, num_vec);
00448 SGStringList<ST> string_list;
00449 string_list.strings = s;
00450 string_list.num_strings = num_vec;
00451 string_list.max_string_length = num_feat;
00452
00453 return new CStringFeatures<ST>(string_list, alphabet);
00454 }
00455
00469 SGString<ST>* get_transposed(int32_t &num_feat, int32_t &num_vec)
00470 {
00471 num_feat=get_num_vectors();
00472 num_vec=get_max_vector_length();
00473 ASSERT(have_same_length());
00474
00475 SG_DEBUG("Allocating memory for transposed string features of size %ld\n",
00476 int64_t(num_feat)*num_vec);
00477
00478 SGString<ST>* sf=SG_MALLOC(SGString<ST>, num_vec);
00479
00480 for (int32_t i=0; i<num_vec; i++)
00481 {
00482 sf[i].string=SG_MALLOC(ST, num_feat);
00483 sf[i].slen=num_feat;
00484 }
00485
00486 for (int32_t i=0; i<num_feat; i++)
00487 {
00488 int32_t len=0;
00489 bool free_vec=false;
00490 ST* vec=get_feature_vector(i, len, free_vec);
00491
00492 for (int32_t j=0; j<num_vec; j++)
00493 sf[j].string[i]=vec[j];
00494
00495 free_feature_vector(vec, i, free_vec);
00496 }
00497 return sf;
00498 }
00499
00508 void free_feature_vector(ST* feat_vec, int32_t num, bool dofree)
00509 {
00510 if (num>=get_num_vectors())
00511 {
00512 SG_ERROR(
00513 "Trying to access string[%d] but num_str=%d\n", num,
00514 get_num_vectors());
00515 }
00516
00517 int32_t real_num=subset_idx_conversion(num);
00518
00519 if (feature_cache)
00520 feature_cache->unlock_entry(real_num);
00521
00522 if (dofree)
00523 SG_FREE(feat_vec);
00524 }
00525
00533 void free_feature_vector(SGVector<ST> feat_vec, int32_t num)
00534 {
00535 if (num>=get_num_vectors())
00536 {
00537 SG_ERROR(
00538 "Trying to access string[%d] but num_str=%d\n", num,
00539 get_num_vectors());
00540 }
00541
00542 int32_t real_num=subset_idx_conversion(num);
00543
00544 if (feature_cache)
00545 feature_cache->unlock_entry(real_num);
00546
00547 if (feat_vec.do_free)
00548 SG_FREE(feat_vec.vector);
00549 }
00550
00559 virtual ST inline get_feature(int32_t vec_num, int32_t feat_num)
00560 {
00561 ASSERT(vec_num<get_num_vectors());
00562
00563 int32_t len;
00564 bool free_vec;
00565 ST* vec=get_feature_vector(vec_num, len, free_vec);
00566 ASSERT(feat_num<len);
00567 ST result=vec[feat_num];
00568 free_feature_vector(vec, vec_num, free_vec);
00569
00570 return result;
00571 }
00572
00580 virtual inline int32_t get_vector_length(int32_t vec_num)
00581 {
00582 ASSERT(vec_num<get_num_vectors());
00583
00584 int32_t len;
00585 bool free_vec;
00586 ST* vec=get_feature_vector(vec_num, len, free_vec);
00587 free_feature_vector(vec, vec_num, free_vec);
00588 return len;
00589 }
00590
00597 virtual inline int32_t get_max_vector_length()
00598 {
00599 return max_string_length;
00600 }
00601
00603 virtual inline int32_t get_num_vectors() const
00604 {
00605 return m_subset ? m_subset->get_size() : num_vectors;
00606 }
00607
00614 inline floatmax_t get_num_symbols() { return num_symbols; }
00615
00623 inline floatmax_t get_max_num_symbols() { return CMath::powl(2,sizeof(ST)*8); }
00624
00625
00626
00631 inline floatmax_t get_original_num_symbols() { return original_num_symbols; }
00632
00637 inline int32_t get_order() { return order; }
00638
00646 inline ST get_masked_symbols(ST symbol, uint8_t mask)
00647 {
00648 ASSERT(symbol_mask_table);
00649 return symbol_mask_table[mask] & symbol;
00650 }
00651
00658 inline ST shift_offset(ST offset, int32_t amount)
00659 {
00660 ASSERT(alphabet);
00661 return (offset << (amount*alphabet->get_num_bits()));
00662 }
00663
00670 inline ST shift_symbol(ST symbol, int32_t amount)
00671 {
00672 ASSERT(alphabet);
00673 return (symbol >> (amount*alphabet->get_num_bits()));
00674 }
00675
00680 virtual inline void load(CFile* loader);
00681
00692 void load_ascii_file(char* fname, bool remap_to_bin=true,
00693 EAlphabet ascii_alphabet=DNA, EAlphabet binary_alphabet=RAWDNA)
00694 {
00695 remove_subset();
00696
00697 size_t blocksize=1024*1024;
00698 size_t required_blocksize=0;
00699 uint8_t* dummy=SG_MALLOC(uint8_t, blocksize);
00700 uint8_t* overflow=NULL;
00701 int32_t overflow_len=0;
00702
00703 cleanup();
00704
00705 CAlphabet* alpha=new CAlphabet(ascii_alphabet);
00706 CAlphabet* alpha_bin=new CAlphabet(binary_alphabet);
00707
00708 FILE* f=fopen(fname, "ro");
00709
00710 if (f)
00711 {
00712 num_vectors=0;
00713 max_string_length=0;
00714
00715 SG_INFO("counting line numbers in file %s\n", fname);
00716 size_t block_offs=0;
00717 size_t old_block_offs=0;
00718 fseek(f, 0, SEEK_END);
00719 size_t fsize=ftell(f);
00720 rewind(f);
00721
00722 if (blocksize>fsize)
00723 blocksize=fsize;
00724
00725 SG_DEBUG("block_size=%ld file_size=%ld\n", blocksize, fsize);
00726
00727 size_t sz=blocksize;
00728 while (sz == blocksize)
00729 {
00730 sz=fread(dummy, sizeof(uint8_t), blocksize, f);
00731 for (size_t i=0; i<sz; i++)
00732 {
00733 block_offs++;
00734 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00735 {
00736 num_vectors++;
00737 required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs);
00738 old_block_offs=block_offs;
00739 }
00740 }
00741 SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
00742 }
00743
00744 SG_INFO("found %d strings\n", num_vectors);
00745 SG_FREE(dummy);
00746 blocksize=required_blocksize;
00747 dummy=SG_MALLOC(uint8_t, blocksize);
00748 overflow=SG_MALLOC(uint8_t, blocksize);
00749 features=SG_MALLOC(SGString<ST>, num_vectors);
00750
00751 rewind(f);
00752 sz=blocksize;
00753 int32_t lines=0;
00754 while (sz == blocksize)
00755 {
00756 sz=fread(dummy, sizeof(uint8_t), blocksize, f);
00757
00758 size_t old_sz=0;
00759 for (size_t i=0; i<sz; i++)
00760 {
00761 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00762 {
00763 int32_t len=i-old_sz;
00764
00765 max_string_length=CMath::max(max_string_length, len+overflow_len);
00766
00767 features[lines].slen=len;
00768 features[lines].string=SG_MALLOC(ST, len);
00769
00770 if (remap_to_bin)
00771 {
00772 for (int32_t j=0; j<overflow_len; j++)
00773 features[lines].string[j]=alpha->remap_to_bin(overflow[j]);
00774 for (int32_t j=0; j<len; j++)
00775 features[lines].string[j+overflow_len]=alpha->remap_to_bin(dummy[old_sz+j]);
00776 alpha->add_string_to_histogram(&dummy[old_sz], len);
00777 alpha_bin->add_string_to_histogram(features[lines].string, features[lines].slen);
00778 }
00779 else
00780 {
00781 for (int32_t j=0; j<overflow_len; j++)
00782 features[lines].string[j]=overflow[j];
00783 for (int32_t j=0; j<len; j++)
00784 features[lines].string[j+overflow_len]=dummy[old_sz+j];
00785 alpha->add_string_to_histogram(&dummy[old_sz], len);
00786 alpha->add_string_to_histogram(features[lines].string, features[lines].slen);
00787 }
00788
00789
00790 overflow_len=0;
00791
00792
00793 old_sz=i+1;
00794 lines++;
00795 SG_PROGRESS(lines, 0, num_vectors, 1, "LOADING:\t");
00796 }
00797 }
00798 for (size_t i=old_sz; i<sz; i++)
00799 overflow[i-old_sz]=dummy[i];
00800
00801 overflow_len=sz-old_sz;
00802 }
00803
00804 if (alpha->check_alphabet_size() && alpha->check_alphabet())
00805 {
00806 SG_INFO("file successfully read\n");
00807 SG_INFO("max_string_length=%d\n", max_string_length);
00808 SG_INFO("num_strings=%d\n", num_vectors);
00809 }
00810 fclose(f);
00811 }
00812
00813 SG_FREE(dummy);
00814
00815 SG_UNREF(alphabet);
00816
00817 if (remap_to_bin)
00818 alphabet=alpha_bin;
00819 else
00820 alphabet=alpha;
00821 SG_REF(alphabet);
00822 num_symbols=alphabet->get_num_symbols();
00823 }
00824
00833 bool load_fasta_file(const char* fname, bool ignore_invalid=false)
00834 {
00835 remove_subset();
00836
00837 int32_t i=0;
00838 uint64_t len=0;
00839 uint64_t offs=0;
00840 int32_t num=0;
00841 int32_t max_len=0;
00842
00843 CMemoryMappedFile<char> f(fname);
00844
00845 while (true)
00846 {
00847 char* s=f.get_line(len, offs);
00848 if (!s)
00849 break;
00850
00851 if (len>0 && s[0]=='>')
00852 num++;
00853 }
00854
00855 if (num==0)
00856 SG_ERROR("No fasta hunks (lines starting with '>') found\n");
00857
00858 cleanup();
00859 SG_UNREF(alphabet);
00860 alphabet=new CAlphabet(DNA);
00861 num_symbols=alphabet->get_num_symbols();
00862
00863 SGString<ST>* strings=SG_MALLOC(SGString<ST>, num);
00864 offs=0;
00865
00866 for (i=0;i<num; i++)
00867 {
00868 uint64_t id_len=0;
00869 char* id=f.get_line(id_len, offs);
00870
00871 char* fasta=f.get_line(len, offs);
00872 char* s=fasta;
00873 int32_t fasta_len=0;
00874 int32_t spanned_lines=0;
00875
00876 while (true)
00877 {
00878 if (!s || len==0)
00879 SG_ERROR("Error reading fasta entry in line %d len=%ld", 4*i+1, len);
00880
00881 if (s[0]=='>' || offs==f.get_size())
00882 {
00883 offs-=len+1;
00884 if (offs==f.get_size())
00885 {
00886 SG_DEBUG("at EOF\n");
00887 fasta_len+=len;
00888 }
00889
00890 len=fasta_len-spanned_lines;
00891 strings[i].string=SG_MALLOC(ST, len);
00892 strings[i].slen=len;
00893
00894 ST* str=strings[i].string;
00895 int32_t idx=0;
00896 SG_DEBUG("'%.*s', len=%d, spanned_lines=%d\n", (int32_t) id_len, id, (int32_t) len, (int32_t) spanned_lines);
00897
00898 for (int32_t j=0; j<fasta_len; j++)
00899 {
00900 if (fasta[j]=='\n')
00901 continue;
00902
00903 ST c=(ST) fasta[j];
00904
00905 if (ignore_invalid && !alphabet->is_valid((uint8_t) fasta[j]))
00906 c=(ST) 'A';
00907
00908 if (idx>=len)
00909 SG_ERROR("idx=%d j=%d fasta_len=%d, spanned_lines=%d str='%.*s'\n", idx, j, fasta_len, spanned_lines, idx, str);
00910 str[idx++]=c;
00911 }
00912 max_len=CMath::max(max_len, strings[i].slen);
00913
00914
00915 break;
00916 }
00917
00918 spanned_lines++;
00919 fasta_len+=len+1;
00920 s=f.get_line(len, offs);
00921 }
00922 }
00923 return set_features(strings, num, max_len);
00924 }
00925
00935 bool load_fastq_file(const char* fname,
00936 bool ignore_invalid=false, bool bitremap_in_single_string=false)
00937 {
00938 remove_subset();
00939
00940 CMemoryMappedFile<char> f(fname);
00941
00942 int32_t i=0;
00943 uint64_t len=0;
00944 uint64_t offs=0;
00945
00946 int32_t num=f.get_num_lines();
00947 int32_t max_len=0;
00948
00949 if (num%4)
00950 SG_ERROR("Number of lines must be divisible by 4 in fastq files\n");
00951 num/=4;
00952
00953 cleanup();
00954 SG_UNREF(alphabet);
00955 alphabet=new CAlphabet(DNA);
00956
00957 SGString<ST>* strings;
00958
00959 ST* str;
00960 if (bitremap_in_single_string)
00961 {
00962 strings=SG_MALLOC(SGString<ST>, 1);
00963 strings[0].string=SG_MALLOC(ST, num);
00964 strings[0].slen=num;
00965 f.get_line(len, offs);
00966 f.get_line(len, offs);
00967 order=len;
00968 max_len=num;
00969 offs=0;
00970 original_num_symbols=alphabet->get_num_symbols();
00971 int32_t max_val=alphabet->get_num_bits();
00972 str=SG_MALLOC(ST, len);
00973 }
00974 else
00975 strings=SG_MALLOC(SGString<ST>, num);
00976
00977 for (i=0;i<num; i++)
00978 {
00979 if (!f.get_line(len, offs))
00980 SG_ERROR("Error reading 'read' identifier in line %d", 4*i);
00981
00982 char* s=f.get_line(len, offs);
00983 if (!s || len==0)
00984 SG_ERROR("Error reading 'read' in line %d len=%ld", 4*i+1, len);
00985
00986 if (bitremap_in_single_string)
00987 {
00988 if (len!=order)
00989 SG_ERROR("read in line %d not of length %d (is %d)\n", 4*i+1, order, len);
00990 for (int32_t j=0; j<order; j++)
00991 str[j]=(ST) alphabet->remap_to_bin((uint8_t) s[j]);
00992
00993 strings[0].string[i]=embed_word(str, order);
00994 }
00995 else
00996 {
00997 strings[i].string=SG_MALLOC(ST, len);
00998 strings[i].slen=len;
00999 str=strings[i].string;
01000
01001 if (ignore_invalid)
01002 {
01003 for (int32_t j=0; j<len; j++)
01004 {
01005 if (alphabet->is_valid((uint8_t) s[j]))
01006 str[j]= (ST) s[j];
01007 else
01008 str[j]= (ST) 'A';
01009 }
01010 }
01011 else
01012 {
01013 for (int32_t j=0; j<len; j++)
01014 str[j]= (ST) s[j];
01015 }
01016 max_len=CMath::max(max_len, (int32_t) len);
01017 }
01018
01019
01020 if (!f.get_line(len, offs))
01021 SG_ERROR("Error reading 'read' quality identifier in line %d", 4*i+2);
01022
01023 if (!f.get_line(len, offs))
01024 SG_ERROR("Error reading 'read' quality in line %d", 4*i+3);
01025 }
01026
01027 if (bitremap_in_single_string)
01028 num=1;
01029
01030 num_vectors=num;
01031 max_string_length=max_len;
01032 features=strings;
01033
01034 return true;
01035 }
01036
01044 bool load_from_directory(char* dirname)
01045 {
01046 remove_subset();
01047
01048 struct dirent **namelist;
01049 int32_t n;
01050
01051 SGIO::set_dirname(dirname);
01052
01053 SG_DEBUG("dirname '%s'\n", dirname);
01054
01055 n=scandir(dirname, &namelist, &SGIO::filter, alphasort);
01056 if (n <= 0)
01057 {
01058 SG_ERROR("error calling scandir - no files found\n");
01059 return false;
01060 }
01061 else
01062 {
01063 SGString<ST>* strings=NULL;
01064
01065 int32_t num=0;
01066 int32_t max_len=-1;
01067
01068
01069
01070 strings=SG_MALLOC(SGString<ST>, n);
01071
01072 for (int32_t i=0; i<n; i++)
01073 {
01074 char* fname=SGIO::concat_filename(namelist[i]->d_name);
01075
01076 struct stat s;
01077 off_t filesize=0;
01078
01079 if (!stat(fname, &s) && s.st_size>0)
01080 {
01081 filesize=s.st_size/sizeof(ST);
01082
01083 FILE* f=fopen(fname, "ro");
01084 if (f)
01085 {
01086 ST* str=SG_MALLOC(ST, filesize);
01087 SG_DEBUG("%s:%ld\n", fname, (int64_t) filesize);
01088 if (fread(str, sizeof(ST), filesize, f)!=(size_t) filesize)
01089 SG_ERROR("failed to read file\n");
01090 strings[num].string=str;
01091 strings[num].slen=filesize;
01092 max_len=CMath::max(max_len, strings[num].slen);
01093
01094 num++;
01095 fclose(f);
01096 }
01097 }
01098 else
01099 SG_ERROR("empty or non readable file \'%s\'\n", fname);
01100
01101 SG_FREE(namelist[i]);
01102 }
01103 SG_FREE(namelist);
01104
01105 if (num>0 && strings)
01106 {
01107 set_features(strings, num, max_len);
01108 return true;
01109 }
01110 }
01111 return false;
01112 }
01113
01119 void set_features(SGStringList<ST> feats)
01120 {
01121 set_features(feats.strings, feats.num_strings, feats.max_string_length);
01122 }
01123
01133 bool set_features(SGString<ST>* p_features, int32_t p_num_vectors, int32_t p_max_string_length)
01134 {
01135 if (m_subset)
01136 SG_ERROR("Cannot call set_features() with subset.\n");
01137
01138 if (p_features)
01139 {
01140 CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet());
01141
01142
01143 for (int32_t i=0; i<p_num_vectors; i++)
01144 alpha->add_string_to_histogram( p_features[i].string, p_features[i].slen);
01145
01146 SG_INFO("max_value_in_histogram:%d\n", alpha->get_max_value_in_histogram());
01147 SG_INFO("num_symbols_in_histogram:%d\n", alpha->get_num_symbols_in_histogram());
01148
01149 if (alpha->check_alphabet_size() && alpha->check_alphabet())
01150 {
01151 cleanup();
01152 SG_UNREF(alphabet);
01153
01154 alphabet=alpha;
01155 SG_REF(alphabet);
01156
01157 features=p_features;
01158 num_vectors=p_num_vectors;
01159 max_string_length=p_max_string_length;
01160
01161 return true;
01162 }
01163 else
01164 SG_UNREF(alpha);
01165 }
01166
01167 return false;
01168 }
01169
01178 bool append_features(CStringFeatures<ST>* sf)
01179 {
01180 ASSERT(sf);
01181
01182 if (m_subset)
01183 SG_ERROR("Cannot call set_features() with subset.\n");
01184
01185 SGString<ST>* new_features=SG_MALLOC(SGString<ST>, sf->get_num_vectors());
01186
01187 index_t sf_num_str=sf->get_num_vectors();
01188 for (int32_t i=0; i<sf_num_str; i++)
01189 {
01190 int32_t real_i = sf->subset_idx_conversion(i);
01191 int32_t length=sf->features[real_i].slen;
01192 new_features[i].string=SG_MALLOC(ST, length);
01193 memcpy(new_features[i].string, sf->features[real_i].string, length);
01194 new_features[i].slen=length;
01195 }
01196 return append_features(new_features, sf_num_str,
01197 sf->max_string_length);
01198 }
01199
01212 bool append_features(SGString<ST>* p_features, int32_t p_num_vectors, int32_t p_max_string_length)
01213 {
01214 if (m_subset)
01215 SG_ERROR("Cannot call set_features() with subset.\n");
01216
01217 if (!features)
01218 return set_features(p_features, p_num_vectors, p_max_string_length);
01219
01220 CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet());
01221
01222
01223 for (int32_t i=0; i<p_num_vectors; i++)
01224 alpha->add_string_to_histogram( p_features[i].string, p_features[i].slen);
01225
01226 SG_INFO("max_value_in_histogram:%d\n", alpha->get_max_value_in_histogram());
01227 SG_INFO("num_symbols_in_histogram:%d\n", alpha->get_num_symbols_in_histogram());
01228
01229 if (alpha->check_alphabet_size() && alpha->check_alphabet())
01230 {
01231 SG_UNREF(alpha);
01232 for (int32_t i=0; i<p_num_vectors; i++)
01233 alphabet->add_string_to_histogram( p_features[i].string, p_features[i].slen);
01234
01235 int32_t old_num_vectors=num_vectors;
01236 num_vectors=old_num_vectors+p_num_vectors;
01237 SGString<ST>* new_features=SG_MALLOC(SGString<ST>, num_vectors);
01238
01239 for (int32_t i=0; i<num_vectors; i++)
01240 {
01241 if (i<old_num_vectors)
01242 {
01243 new_features[i].string=features[i].string;
01244 new_features[i].slen=features[i].slen;
01245 }
01246 else
01247 {
01248 new_features[i].string=p_features[i-old_num_vectors].string;
01249 new_features[i].slen=p_features[i-old_num_vectors].slen;
01250 }
01251 }
01252 SG_FREE(features);
01253 SG_FREE(p_features);
01254
01255 this->features=new_features;
01256 max_string_length=CMath::max(max_string_length, p_max_string_length);
01257
01258 return true;
01259 }
01260 SG_UNREF(alpha);
01261
01262 return false;
01263 }
01264
01268 SGStringList<ST> get_features()
01269 {
01270 SGStringList<ST> sl;
01271
01272 sl.strings=get_features(sl.num_strings, sl.max_string_length);
01273 return sl;
01274 }
01275
01284 virtual SGString<ST>* get_features(int32_t& num_str, int32_t& max_str_len)
01285 {
01286 if (m_subset)
01287 SG_ERROR("get features() is not possible on subset");
01288
01289 num_str=num_vectors;
01290 max_str_len=max_string_length;
01291 return features;
01292 }
01293
01302 virtual SGString<ST>* copy_features(int32_t& num_str, int32_t& max_str_len)
01303 {
01304 ASSERT(num_vectors>0);
01305
01306 num_str=get_num_vectors();
01307 max_str_len=max_string_length;
01308 SGString<ST>* new_feat=SG_MALLOC(SGString<ST>, num_str);
01309
01310 for (int32_t i=0; i<num_str; i++)
01311 {
01312 int32_t len;
01313 bool free_vec;
01314 ST* vec=get_feature_vector(i, len, free_vec);
01315 new_feat[i].string=SG_MALLOC(ST, len);
01316 new_feat[i].slen=len;
01317 memcpy(new_feat[i].string, vec, ((size_t) len) * sizeof(ST));
01318 free_feature_vector(vec, i, free_vec);
01319 }
01320
01321 return new_feat;
01322 }
01323
01331 virtual void get_features(SGString<ST>** dst, int32_t* num_str)
01332 {
01333 int32_t num_vec;
01334 int32_t max_str_len;
01335 *dst=copy_features(num_vec, max_str_len);
01336 *num_str=num_vec;
01337 }
01338
01345 virtual inline void save(CFile* writer);
01346
01355 virtual bool load_compressed(char* src, bool decompress)
01356 {
01357 remove_subset();
01358
01359 FILE* file=NULL;
01360
01361 if (!(file=fopen(src, "r")))
01362 return false;
01363 cleanup();
01364
01365
01366 char id[4];
01367 if (fread(&id[0], sizeof(char), 1, file)!=1)
01368 SG_ERROR("failed to read header");
01369 ASSERT(id[0]=='S');
01370 if (fread(&id[1], sizeof(char), 1, file)!=1)
01371 SG_ERROR("failed to read header");
01372 ASSERT(id[1]=='G');
01373 if (fread(&id[2], sizeof(char), 1, file)!=1)
01374 SG_ERROR("failed to read header");
01375 ASSERT(id[2]=='V');
01376 if (fread(&id[3], sizeof(char), 1, file)!=1)
01377 SG_ERROR("failed to read header");
01378 ASSERT(id[3]=='0');
01379
01380
01381 uint8_t c;
01382 if (fread(&c, sizeof(uint8_t), 1, file)!=1)
01383 SG_ERROR("failed to read compression type");
01384 CCompressor* compressor= new CCompressor((E_COMPRESSION_TYPE) c);
01385
01386 uint8_t a;
01387 delete alphabet;
01388 if (fread(&a, sizeof(uint8_t), 1, file)!=1)
01389 SG_ERROR("failed to read compression alphabet");
01390 alphabet=new CAlphabet((EAlphabet) a);
01391
01392 if (fread(&num_vectors, sizeof(int32_t), 1, file)!=1)
01393 SG_ERROR("failed to read compression number of vectors");
01394 ASSERT(num_vectors>0);
01395
01396 if (fread(&max_string_length, sizeof(int32_t), 1, file)!=1)
01397 SG_ERROR("failed to read maximum string length");
01398 ASSERT(max_string_length>0);
01399
01400 features=SG_MALLOC(SGString<ST>, num_vectors);
01401
01402
01403 for (int32_t i=0; i<num_vectors; i++)
01404 {
01405
01406 int32_t len_compressed;
01407 if (fread(&len_compressed, sizeof(int32_t), 1, file)!=1)
01408 SG_ERROR("failed to read vector length compressed");
01409
01410 int32_t len_uncompressed;
01411 if (fread(&len_uncompressed, sizeof(int32_t), 1, file)!=1)
01412 SG_ERROR("failed to read vector length uncompressed");
01413
01414
01415 if (decompress)
01416 {
01417 features[i].string=SG_MALLOC(ST, len_uncompressed);
01418 features[i].slen=len_uncompressed;
01419 uint8_t* compressed=SG_MALLOC(uint8_t, len_compressed);
01420 if (fread(compressed, sizeof(uint8_t), len_compressed, file)!=(size_t) len_compressed)
01421 SG_ERROR("failed to read compressed data (expected %d bytes)", len_compressed);
01422 uint64_t uncompressed_size=len_uncompressed;
01423 uncompressed_size*=sizeof(ST);
01424 compressor->decompress(compressed, len_compressed,
01425 (uint8_t*) features[i].string, uncompressed_size);
01426 SG_FREE(compressed);
01427 ASSERT(uncompressed_size==((uint64_t) len_uncompressed)*sizeof(ST));
01428 }
01429 else
01430 {
01431 int32_t offs=CMath::ceil(2.0*sizeof(int32_t)/sizeof(ST));
01432 features[i].string=SG_MALLOC(ST, len_compressed+offs);
01433 features[i].slen=len_compressed+offs;
01434 int32_t* feat32ptr=((int32_t*) (features[i].string));
01435 memset(features[i].string, 0, offs*sizeof(ST));
01436 feat32ptr[0]=(int32_t) len_compressed;
01437 feat32ptr[1]=(int32_t) len_uncompressed;
01438 uint8_t* compressed=(uint8_t*) (&features[i].string[offs]);
01439 if (fread(compressed, 1, len_compressed, file)!=(size_t) len_compressed)
01440 SG_ERROR("failed to read uncompressed data");
01441 }
01442 }
01443
01444 delete compressor;
01445 fclose(file);
01446
01447 return false;
01448 }
01449
01459 virtual bool save_compressed(char* dest, E_COMPRESSION_TYPE compression, int level)
01460 {
01461 if (m_subset)
01462 SG_ERROR("save_compressed() is not possible on subset");
01463
01464 FILE* file=NULL;
01465
01466 if (!(file=fopen(dest, "wb")))
01467 return false;
01468
01469 CCompressor* compressor= new CCompressor(compression);
01470
01471
01472 const char* id="SGV0";
01473 fwrite(&id[0], sizeof(char), 1, file);
01474 fwrite(&id[1], sizeof(char), 1, file);
01475 fwrite(&id[2], sizeof(char), 1, file);
01476 fwrite(&id[3], sizeof(char), 1, file);
01477
01478
01479 uint8_t c=(uint8_t) compression;
01480 fwrite(&c, sizeof(uint8_t), 1, file);
01481
01482 uint8_t a=(uint8_t) alphabet->get_alphabet();
01483 fwrite(&a, sizeof(uint8_t), 1, file);
01484
01485 fwrite(&num_vectors, sizeof(int32_t), 1, file);
01486
01487 fwrite(&max_string_length, sizeof(int32_t), 1, file);
01488
01489
01490 for (int32_t i=0; i<num_vectors; i++)
01491 {
01492 int32_t len=-1;
01493 bool vfree;
01494 ST* vec=get_feature_vector(i, len, vfree);
01495
01496 uint8_t* compressed=NULL;
01497 uint64_t compressed_size=0;
01498
01499 compressor->compress((uint8_t*) vec, ((uint64_t) len)*sizeof(ST),
01500 compressed, compressed_size, level);
01501
01502 int32_t len_compressed=(int32_t) compressed_size;
01503
01504 fwrite(&len_compressed, sizeof(int32_t), 1, file);
01505
01506 fwrite(&len, sizeof(int32_t), 1, file);
01507
01508 fwrite(compressed, compressed_size, 1, file);
01509 SG_FREE(compressed);
01510
01511 free_feature_vector(vec, i, vfree);
01512 }
01513
01514 delete compressor;
01515 fclose(file);
01516 return true;
01517 }
01518
01519
01524 virtual int32_t get_size() { return sizeof(ST); }
01525
01531 virtual bool apply_preprocessor(bool force_preprocessing=false)
01532 {
01533 SG_DEBUG( "force: %d\n", force_preprocessing);
01534
01535 for (int32_t i=0; i<get_num_preprocessors(); i++)
01536 {
01537 if ( (!is_preprocessed(i) || force_preprocessing) )
01538 {
01539 set_preprocessed(i);
01540 CStringPreprocessor<ST>* p=(CStringPreprocessor<ST>*) get_preprocessor(i);
01541 SG_INFO( "preprocessing using preproc %s\n", p->get_name());
01542
01543 if (!p->apply_to_string_features(this))
01544 {
01545 SG_UNREF(p);
01546 return false;
01547 }
01548 else
01549 SG_UNREF(p);
01550 }
01551 }
01552 return true;
01553 }
01554
01567 int32_t obtain_by_sliding_window(int32_t window_size, int32_t step_size, int32_t skip=0)
01568 {
01569 if (m_subset)
01570 SG_NOTIMPLEMENTED;
01571
01572 ASSERT(step_size>0);
01573 ASSERT(window_size>0);
01574 ASSERT(num_vectors==1 || single_string);
01575 ASSERT(max_string_length>=window_size ||
01576 (single_string && length_of_single_string>=window_size));
01577
01578
01579
01580 if (single_string)
01581 num_vectors= (length_of_single_string-window_size)/step_size + 1;
01582 else if (num_vectors==1)
01583 {
01584 num_vectors= (max_string_length-window_size)/step_size + 1;
01585 length_of_single_string=max_string_length;
01586 }
01587
01588 SGString<ST>* f=SG_MALLOC(SGString<ST>, num_vectors);
01589 int32_t offs=0;
01590 for (int32_t i=0; i<num_vectors; i++)
01591 {
01592 f[i].string=&features[0].string[offs+skip];
01593 f[i].slen=window_size-skip;
01594 offs+=step_size;
01595 }
01596 single_string=features[0].string;
01597 SG_FREE(features);
01598 features=f;
01599 max_string_length=window_size-skip;
01600
01601 return num_vectors;
01602 }
01603
01614 int32_t obtain_by_position_list(int32_t window_size, CDynamicArray<int32_t>* positions, int32_t skip=0)
01615 {
01616 if (m_subset)
01617 SG_NOTIMPLEMENTED;
01618
01619 ASSERT(positions);
01620 ASSERT(window_size>0);
01621 ASSERT(num_vectors==1 || single_string);
01622 ASSERT(max_string_length>=window_size ||
01623 (single_string && length_of_single_string>=window_size));
01624
01625 num_vectors= positions->get_num_elements();
01626 ASSERT(num_vectors>0);
01627
01628 int32_t len;
01629
01630
01631
01632 if (single_string)
01633 len=length_of_single_string;
01634 else
01635 {
01636 single_string=features[0].string;
01637 len=max_string_length;
01638 length_of_single_string=max_string_length;
01639 }
01640
01641 SGString<ST>* f=SG_MALLOC(SGString<ST>, num_vectors);
01642 for (int32_t i=0; i<num_vectors; i++)
01643 {
01644 int32_t p=positions->get_element(i);
01645
01646 if (p>=0 && p<=len-window_size)
01647 {
01648 f[i].string=&features[0].string[p+skip];
01649 f[i].slen=window_size-skip;
01650 }
01651 else
01652 {
01653 num_vectors=1;
01654 max_string_length=len;
01655 features[0].slen=len;
01656 single_string=NULL;
01657 SG_FREE(f);
01658 SG_ERROR("window (size:%d) starting at position[%d]=%d does not fit in sequence(len:%d)\n",
01659 window_size, i, p, len);
01660 return -1;
01661 }
01662 }
01663
01664 SG_FREE(features);
01665 features=f;
01666 max_string_length=window_size-skip;
01667
01668 return num_vectors;
01669 }
01670
01684 inline bool obtain_from_char(CStringFeatures<char>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
01685 {
01686 return obtain_from_char_features(sf, start, p_order, gap, rev);
01687 }
01688
01700 template <class CT>
01701 bool obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
01702 {
01703 remove_subset();
01704 ASSERT(sf);
01705
01706 CAlphabet* alpha=sf->get_alphabet();
01707 ASSERT(alpha->get_num_symbols_in_histogram() > 0);
01708
01709 this->order=p_order;
01710 cleanup();
01711
01712 num_vectors=sf->get_num_vectors();
01713 ASSERT(num_vectors>0);
01714 max_string_length=sf->get_max_vector_length()-start;
01715 features=SG_MALLOC(SGString<ST>, num_vectors);
01716
01717 SG_DEBUG( "%1.0llf symbols in StringFeatures<*> %d symbols in histogram\n", sf->get_num_symbols(),
01718 alpha->get_num_symbols_in_histogram());
01719
01720 for (int32_t i=0; i<num_vectors; i++)
01721 {
01722 int32_t len=-1;
01723 bool vfree;
01724 CT* c=sf->get_feature_vector(i, len, vfree);
01725 ASSERT(!vfree);
01726
01727 features[i].string=SG_MALLOC(ST, len);
01728 features[i].slen=len;
01729
01730 ST* str=features[i].string;
01731 for (int32_t j=0; j<len; j++)
01732 str[j]=(ST) alpha->remap_to_bin(c[j]);
01733 }
01734
01735 original_num_symbols=alpha->get_num_symbols();
01736 int32_t max_val=alpha->get_num_bits();
01737
01738 SG_UNREF(alpha);
01739
01740 if (p_order>1)
01741 num_symbols=CMath::powl((floatmax_t) 2, (floatmax_t) max_val*p_order);
01742 else
01743 num_symbols=original_num_symbols;
01744 SG_INFO( "max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols);
01745
01746 if ( ((floatmax_t) num_symbols) > CMath::powl(((floatmax_t) 2),((floatmax_t) sizeof(ST)*8)) )
01747 {
01748 SG_ERROR( "symbol does not fit into datatype \"%c\" (%d)\n", (char) max_val, (int) max_val);
01749 return false;
01750 }
01751
01752 SG_DEBUG( "translate: start=%i order=%i gap=%i(size:%i)\n", start, p_order, gap, sizeof(ST)) ;
01753 for (int32_t line=0; line<num_vectors; line++)
01754 {
01755 int32_t len=0;
01756 bool vfree;
01757 ST* fv=get_feature_vector(line, len, vfree);
01758 ASSERT(!vfree);
01759
01760 if (rev)
01761 CAlphabet::translate_from_single_order_reversed(fv, len, start+gap, p_order+gap, max_val, gap);
01762 else
01763 CAlphabet::translate_from_single_order(fv, len, start+gap, p_order+gap, max_val, gap);
01764
01765
01766 features[line].slen-=start+gap ;
01767 if (features[line].slen<0)
01768 features[line].slen=0 ;
01769 }
01770
01771 compute_symbol_mask_table(max_val);
01772
01773 return true;
01774 }
01775
01785 bool have_same_length(int32_t len=-1)
01786 {
01787 if (len!=-1)
01788 {
01789 if (len!=max_string_length)
01790 return false;
01791 }
01792 len=max_string_length;
01793
01794 index_t num_str=get_num_vectors();
01795 for (int32_t i=0; i<num_str; i++)
01796 {
01797 if (get_vector_length(i)!=len)
01798 return false;
01799 }
01800
01801 return true;
01802 }
01803
01809 inline void embed_features(int32_t p_order)
01810 {
01811 if (m_subset)
01812 SG_NOTIMPLEMENTED;
01813
01814 ASSERT(alphabet->get_num_symbols_in_histogram() > 0);
01815
01816 order=p_order;
01817 original_num_symbols=alphabet->get_num_symbols();
01818 int32_t max_val=alphabet->get_num_bits();
01819
01820 if (p_order>1)
01821 num_symbols=CMath::powl((floatmax_t) 2, (floatmax_t) max_val*p_order);
01822 else
01823 num_symbols=original_num_symbols;
01824
01825 SG_INFO( "max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols);
01826
01827 if ( ((floatmax_t) num_symbols) > CMath::powl(((floatmax_t) 2),((floatmax_t) sizeof(ST)*8)) )
01828 SG_WARNING("symbols did not fit into datatype \"%c\" (%d)\n", (char) max_val, (int) max_val);
01829
01830 ST mask=0;
01831 for (int32_t i=0; i<p_order*max_val; i++)
01832 mask= (mask<<1) | ((ST) 1);
01833
01834 for (int32_t i=0; i<num_vectors; i++)
01835 {
01836 int32_t len=features[i].slen;
01837
01838 if (len < p_order)
01839 SG_ERROR("Sequence must be longer than order (%d vs. %d)\n", len, p_order);
01840
01841 ST* str=features[i].string;
01842
01843
01844 for (int32_t j=0; j<p_order; j++)
01845 str[j]=(ST) alphabet->remap_to_bin(str[j]);
01846 str[0]=embed_word(&str[0], p_order);
01847
01848
01849 int32_t idx=0;
01850 for (int32_t j=p_order; j<len; j++)
01851 {
01852 str[j]=(ST) alphabet->remap_to_bin(str[j]);
01853 str[idx+1]= ((str[idx]<<max_val) | str[j]) & mask;
01854 idx++;
01855 }
01856
01857 features[i].slen=len-p_order+1;
01858 }
01859
01860 compute_symbol_mask_table(max_val);
01861 }
01862
01869 inline void compute_symbol_mask_table(int64_t max_val)
01870 {
01871 if (m_subset)
01872 SG_NOTIMPLEMENTED;
01873
01874 SG_FREE(symbol_mask_table);
01875 symbol_mask_table=SG_MALLOC(ST, 256);
01876
01877 uint64_t mask=0;
01878 for (int32_t i=0; i< (int64_t) max_val; i++)
01879 mask=(mask<<1) | 1;
01880
01881 for (int32_t i=0; i<256; i++)
01882 {
01883 uint8_t bits=(uint8_t) i;
01884 symbol_mask_table[i]=0;
01885
01886 for (int32_t j=0; j<8; j++)
01887 {
01888 if (bits & 1)
01889 symbol_mask_table[i]|=mask<<(max_val*j);
01890
01891 bits>>=1;
01892 }
01893 }
01894 }
01895
01902 inline void unembed_word(ST word, uint8_t* seq, int32_t len)
01903 {
01904 uint32_t nbits= (uint32_t) alphabet->get_num_bits();
01905
01906 ST mask=0;
01907 for (int32_t i=0; i<nbits; i++)
01908 mask=(mask<<1) | (ST) 1;
01909
01910 for (int32_t i=0; i<len; i++)
01911 {
01912 ST w=(word & mask);
01913 seq[len-i-1]=alphabet->remap_to_char((uint8_t) w);
01914 word>>=nbits;
01915 }
01916 }
01917
01923 inline ST embed_word(ST* seq, int32_t len)
01924 {
01925 ST value=(ST) 0;
01926 uint32_t nbits= (uint32_t) alphabet->get_num_bits();
01927 for (int32_t i=0; i<len; i++)
01928 {
01929 value<<=nbits;
01930 value|=seq[i];
01931 }
01932
01933 return value;
01934 }
01935
01940 void determine_maximum_string_length()
01941 {
01942 max_string_length=0;
01943 index_t num_str=get_num_vectors();
01944 for (int32_t i=0; i<num_str; i++)
01945 {
01946 max_string_length=CMath::max(max_string_length,
01947 features[subset_idx_conversion(i)].slen);
01948 }
01949 }
01950
01958 static ST* get_zero_terminated_string_copy(SGString<ST> str)
01959 {
01960 int32_t l=str.slen;
01961 ST* s=SG_MALLOC(ST, l+1);
01962 memcpy(s, str.string, sizeof(ST)*l);
01963 s[l]='\0';
01964 return s;
01965 }
01966
01975 virtual void set_feature_vector(int32_t num, ST* string, int32_t len)
01976 {
01977 ASSERT(features);
01978 ASSERT(num<get_num_vectors());
01979
01980 int32_t real_num=subset_idx_conversion(num);
01981
01982
01983 features[real_num].slen=len ;
01984 features[real_num].string=string ;
01985
01986 max_string_length=CMath::max(len, max_string_length);
01987 }
01988
01989
01994 virtual void get_histogram(float64_t** hist, int32_t* rows, int32_t* cols, bool normalize=true)
01995 {
01996 int32_t nsym=get_num_symbols();
01997 int32_t slen=get_max_vector_length();
01998 int64_t sz=int64_t(nsym)*slen*sizeof(float64_t);
01999 float64_t* h= SG_MALLOC(float64_t, sz);
02000 memset(h, 0, sz);
02001
02002 float64_t* h_normalizer=SG_MALLOC(float64_t, slen);
02003 memset(h_normalizer, 0, slen*sizeof(float64_t));
02004 int32_t num_str=get_num_vectors();
02005 for (int32_t i=0; i<num_str; i++)
02006 {
02007 int32_t len;
02008 bool free_vec;
02009 ST* vec=get_feature_vector(i, len, free_vec);
02010 for (int32_t j=0; j<len; j++)
02011 {
02012 h[int64_t(j)*nsym+alphabet->remap_to_bin(vec[j])]++;
02013 h_normalizer[j]++;
02014 }
02015 free_feature_vector(vec, i, free_vec);
02016 }
02017
02018 if (normalize)
02019 {
02020 for (int32_t i=0; i<slen; i++)
02021 {
02022 for (int32_t j=0; j<nsym; j++)
02023 {
02024 if (h_normalizer && h_normalizer[i])
02025 h[int64_t(i)*nsym+j]/=h_normalizer[i];
02026 }
02027 }
02028 }
02029 SG_FREE(h_normalizer);
02030
02031 *hist=h;
02032 *rows=nsym;
02033 *cols=slen;
02034 }
02035
02040 virtual void create_random(float64_t* hist, int32_t rows, int32_t cols, int32_t num_vec)
02041 {
02042 ASSERT(rows == get_num_symbols());
02043 cleanup();
02044 float64_t* randoms=SG_MALLOC(float64_t, cols);
02045 SGString<ST>* sf=SG_MALLOC(SGString<ST>, num_vec);
02046
02047 for (int32_t i=0; i<num_vec; i++)
02048 {
02049 sf[i].string=SG_MALLOC(ST, cols);
02050 sf[i].slen=cols;
02051
02052 CMath::random_vector(randoms, cols, 0.0, 1.0);
02053
02054 for (int32_t j=0; j<cols; j++)
02055 {
02056 float64_t lik=hist[int64_t(j)*rows+0];
02057
02058 int32_t c;
02059 for (c=0; c<rows-1; c++)
02060 {
02061 if (randoms[j]<=lik)
02062 break;
02063 lik+=hist[int64_t(j)*rows+c+1];
02064 }
02065 sf[i].string[j]=alphabet->remap_to_char(c);
02066 }
02067 }
02068 SG_FREE(randoms);
02069 set_features(sf, num_vec, cols);
02070 }
02071
02072
02073
02074
02075
02076
02077
02078
02079
02080
02081
02082
02083
02084
02085
02086
02087
02088
02089
02090
02091
02092
02093
02094
02095
02096
02097
02098
02099
02100
02101
02102
02103
02104
02105
02106
02107
02108
02109
02110
02111
02112
02113
02114
02115
02116
02117
02118
02119
02120
02121
02122
02123
02124
02125
02126
02127
02128
02129
02130
02131
02132
02133
02134
02135
02136
02137
02138
02139
02140
02141
02150 virtual CFeatures* copy_subset(SGVector<index_t> indices)
02151 {
02152
02153 SGStringList<ST> list_copy(indices.vlen, max_string_length);
02154
02155
02156 for (index_t i=0; i<indices.vlen; ++i)
02157 {
02158
02159 index_t real_idx=subset_idx_conversion(indices.vector[i]);
02160
02161
02162 SGString<ST> current_string=features[real_idx];
02163 SGString<ST> string_copy(current_string.slen);
02164 memcpy(string_copy.string, current_string.string,
02165 current_string.slen*sizeof(ST));
02166 list_copy.strings[i]=string_copy;
02167 }
02168
02169
02170 CStringFeatures* result=new CStringFeatures(list_copy, alphabet);
02171
02172
02173 result->determine_maximum_string_length();
02174
02175 return result;
02176 }
02177
02179 inline virtual const char* get_name() const { return "StringFeatures"; }
02180
02182 virtual void subset_changed_post()
02183 {
02184
02185 determine_maximum_string_length();
02186 }
02187 protected:
02188
02199 virtual ST* compute_feature_vector(int32_t num, int32_t& len)
02200 {
02201 ASSERT(features && num<get_num_vectors());
02202
02203 int32_t real_num=subset_idx_conversion(num);
02204
02205 len=features[real_num].slen;
02206 if (len<=0)
02207 return NULL;
02208
02209 ST* target=SG_MALLOC(ST, len);
02210 memcpy(target, features[real_num].string, len*sizeof(ST));
02211 return target;
02212 }
02213
02214 private:
02215 void init()
02216 {
02217 set_generic<ST>();
02218
02219 alphabet=NULL;
02220 num_vectors=0;
02221 features=NULL;
02222 single_string=NULL;
02223 length_of_single_string=0;
02224 max_string_length=0;
02225 order=0;
02226 symbol_mask_table=0;
02227 preprocess_on_get=false;
02228 feature_cache=NULL;
02229
02230 m_parameters->add((CSGObject**) &alphabet, "alphabet");
02231 m_parameters->add_vector(&features, &num_vectors, "features",
02232 "This contains the array of features.");
02233 m_parameters->add_vector(&single_string,
02234 &length_of_single_string,
02235 "single_string",
02236 "Created by sliding window.");
02237 m_parameters->add(&max_string_length, "max_string_length",
02238 "Length of longest string.");
02239 m_parameters->add(&num_symbols, "num_symbols",
02240 "Number of used symbols.");
02241 m_parameters->add(&original_num_symbols, "original_num_symbols",
02242 "Original number of used symbols.");
02243 m_parameters->add(&order, "order",
02244 "Order used in higher order mapping.");
02245 m_parameters->add(&preprocess_on_get, "preprocess_on_get",
02246 "Preprocess on-the-fly?");
02247
02248
02249
02250
02251
02252 }
02253
02254
02255 protected:
02256
02258 CAlphabet* alphabet;
02259
02261 int32_t num_vectors;
02262
02264 SGString<ST>* features;
02265
02267 ST* single_string;
02268
02270 int32_t length_of_single_string;
02271
02273 int32_t max_string_length;
02274
02276 floatmax_t num_symbols;
02277
02279 floatmax_t original_num_symbols;
02280
02282 int32_t order;
02283
02285 ST* symbol_mask_table;
02286
02288 bool preprocess_on_get;
02289
02291 CCache<ST>* feature_cache;
02292 };
02293
02294 #ifndef DOXYGEN_SHOULD_SKIP_THIS
02295
02299 template<> inline EFeatureType CStringFeatures<bool>::get_feature_type()
02300 {
02301 return F_BOOL;
02302 }
02303
02308 template<> inline EFeatureType CStringFeatures<char>::get_feature_type()
02309 {
02310 return F_CHAR;
02311 }
02312
02317 template<> inline EFeatureType CStringFeatures<uint8_t>::get_feature_type()
02318 {
02319 return F_BYTE;
02320 }
02321
02326 template<> inline EFeatureType CStringFeatures<int16_t>::get_feature_type()
02327 {
02328 return F_SHORT;
02329 }
02330
02335 template<> inline EFeatureType CStringFeatures<uint16_t>::get_feature_type()
02336 {
02337 return F_WORD;
02338 }
02339
02344 template<> inline EFeatureType CStringFeatures<int32_t>::get_feature_type()
02345 {
02346 return F_INT;
02347 }
02348
02353 template<> inline EFeatureType CStringFeatures<uint32_t>::get_feature_type()
02354 {
02355 return F_UINT;
02356 }
02357
02362 template<> inline EFeatureType CStringFeatures<int64_t>::get_feature_type()
02363 {
02364 return F_LONG;
02365 }
02366
02371 template<> inline EFeatureType CStringFeatures<uint64_t>::get_feature_type()
02372 {
02373 return F_ULONG;
02374 }
02375
02380 template<> inline EFeatureType CStringFeatures<float32_t>::get_feature_type()
02381 {
02382 return F_SHORTREAL;
02383 }
02384
02389 template<> inline EFeatureType CStringFeatures<float64_t>::get_feature_type()
02390 {
02391 return F_DREAL;
02392 }
02393
02398 template<> inline EFeatureType CStringFeatures<floatmax_t>::get_feature_type()
02399 {
02400 return F_LONGREAL;
02401 }
02402
02403 template<> inline bool CStringFeatures<bool>::get_masked_symbols(bool symbol, uint8_t mask)
02404 {
02405 return symbol;
02406 }
02407 template<> inline float32_t CStringFeatures<float32_t>::get_masked_symbols(float32_t symbol, uint8_t mask)
02408 {
02409 return symbol;
02410 }
02411 template<> inline float64_t CStringFeatures<float64_t>::get_masked_symbols(float64_t symbol, uint8_t mask)
02412 {
02413 return symbol;
02414 }
02415 template<> inline floatmax_t CStringFeatures<floatmax_t>::get_masked_symbols(floatmax_t symbol, uint8_t mask)
02416 {
02417 return symbol;
02418 }
02419
02420 template<> inline bool CStringFeatures<bool>::shift_offset(bool symbol, int32_t amount)
02421 {
02422 return false;
02423 }
02424 template<> inline float32_t CStringFeatures<float32_t>::shift_offset(float32_t symbol, int32_t amount)
02425 {
02426 return 0;
02427 }
02428 template<> inline float64_t CStringFeatures<float64_t>::shift_offset(float64_t symbol, int32_t amount)
02429 {
02430 return 0;
02431 }
02432 template<> inline floatmax_t CStringFeatures<floatmax_t>::shift_offset(floatmax_t symbol, int32_t amount)
02433 {
02434 return 0;
02435 }
02436
02437 template<> inline bool CStringFeatures<bool>::shift_symbol(bool symbol, int32_t amount)
02438 {
02439 return symbol;
02440 }
02441 template<> inline float32_t CStringFeatures<float32_t>::shift_symbol(float32_t symbol, int32_t amount)
02442 {
02443 return symbol;
02444 }
02445 template<> inline float64_t CStringFeatures<float64_t>::shift_symbol(float64_t symbol, int32_t amount)
02446 {
02447 return symbol;
02448 }
02449 template<> inline floatmax_t CStringFeatures<floatmax_t>::shift_symbol(floatmax_t symbol, int32_t amount)
02450 {
02451 return symbol;
02452 }
02453
02454 #ifndef SUNOS
02455 template<> template <class CT> bool CStringFeatures<float32_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
02456 {
02457 return false;
02458 }
02459 template<> template <class CT> bool CStringFeatures<float64_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
02460 {
02461 return false;
02462 }
02463 template<> template <class CT> bool CStringFeatures<floatmax_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
02464 {
02465 return false;
02466 }
02467 #endif
02468
02469 template<> inline void CStringFeatures<float32_t>::embed_features(int32_t p_order)
02470 {
02471 }
02472 template<> inline void CStringFeatures<float64_t>::embed_features(int32_t p_order)
02473 {
02474 }
02475 template<> inline void CStringFeatures<floatmax_t>::embed_features(int32_t p_order)
02476 {
02477 }
02478
02479 template<> inline void CStringFeatures<float32_t>::compute_symbol_mask_table(int64_t max_val)
02480 {
02481 }
02482 template<> inline void CStringFeatures<float64_t>::compute_symbol_mask_table(int64_t max_val)
02483 {
02484 }
02485 template<> inline void CStringFeatures<floatmax_t>::compute_symbol_mask_table(int64_t max_val)
02486 {
02487 }
02488
02489 template<> inline float32_t CStringFeatures<float32_t>::embed_word(float32_t* seq, int32_t len)
02490 {
02491 return 0;
02492 }
02493 template<> inline float64_t CStringFeatures<float64_t>::embed_word(float64_t* seq, int32_t len)
02494 {
02495 return 0;
02496 }
02497 template<> inline floatmax_t CStringFeatures<floatmax_t>::embed_word(floatmax_t* seq, int32_t len)
02498 {
02499 return 0;
02500 }
02501
02502 template<> inline void CStringFeatures<float32_t>::unembed_word(float32_t word, uint8_t* seq, int32_t len)
02503 {
02504 }
02505 template<> inline void CStringFeatures<float64_t>::unembed_word(float64_t word, uint8_t* seq, int32_t len)
02506 {
02507 }
02508 template<> inline void CStringFeatures<floatmax_t>::unembed_word(floatmax_t word, uint8_t* seq, int32_t len)
02509 {
02510 }
02511 #define LOAD(f_load, sg_type) \
02512 template<> inline void CStringFeatures<sg_type>::load(CFile* loader) \
02513 { \
02514 SG_INFO( "loading...\n"); \
02515 \
02516 SG_SET_LOCALE_C; \
02517 SGString<sg_type>* strs; \
02518 int32_t num_str; \
02519 int32_t max_len; \
02520 loader->f_load(strs, num_str, max_len); \
02521 set_features(strs, num_str, max_len); \
02522 SG_RESET_LOCALE; \
02523 }
02524
02525 LOAD(get_string_list, bool)
02526 LOAD(get_string_list, char)
02527 LOAD(get_int8_string_list, int8_t)
02528 LOAD(get_string_list, uint8_t)
02529 LOAD(get_string_list, int16_t)
02530 LOAD(get_string_list, uint16_t)
02531 LOAD(get_string_list, int32_t)
02532 LOAD(get_uint_string_list, uint32_t)
02533 LOAD(get_long_string_list, int64_t)
02534 LOAD(get_ulong_string_list, uint64_t)
02535 LOAD(get_string_list, float32_t)
02536 LOAD(get_string_list, float64_t)
02537 LOAD(get_longreal_string_list, floatmax_t)
02538 #undef LOAD
02539
02540 #define SAVE(f_write, sg_type) \
02541 template<> inline void CStringFeatures<sg_type>::save(CFile* writer) \
02542 { \
02543 if (m_subset) \
02544 SG_ERROR("save() is not possible on subset"); \
02545 SG_SET_LOCALE_C; \
02546 ASSERT(writer); \
02547 writer->f_write(features, num_vectors); \
02548 SG_RESET_LOCALE; \
02549 }
02550
02551 SAVE(set_string_list, bool)
02552 SAVE(set_string_list, char)
02553 SAVE(set_int8_string_list, int8_t)
02554 SAVE(set_string_list, uint8_t)
02555 SAVE(set_string_list, int16_t)
02556 SAVE(set_string_list, uint16_t)
02557 SAVE(set_string_list, int32_t)
02558 SAVE(set_uint_string_list, uint32_t)
02559 SAVE(set_long_string_list, int64_t)
02560 SAVE(set_ulong_string_list, uint64_t)
02561 SAVE(set_string_list, float32_t)
02562 SAVE(set_string_list, float64_t)
02563 SAVE(set_longreal_string_list, floatmax_t)
02564 #undef SAVE
02565 #endif // DOXYGEN_SHOULD_SKIP_THIS
02566 }
02567 #endif // _CSTRINGFEATURES__H__