StringFeatures.cpp

Go to the documentation of this file.
00001 #include <shogun/features/StringFeatures.h>
00002 #include <shogun/preprocessor/Preprocessor.h>
00003 #include <shogun/preprocessor/StringPreprocessor.h>
00004 #include <shogun/io/MemoryMappedFile.h>
00005 #include <shogun/io/SGIO.h>
00006 #include <shogun/mathematics/Math.h>
00007 #include <shogun/base/Parameter.h>
00008 
00009 #include <sys/types.h>
00010 #include <sys/stat.h>
00011 #include <dirent.h>
00012 #include <stdio.h>
00013 #include <stdlib.h>
00014 #include <unistd.h>
00015 
00016 
00017 namespace shogun
00018 {
00019 
00020 template<class ST> CStringFeatures<ST>::CStringFeatures() : CFeatures(0)
00021 {
00022     init();
00023     alphabet=new CAlphabet();
00024 }
00025 
00026 template<class ST> CStringFeatures<ST>::CStringFeatures(EAlphabet alpha) : CFeatures(0)
00027 {
00028     init();
00029 
00030     alphabet=new CAlphabet(alpha);
00031     SG_REF(alphabet);
00032     num_symbols=alphabet->get_num_symbols();
00033     original_num_symbols=num_symbols;
00034 }
00035 
00036 template<class ST> CStringFeatures<ST>::CStringFeatures(SGStringList<ST> string_list, EAlphabet alpha)
00037 : CFeatures(0)
00038 {
00039     init();
00040 
00041     alphabet=new CAlphabet(alpha);
00042     SG_REF(alphabet);
00043     num_symbols=alphabet->get_num_symbols();
00044     original_num_symbols=num_symbols;
00045     set_features(string_list.strings, string_list.num_strings, string_list.max_string_length);
00046 }
00047 
00048 template<class ST> CStringFeatures<ST>::CStringFeatures(SGStringList<ST> string_list, CAlphabet* alpha)
00049 : CFeatures(0)
00050 {
00051     init();
00052 
00053     alphabet=new CAlphabet(alpha);
00054     SG_REF(alphabet);
00055     num_symbols=alphabet->get_num_symbols();
00056     original_num_symbols=num_symbols;
00057     set_features(string_list.strings, string_list.num_strings, string_list.max_string_length);
00058 }
00059 
00060 template<class ST> CStringFeatures<ST>::CStringFeatures(CAlphabet* alpha)
00061 : CFeatures(0)
00062 {
00063     init();
00064 
00065     ASSERT(alpha);
00066     SG_REF(alpha);
00067     alphabet=alpha;
00068     num_symbols=alphabet->get_num_symbols();
00069     original_num_symbols=num_symbols;
00070 }
00071 
00072 template<class ST> CStringFeatures<ST>::CStringFeatures(const CStringFeatures & orig)
00073 : CFeatures(orig), num_vectors(orig.num_vectors),
00074     single_string(orig.single_string),
00075     length_of_single_string(orig.length_of_single_string),
00076     max_string_length(orig.max_string_length),
00077     num_symbols(orig.num_symbols),
00078     original_num_symbols(orig.original_num_symbols),
00079     order(orig.order), preprocess_on_get(false),
00080     feature_cache(NULL)
00081 {
00082     init();
00083 
00084     ASSERT(orig.single_string == NULL); //not implemented
00085 
00086     alphabet=orig.alphabet;
00087     SG_REF(alphabet);
00088 
00089     if (orig.features)
00090     {
00091         features=SG_MALLOC(SGString<ST>, orig.num_vectors);
00092 
00093         for (int32_t i=0; i<num_vectors; i++)
00094         {
00095             features[i].string=SG_MALLOC(ST, orig.features[i].slen);
00096             features[i].slen=orig.features[i].slen;
00097             memcpy(features[i].string, orig.features[i].string, sizeof(ST)*orig.features[i].slen);
00098         }
00099     }
00100 
00101     if (orig.symbol_mask_table)
00102     {
00103         symbol_mask_table=SG_MALLOC(ST, 256);
00104         for (int32_t i=0; i<256; i++)
00105             symbol_mask_table[i]=orig.symbol_mask_table[i];
00106     }
00107 
00108     m_subset_stack=orig.m_subset_stack;
00109     SG_REF(m_subset_stack);
00110 }
00111 
00112 template<class ST> CStringFeatures<ST>::CStringFeatures(CFile* loader, EAlphabet alpha)
00113 : CFeatures(loader), num_vectors(0),
00114   features(NULL), single_string(NULL), length_of_single_string(0),
00115   max_string_length(0), order(0),
00116   symbol_mask_table(NULL), preprocess_on_get(false), feature_cache(NULL)
00117 {
00118     init();
00119 
00120     alphabet=new CAlphabet(alpha);
00121     SG_REF(alphabet);
00122     num_symbols=alphabet->get_num_symbols();
00123     original_num_symbols=num_symbols;
00124     load(loader);
00125 }
00126 
00127 template<class ST> CStringFeatures<ST>::~CStringFeatures()
00128 {
00129     cleanup();
00130 
00131     SG_UNREF(alphabet);
00132 }
00133 
00134 template<class ST> void CStringFeatures<ST>::cleanup()
00135 {
00136     remove_all_subsets();
00137 
00138     if (single_string)
00139     {
00140         SG_FREE(single_string);
00141         single_string=NULL;
00142     }
00143     else
00144         cleanup_feature_vectors(0, num_vectors-1);
00145 
00146     /*
00147     if (single_string)
00148     {
00149         SG_FREE(single_string);
00150         single_string=NULL;
00151     }
00152     else
00153         cleanup_feature_vectors(0, num_vectors-1);
00154     */
00155 
00156     num_vectors=0;
00157     SG_FREE(features);
00158     SG_FREE(symbol_mask_table);
00159     features=NULL;
00160     symbol_mask_table=NULL;
00161 
00162     /* start with a fresh alphabet, but instead of emptying the histogram
00163      * create a new object (to leave the alphabet object alone if it is used
00164      * by others)
00165      */
00166     CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet());
00167     SG_UNREF(alphabet);
00168     alphabet=alpha;
00169     SG_REF(alphabet);
00170 }
00171 
00172 template<class ST> void CStringFeatures<ST>::cleanup_feature_vector(int32_t num)
00173 {
00174     ASSERT(num<get_num_vectors());
00175 
00176     if (features)
00177     {
00178         int32_t real_num=m_subset_stack->subset_idx_conversion(num);
00179         SG_FREE(features[real_num].string);
00180         features[real_num].string=NULL;
00181         features[real_num].slen=0;
00182 
00183         determine_maximum_string_length();
00184     }
00185 }
00186 
00187 template<class ST> void CStringFeatures<ST>::cleanup_feature_vectors(int32_t start, int32_t stop)
00188 {
00189     if (features && get_num_vectors())
00190     {
00191         ASSERT(start<get_num_vectors());
00192         ASSERT(stop<get_num_vectors());
00193 
00194         for (int32_t i=start; i<=stop; i++)
00195         {
00196             int32_t real_num=m_subset_stack->subset_idx_conversion(i);
00197             SG_FREE(features[real_num].string);
00198             features[real_num].string=NULL;
00199             features[real_num].slen=0;
00200         }
00201         determine_maximum_string_length();
00202     }
00203 }
00204 
00205 template<class ST> EFeatureClass CStringFeatures<ST>::get_feature_class() const { return C_STRING; }
00206 
00207 template<class ST> EFeatureType CStringFeatures<ST>::get_feature_type() const { return F_UNKNOWN; }
00208 
00209 template<class ST> CAlphabet* CStringFeatures<ST>::get_alphabet()
00210 {
00211     SG_REF(alphabet);
00212     return alphabet;
00213 }
00214 
00215 template<class ST> CFeatures* CStringFeatures<ST>::duplicate() const
00216 {
00217     return new CStringFeatures<ST>(*this);
00218 }
00219 
00220 template<class ST> SGVector<ST> CStringFeatures<ST>::get_feature_vector(int32_t num)
00221 {
00222     ASSERT(features);
00223     if (num>=get_num_vectors())
00224     {
00225         SG_ERROR("Index out of bounds (number of strings %d, you "
00226                 "requested %d)\n", get_num_vectors(), num);
00227     }
00228 
00229     int32_t l;
00230     bool free_vec;
00231     ST* vec=get_feature_vector(num, l, free_vec);
00232     ST* dst=SG_MALLOC(ST, l);
00233     memcpy(dst, vec, l*sizeof(ST));
00234     free_feature_vector(vec, num, free_vec);
00235     return SGVector<ST>(dst, l, true);
00236 }
00237 
00238 template<class ST> void CStringFeatures<ST>::set_feature_vector(SGVector<ST> vector, int32_t num)
00239 {
00240     ASSERT(features);
00241 
00242     if (m_subset_stack->has_subsets())
00243         SG_ERROR("A subset is set, cannot set feature vector\n");
00244 
00245     if (num>=num_vectors)
00246     {
00247         SG_ERROR("Index out of bounds (number of strings %d, you "
00248                 "requested %d)\n", num_vectors, num);
00249     }
00250 
00251     if (vector.vlen<=0)
00252         SG_ERROR("String has zero or negative length\n");
00253 
00254     cleanup_feature_vector(num);
00255     features[num].slen=vector.vlen;
00256     features[num].string=SG_MALLOC(ST, vector.vlen);
00257     memcpy(features[num].string, vector.vector, vector.vlen*sizeof(ST));
00258 
00259     determine_maximum_string_length();
00260 }
00261 
00262 template<class ST> void CStringFeatures<ST>::enable_on_the_fly_preprocessing()
00263 {
00264     preprocess_on_get=true;
00265 }
00266 
00267 template<class ST> void CStringFeatures<ST>::disable_on_the_fly_preprocessing()
00268 {
00269     preprocess_on_get=false;
00270 }
00271 
00272 template<class ST> ST* CStringFeatures<ST>::get_feature_vector(int32_t num, int32_t& len, bool& dofree)
00273 {
00274     ASSERT(features);
00275     if (num>=get_num_vectors())
00276         SG_ERROR("Requested feature vector with index %d while total num is", num, get_num_vectors());
00277 
00278     int32_t real_num=m_subset_stack->subset_idx_conversion(num);
00279 
00280     if (!preprocess_on_get)
00281     {
00282         dofree=false;
00283         len=features[real_num].slen;
00284         return features[real_num].string;
00285     }
00286     else
00287     {
00288         SG_DEBUG( "computing feature vector!\n") ;
00289         ST* feat=compute_feature_vector(num, len);
00290         dofree=true;
00291 
00292         if (get_num_preprocessors())
00293         {
00294             ST* tmp_feat_before=feat;
00295 
00296             for (int32_t i=0; i<get_num_preprocessors(); i++)
00297             {
00298                 CStringPreprocessor<ST>* p=(CStringPreprocessor<ST>*) get_preprocessor(i);
00299                 feat=p->apply_to_string(tmp_feat_before, len);
00300                 SG_UNREF(p);
00301                 SG_FREE(tmp_feat_before);
00302                 tmp_feat_before=feat;
00303             }
00304         }
00305         // TODO: implement caching
00306         return feat;
00307     }
00308 }
00309 
00310 template<class ST> CStringFeatures<ST>* CStringFeatures<ST>::get_transposed()
00311 {
00312     int32_t num_feat;
00313     int32_t num_vec;
00314     SGString<ST>* s=get_transposed(num_feat, num_vec);
00315     SGStringList<ST> string_list;
00316     string_list.strings = s;
00317     string_list.num_strings = num_vec;
00318     string_list.max_string_length = num_feat;
00319 
00320     return new CStringFeatures<ST>(string_list, alphabet);
00321 }
00322 
00323 template<class ST> SGString<ST>* CStringFeatures<ST>::get_transposed(int32_t &num_feat, int32_t &num_vec)
00324 {
00325     num_feat=get_num_vectors();
00326     num_vec=get_max_vector_length();
00327     ASSERT(have_same_length());
00328 
00329     SG_DEBUG("Allocating memory for transposed string features of size %ld\n",
00330             int64_t(num_feat)*num_vec);
00331 
00332     SGString<ST>* sf=SG_MALLOC(SGString<ST>, num_vec);
00333 
00334     for (int32_t i=0; i<num_vec; i++)
00335     {
00336         sf[i].string=SG_MALLOC(ST, num_feat);
00337         sf[i].slen=num_feat;
00338     }
00339 
00340     for (int32_t i=0; i<num_feat; i++)
00341     {
00342         int32_t len=0;
00343         bool free_vec=false;
00344         ST* vec=get_feature_vector(i, len, free_vec);
00345 
00346         for (int32_t j=0; j<num_vec; j++)
00347             sf[j].string[i]=vec[j];
00348 
00349         free_feature_vector(vec, i, free_vec);
00350     }
00351     return sf;
00352 }
00353 
00354 template<class ST> void CStringFeatures<ST>::free_feature_vector(ST* feat_vec, int32_t num, bool dofree)
00355 {
00356     if (num>=get_num_vectors())
00357     {
00358         SG_ERROR(
00359             "Trying to access string[%d] but num_str=%d\n", num,
00360             get_num_vectors());
00361     }
00362 
00363     int32_t real_num=m_subset_stack->subset_idx_conversion(num);
00364 
00365     if (feature_cache)
00366         feature_cache->unlock_entry(real_num);
00367 
00368     if (dofree)
00369         SG_FREE(feat_vec);
00370 }
00371 
00372 template<class ST> void CStringFeatures<ST>::free_feature_vector(SGVector<ST> feat_vec, int32_t num)
00373 {
00374     if (num>=get_num_vectors())
00375     {
00376         SG_ERROR(
00377             "Trying to access string[%d] but num_str=%d\n", num,
00378             get_num_vectors());
00379     }
00380 
00381     int32_t real_num=m_subset_stack->subset_idx_conversion(num);
00382 
00383     if (feature_cache)
00384         feature_cache->unlock_entry(real_num);
00385 }
00386 
00387 template<class ST> ST CStringFeatures<ST>::get_feature(int32_t vec_num, int32_t feat_num)
00388 {
00389     ASSERT(vec_num<get_num_vectors());
00390 
00391     int32_t len;
00392     bool free_vec;
00393     ST* vec=get_feature_vector(vec_num, len, free_vec);
00394     ASSERT(feat_num<len);
00395     ST result=vec[feat_num];
00396     free_feature_vector(vec, vec_num, free_vec);
00397 
00398     return result;
00399 }
00400 
00401 template<class ST> int32_t CStringFeatures<ST>::get_vector_length(int32_t vec_num)
00402 {
00403     ASSERT(vec_num<get_num_vectors());
00404 
00405     int32_t len;
00406     bool free_vec;
00407     ST* vec=get_feature_vector(vec_num, len, free_vec);
00408     free_feature_vector(vec, vec_num, free_vec);
00409     return len;
00410 }
00411 
00412 template<class ST> int32_t CStringFeatures<ST>::get_max_vector_length()
00413 {
00414     return max_string_length;
00415 }
00416 
00417 template<class ST> int32_t CStringFeatures<ST>::get_num_vectors() const
00418 {
00419     return m_subset_stack->has_subsets() ? m_subset_stack->get_size() : num_vectors;
00420 }
00421 
00422 template<class ST> floatmax_t CStringFeatures<ST>::get_num_symbols() { return num_symbols; }
00423 
00424 template<class ST> floatmax_t CStringFeatures<ST>::get_max_num_symbols() { return CMath::powl(2,sizeof(ST)*8); }
00425 
00426 template<class ST> floatmax_t CStringFeatures<ST>::get_original_num_symbols() { return original_num_symbols; }
00427 
00428 template<class ST> int32_t CStringFeatures<ST>::get_order() { return order; }
00429 
00430 template<class ST> ST CStringFeatures<ST>::get_masked_symbols(ST symbol, uint8_t mask)
00431 {
00432     ASSERT(symbol_mask_table);
00433     return symbol_mask_table[mask] & symbol;
00434 }
00435 
00436 template<class ST> ST CStringFeatures<ST>::shift_offset(ST offset, int32_t amount)
00437 {
00438     ASSERT(alphabet);
00439     return (offset << (amount*alphabet->get_num_bits()));
00440 }
00441 
00442 template<class ST> ST CStringFeatures<ST>::shift_symbol(ST symbol, int32_t amount)
00443 {
00444     ASSERT(alphabet);
00445     return (symbol >> (amount*alphabet->get_num_bits()));
00446 }
00447 
00448 template<class ST> void CStringFeatures<ST>::load_ascii_file(char* fname, bool remap_to_bin,
00449         EAlphabet ascii_alphabet, EAlphabet binary_alphabet)
00450 {
00451     remove_all_subsets();
00452 
00453     size_t blocksize=1024*1024;
00454     size_t required_blocksize=0;
00455     uint8_t* dummy=SG_MALLOC(uint8_t, blocksize);
00456     uint8_t* overflow=NULL;
00457     int32_t overflow_len=0;
00458 
00459     cleanup();
00460 
00461     CAlphabet* alpha=new CAlphabet(ascii_alphabet);
00462     CAlphabet* alpha_bin=new CAlphabet(binary_alphabet);
00463 
00464     FILE* f=fopen(fname, "ro");
00465 
00466     if (f)
00467     {
00468         num_vectors=0;
00469         max_string_length=0;
00470 
00471         SG_INFO("counting line numbers in file %s\n", fname);
00472         size_t block_offs=0;
00473         size_t old_block_offs=0;
00474         fseek(f, 0, SEEK_END);
00475         size_t fsize=ftell(f);
00476         rewind(f);
00477 
00478         if (blocksize>fsize)
00479             blocksize=fsize;
00480 
00481         SG_DEBUG("block_size=%ld file_size=%ld\n", blocksize, fsize);
00482 
00483         size_t sz=blocksize;
00484         while (sz == blocksize)
00485         {
00486             sz=fread(dummy, sizeof(uint8_t), blocksize, f);
00487             for (size_t i=0; i<sz; i++)
00488             {
00489                 block_offs++;
00490                 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00491                 {
00492                     num_vectors++;
00493                     required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs);
00494                     old_block_offs=block_offs;
00495                 }
00496             }
00497             SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
00498         }
00499 
00500         SG_INFO("found %d strings\n", num_vectors);
00501         SG_FREE(dummy);
00502         blocksize=required_blocksize;
00503         dummy=SG_MALLOC(uint8_t, blocksize);
00504         overflow=SG_MALLOC(uint8_t, blocksize);
00505         features=SG_MALLOC(SGString<ST>, num_vectors);
00506 
00507         rewind(f);
00508         sz=blocksize;
00509         int32_t lines=0;
00510         while (sz == blocksize)
00511         {
00512             sz=fread(dummy, sizeof(uint8_t), blocksize, f);
00513 
00514             size_t old_sz=0;
00515             for (size_t i=0; i<sz; i++)
00516             {
00517                 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00518                 {
00519                     int32_t len=i-old_sz;
00520                     //SG_PRINT("i:%d len:%d old_sz:%d\n", i, len, old_sz);
00521                     max_string_length=CMath::max(max_string_length, len+overflow_len);
00522 
00523                     features[lines].slen=len;
00524                     features[lines].string=SG_MALLOC(ST, len);
00525 
00526                     if (remap_to_bin)
00527                     {
00528                         for (int32_t j=0; j<overflow_len; j++)
00529                             features[lines].string[j]=alpha->remap_to_bin(overflow[j]);
00530                         for (int32_t j=0; j<len; j++)
00531                             features[lines].string[j+overflow_len]=alpha->remap_to_bin(dummy[old_sz+j]);
00532                         alpha->add_string_to_histogram(&dummy[old_sz], len);
00533                         alpha_bin->add_string_to_histogram(features[lines].string, features[lines].slen);
00534                     }
00535                     else
00536                     {
00537                         for (int32_t j=0; j<overflow_len; j++)
00538                             features[lines].string[j]=overflow[j];
00539                         for (int32_t j=0; j<len; j++)
00540                             features[lines].string[j+overflow_len]=dummy[old_sz+j];
00541                         alpha->add_string_to_histogram(&dummy[old_sz], len);
00542                         alpha->add_string_to_histogram(features[lines].string, features[lines].slen);
00543                     }
00544 
00545                     // clear overflow
00546                     overflow_len=0;
00547 
00548                     //CMath::display_vector(features[lines].string, len);
00549                     old_sz=i+1;
00550                     lines++;
00551                     SG_PROGRESS(lines, 0, num_vectors, 1, "LOADING:\t");
00552                 }
00553             }
00554             for (size_t i=old_sz; i<sz; i++)
00555                 overflow[i-old_sz]=dummy[i];
00556 
00557             overflow_len=sz-old_sz;
00558         }
00559 
00560         if (alpha->check_alphabet_size() && alpha->check_alphabet())
00561         {
00562             SG_INFO("file successfully read\n");
00563             SG_INFO("max_string_length=%d\n", max_string_length);
00564             SG_INFO("num_strings=%d\n", num_vectors);
00565         }
00566         fclose(f);
00567     }
00568 
00569     SG_FREE(dummy);
00570 
00571     SG_UNREF(alphabet);
00572 
00573     if (remap_to_bin)
00574         alphabet=alpha_bin;
00575     else
00576         alphabet=alpha;
00577     SG_REF(alphabet);
00578     num_symbols=alphabet->get_num_symbols();
00579 }
00580 
00581 template<class ST> bool CStringFeatures<ST>::load_fasta_file(const char* fname, bool ignore_invalid)
00582 {
00583     remove_all_subsets();
00584 
00585     int32_t i=0;
00586     uint64_t len=0;
00587     uint64_t offs=0;
00588     int32_t num=0;
00589     int32_t max_len=0;
00590 
00591     CMemoryMappedFile<char> f(fname);
00592 
00593     while (true)
00594     {
00595         char* s=f.get_line(len, offs);
00596         if (!s)
00597             break;
00598 
00599         if (len>0 && s[0]=='>')
00600             num++;
00601     }
00602 
00603     if (num==0)
00604         SG_ERROR("No fasta hunks (lines starting with '>') found\n");
00605 
00606     cleanup();
00607     SG_UNREF(alphabet);
00608     alphabet=new CAlphabet(DNA);
00609     num_symbols=alphabet->get_num_symbols();
00610 
00611     SGString<ST>* strings=SG_MALLOC(SGString<ST>, num);
00612     offs=0;
00613 
00614     for (i=0;i<num; i++)
00615     {
00616         uint64_t id_len=0;
00617         char* id=f.get_line(id_len, offs);
00618 
00619         char* fasta=f.get_line(len, offs);
00620         char* s=fasta;
00621         int32_t fasta_len=0;
00622         int32_t spanned_lines=0;
00623 
00624         while (true)
00625         {
00626             if (!s || len==0)
00627                 SG_ERROR("Error reading fasta entry in line %d len=%ld", 4*i+1, len);
00628 
00629             if (s[0]=='>' || offs==f.get_size())
00630             {
00631                 offs-=len+1; // seek to beginning
00632                 if (offs==f.get_size())
00633                 {
00634                     SG_DEBUG("at EOF\n");
00635                     fasta_len+=len;
00636                 }
00637 
00638                 len=fasta_len-spanned_lines;
00639                 strings[i].string=SG_MALLOC(ST, len);
00640                 strings[i].slen=len;
00641 
00642                 ST* str=strings[i].string;
00643                 int32_t idx=0;
00644                 SG_DEBUG("'%.*s', len=%d, spanned_lines=%d\n", (int32_t) id_len, id, (int32_t) len, (int32_t) spanned_lines);
00645 
00646                 for (int32_t j=0; j<fasta_len; j++)
00647                 {
00648                     if (fasta[j]=='\n')
00649                         continue;
00650 
00651                     ST c=(ST) fasta[j];
00652 
00653                     if (ignore_invalid  && !alphabet->is_valid((uint8_t) fasta[j]))
00654                         c=(ST) 'A';
00655 
00656                     if (uint64_t(idx)>=len)
00657                         SG_ERROR("idx=%d j=%d fasta_len=%d, spanned_lines=%d str='%.*s'\n", idx, j, fasta_len, spanned_lines, idx, str);
00658                     str[idx++]=c;
00659                 }
00660                 max_len=CMath::max(max_len, strings[i].slen);
00661 
00662 
00663                 break;
00664             }
00665 
00666             spanned_lines++;
00667             fasta_len+=len+1; // including '\n'
00668             s=f.get_line(len, offs);
00669         }
00670     }
00671     return set_features(strings, num, max_len);
00672 }
00673 
00674 template<class ST> bool CStringFeatures<ST>::load_fastq_file(const char* fname,
00675         bool ignore_invalid, bool bitremap_in_single_string)
00676 {
00677     remove_all_subsets();
00678 
00679     CMemoryMappedFile<char> f(fname);
00680 
00681     int32_t i=0;
00682     uint64_t len=0;
00683     uint64_t offs=0;
00684 
00685     int32_t num=f.get_num_lines();
00686     int32_t max_len=0;
00687 
00688     if (num%4)
00689         SG_ERROR("Number of lines must be divisible by 4 in fastq files\n");
00690     num/=4;
00691 
00692     cleanup();
00693     SG_UNREF(alphabet);
00694     alphabet=new CAlphabet(DNA);
00695 
00696     SGString<ST>* strings;
00697 
00698     ST* str=NULL;
00699     if (bitremap_in_single_string)
00700     {
00701         strings=SG_MALLOC(SGString<ST>, 1);
00702         strings[0].string=SG_MALLOC(ST, num);
00703         strings[0].slen=num;
00704         f.get_line(len, offs);
00705         f.get_line(len, offs);
00706         order=len;
00707         max_len=num;
00708         offs=0;
00709         original_num_symbols=alphabet->get_num_symbols();
00710         str=SG_MALLOC(ST, len);
00711     }
00712     else
00713         strings=SG_MALLOC(SGString<ST>, num);
00714 
00715     for (i=0;i<num; i++)
00716     {
00717         if (!f.get_line(len, offs))
00718             SG_ERROR("Error reading 'read' identifier in line %d", 4*i);
00719 
00720         char* s=f.get_line(len, offs);
00721         if (!s || len==0)
00722             SG_ERROR("Error reading 'read' in line %d len=%ld", 4*i+1, len);
00723 
00724         if (bitremap_in_single_string)
00725         {
00726             if (len!=(uint64_t) order)
00727                 SG_ERROR("read in line %d not of length %d (is %d)\n", 4*i+1, order, len);
00728             for (int32_t j=0; j<order; j++)
00729                 str[j]=(ST) alphabet->remap_to_bin((uint8_t) s[j]);
00730 
00731             strings[0].string[i]=embed_word(str, order);
00732         }
00733         else
00734         {
00735             strings[i].string=SG_MALLOC(ST, len);
00736             strings[i].slen=len;
00737             str=strings[i].string;
00738 
00739             if (ignore_invalid)
00740             {
00741                 for (uint64_t j=0; j<len; j++)
00742                 {
00743                     if (alphabet->is_valid((uint8_t) s[j]))
00744                         str[j]= (ST) s[j];
00745                     else
00746                         str[j]= (ST) 'A';
00747                 }
00748             }
00749             else
00750             {
00751                 for (uint64_t j=0; j<len; j++)
00752                     str[j]= (ST) s[j];
00753             }
00754             max_len=CMath::max(max_len, (int32_t) len);
00755         }
00756 
00757 
00758         if (!f.get_line(len, offs))
00759             SG_ERROR("Error reading 'read' quality identifier in line %d", 4*i+2);
00760 
00761         if (!f.get_line(len, offs))
00762             SG_ERROR("Error reading 'read' quality in line %d", 4*i+3);
00763     }
00764 
00765     if (bitremap_in_single_string)
00766         num=1;
00767 
00768     num_vectors=num;
00769     max_string_length=max_len;
00770     features=strings;
00771 
00772     return true;
00773 }
00774 
00775 template<class ST> bool CStringFeatures<ST>::load_from_directory(char* dirname)
00776 {
00777     remove_all_subsets();
00778 
00779     struct dirent **namelist;
00780     int32_t n;
00781 
00782     SGIO::set_dirname(dirname);
00783 
00784     SG_DEBUG("dirname '%s'\n", dirname);
00785 
00786     n=scandir(dirname, &namelist, &SGIO::filter, alphasort);
00787     if (n <= 0)
00788     {
00789         SG_ERROR("error calling scandir - no files found\n");
00790         return false;
00791     }
00792     else
00793     {
00794         SGString<ST>* strings=NULL;
00795 
00796         int32_t num=0;
00797         int32_t max_len=-1;
00798 
00799         //usually n==num_vec, but it might not in race conditions
00800         //(file perms modified, file erased)
00801         strings=SG_MALLOC(SGString<ST>, n);
00802 
00803         for (int32_t i=0; i<n; i++)
00804         {
00805             char* fname=SGIO::concat_filename(namelist[i]->d_name);
00806 
00807             struct stat s;
00808             off_t filesize=0;
00809 
00810             if (!stat(fname, &s) && s.st_size>0)
00811             {
00812                 filesize=s.st_size/sizeof(ST);
00813 
00814                 FILE* f=fopen(fname, "ro");
00815                 if (f)
00816                 {
00817                     ST* str=SG_MALLOC(ST, filesize);
00818                     SG_DEBUG("%s:%ld\n", fname, (int64_t) filesize);
00819                     if (fread(str, sizeof(ST), filesize, f)!=(size_t) filesize)
00820                         SG_ERROR("failed to read file\n");
00821                     strings[num].string=str;
00822                     strings[num].slen=filesize;
00823                     max_len=CMath::max(max_len, strings[num].slen);
00824 
00825                     num++;
00826                     fclose(f);
00827                 }
00828             }
00829             else
00830                 SG_ERROR("empty or non readable file \'%s\'\n", fname);
00831 
00832             SG_FREE(namelist[i]);
00833         }
00834         SG_FREE(namelist);
00835 
00836         if (num>0 && strings)
00837         {
00838             set_features(strings, num, max_len);
00839             return true;
00840         }
00841     }
00842     return false;
00843 }
00844 
00845 template<class ST> void CStringFeatures<ST>::set_features(SGStringList<ST> feats)
00846 {
00847     set_features(feats.strings, feats.num_strings, feats.max_string_length);
00848 }
00849 
00850 template<class ST> bool CStringFeatures<ST>::set_features(SGString<ST>* p_features, int32_t p_num_vectors, int32_t p_max_string_length)
00851 {
00852     if (m_subset_stack->has_subsets())
00853         SG_ERROR("Cannot call set_features() with subset.\n");
00854 
00855     if (p_features)
00856     {
00857         CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet());
00858 
00859         //compute histogram for char/byte
00860         for (int32_t i=0; i<p_num_vectors; i++)
00861             alpha->add_string_to_histogram( p_features[i].string, p_features[i].slen);
00862 
00863         SG_INFO("max_value_in_histogram:%d\n", alpha->get_max_value_in_histogram());
00864         SG_INFO("num_symbols_in_histogram:%d\n", alpha->get_num_symbols_in_histogram());
00865 
00866         if (alpha->check_alphabet_size() && alpha->check_alphabet())
00867         {
00868             cleanup();
00869             SG_UNREF(alphabet);
00870 
00871             alphabet=alpha;
00872             SG_REF(alphabet);
00873 
00874             // TODO remove copying
00875             features = SG_MALLOC(SGString<ST>,p_num_vectors);
00876             memcpy(features,p_features,sizeof(SGString<ST>)*p_num_vectors);
00877             num_vectors = p_num_vectors;
00878             max_string_length = p_max_string_length;
00879 
00880             return true;
00881         }
00882         else
00883             SG_UNREF(alpha);
00884     }
00885 
00886     return false;
00887 }
00888 
00889 template<class ST> bool CStringFeatures<ST>::append_features(CStringFeatures<ST>* sf)
00890 {
00891     ASSERT(sf);
00892 
00893     if (m_subset_stack->has_subsets())
00894         SG_ERROR("Cannot call set_features() with subset.\n");
00895 
00896     SGString<ST>* new_features=SG_MALLOC(SGString<ST>, sf->get_num_vectors());
00897 
00898     index_t sf_num_str=sf->get_num_vectors();
00899     for (int32_t i=0; i<sf_num_str; i++)
00900     {
00901         int32_t real_i = sf->m_subset_stack->subset_idx_conversion(i);
00902         int32_t length=sf->features[real_i].slen;
00903         new_features[i].string=SG_MALLOC(ST, length);
00904         memcpy(new_features[i].string, sf->features[real_i].string, length);
00905         new_features[i].slen=length;
00906     }
00907     return append_features(new_features, sf_num_str,
00908             sf->max_string_length);
00909 }
00910 
00911 template<class ST> bool CStringFeatures<ST>::append_features(SGString<ST>* p_features, int32_t p_num_vectors, int32_t p_max_string_length)
00912 {
00913     if (m_subset_stack->has_subsets())
00914         SG_ERROR("Cannot call set_features() with subset.\n");
00915 
00916     if (!features)
00917         return set_features(p_features, p_num_vectors, p_max_string_length);
00918 
00919     CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet());
00920 
00921     //compute histogram for char/byte
00922     for (int32_t i=0; i<p_num_vectors; i++)
00923         alpha->add_string_to_histogram( p_features[i].string, p_features[i].slen);
00924 
00925     SG_INFO("max_value_in_histogram:%d\n", alpha->get_max_value_in_histogram());
00926     SG_INFO("num_symbols_in_histogram:%d\n", alpha->get_num_symbols_in_histogram());
00927 
00928     if (alpha->check_alphabet_size() && alpha->check_alphabet())
00929     {
00930         SG_UNREF(alpha);
00931         for (int32_t i=0; i<p_num_vectors; i++)
00932             alphabet->add_string_to_histogram( p_features[i].string, p_features[i].slen);
00933 
00934         int32_t old_num_vectors=num_vectors;
00935         num_vectors=old_num_vectors+p_num_vectors;
00936         SGString<ST>* new_features=SG_MALLOC(SGString<ST>, num_vectors);
00937 
00938         for (int32_t i=0; i<num_vectors; i++)
00939         {
00940             if (i<old_num_vectors)
00941             {
00942                 new_features[i].string=features[i].string;
00943                 new_features[i].slen=features[i].slen;
00944             }
00945             else
00946             {
00947                 new_features[i].string=p_features[i-old_num_vectors].string;
00948                 new_features[i].slen=p_features[i-old_num_vectors].slen;
00949             }
00950         }
00951         SG_FREE(features);
00952         SG_FREE(p_features); // free now obsolete features
00953 
00954         this->features=new_features;
00955         max_string_length=CMath::max(max_string_length, p_max_string_length);
00956 
00957         return true;
00958     }
00959     SG_UNREF(alpha);
00960 
00961     return false;
00962 }
00963 
00964 template<class ST> SGStringList<ST> CStringFeatures<ST>::get_features()
00965 {
00966     SGStringList<ST> sl(NULL,0,0,false);
00967 
00968     sl.strings=get_features(sl.num_strings, sl.max_string_length);
00969     return sl;
00970 }
00971 
00972 template<class ST> SGString<ST>* CStringFeatures<ST>::get_features(int32_t& num_str, int32_t& max_str_len)
00973 {
00974     if (m_subset_stack->has_subsets())
00975         SG_ERROR("get features() is not possible on subset");
00976 
00977     num_str=num_vectors;
00978     max_str_len=max_string_length;
00979     return features;
00980 }
00981 
00982 template<class ST> SGString<ST>* CStringFeatures<ST>::copy_features(int32_t& num_str, int32_t& max_str_len)
00983 {
00984     ASSERT(num_vectors>0);
00985 
00986     num_str=get_num_vectors();
00987     max_str_len=max_string_length;
00988     SGString<ST>* new_feat=SG_MALLOC(SGString<ST>, num_str);
00989 
00990     for (int32_t i=0; i<num_str; i++)
00991     {
00992         int32_t len;
00993         bool free_vec;
00994         ST* vec=get_feature_vector(i, len, free_vec);
00995         new_feat[i].string=SG_MALLOC(ST, len);
00996         new_feat[i].slen=len;
00997         memcpy(new_feat[i].string, vec, ((size_t) len) * sizeof(ST));
00998         free_feature_vector(vec, i, free_vec);
00999     }
01000 
01001     return new_feat;
01002 }
01003 
01004 template<class ST> void CStringFeatures<ST>::get_features(SGString<ST>** dst, int32_t* num_str)
01005 {
01006     int32_t num_vec;
01007     int32_t max_str_len;
01008     *dst=copy_features(num_vec, max_str_len);
01009     *num_str=num_vec;
01010 }
01011 
01012 template<class ST> bool CStringFeatures<ST>::load_compressed(char* src, bool decompress)
01013 {
01014     remove_all_subsets();
01015 
01016     FILE* file=NULL;
01017 
01018     if (!(file=fopen(src, "r")))
01019         return false;
01020     cleanup();
01021 
01022     // header shogun v0
01023     char id[4];
01024     if (fread(&id[0], sizeof(char), 1, file)!=1)
01025         SG_ERROR("failed to read header");
01026     ASSERT(id[0]=='S');
01027     if (fread(&id[1], sizeof(char), 1, file)!=1)
01028         SG_ERROR("failed to read header");
01029     ASSERT(id[1]=='G');
01030     if (fread(&id[2], sizeof(char), 1, file)!=1)
01031         SG_ERROR("failed to read header");
01032     ASSERT(id[2]=='V');
01033     if (fread(&id[3], sizeof(char), 1, file)!=1)
01034         SG_ERROR("failed to read header");
01035     ASSERT(id[3]=='0');
01036 
01037     //compression type
01038     uint8_t c;
01039     if (fread(&c, sizeof(uint8_t), 1, file)!=1)
01040         SG_ERROR("failed to read compression type");
01041     CCompressor* compressor= new CCompressor((E_COMPRESSION_TYPE) c);
01042     //alphabet
01043     uint8_t a;
01044     delete alphabet;
01045     if (fread(&a, sizeof(uint8_t), 1, file)!=1)
01046         SG_ERROR("failed to read compression alphabet");
01047     alphabet=new CAlphabet((EAlphabet) a);
01048     // number of vectors
01049     if (fread(&num_vectors, sizeof(int32_t), 1, file)!=1)
01050         SG_ERROR("failed to read compression number of vectors");
01051     ASSERT(num_vectors>0);
01052     // maximum string length
01053     if (fread(&max_string_length, sizeof(int32_t), 1, file)!=1)
01054         SG_ERROR("failed to read maximum string length");
01055     ASSERT(max_string_length>0);
01056 
01057     features=SG_MALLOC(SGString<ST>, num_vectors);
01058 
01059     // vectors
01060     for (int32_t i=0; i<num_vectors; i++)
01061     {
01062         // vector len compressed
01063         int32_t len_compressed;
01064         if (fread(&len_compressed, sizeof(int32_t), 1, file)!=1)
01065             SG_ERROR("failed to read vector length compressed");
01066         // vector len uncompressed
01067         int32_t len_uncompressed;
01068         if (fread(&len_uncompressed, sizeof(int32_t), 1, file)!=1)
01069             SG_ERROR("failed to read vector length uncompressed");
01070 
01071         // vector raw data
01072         if (decompress)
01073         {
01074             features[i].string=SG_MALLOC(ST, len_uncompressed);
01075             features[i].slen=len_uncompressed;
01076             uint8_t* compressed=SG_MALLOC(uint8_t, len_compressed);
01077             if (fread(compressed, sizeof(uint8_t), len_compressed, file)!=(size_t) len_compressed)
01078                 SG_ERROR("failed to read compressed data (expected %d bytes)", len_compressed);
01079             uint64_t uncompressed_size=len_uncompressed;
01080             uncompressed_size*=sizeof(ST);
01081             compressor->decompress(compressed, len_compressed,
01082                     (uint8_t*) features[i].string, uncompressed_size);
01083             SG_FREE(compressed);
01084             ASSERT(uncompressed_size==((uint64_t) len_uncompressed)*sizeof(ST));
01085         }
01086         else
01087         {
01088             int32_t offs=CMath::ceil(2.0*sizeof(int32_t)/sizeof(ST));
01089             features[i].string=SG_MALLOC(ST, len_compressed+offs);
01090             features[i].slen=len_compressed+offs;
01091             int32_t* feat32ptr=((int32_t*) (features[i].string));
01092             memset(features[i].string, 0, offs*sizeof(ST));
01093             feat32ptr[0]=(int32_t) len_compressed;
01094             feat32ptr[1]=(int32_t) len_uncompressed;
01095             uint8_t* compressed=(uint8_t*) (&features[i].string[offs]);
01096             if (fread(compressed, 1, len_compressed, file)!=(size_t) len_compressed)
01097                 SG_ERROR("failed to read uncompressed data");
01098         }
01099     }
01100 
01101     delete compressor;
01102     fclose(file);
01103 
01104     return false;
01105 }
01106 
01107 template<class ST> bool CStringFeatures<ST>::save_compressed(char* dest, E_COMPRESSION_TYPE compression, int level)
01108 {
01109     if (m_subset_stack->has_subsets())
01110         SG_ERROR("save_compressed() is not possible on subset");
01111 
01112     FILE* file=NULL;
01113 
01114     if (!(file=fopen(dest, "wb")))
01115         return false;
01116 
01117     CCompressor* compressor= new CCompressor(compression);
01118 
01119     // header shogun v0
01120     const char* id="SGV0";
01121     fwrite(&id[0], sizeof(char), 1, file);
01122     fwrite(&id[1], sizeof(char), 1, file);
01123     fwrite(&id[2], sizeof(char), 1, file);
01124     fwrite(&id[3], sizeof(char), 1, file);
01125 
01126     //compression type
01127     uint8_t c=(uint8_t) compression;
01128     fwrite(&c, sizeof(uint8_t), 1, file);
01129     //alphabet
01130     uint8_t a=(uint8_t) alphabet->get_alphabet();
01131     fwrite(&a, sizeof(uint8_t), 1, file);
01132     // number of vectors
01133     fwrite(&num_vectors, sizeof(int32_t), 1, file);
01134     // maximum string length
01135     fwrite(&max_string_length, sizeof(int32_t), 1, file);
01136 
01137     // vectors
01138     for (int32_t i=0; i<num_vectors; i++)
01139     {
01140         int32_t len=-1;
01141         bool vfree;
01142         ST* vec=get_feature_vector(i, len, vfree);
01143 
01144         uint8_t* compressed=NULL;
01145         uint64_t compressed_size=0;
01146 
01147         compressor->compress((uint8_t*) vec, ((uint64_t) len)*sizeof(ST),
01148                 compressed, compressed_size, level);
01149 
01150         int32_t len_compressed=(int32_t) compressed_size;
01151         // vector len compressed in bytes
01152         fwrite(&len_compressed, sizeof(int32_t), 1, file);
01153         // vector len uncompressed in number of elements of type ST
01154         fwrite(&len, sizeof(int32_t), 1, file);
01155         // vector raw data
01156         fwrite(compressed, compressed_size, 1, file);
01157         SG_FREE(compressed);
01158 
01159         free_feature_vector(vec, i, vfree);
01160     }
01161 
01162     delete compressor;
01163     fclose(file);
01164     return true;
01165 }
01166 
01167 template<class ST> int32_t CStringFeatures<ST>::get_size() const { return sizeof(ST); }
01168 
01169 template<class ST> bool CStringFeatures<ST>::apply_preprocessor(bool force_preprocessing)
01170 {
01171     SG_DEBUG( "force: %d\n", force_preprocessing);
01172 
01173     for (int32_t i=0; i<get_num_preprocessors(); i++)
01174     {
01175         if ( (!is_preprocessed(i) || force_preprocessing) )
01176         {
01177             set_preprocessed(i);
01178             CStringPreprocessor<ST>* p=(CStringPreprocessor<ST>*) get_preprocessor(i);
01179             SG_INFO( "preprocessing using preproc %s\n", p->get_name());
01180 
01181             if (!p->apply_to_string_features(this))
01182             {
01183                 SG_UNREF(p);
01184                 return false;
01185             }
01186             else
01187                 SG_UNREF(p);
01188         }
01189     }
01190     return true;
01191 }
01192 
01193 template<class ST> int32_t CStringFeatures<ST>::obtain_by_sliding_window(int32_t window_size, int32_t step_size, int32_t skip)
01194 {
01195     if (m_subset_stack->has_subsets())
01196         SG_NOTIMPLEMENTED;
01197 
01198     ASSERT(step_size>0);
01199     ASSERT(window_size>0);
01200     ASSERT(num_vectors==1 || single_string);
01201     ASSERT(max_string_length>=window_size ||
01202             (single_string && length_of_single_string>=window_size));
01203 
01204     //in case we are dealing with a single remapped string
01205     //allow remapping
01206     if (single_string)
01207         num_vectors= (length_of_single_string-window_size)/step_size + 1;
01208     else if (num_vectors==1)
01209     {
01210         num_vectors= (max_string_length-window_size)/step_size + 1;
01211         length_of_single_string=max_string_length;
01212     }
01213 
01214     SGString<ST>* f=SG_MALLOC(SGString<ST>, num_vectors);
01215     int32_t offs=0;
01216     for (int32_t i=0; i<num_vectors; i++)
01217     {
01218         f[i].string=&features[0].string[offs+skip];
01219         f[i].slen=window_size-skip;
01220         offs+=step_size;
01221     }
01222     single_string=features[0].string;
01223     SG_FREE(features);
01224     features=f;
01225     max_string_length=window_size-skip;
01226 
01227     return num_vectors;
01228 }
01229 
01230 template<class ST> int32_t CStringFeatures<ST>::obtain_by_position_list(int32_t window_size, CDynamicArray<int32_t>* positions,
01231         int32_t skip)
01232 {
01233     if (m_subset_stack->has_subsets())
01234         SG_NOTIMPLEMENTED;
01235 
01236     ASSERT(positions);
01237     ASSERT(window_size>0);
01238     ASSERT(num_vectors==1 || single_string);
01239     ASSERT(max_string_length>=window_size ||
01240             (single_string && length_of_single_string>=window_size));
01241 
01242     num_vectors= positions->get_num_elements();
01243     ASSERT(num_vectors>0);
01244 
01245     int32_t len;
01246 
01247     //in case we are dealing with a single remapped string
01248     //allow remapping
01249     if (single_string)
01250         len=length_of_single_string;
01251     else
01252     {
01253         single_string=features[0].string;
01254         len=max_string_length;
01255         length_of_single_string=max_string_length;
01256     }
01257 
01258     SGString<ST>* f=SG_MALLOC(SGString<ST>, num_vectors);
01259     for (int32_t i=0; i<num_vectors; i++)
01260     {
01261         int32_t p=positions->get_element(i);
01262 
01263         if (p>=0 && p<=len-window_size)
01264         {
01265             f[i].string=&features[0].string[p+skip];
01266             f[i].slen=window_size-skip;
01267         }
01268         else
01269         {
01270             num_vectors=1;
01271             max_string_length=len;
01272             features[0].slen=len;
01273             single_string=NULL;
01274             SG_FREE(f);
01275             SG_ERROR("window (size:%d) starting at position[%d]=%d does not fit in sequence(len:%d)\n",
01276                     window_size, i, p, len);
01277             return -1;
01278         }
01279     }
01280 
01281     SG_FREE(features);
01282     features=f;
01283     max_string_length=window_size-skip;
01284 
01285     return num_vectors;
01286 }
01287 
01288 template<class ST> bool CStringFeatures<ST>::obtain_from_char(CStringFeatures<char>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
01289 {
01290     return obtain_from_char_features(sf, start, p_order, gap, rev);
01291 }
01292 
01293 template<class ST> bool CStringFeatures<ST>::have_same_length(int32_t len)
01294 {
01295     if (len!=-1)
01296     {
01297         if (len!=max_string_length)
01298             return false;
01299     }
01300     len=max_string_length;
01301 
01302     index_t num_str=get_num_vectors();
01303     for (int32_t i=0; i<num_str; i++)
01304     {
01305         if (get_vector_length(i)!=len)
01306             return false;
01307     }
01308 
01309     return true;
01310 }
01311 
01312 template<class ST> void CStringFeatures<ST>::embed_features(int32_t p_order)
01313 {
01314     if (m_subset_stack->has_subsets())
01315         SG_NOTIMPLEMENTED;
01316 
01317     ASSERT(alphabet->get_num_symbols_in_histogram() > 0);
01318 
01319     order=p_order;
01320     original_num_symbols=alphabet->get_num_symbols();
01321     int32_t max_val=alphabet->get_num_bits();
01322 
01323     if (p_order>1)
01324         num_symbols=CMath::powl((floatmax_t) 2, (floatmax_t) max_val*p_order);
01325     else
01326         num_symbols=original_num_symbols;
01327 
01328     SG_INFO( "max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols);
01329 
01330     if ( ((floatmax_t) num_symbols) > CMath::powl(((floatmax_t) 2),((floatmax_t) sizeof(ST)*8)) )
01331         SG_WARNING("symbols did not fit into datatype \"%c\" (%d)\n", (char) max_val, (int) max_val);
01332 
01333     ST mask=0;
01334     for (int32_t i=0; i<p_order*max_val; i++)
01335         mask= (mask<<1) | ((ST) 1);
01336 
01337     for (int32_t i=0; i<num_vectors; i++)
01338     {
01339         int32_t len=features[i].slen;
01340 
01341         if (len < p_order)
01342             SG_ERROR("Sequence must be longer than order (%d vs. %d)\n", len, p_order);
01343 
01344         ST* str=features[i].string;
01345 
01346         // convert first word
01347         for (int32_t j=0; j<p_order; j++)
01348             str[j]=(ST) alphabet->remap_to_bin(str[j]);
01349         str[0]=embed_word(&str[0], p_order);
01350 
01351         // convert the rest
01352         int32_t idx=0;
01353         for (int32_t j=p_order; j<len; j++)
01354         {
01355             str[j]=(ST) alphabet->remap_to_bin(str[j]);
01356             str[idx+1]= ((str[idx]<<max_val) | str[j]) & mask;
01357             idx++;
01358         }
01359 
01360         features[i].slen=len-p_order+1;
01361     }
01362 
01363     compute_symbol_mask_table(max_val);
01364 }
01365 
01366 template<class ST> void CStringFeatures<ST>::compute_symbol_mask_table(int64_t max_val)
01367 {
01368     if (m_subset_stack->has_subsets())
01369         SG_NOTIMPLEMENTED;
01370 
01371     SG_FREE(symbol_mask_table);
01372     symbol_mask_table=SG_MALLOC(ST, 256);
01373 
01374     uint64_t mask=0;
01375     for (int32_t i=0; i< (int64_t) max_val; i++)
01376         mask=(mask<<1) | 1;
01377 
01378     for (int32_t i=0; i<256; i++)
01379     {
01380         uint8_t bits=(uint8_t) i;
01381         symbol_mask_table[i]=0;
01382 
01383         for (int32_t j=0; j<8; j++)
01384         {
01385             if (bits & 1)
01386                 symbol_mask_table[i]|=mask<<(max_val*j);
01387 
01388             bits>>=1;
01389         }
01390     }
01391 }
01392 
01393 template<class ST> void CStringFeatures<ST>::unembed_word(ST word, uint8_t* seq, int32_t len)
01394 {
01395     uint32_t nbits= (uint32_t) alphabet->get_num_bits();
01396 
01397     ST mask=0;
01398     for (uint32_t i=0; i<nbits; i++)
01399         mask=(mask<<1) | (ST) 1;
01400 
01401     for (int32_t i=0; i<len; i++)
01402     {
01403         ST w=(word & mask);
01404         seq[len-i-1]=alphabet->remap_to_char((uint8_t) w);
01405         word>>=nbits;
01406     }
01407 }
01408 
01409 template<class ST> ST CStringFeatures<ST>::embed_word(ST* seq, int32_t len)
01410 {
01411     ST value=(ST) 0;
01412     uint32_t nbits= (uint32_t) alphabet->get_num_bits();
01413     for (int32_t i=0; i<len; i++)
01414     {
01415         value<<=nbits;
01416         value|=seq[i];
01417     }
01418 
01419     return value;
01420 }
01421 
01422 template<class ST> void CStringFeatures<ST>::determine_maximum_string_length()
01423 {
01424     max_string_length=0;
01425     index_t num_str=get_num_vectors();
01426 
01427     for (int32_t i=0; i<num_str; i++)
01428     {
01429         max_string_length=CMath::max(max_string_length,
01430             features[m_subset_stack->subset_idx_conversion(i)].slen);
01431     }
01432 }
01433 
01434 template<class ST> ST* CStringFeatures<ST>::get_zero_terminated_string_copy(SGString<ST> str)
01435 {
01436     int32_t l=str.slen;
01437     ST* s=SG_MALLOC(ST, l+1);
01438     memcpy(s, str.string, sizeof(ST)*l);
01439     s[l]='\0';
01440     return s;
01441 }
01442 
01443 template<class ST> void CStringFeatures<ST>::set_feature_vector(int32_t num, ST* string, int32_t len)
01444 {
01445     ASSERT(features);
01446     ASSERT(num<get_num_vectors());
01447 
01448     int32_t real_num=m_subset_stack->subset_idx_conversion(num);
01449 
01450 
01451     features[real_num].slen=len ;
01452     features[real_num].string=string ;
01453 
01454     max_string_length=CMath::max(len, max_string_length);
01455 }
01456 
01457 template<class ST> void CStringFeatures<ST>::get_histogram(float64_t** hist, int32_t* rows, int32_t* cols, bool normalize)
01458 {
01459     int32_t nsym=get_num_symbols();
01460     int32_t slen=get_max_vector_length();
01461     int64_t sz=int64_t(nsym)*slen*sizeof(float64_t);
01462     float64_t* h= SG_MALLOC(float64_t, sz);
01463     memset(h, 0, sz);
01464 
01465     float64_t* h_normalizer=SG_MALLOC(float64_t, slen);
01466     memset(h_normalizer, 0, slen*sizeof(float64_t));
01467     int32_t num_str=get_num_vectors();
01468     for (int32_t i=0; i<num_str; i++)
01469     {
01470         int32_t len;
01471         bool free_vec;
01472         ST* vec=get_feature_vector(i, len, free_vec);
01473         for (int32_t j=0; j<len; j++)
01474         {
01475             h[int64_t(j)*nsym+alphabet->remap_to_bin(vec[j])]++;
01476             h_normalizer[j]++;
01477         }
01478         free_feature_vector(vec, i, free_vec);
01479     }
01480 
01481     if (normalize)
01482     {
01483         for (int32_t i=0; i<slen; i++)
01484         {
01485             for (int32_t j=0; j<nsym; j++)
01486             {
01487                 if (h_normalizer && h_normalizer[i])
01488                     h[int64_t(i)*nsym+j]/=h_normalizer[i];
01489             }
01490         }
01491     }
01492     SG_FREE(h_normalizer);
01493 
01494     *hist=h;
01495     *rows=nsym;
01496     *cols=slen;
01497 }
01498 
01499 template<class ST> void CStringFeatures<ST>::create_random(float64_t* hist, int32_t rows, int32_t cols, int32_t num_vec)
01500 {
01501     ASSERT(rows == get_num_symbols());
01502     cleanup();
01503     float64_t* randoms=SG_MALLOC(float64_t, cols);
01504     SGString<ST>* sf=SG_MALLOC(SGString<ST>, num_vec);
01505 
01506     for (int32_t i=0; i<num_vec; i++)
01507     {
01508         sf[i].string=SG_MALLOC(ST, cols);
01509         sf[i].slen=cols;
01510 
01511         SGVector<float64_t>::random_vector(randoms, cols, 0.0, 1.0);
01512 
01513         for (int32_t j=0; j<cols; j++)
01514         {
01515             float64_t lik=hist[int64_t(j)*rows+0];
01516 
01517             int32_t c;
01518             for (c=0; c<rows-1; c++)
01519             {
01520                 if (randoms[j]<=lik)
01521                     break;
01522                 lik+=hist[int64_t(j)*rows+c+1];
01523             }
01524             sf[i].string[j]=alphabet->remap_to_char(c);
01525         }
01526     }
01527     SG_FREE(randoms);
01528     set_features(sf, num_vec, cols);
01529 }
01530 
01531 /*
01532 CStringFeatures<SSKTripleFeature>* obtain_sssk_triple_from_cha(int d1, int d2)
01533 {
01534     int *s;
01535     int32_t nStr=get_num_vectors();
01536 
01537     int32_t nfeat=0;
01538     for (int32_t i=0; i < nStr; ++i)
01539         nfeat += get_vector_length[i] - d1 -d2;
01540     SGString<SSKFeature>* F= SG_MALLOC(SGString<SSKFeature>, nfeat);
01541     int32_t c=0;
01542     for (int32_t i=0; i < nStr; ++i)
01543     {
01544     int32_t len;
01545     bool free_vec;
01546     ST* S=get_feature_vector(vec_num, len, free_vec);
01547     free_feature_vector(vec, vec_num, free_vec);
01548         int32_t n=len - d1 - d2;
01549         s=S[i];
01550         for (int32_t j=0; j < n; ++j)
01551         {
01552             F[c].feature1=s[j];
01553             F[c].feature2=s[j+d1];
01554             F[c].feature3=s[j+d1+d2];
01555             F[c].group=i;
01556             c++;
01557         }
01558     }
01559     ASSERT(nfeat==c);
01560     return F;
01561 }
01562 
01563 CStringFeatures<SSKFeature>* obtain_sssk_double_from_char(int **S, int *len, int nStr, int d1)
01564 {
01565     int i, j;
01566     int n, nfeat;
01567     int *group;
01568     int *features;
01569     int *s;
01570     int c;
01571     SSKFeatures *F;
01572 
01573     nfeat=0;
01574     for (i=0; i < nStr; ++i)
01575         nfeat += len[i] - d1;
01576     group=(int *)SG_MALLOC(nfeat*sizeof(int));
01577     features=(int *)SG_MALLOC(nfeat*2*sizeof(int *));
01578     c=0;
01579     for (i=0; i < nStr; ++i)
01580     {
01581         n=len[i] - d1;
01582         s=S[i];
01583         for (j=0; j < n; ++j)
01584         {
01585             features[c]=s[j];
01586             features[c+nfeat]=s[j+d1];
01587             group[c]=i;
01588             c++;
01589         }
01590     }
01591     if (nfeat!=c)
01592         printf("Something is wrong...\n");
01593     F=(SSKFeatures *)SG_MALLOC(sizeof(SSKFeatures));
01594     (*F).features=features;
01595     (*F).group=group;
01596     (*F).n=nfeat;
01597     return F;
01598 }
01599 */
01600 
01601 template<class ST> CFeatures* CStringFeatures<ST>::copy_subset(
01602         SGVector<index_t> indices)
01603 {
01604     /* string list to create new CStringFeatures from */
01605     SGStringList<ST> list_copy(indices.vlen, max_string_length);
01606 
01607     /* copy all features */
01608     for (index_t i=0; i<indices.vlen; ++i)
01609     {
01610         /* index with respect to possible subset */
01611         index_t real_idx=m_subset_stack->subset_idx_conversion(indices.vector[i]);
01612 
01613         /* copy string */
01614         SGString<ST> current_string=features[real_idx];
01615         SGString<ST> string_copy(current_string.slen);
01616         memcpy(string_copy.string, current_string.string,
01617             current_string.slen*sizeof(ST));
01618         list_copy.strings[i]=string_copy;
01619     }
01620 
01621     /* create copy instance */
01622     CStringFeatures* result=new CStringFeatures(list_copy, alphabet);
01623 
01624     /* max string length may have changed */
01625     result->determine_maximum_string_length();
01626 
01627     /* keep things from original features (otherwise assertions in x-val) */
01628     result->order=order;
01629     result->compute_symbol_mask_table(result->alphabet->get_num_symbols());
01630 
01631     SG_REF(result);
01632 
01633     return result;
01634 }
01635 
01636 template<class ST> void CStringFeatures<ST>::subset_changed_post()
01637 {
01638     /* max string length has to be updated */
01639     determine_maximum_string_length();
01640 }
01641 
01642 template<class ST> ST* CStringFeatures<ST>::compute_feature_vector(int32_t num, int32_t& len)
01643 {
01644     ASSERT(features && num<get_num_vectors());
01645 
01646     int32_t real_num=m_subset_stack->subset_idx_conversion(num);
01647 
01648     len=features[real_num].slen;
01649     if (len<=0)
01650         return NULL;
01651 
01652     ST* target=SG_MALLOC(ST, len);
01653     memcpy(target, features[real_num].string, len*sizeof(ST));
01654     return target;
01655 }
01656 
01657 template<class ST> void CStringFeatures<ST>::init()
01658 {
01659     set_generic<ST>();
01660 
01661     alphabet=NULL;
01662     num_vectors=0;
01663     features=NULL;
01664     single_string=NULL;
01665     length_of_single_string=0;
01666     max_string_length=0;
01667     order=0;
01668     symbol_mask_table=0;
01669     preprocess_on_get=false;
01670     feature_cache=NULL;
01671     symbol_mask_table_len=256;
01672 
01673     m_parameters->add((CSGObject**) &alphabet, "alphabet");
01674     m_parameters->add_vector(&features, &num_vectors, "features",
01675             "This contains the array of features.");
01676     m_parameters->add_vector(&single_string,
01677             &length_of_single_string,
01678             "single_string",
01679             "Created by sliding window.");
01680     m_parameters->add(&max_string_length, "max_string_length",
01681             "Length of longest string.");
01682     m_parameters->add(&num_symbols, "num_symbols",
01683             "Number of used symbols.");
01684     m_parameters->add(&original_num_symbols, "original_num_symbols",
01685             "Original number of used symbols.");
01686     m_parameters->add(&order, "order",
01687             "Order used in higher order mapping.");
01688     m_parameters->add(&preprocess_on_get, "preprocess_on_get",
01689             "Preprocess on-the-fly?");
01690 
01691     /* TODO M_PARAMETERS->ADD?
01692      * /// order used in higher order mapping
01693      * ST* symbol_mask_table;
01694      */
01695     m_parameters->add_vector(&symbol_mask_table, &symbol_mask_table_len, "mask table", "fuck you");
01696 }
01697 
01702 template<> EFeatureType CStringFeatures<bool>::get_feature_type() const
01703 {
01704     return F_BOOL;
01705 }
01706 
01711 template<> EFeatureType CStringFeatures<char>::get_feature_type() const
01712 {
01713     return F_CHAR;
01714 }
01715 
01720 template<> EFeatureType CStringFeatures<uint8_t>::get_feature_type() const
01721 {
01722     return F_BYTE;
01723 }
01724 
01729 template<> EFeatureType CStringFeatures<int16_t>::get_feature_type() const
01730 {
01731     return F_SHORT;
01732 }
01733 
01738 template<> EFeatureType CStringFeatures<uint16_t>::get_feature_type() const
01739 {
01740     return F_WORD;
01741 }
01742 
01747 template<> EFeatureType CStringFeatures<int32_t>::get_feature_type() const
01748 {
01749     return F_INT;
01750 }
01751 
01756 template<> EFeatureType CStringFeatures<uint32_t>::get_feature_type() const
01757 {
01758     return F_UINT;
01759 }
01760 
01765 template<> EFeatureType CStringFeatures<int64_t>::get_feature_type() const
01766 {
01767     return F_LONG;
01768 }
01769 
01774 template<> EFeatureType CStringFeatures<uint64_t>::get_feature_type() const
01775 {
01776     return F_ULONG;
01777 }
01778 
01783 template<> EFeatureType CStringFeatures<float32_t>::get_feature_type() const
01784 {
01785     return F_SHORTREAL;
01786 }
01787 
01792 template<> EFeatureType CStringFeatures<float64_t>::get_feature_type() const
01793 {
01794     return F_DREAL;
01795 }
01796 
01801 template<> EFeatureType CStringFeatures<floatmax_t>::get_feature_type() const
01802 {
01803     return F_LONGREAL;
01804 }
01805 
01806 template<> bool CStringFeatures<bool>::get_masked_symbols(bool symbol, uint8_t mask)
01807 {
01808     return symbol;
01809 }
01810 template<> float32_t CStringFeatures<float32_t>::get_masked_symbols(float32_t symbol, uint8_t mask)
01811 {
01812     return symbol;
01813 }
01814 template<> float64_t CStringFeatures<float64_t>::get_masked_symbols(float64_t symbol, uint8_t mask)
01815 {
01816     return symbol;
01817 }
01818 template<> floatmax_t CStringFeatures<floatmax_t>::get_masked_symbols(floatmax_t symbol, uint8_t mask)
01819 {
01820     return symbol;
01821 }
01822 
01823 template<> bool CStringFeatures<bool>::shift_offset(bool symbol, int32_t amount)
01824 {
01825     return false;
01826 }
01827 template<> float32_t CStringFeatures<float32_t>::shift_offset(float32_t symbol, int32_t amount)
01828 {
01829     return 0;
01830 }
01831 template<> float64_t CStringFeatures<float64_t>::shift_offset(float64_t symbol, int32_t amount)
01832 {
01833     return 0;
01834 }
01835 template<> floatmax_t CStringFeatures<floatmax_t>::shift_offset(floatmax_t symbol, int32_t amount)
01836 {
01837     return 0;
01838 }
01839 
01840 template<> bool CStringFeatures<bool>::shift_symbol(bool symbol, int32_t amount)
01841 {
01842     return symbol;
01843 }
01844 template<> float32_t CStringFeatures<float32_t>::shift_symbol(float32_t symbol, int32_t amount)
01845 {
01846     return symbol;
01847 }
01848 template<> float64_t CStringFeatures<float64_t>::shift_symbol(float64_t symbol, int32_t amount)
01849 {
01850     return symbol;
01851 }
01852 template<> floatmax_t CStringFeatures<floatmax_t>::shift_symbol(floatmax_t symbol, int32_t amount)
01853 {
01854     return symbol;
01855 }
01856 
01857 #ifndef SUNOS
01858 template<>  template <class CT> bool CStringFeatures<float32_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
01859 {
01860     return false;
01861 }
01862 template<>  template <class CT> bool CStringFeatures<float64_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
01863 {
01864     return false;
01865 }
01866 template<>  template <class CT> bool CStringFeatures<floatmax_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
01867 {
01868     return false;
01869 }
01870 #endif
01871 
01872 template<>  void CStringFeatures<float32_t>::embed_features(int32_t p_order)
01873 {
01874 }
01875 template<>  void CStringFeatures<float64_t>::embed_features(int32_t p_order)
01876 {
01877 }
01878 template<>  void CStringFeatures<floatmax_t>::embed_features(int32_t p_order)
01879 {
01880 }
01881 
01882 template<>  void CStringFeatures<float32_t>::compute_symbol_mask_table(int64_t max_val)
01883 {
01884 }
01885 template<>  void CStringFeatures<float64_t>::compute_symbol_mask_table(int64_t max_val)
01886 {
01887 }
01888 template<>  void CStringFeatures<floatmax_t>::compute_symbol_mask_table(int64_t max_val)
01889 {
01890 }
01891 
01892 template<>  float32_t CStringFeatures<float32_t>::embed_word(float32_t* seq, int32_t len)
01893 {
01894     return 0;
01895 }
01896 template<>  float64_t CStringFeatures<float64_t>::embed_word(float64_t* seq, int32_t len)
01897 {
01898     return 0;
01899 }
01900 template<>  floatmax_t CStringFeatures<floatmax_t>::embed_word(floatmax_t* seq, int32_t len)
01901 {
01902     return 0;
01903 }
01904 
01905 template<>  void CStringFeatures<float32_t>::unembed_word(float32_t word, uint8_t* seq, int32_t len)
01906 {
01907 }
01908 template<>  void CStringFeatures<float64_t>::unembed_word(float64_t word, uint8_t* seq, int32_t len)
01909 {
01910 }
01911 template<>  void CStringFeatures<floatmax_t>::unembed_word(floatmax_t word, uint8_t* seq, int32_t len)
01912 {
01913 }
01914 #define LOAD(f_load, sg_type)                                               \
01915 template<> void CStringFeatures<sg_type>::load(CFile* loader)       \
01916 {                                                                           \
01917     SG_INFO( "loading...\n");                                               \
01918                                                                             \
01919     SG_SET_LOCALE_C;                                                    \
01920     SGString<sg_type>* strs;                                                \
01921     int32_t num_str;                                                        \
01922     int32_t max_len;                                                        \
01923     loader->f_load(strs, num_str, max_len);                                 \
01924     set_features(strs, num_str, max_len);                                   \
01925     SG_RESET_LOCALE;                                                    \
01926 }
01927 
01928 LOAD(get_string_list, bool)
01929 LOAD(get_string_list, char)
01930 LOAD(get_int8_string_list, int8_t)
01931 LOAD(get_string_list, uint8_t)
01932 LOAD(get_string_list, int16_t)
01933 LOAD(get_string_list, uint16_t)
01934 LOAD(get_string_list, int32_t)
01935 LOAD(get_uint_string_list, uint32_t)
01936 LOAD(get_long_string_list, int64_t)
01937 LOAD(get_ulong_string_list, uint64_t)
01938 LOAD(get_string_list, float32_t)
01939 LOAD(get_string_list, float64_t)
01940 LOAD(get_longreal_string_list, floatmax_t)
01941 #undef LOAD
01942 
01943 #define SAVE(f_write, sg_type)                                              \
01944 template<> void CStringFeatures<sg_type>::save(CFile* writer)       \
01945 {                                                                           \
01946     if (m_subset_stack->has_subsets())                                                          \
01947         SG_ERROR("save() is not possible on subset");                       \
01948     SG_SET_LOCALE_C;                                                    \
01949     ASSERT(writer);                                                         \
01950     writer->f_write(features, num_vectors);                                 \
01951     SG_RESET_LOCALE;                                                    \
01952 }
01953 
01954 SAVE(set_string_list, bool)
01955 SAVE(set_string_list, char)
01956 SAVE(set_int8_string_list, int8_t)
01957 SAVE(set_string_list, uint8_t)
01958 SAVE(set_string_list, int16_t)
01959 SAVE(set_string_list, uint16_t)
01960 SAVE(set_string_list, int32_t)
01961 SAVE(set_uint_string_list, uint32_t)
01962 SAVE(set_long_string_list, int64_t)
01963 SAVE(set_ulong_string_list, uint64_t)
01964 SAVE(set_string_list, float32_t)
01965 SAVE(set_string_list, float64_t)
01966 SAVE(set_longreal_string_list, floatmax_t)
01967 #undef SAVE
01968 
01969 template <class ST> template <class CT>
01970 bool CStringFeatures<ST>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start,
01971         int32_t p_order, int32_t gap, bool rev)
01972 {
01973     remove_all_subsets();
01974     ASSERT(sf);
01975 
01976     CAlphabet* alpha=sf->get_alphabet();
01977     ASSERT(alpha->get_num_symbols_in_histogram() > 0);
01978 
01979     this->order=p_order;
01980     cleanup();
01981 
01982     num_vectors=sf->get_num_vectors();
01983     ASSERT(num_vectors>0);
01984     max_string_length=sf->get_max_vector_length()-start;
01985     features=SG_MALLOC(SGString<ST>, num_vectors);
01986 
01987     SG_DEBUG( "%1.0llf symbols in StringFeatures<*> %d symbols in histogram\n", sf->get_num_symbols(),
01988             alpha->get_num_symbols_in_histogram());
01989 
01990     for (int32_t i=0; i<num_vectors; i++)
01991     {
01992         int32_t len=-1;
01993         bool vfree;
01994         CT* c=sf->get_feature_vector(i, len, vfree);
01995         ASSERT(!vfree); // won't work when preprocessors are attached
01996 
01997         features[i].string=SG_MALLOC(ST, len);
01998         features[i].slen=len;
01999 
02000         ST* str=features[i].string;
02001         for (int32_t j=0; j<len; j++)
02002             str[j]=(ST) alpha->remap_to_bin(c[j]);
02003     }
02004 
02005     original_num_symbols=alpha->get_num_symbols();
02006     int32_t max_val=alpha->get_num_bits();
02007 
02008     SG_UNREF(alpha);
02009 
02010     if (p_order>1)
02011         num_symbols=CMath::powl((floatmax_t) 2, (floatmax_t) max_val*p_order);
02012     else
02013         num_symbols=original_num_symbols;
02014     SG_INFO( "max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols);
02015 
02016     if ( ((floatmax_t) num_symbols) > CMath::powl(((floatmax_t) 2),((floatmax_t) sizeof(ST)*8)) )
02017     {
02018         SG_ERROR( "symbol does not fit into datatype \"%c\" (%d)\n", (char) max_val, (int) max_val);
02019         return false;
02020     }
02021 
02022     SG_DEBUG( "translate: start=%i order=%i gap=%i(size:%i)\n", start, p_order, gap, sizeof(ST)) ;
02023     for (int32_t line=0; line<num_vectors; line++)
02024     {
02025         int32_t len=0;
02026         bool vfree;
02027         ST* fv=get_feature_vector(line, len, vfree);
02028         ASSERT(!vfree); // won't work when preprocessors are attached
02029 
02030         if (rev)
02031             CAlphabet::translate_from_single_order_reversed(fv, len, start+gap, p_order+gap, max_val, gap);
02032         else
02033             CAlphabet::translate_from_single_order(fv, len, start+gap, p_order+gap, max_val, gap);
02034 
02035         /* fix the length of the string -- hacky */
02036         features[line].slen-=start+gap ;
02037         if (features[line].slen<0)
02038             features[line].slen=0 ;
02039     }
02040 
02041     compute_symbol_mask_table(max_val);
02042 
02043     return true;
02044 }
02045 
02046 template class CStringFeatures<bool>;
02047 template class CStringFeatures<char>;
02048 template class CStringFeatures<int8_t>;
02049 template class CStringFeatures<uint8_t>;
02050 template class CStringFeatures<int16_t>;
02051 template class CStringFeatures<uint16_t>;
02052 template class CStringFeatures<int32_t>;
02053 template class CStringFeatures<uint32_t>;
02054 template class CStringFeatures<int64_t>;
02055 template class CStringFeatures<uint64_t>;
02056 template class CStringFeatures<float32_t>;
02057 template class CStringFeatures<float64_t>;
02058 template class CStringFeatures<floatmax_t>;
02059 
02060 template bool CStringFeatures<uint16_t>::obtain_from_char_features<uint8_t>(CStringFeatures<uint8_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
02061 template bool CStringFeatures<uint32_t>::obtain_from_char_features<uint8_t>(CStringFeatures<uint8_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
02062 template bool CStringFeatures<uint64_t>::obtain_from_char_features<uint8_t>(CStringFeatures<uint8_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
02063 
02064 template bool CStringFeatures<uint16_t>::obtain_from_char_features<uint16_t>(CStringFeatures<uint16_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
02065 template bool CStringFeatures<uint32_t>::obtain_from_char_features<uint16_t>(CStringFeatures<uint16_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
02066 template bool CStringFeatures<uint64_t>::obtain_from_char_features<uint16_t>(CStringFeatures<uint16_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
02067 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation