StringFeatures.cpp

Go to the documentation of this file.
00001 #include <shogun/features/StringFeatures.h>
00002 #include <shogun/preprocessor/Preprocessor.h>
00003 #include <shogun/preprocessor/StringPreprocessor.h>
00004 #include <shogun/io/MemoryMappedFile.h>
00005 #include <shogun/io/SGIO.h>
00006 #include <shogun/mathematics/Math.h>
00007 #include <shogun/base/Parameter.h>
00008 
00009 #include <sys/types.h>
00010 #include <sys/stat.h>
00011 #include <dirent.h>
00012 #include <stdio.h>
00013 #include <stdlib.h>
00014 #include <unistd.h>
00015 
00016 
00017 namespace shogun
00018 {
00019 
00020 template<class ST> CStringFeatures<ST>::CStringFeatures() : CFeatures(0)
00021 {
00022     init();
00023     alphabet=new CAlphabet();
00024 }
00025 
00026 template<class ST> CStringFeatures<ST>::CStringFeatures(EAlphabet alpha) : CFeatures(0)
00027 {
00028     init();
00029 
00030     alphabet=new CAlphabet(alpha);
00031     SG_REF(alphabet);
00032     num_symbols=alphabet->get_num_symbols();
00033     original_num_symbols=num_symbols;
00034 }
00035 
00036 template<class ST> CStringFeatures<ST>::CStringFeatures(SGStringList<ST> string_list, EAlphabet alpha)
00037 : CFeatures(0)
00038 {
00039     init();
00040 
00041     alphabet=new CAlphabet(alpha);
00042     SG_REF(alphabet);
00043     num_symbols=alphabet->get_num_symbols();
00044     original_num_symbols=num_symbols;
00045     set_features(string_list.strings, string_list.num_strings, string_list.max_string_length);
00046 }
00047 
00048 template<class ST> CStringFeatures<ST>::CStringFeatures(SGStringList<ST> string_list, CAlphabet* alpha)
00049 : CFeatures(0)
00050 {
00051     init();
00052 
00053     alphabet=new CAlphabet(alpha);
00054     SG_REF(alphabet);
00055     num_symbols=alphabet->get_num_symbols();
00056     original_num_symbols=num_symbols;
00057     set_features(string_list.strings, string_list.num_strings, string_list.max_string_length);
00058 }
00059 
00060 template<class ST> CStringFeatures<ST>::CStringFeatures(CAlphabet* alpha)
00061 : CFeatures(0)
00062 {
00063     init();
00064 
00065     ASSERT(alpha);
00066     SG_REF(alpha);
00067     alphabet=alpha;
00068     num_symbols=alphabet->get_num_symbols();
00069     original_num_symbols=num_symbols;
00070 }
00071 
00072 template<class ST> CStringFeatures<ST>::CStringFeatures(const CStringFeatures & orig)
00073 : CFeatures(orig), num_vectors(orig.num_vectors),
00074     single_string(orig.single_string),
00075     length_of_single_string(orig.length_of_single_string),
00076     max_string_length(orig.max_string_length),
00077     num_symbols(orig.num_symbols),
00078     original_num_symbols(orig.original_num_symbols),
00079     order(orig.order), preprocess_on_get(false),
00080     feature_cache(NULL)
00081 {
00082     init();
00083 
00084     ASSERT(orig.single_string == NULL); //not implemented
00085 
00086     alphabet=orig.alphabet;
00087     SG_REF(alphabet);
00088 
00089     if (orig.features)
00090     {
00091         features=SG_MALLOC(SGString<ST>, orig.num_vectors);
00092 
00093         for (int32_t i=0; i<num_vectors; i++)
00094         {
00095             features[i].string=SG_MALLOC(ST, orig.features[i].slen);
00096             features[i].slen=orig.features[i].slen;
00097             memcpy(features[i].string, orig.features[i].string, sizeof(ST)*orig.features[i].slen);
00098         }
00099     }
00100 
00101     if (orig.symbol_mask_table)
00102     {
00103         symbol_mask_table=SG_MALLOC(ST, 256);
00104         for (int32_t i=0; i<256; i++)
00105             symbol_mask_table[i]=orig.symbol_mask_table[i];
00106     }
00107 
00108     m_subset=orig.m_subset->duplicate();
00109 }
00110 
00111 template<class ST> CStringFeatures<ST>::CStringFeatures(CFile* loader, EAlphabet alpha)
00112 : CFeatures(loader), num_vectors(0),
00113   features(NULL), single_string(NULL), length_of_single_string(0),
00114   max_string_length(0), order(0),
00115   symbol_mask_table(NULL), preprocess_on_get(false), feature_cache(NULL)
00116 {
00117     init();
00118 
00119     alphabet=new CAlphabet(alpha);
00120     SG_REF(alphabet);
00121     num_symbols=alphabet->get_num_symbols();
00122     original_num_symbols=num_symbols;
00123     load(loader);
00124 }
00125 
00126 template<class ST> CStringFeatures<ST>::~CStringFeatures()
00127 {
00128     cleanup();
00129 
00130     SG_UNREF(alphabet);
00131 }
00132 
00133 template<class ST> void CStringFeatures<ST>::cleanup()
00134 {
00135     remove_subset();
00136 
00137     if (single_string)
00138     {
00139         SG_FREE(single_string);
00140         single_string=NULL;
00141     }
00142     else
00143         cleanup_feature_vectors(0, num_vectors-1);
00144 
00145     num_vectors=0;
00146     SG_FREE(features);
00147     SG_FREE(symbol_mask_table);
00148     features=NULL;
00149     symbol_mask_table=NULL;
00150 
00151     /* start with a fresh alphabet, but instead of emptying the histogram
00152      * create a new object (to leave the alphabet object alone if it is used
00153      * by others)
00154      */
00155     CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet());
00156     SG_UNREF(alphabet);
00157     alphabet=alpha;
00158     SG_REF(alphabet);
00159 }
00160 
00161 template<class ST> void CStringFeatures<ST>::cleanup_feature_vector(int32_t num)
00162 {
00163     ASSERT(num<get_num_vectors());
00164 
00165     if (features)
00166     {
00167         int32_t real_num=subset_idx_conversion(num);
00168         SG_FREE(features[real_num].string);
00169         features[real_num].string=NULL;
00170         features[real_num].slen=0;
00171 
00172         determine_maximum_string_length();
00173     }
00174 }
00175 
00176 template<class ST> void CStringFeatures<ST>::cleanup_feature_vectors(int32_t start, int32_t stop)
00177 {
00178     if (features && get_num_vectors())
00179     {
00180         ASSERT(start<get_num_vectors());
00181         ASSERT(stop<get_num_vectors());
00182 
00183         for (int32_t i=start; i<=stop; i++)
00184         {
00185             int32_t real_num=subset_idx_conversion(i);
00186             SG_FREE(features[real_num].string);
00187             features[real_num].string=NULL;
00188             features[real_num].slen=0;
00189         }
00190         determine_maximum_string_length();
00191     }
00192 }
00193 
00194 template<class ST> EFeatureClass CStringFeatures<ST>::get_feature_class() { return C_STRING; }
00195 
00196 template<class ST> EFeatureType CStringFeatures<ST>::get_feature_type() { return F_UNKNOWN; }
00197 
00198 template<class ST> CAlphabet* CStringFeatures<ST>::get_alphabet()
00199 {
00200     SG_REF(alphabet);
00201     return alphabet;
00202 }
00203 
00204 template<class ST> CFeatures* CStringFeatures<ST>::duplicate() const
00205 {
00206     return new CStringFeatures<ST>(*this);
00207 }
00208 
00209 template<class ST> SGVector<ST> CStringFeatures<ST>::get_feature_vector(int32_t num)
00210 {
00211     ASSERT(features);
00212     if (num>=get_num_vectors())
00213     {
00214         SG_ERROR("Index out of bounds (number of strings %d, you "
00215                 "requested %d)\n", get_num_vectors(), num);
00216     }
00217 
00218     int32_t l;
00219     bool free_vec;
00220     ST* vec=get_feature_vector(num, l, free_vec);
00221     ST* dst=SG_MALLOC(ST, l);
00222     memcpy(dst, vec, l*sizeof(ST));
00223     free_feature_vector(vec, num, free_vec);
00224     return SGVector<ST>(dst, l);
00225 }
00226 
00227 template<class ST> void CStringFeatures<ST>::set_feature_vector(SGVector<ST> vector, int32_t num)
00228 {
00229     ASSERT(features);
00230 
00231     if (m_subset)
00232         SG_ERROR("A subset is set, cannot set feature vector\n");
00233 
00234     if (num>=num_vectors)
00235     {
00236         SG_ERROR("Index out of bounds (number of strings %d, you "
00237                 "requested %d)\n", num_vectors, num);
00238     }
00239 
00240     if (vector.vlen<=0)
00241         SG_ERROR("String has zero or negative length\n");
00242 
00243     cleanup_feature_vector(num);
00244     features[num].slen=vector.vlen;
00245     features[num].string=SG_MALLOC(ST, vector.vlen);
00246     memcpy(features[num].string, vector.vector, vector.vlen*sizeof(ST));
00247 
00248     determine_maximum_string_length();
00249 }
00250 
00251 template<class ST> void CStringFeatures<ST>::enable_on_the_fly_preprocessing()
00252 {
00253     preprocess_on_get=true;
00254 }
00255 
00256 template<class ST> void CStringFeatures<ST>::disable_on_the_fly_preprocessing()
00257 {
00258     preprocess_on_get=false;
00259 }
00260 
00261 template<class ST> ST* CStringFeatures<ST>::get_feature_vector(int32_t num, int32_t& len, bool& dofree)
00262 {
00263     ASSERT(features);
00264     ASSERT(num<get_num_vectors());
00265 
00266 
00267     int32_t real_num=subset_idx_conversion(num);
00268 
00269     if (!preprocess_on_get)
00270     {
00271         dofree=false;
00272         len=features[real_num].slen;
00273         return features[real_num].string;
00274     }
00275     else
00276     {
00277         SG_DEBUG( "computing feature vector!\n") ;
00278         ST* feat=compute_feature_vector(num, len);
00279         dofree=true;
00280 
00281         if (get_num_preprocessors())
00282         {
00283             ST* tmp_feat_before=feat;
00284 
00285             for (int32_t i=0; i<get_num_preprocessors(); i++)
00286             {
00287                 CStringPreprocessor<ST>* p=(CStringPreprocessor<ST>*) get_preprocessor(i);
00288                 feat=p->apply_to_string(tmp_feat_before, len);
00289                 SG_UNREF(p);
00290                 SG_FREE(tmp_feat_before);
00291                 tmp_feat_before=feat;
00292             }
00293         }
00294         // TODO: implement caching
00295         return feat;
00296     }
00297 }
00298 
00299 template<class ST> CStringFeatures<ST>* CStringFeatures<ST>::get_transposed()
00300 {
00301     int32_t num_feat;
00302     int32_t num_vec;
00303     SGString<ST>* s=get_transposed(num_feat, num_vec);
00304     SGStringList<ST> string_list;
00305     string_list.strings = s;
00306     string_list.num_strings = num_vec;
00307     string_list.max_string_length = num_feat;
00308 
00309     return new CStringFeatures<ST>(string_list, alphabet);
00310 }
00311 
00312 template<class ST> SGString<ST>* CStringFeatures<ST>::get_transposed(int32_t &num_feat, int32_t &num_vec)
00313 {
00314     num_feat=get_num_vectors();
00315     num_vec=get_max_vector_length();
00316     ASSERT(have_same_length());
00317 
00318     SG_DEBUG("Allocating memory for transposed string features of size %ld\n",
00319             int64_t(num_feat)*num_vec);
00320 
00321     SGString<ST>* sf=SG_MALLOC(SGString<ST>, num_vec);
00322 
00323     for (int32_t i=0; i<num_vec; i++)
00324     {
00325         sf[i].string=SG_MALLOC(ST, num_feat);
00326         sf[i].slen=num_feat;
00327     }
00328 
00329     for (int32_t i=0; i<num_feat; i++)
00330     {
00331         int32_t len=0;
00332         bool free_vec=false;
00333         ST* vec=get_feature_vector(i, len, free_vec);
00334 
00335         for (int32_t j=0; j<num_vec; j++)
00336             sf[j].string[i]=vec[j];
00337 
00338         free_feature_vector(vec, i, free_vec);
00339     }
00340     return sf;
00341 }
00342 
00343 template<class ST> void CStringFeatures<ST>::free_feature_vector(ST* feat_vec, int32_t num, bool dofree)
00344 {
00345     if (num>=get_num_vectors())
00346     {
00347         SG_ERROR(
00348             "Trying to access string[%d] but num_str=%d\n", num,
00349             get_num_vectors());
00350     }
00351 
00352     int32_t real_num=subset_idx_conversion(num);
00353 
00354     if (feature_cache)
00355         feature_cache->unlock_entry(real_num);
00356 
00357     if (dofree)
00358         SG_FREE(feat_vec);
00359 }
00360 
00361 template<class ST> void CStringFeatures<ST>::free_feature_vector(SGVector<ST> feat_vec, int32_t num)
00362 {
00363     if (num>=get_num_vectors())
00364     {
00365         SG_ERROR(
00366             "Trying to access string[%d] but num_str=%d\n", num,
00367             get_num_vectors());
00368     }
00369 
00370     int32_t real_num=subset_idx_conversion(num);
00371 
00372     if (feature_cache)
00373         feature_cache->unlock_entry(real_num);
00374 
00375     if (feat_vec.do_free)
00376         SG_FREE(feat_vec.vector);
00377 }
00378 
00379 template<class ST> ST CStringFeatures<ST>::get_feature(int32_t vec_num, int32_t feat_num)
00380 {
00381     ASSERT(vec_num<get_num_vectors());
00382 
00383     int32_t len;
00384     bool free_vec;
00385     ST* vec=get_feature_vector(vec_num, len, free_vec);
00386     ASSERT(feat_num<len);
00387     ST result=vec[feat_num];
00388     free_feature_vector(vec, vec_num, free_vec);
00389 
00390     return result;
00391 }
00392 
00393 template<class ST> int32_t CStringFeatures<ST>::get_vector_length(int32_t vec_num)
00394 {
00395     ASSERT(vec_num<get_num_vectors());
00396 
00397     int32_t len;
00398     bool free_vec;
00399     ST* vec=get_feature_vector(vec_num, len, free_vec);
00400     free_feature_vector(vec, vec_num, free_vec);
00401     return len;
00402 }
00403 
00404 template<class ST> int32_t CStringFeatures<ST>::get_max_vector_length()
00405 {
00406     return max_string_length;
00407 }
00408 
00409 template<class ST> int32_t CStringFeatures<ST>::get_num_vectors() const
00410 {
00411     return m_subset ? m_subset->get_size() : num_vectors;
00412 }
00413 
00414 template<class ST> floatmax_t CStringFeatures<ST>::get_num_symbols() { return num_symbols; }
00415 
00416 template<class ST> floatmax_t CStringFeatures<ST>::get_max_num_symbols() { return CMath::powl(2,sizeof(ST)*8); }
00417 
00418 template<class ST> floatmax_t CStringFeatures<ST>::get_original_num_symbols() { return original_num_symbols; }
00419 
00420 template<class ST> int32_t CStringFeatures<ST>::get_order() { return order; }
00421 
00422 template<class ST> ST CStringFeatures<ST>::get_masked_symbols(ST symbol, uint8_t mask)
00423 {
00424     ASSERT(symbol_mask_table);
00425     return symbol_mask_table[mask] & symbol;
00426 }
00427 
00428 template<class ST> ST CStringFeatures<ST>::shift_offset(ST offset, int32_t amount)
00429 {
00430     ASSERT(alphabet);
00431     return (offset << (amount*alphabet->get_num_bits()));
00432 }
00433 
00434 template<class ST> ST CStringFeatures<ST>::shift_symbol(ST symbol, int32_t amount)
00435 {
00436     ASSERT(alphabet);
00437     return (symbol >> (amount*alphabet->get_num_bits()));
00438 }
00439 
00440 template<class ST> void CStringFeatures<ST>::load_ascii_file(char* fname, bool remap_to_bin,
00441         EAlphabet ascii_alphabet, EAlphabet binary_alphabet)
00442 {
00443     remove_subset();
00444 
00445     size_t blocksize=1024*1024;
00446     size_t required_blocksize=0;
00447     uint8_t* dummy=SG_MALLOC(uint8_t, blocksize);
00448     uint8_t* overflow=NULL;
00449     int32_t overflow_len=0;
00450 
00451     cleanup();
00452 
00453     CAlphabet* alpha=new CAlphabet(ascii_alphabet);
00454     CAlphabet* alpha_bin=new CAlphabet(binary_alphabet);
00455 
00456     FILE* f=fopen(fname, "ro");
00457 
00458     if (f)
00459     {
00460         num_vectors=0;
00461         max_string_length=0;
00462 
00463         SG_INFO("counting line numbers in file %s\n", fname);
00464         size_t block_offs=0;
00465         size_t old_block_offs=0;
00466         fseek(f, 0, SEEK_END);
00467         size_t fsize=ftell(f);
00468         rewind(f);
00469 
00470         if (blocksize>fsize)
00471             blocksize=fsize;
00472 
00473         SG_DEBUG("block_size=%ld file_size=%ld\n", blocksize, fsize);
00474 
00475         size_t sz=blocksize;
00476         while (sz == blocksize)
00477         {
00478             sz=fread(dummy, sizeof(uint8_t), blocksize, f);
00479             for (size_t i=0; i<sz; i++)
00480             {
00481                 block_offs++;
00482                 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00483                 {
00484                     num_vectors++;
00485                     required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs);
00486                     old_block_offs=block_offs;
00487                 }
00488             }
00489             SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
00490         }
00491 
00492         SG_INFO("found %d strings\n", num_vectors);
00493         SG_FREE(dummy);
00494         blocksize=required_blocksize;
00495         dummy=SG_MALLOC(uint8_t, blocksize);
00496         overflow=SG_MALLOC(uint8_t, blocksize);
00497         features=SG_MALLOC(SGString<ST>, num_vectors);
00498 
00499         rewind(f);
00500         sz=blocksize;
00501         int32_t lines=0;
00502         while (sz == blocksize)
00503         {
00504             sz=fread(dummy, sizeof(uint8_t), blocksize, f);
00505 
00506             size_t old_sz=0;
00507             for (size_t i=0; i<sz; i++)
00508             {
00509                 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00510                 {
00511                     int32_t len=i-old_sz;
00512                     //SG_PRINT("i:%d len:%d old_sz:%d\n", i, len, old_sz);
00513                     max_string_length=CMath::max(max_string_length, len+overflow_len);
00514 
00515                     features[lines].slen=len;
00516                     features[lines].string=SG_MALLOC(ST, len);
00517 
00518                     if (remap_to_bin)
00519                     {
00520                         for (int32_t j=0; j<overflow_len; j++)
00521                             features[lines].string[j]=alpha->remap_to_bin(overflow[j]);
00522                         for (int32_t j=0; j<len; j++)
00523                             features[lines].string[j+overflow_len]=alpha->remap_to_bin(dummy[old_sz+j]);
00524                         alpha->add_string_to_histogram(&dummy[old_sz], len);
00525                         alpha_bin->add_string_to_histogram(features[lines].string, features[lines].slen);
00526                     }
00527                     else
00528                     {
00529                         for (int32_t j=0; j<overflow_len; j++)
00530                             features[lines].string[j]=overflow[j];
00531                         for (int32_t j=0; j<len; j++)
00532                             features[lines].string[j+overflow_len]=dummy[old_sz+j];
00533                         alpha->add_string_to_histogram(&dummy[old_sz], len);
00534                         alpha->add_string_to_histogram(features[lines].string, features[lines].slen);
00535                     }
00536 
00537                     // clear overflow
00538                     overflow_len=0;
00539 
00540                     //CMath::display_vector(features[lines].string, len);
00541                     old_sz=i+1;
00542                     lines++;
00543                     SG_PROGRESS(lines, 0, num_vectors, 1, "LOADING:\t");
00544                 }
00545             }
00546             for (size_t i=old_sz; i<sz; i++)
00547                 overflow[i-old_sz]=dummy[i];
00548 
00549             overflow_len=sz-old_sz;
00550         }
00551 
00552         if (alpha->check_alphabet_size() && alpha->check_alphabet())
00553         {
00554             SG_INFO("file successfully read\n");
00555             SG_INFO("max_string_length=%d\n", max_string_length);
00556             SG_INFO("num_strings=%d\n", num_vectors);
00557         }
00558         fclose(f);
00559     }
00560 
00561     SG_FREE(dummy);
00562 
00563     SG_UNREF(alphabet);
00564 
00565     if (remap_to_bin)
00566         alphabet=alpha_bin;
00567     else
00568         alphabet=alpha;
00569     SG_REF(alphabet);
00570     num_symbols=alphabet->get_num_symbols();
00571 }
00572 
00573 template<class ST> bool CStringFeatures<ST>::load_fasta_file(const char* fname, bool ignore_invalid)
00574 {
00575     remove_subset();
00576 
00577     int32_t i=0;
00578     uint64_t len=0;
00579     uint64_t offs=0;
00580     int32_t num=0;
00581     int32_t max_len=0;
00582 
00583     CMemoryMappedFile<char> f(fname);
00584 
00585     while (true)
00586     {
00587         char* s=f.get_line(len, offs);
00588         if (!s)
00589             break;
00590 
00591         if (len>0 && s[0]=='>')
00592             num++;
00593     }
00594 
00595     if (num==0)
00596         SG_ERROR("No fasta hunks (lines starting with '>') found\n");
00597 
00598     cleanup();
00599     SG_UNREF(alphabet);
00600     alphabet=new CAlphabet(DNA);
00601     num_symbols=alphabet->get_num_symbols();
00602 
00603     SGString<ST>* strings=SG_MALLOC(SGString<ST>, num);
00604     offs=0;
00605 
00606     for (i=0;i<num; i++)
00607     {
00608         uint64_t id_len=0;
00609         char* id=f.get_line(id_len, offs);
00610 
00611         char* fasta=f.get_line(len, offs);
00612         char* s=fasta;
00613         int32_t fasta_len=0;
00614         int32_t spanned_lines=0;
00615 
00616         while (true)
00617         {
00618             if (!s || len==0)
00619                 SG_ERROR("Error reading fasta entry in line %d len=%ld", 4*i+1, len);
00620 
00621             if (s[0]=='>' || offs==f.get_size())
00622             {
00623                 offs-=len+1; // seek to beginning
00624                 if (offs==f.get_size())
00625                 {
00626                     SG_DEBUG("at EOF\n");
00627                     fasta_len+=len;
00628                 }
00629 
00630                 len=fasta_len-spanned_lines;
00631                 strings[i].string=SG_MALLOC(ST, len);
00632                 strings[i].slen=len;
00633 
00634                 ST* str=strings[i].string;
00635                 int32_t idx=0;
00636                 SG_DEBUG("'%.*s', len=%d, spanned_lines=%d\n", (int32_t) id_len, id, (int32_t) len, (int32_t) spanned_lines);
00637 
00638                 for (int32_t j=0; j<fasta_len; j++)
00639                 {
00640                     if (fasta[j]=='\n')
00641                         continue;
00642 
00643                     ST c=(ST) fasta[j];
00644 
00645                     if (ignore_invalid  && !alphabet->is_valid((uint8_t) fasta[j]))
00646                         c=(ST) 'A';
00647 
00648                     if (uint64_t(idx)>=len)
00649                         SG_ERROR("idx=%d j=%d fasta_len=%d, spanned_lines=%d str='%.*s'\n", idx, j, fasta_len, spanned_lines, idx, str);
00650                     str[idx++]=c;
00651                 }
00652                 max_len=CMath::max(max_len, strings[i].slen);
00653 
00654 
00655                 break;
00656             }
00657 
00658             spanned_lines++;
00659             fasta_len+=len+1; // including '\n'
00660             s=f.get_line(len, offs);
00661         }
00662     }
00663     return set_features(strings, num, max_len);
00664 }
00665 
00666 template<class ST> bool CStringFeatures<ST>::load_fastq_file(const char* fname,
00667         bool ignore_invalid, bool bitremap_in_single_string)
00668 {
00669     remove_subset();
00670 
00671     CMemoryMappedFile<char> f(fname);
00672 
00673     int32_t i=0;
00674     uint64_t len=0;
00675     uint64_t offs=0;
00676 
00677     int32_t num=f.get_num_lines();
00678     int32_t max_len=0;
00679 
00680     if (num%4)
00681         SG_ERROR("Number of lines must be divisible by 4 in fastq files\n");
00682     num/=4;
00683 
00684     cleanup();
00685     SG_UNREF(alphabet);
00686     alphabet=new CAlphabet(DNA);
00687 
00688     SGString<ST>* strings;
00689 
00690     ST* str=NULL;
00691     if (bitremap_in_single_string)
00692     {
00693         strings=SG_MALLOC(SGString<ST>, 1);
00694         strings[0].string=SG_MALLOC(ST, num);
00695         strings[0].slen=num;
00696         f.get_line(len, offs);
00697         f.get_line(len, offs);
00698         order=len;
00699         max_len=num;
00700         offs=0;
00701         original_num_symbols=alphabet->get_num_symbols();
00702         str=SG_MALLOC(ST, len);
00703     }
00704     else
00705         strings=SG_MALLOC(SGString<ST>, num);
00706 
00707     for (i=0;i<num; i++)
00708     {
00709         if (!f.get_line(len, offs))
00710             SG_ERROR("Error reading 'read' identifier in line %d", 4*i);
00711 
00712         char* s=f.get_line(len, offs);
00713         if (!s || len==0)
00714             SG_ERROR("Error reading 'read' in line %d len=%ld", 4*i+1, len);
00715 
00716         if (bitremap_in_single_string)
00717         {
00718             if (len!=(uint64_t) order)
00719                 SG_ERROR("read in line %d not of length %d (is %d)\n", 4*i+1, order, len);
00720             for (int32_t j=0; j<order; j++)
00721                 str[j]=(ST) alphabet->remap_to_bin((uint8_t) s[j]);
00722 
00723             strings[0].string[i]=embed_word(str, order);
00724         }
00725         else
00726         {
00727             strings[i].string=SG_MALLOC(ST, len);
00728             strings[i].slen=len;
00729             str=strings[i].string;
00730 
00731             if (ignore_invalid)
00732             {
00733                 for (uint64_t j=0; j<len; j++)
00734                 {
00735                     if (alphabet->is_valid((uint8_t) s[j]))
00736                         str[j]= (ST) s[j];
00737                     else
00738                         str[j]= (ST) 'A';
00739                 }
00740             }
00741             else
00742             {
00743                 for (uint64_t j=0; j<len; j++)
00744                     str[j]= (ST) s[j];
00745             }
00746             max_len=CMath::max(max_len, (int32_t) len);
00747         }
00748 
00749 
00750         if (!f.get_line(len, offs))
00751             SG_ERROR("Error reading 'read' quality identifier in line %d", 4*i+2);
00752 
00753         if (!f.get_line(len, offs))
00754             SG_ERROR("Error reading 'read' quality in line %d", 4*i+3);
00755     }
00756 
00757     if (bitremap_in_single_string)
00758         num=1;
00759 
00760     num_vectors=num;
00761     max_string_length=max_len;
00762     features=strings;
00763 
00764     return true;
00765 }
00766 
00767 template<class ST> bool CStringFeatures<ST>::load_from_directory(char* dirname)
00768 {
00769     remove_subset();
00770 
00771     struct dirent **namelist;
00772     int32_t n;
00773 
00774     SGIO::set_dirname(dirname);
00775 
00776     SG_DEBUG("dirname '%s'\n", dirname);
00777 
00778     n=scandir(dirname, &namelist, &SGIO::filter, alphasort);
00779     if (n <= 0)
00780     {
00781         SG_ERROR("error calling scandir - no files found\n");
00782         return false;
00783     }
00784     else
00785     {
00786         SGString<ST>* strings=NULL;
00787 
00788         int32_t num=0;
00789         int32_t max_len=-1;
00790 
00791         //usually n==num_vec, but it might not in race conditions
00792         //(file perms modified, file erased)
00793         strings=SG_MALLOC(SGString<ST>, n);
00794 
00795         for (int32_t i=0; i<n; i++)
00796         {
00797             char* fname=SGIO::concat_filename(namelist[i]->d_name);
00798 
00799             struct stat s;
00800             off_t filesize=0;
00801 
00802             if (!stat(fname, &s) && s.st_size>0)
00803             {
00804                 filesize=s.st_size/sizeof(ST);
00805 
00806                 FILE* f=fopen(fname, "ro");
00807                 if (f)
00808                 {
00809                     ST* str=SG_MALLOC(ST, filesize);
00810                     SG_DEBUG("%s:%ld\n", fname, (int64_t) filesize);
00811                     if (fread(str, sizeof(ST), filesize, f)!=(size_t) filesize)
00812                         SG_ERROR("failed to read file\n");
00813                     strings[num].string=str;
00814                     strings[num].slen=filesize;
00815                     max_len=CMath::max(max_len, strings[num].slen);
00816 
00817                     num++;
00818                     fclose(f);
00819                 }
00820             }
00821             else
00822                 SG_ERROR("empty or non readable file \'%s\'\n", fname);
00823 
00824             SG_FREE(namelist[i]);
00825         }
00826         SG_FREE(namelist);
00827 
00828         if (num>0 && strings)
00829         {
00830             set_features(strings, num, max_len);
00831             return true;
00832         }
00833     }
00834     return false;
00835 }
00836 
00837 template<class ST> void CStringFeatures<ST>::set_features(SGStringList<ST> feats)
00838 {
00839     set_features(feats.strings, feats.num_strings, feats.max_string_length);
00840 }
00841 
00842 template<class ST> bool CStringFeatures<ST>::set_features(SGString<ST>* p_features, int32_t p_num_vectors, int32_t p_max_string_length)
00843 {
00844     if (m_subset)
00845         SG_ERROR("Cannot call set_features() with subset.\n");
00846 
00847     if (p_features)
00848     {
00849         CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet());
00850 
00851         //compute histogram for char/byte
00852         for (int32_t i=0; i<p_num_vectors; i++)
00853             alpha->add_string_to_histogram( p_features[i].string, p_features[i].slen);
00854 
00855         SG_INFO("max_value_in_histogram:%d\n", alpha->get_max_value_in_histogram());
00856         SG_INFO("num_symbols_in_histogram:%d\n", alpha->get_num_symbols_in_histogram());
00857 
00858         if (alpha->check_alphabet_size() && alpha->check_alphabet())
00859         {
00860             cleanup();
00861             SG_UNREF(alphabet);
00862 
00863             alphabet=alpha;
00864             SG_REF(alphabet);
00865 
00866             features=p_features;
00867             num_vectors=p_num_vectors;
00868             max_string_length=p_max_string_length;
00869 
00870             return true;
00871         }
00872         else
00873             SG_UNREF(alpha);
00874     }
00875 
00876     return false;
00877 }
00878 
00879 template<class ST> bool CStringFeatures<ST>::append_features(CStringFeatures<ST>* sf)
00880 {
00881     ASSERT(sf);
00882 
00883     if (m_subset)
00884         SG_ERROR("Cannot call set_features() with subset.\n");
00885 
00886     SGString<ST>* new_features=SG_MALLOC(SGString<ST>, sf->get_num_vectors());
00887 
00888     index_t sf_num_str=sf->get_num_vectors();
00889     for (int32_t i=0; i<sf_num_str; i++)
00890     {
00891         int32_t real_i = sf->subset_idx_conversion(i);
00892         int32_t length=sf->features[real_i].slen;
00893         new_features[i].string=SG_MALLOC(ST, length);
00894         memcpy(new_features[i].string, sf->features[real_i].string, length);
00895         new_features[i].slen=length;
00896     }
00897     return append_features(new_features, sf_num_str,
00898             sf->max_string_length);
00899 }
00900 
00901 template<class ST> bool CStringFeatures<ST>::append_features(SGString<ST>* p_features, int32_t p_num_vectors, int32_t p_max_string_length)
00902 {
00903     if (m_subset)
00904         SG_ERROR("Cannot call set_features() with subset.\n");
00905 
00906     if (!features)
00907         return set_features(p_features, p_num_vectors, p_max_string_length);
00908 
00909     CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet());
00910 
00911     //compute histogram for char/byte
00912     for (int32_t i=0; i<p_num_vectors; i++)
00913         alpha->add_string_to_histogram( p_features[i].string, p_features[i].slen);
00914 
00915     SG_INFO("max_value_in_histogram:%d\n", alpha->get_max_value_in_histogram());
00916     SG_INFO("num_symbols_in_histogram:%d\n", alpha->get_num_symbols_in_histogram());
00917 
00918     if (alpha->check_alphabet_size() && alpha->check_alphabet())
00919     {
00920         SG_UNREF(alpha);
00921         for (int32_t i=0; i<p_num_vectors; i++)
00922             alphabet->add_string_to_histogram( p_features[i].string, p_features[i].slen);
00923 
00924         int32_t old_num_vectors=num_vectors;
00925         num_vectors=old_num_vectors+p_num_vectors;
00926         SGString<ST>* new_features=SG_MALLOC(SGString<ST>, num_vectors);
00927 
00928         for (int32_t i=0; i<num_vectors; i++)
00929         {
00930             if (i<old_num_vectors)
00931             {
00932                 new_features[i].string=features[i].string;
00933                 new_features[i].slen=features[i].slen;
00934             }
00935             else
00936             {
00937                 new_features[i].string=p_features[i-old_num_vectors].string;
00938                 new_features[i].slen=p_features[i-old_num_vectors].slen;
00939             }
00940         }
00941         SG_FREE(features);
00942         SG_FREE(p_features); // free now obsolete features
00943 
00944         this->features=new_features;
00945         max_string_length=CMath::max(max_string_length, p_max_string_length);
00946 
00947         return true;
00948     }
00949     SG_UNREF(alpha);
00950 
00951     return false;
00952 }
00953 
00954 template<class ST> SGStringList<ST> CStringFeatures<ST>::get_features()
00955 {
00956     SGStringList<ST> sl;
00957 
00958     sl.strings=get_features(sl.num_strings, sl.max_string_length);
00959     return sl;
00960 }
00961 
00962 template<class ST> SGString<ST>* CStringFeatures<ST>::get_features(int32_t& num_str, int32_t& max_str_len)
00963 {
00964     if (m_subset)
00965         SG_ERROR("get features() is not possible on subset");
00966 
00967     num_str=num_vectors;
00968     max_str_len=max_string_length;
00969     return features;
00970 }
00971 
00972 template<class ST> SGString<ST>* CStringFeatures<ST>::copy_features(int32_t& num_str, int32_t& max_str_len)
00973 {
00974     ASSERT(num_vectors>0);
00975 
00976     num_str=get_num_vectors();
00977     max_str_len=max_string_length;
00978     SGString<ST>* new_feat=SG_MALLOC(SGString<ST>, num_str);
00979 
00980     for (int32_t i=0; i<num_str; i++)
00981     {
00982         int32_t len;
00983         bool free_vec;
00984         ST* vec=get_feature_vector(i, len, free_vec);
00985         new_feat[i].string=SG_MALLOC(ST, len);
00986         new_feat[i].slen=len;
00987         memcpy(new_feat[i].string, vec, ((size_t) len) * sizeof(ST));
00988         free_feature_vector(vec, i, free_vec);
00989     }
00990 
00991     return new_feat;
00992 }
00993 
00994 template<class ST> void CStringFeatures<ST>::get_features(SGString<ST>** dst, int32_t* num_str)
00995 {
00996     int32_t num_vec;
00997     int32_t max_str_len;
00998     *dst=copy_features(num_vec, max_str_len);
00999     *num_str=num_vec;
01000 }
01001 
01002 template<class ST> bool CStringFeatures<ST>::load_compressed(char* src, bool decompress)
01003 {
01004     remove_subset();
01005 
01006     FILE* file=NULL;
01007 
01008     if (!(file=fopen(src, "r")))
01009         return false;
01010     cleanup();
01011 
01012     // header shogun v0
01013     char id[4];
01014     if (fread(&id[0], sizeof(char), 1, file)!=1)
01015         SG_ERROR("failed to read header");
01016     ASSERT(id[0]=='S');
01017     if (fread(&id[1], sizeof(char), 1, file)!=1)
01018         SG_ERROR("failed to read header");
01019     ASSERT(id[1]=='G');
01020     if (fread(&id[2], sizeof(char), 1, file)!=1)
01021         SG_ERROR("failed to read header");
01022     ASSERT(id[2]=='V');
01023     if (fread(&id[3], sizeof(char), 1, file)!=1)
01024         SG_ERROR("failed to read header");
01025     ASSERT(id[3]=='0');
01026 
01027     //compression type
01028     uint8_t c;
01029     if (fread(&c, sizeof(uint8_t), 1, file)!=1)
01030         SG_ERROR("failed to read compression type");
01031     CCompressor* compressor= new CCompressor((E_COMPRESSION_TYPE) c);
01032     //alphabet
01033     uint8_t a;
01034     delete alphabet;
01035     if (fread(&a, sizeof(uint8_t), 1, file)!=1)
01036         SG_ERROR("failed to read compression alphabet");
01037     alphabet=new CAlphabet((EAlphabet) a);
01038     // number of vectors
01039     if (fread(&num_vectors, sizeof(int32_t), 1, file)!=1)
01040         SG_ERROR("failed to read compression number of vectors");
01041     ASSERT(num_vectors>0);
01042     // maximum string length
01043     if (fread(&max_string_length, sizeof(int32_t), 1, file)!=1)
01044         SG_ERROR("failed to read maximum string length");
01045     ASSERT(max_string_length>0);
01046 
01047     features=SG_MALLOC(SGString<ST>, num_vectors);
01048 
01049     // vectors
01050     for (int32_t i=0; i<num_vectors; i++)
01051     {
01052         // vector len compressed
01053         int32_t len_compressed;
01054         if (fread(&len_compressed, sizeof(int32_t), 1, file)!=1)
01055             SG_ERROR("failed to read vector length compressed");
01056         // vector len uncompressed
01057         int32_t len_uncompressed;
01058         if (fread(&len_uncompressed, sizeof(int32_t), 1, file)!=1)
01059             SG_ERROR("failed to read vector length uncompressed");
01060 
01061         // vector raw data
01062         if (decompress)
01063         {
01064             features[i].string=SG_MALLOC(ST, len_uncompressed);
01065             features[i].slen=len_uncompressed;
01066             uint8_t* compressed=SG_MALLOC(uint8_t, len_compressed);
01067             if (fread(compressed, sizeof(uint8_t), len_compressed, file)!=(size_t) len_compressed)
01068                 SG_ERROR("failed to read compressed data (expected %d bytes)", len_compressed);
01069             uint64_t uncompressed_size=len_uncompressed;
01070             uncompressed_size*=sizeof(ST);
01071             compressor->decompress(compressed, len_compressed,
01072                     (uint8_t*) features[i].string, uncompressed_size);
01073             SG_FREE(compressed);
01074             ASSERT(uncompressed_size==((uint64_t) len_uncompressed)*sizeof(ST));
01075         }
01076         else
01077         {
01078             int32_t offs=CMath::ceil(2.0*sizeof(int32_t)/sizeof(ST));
01079             features[i].string=SG_MALLOC(ST, len_compressed+offs);
01080             features[i].slen=len_compressed+offs;
01081             int32_t* feat32ptr=((int32_t*) (features[i].string));
01082             memset(features[i].string, 0, offs*sizeof(ST));
01083             feat32ptr[0]=(int32_t) len_compressed;
01084             feat32ptr[1]=(int32_t) len_uncompressed;
01085             uint8_t* compressed=(uint8_t*) (&features[i].string[offs]);
01086             if (fread(compressed, 1, len_compressed, file)!=(size_t) len_compressed)
01087                 SG_ERROR("failed to read uncompressed data");
01088         }
01089     }
01090 
01091     delete compressor;
01092     fclose(file);
01093 
01094     return false;
01095 }
01096 
01097 template<class ST> bool CStringFeatures<ST>::save_compressed(char* dest, E_COMPRESSION_TYPE compression, int level)
01098 {
01099     if (m_subset)
01100         SG_ERROR("save_compressed() is not possible on subset");
01101 
01102     FILE* file=NULL;
01103 
01104     if (!(file=fopen(dest, "wb")))
01105         return false;
01106 
01107     CCompressor* compressor= new CCompressor(compression);
01108 
01109     // header shogun v0
01110     const char* id="SGV0";
01111     fwrite(&id[0], sizeof(char), 1, file);
01112     fwrite(&id[1], sizeof(char), 1, file);
01113     fwrite(&id[2], sizeof(char), 1, file);
01114     fwrite(&id[3], sizeof(char), 1, file);
01115 
01116     //compression type
01117     uint8_t c=(uint8_t) compression;
01118     fwrite(&c, sizeof(uint8_t), 1, file);
01119     //alphabet
01120     uint8_t a=(uint8_t) alphabet->get_alphabet();
01121     fwrite(&a, sizeof(uint8_t), 1, file);
01122     // number of vectors
01123     fwrite(&num_vectors, sizeof(int32_t), 1, file);
01124     // maximum string length
01125     fwrite(&max_string_length, sizeof(int32_t), 1, file);
01126 
01127     // vectors
01128     for (int32_t i=0; i<num_vectors; i++)
01129     {
01130         int32_t len=-1;
01131         bool vfree;
01132         ST* vec=get_feature_vector(i, len, vfree);
01133 
01134         uint8_t* compressed=NULL;
01135         uint64_t compressed_size=0;
01136 
01137         compressor->compress((uint8_t*) vec, ((uint64_t) len)*sizeof(ST),
01138                 compressed, compressed_size, level);
01139 
01140         int32_t len_compressed=(int32_t) compressed_size;
01141         // vector len compressed in bytes
01142         fwrite(&len_compressed, sizeof(int32_t), 1, file);
01143         // vector len uncompressed in number of elements of type ST
01144         fwrite(&len, sizeof(int32_t), 1, file);
01145         // vector raw data
01146         fwrite(compressed, compressed_size, 1, file);
01147         SG_FREE(compressed);
01148 
01149         free_feature_vector(vec, i, vfree);
01150     }
01151 
01152     delete compressor;
01153     fclose(file);
01154     return true;
01155 }
01156 
01157 template<class ST> int32_t CStringFeatures<ST>::get_size() { return sizeof(ST); }
01158 
01159 template<class ST> bool CStringFeatures<ST>::apply_preprocessor(bool force_preprocessing)
01160 {
01161     SG_DEBUG( "force: %d\n", force_preprocessing);
01162 
01163     for (int32_t i=0; i<get_num_preprocessors(); i++)
01164     {
01165         if ( (!is_preprocessed(i) || force_preprocessing) )
01166         {
01167             set_preprocessed(i);
01168             CStringPreprocessor<ST>* p=(CStringPreprocessor<ST>*) get_preprocessor(i);
01169             SG_INFO( "preprocessing using preproc %s\n", p->get_name());
01170 
01171             if (!p->apply_to_string_features(this))
01172             {
01173                 SG_UNREF(p);
01174                 return false;
01175             }
01176             else
01177                 SG_UNREF(p);
01178         }
01179     }
01180     return true;
01181 }
01182 
01183 template<class ST> int32_t CStringFeatures<ST>::obtain_by_sliding_window(int32_t window_size, int32_t step_size, int32_t skip)
01184 {
01185     if (m_subset)
01186         SG_NOTIMPLEMENTED;
01187 
01188     ASSERT(step_size>0);
01189     ASSERT(window_size>0);
01190     ASSERT(num_vectors==1 || single_string);
01191     ASSERT(max_string_length>=window_size ||
01192             (single_string && length_of_single_string>=window_size));
01193 
01194     //in case we are dealing with a single remapped string
01195     //allow remapping
01196     if (single_string)
01197         num_vectors= (length_of_single_string-window_size)/step_size + 1;
01198     else if (num_vectors==1)
01199     {
01200         num_vectors= (max_string_length-window_size)/step_size + 1;
01201         length_of_single_string=max_string_length;
01202     }
01203 
01204     SGString<ST>* f=SG_MALLOC(SGString<ST>, num_vectors);
01205     int32_t offs=0;
01206     for (int32_t i=0; i<num_vectors; i++)
01207     {
01208         f[i].string=&features[0].string[offs+skip];
01209         f[i].slen=window_size-skip;
01210         offs+=step_size;
01211     }
01212     single_string=features[0].string;
01213     SG_FREE(features);
01214     features=f;
01215     max_string_length=window_size-skip;
01216 
01217     return num_vectors;
01218 }
01219 
01220 template<class ST> int32_t CStringFeatures<ST>::obtain_by_position_list(int32_t window_size, CDynamicArray<int32_t>* positions,
01221         int32_t skip)
01222 {
01223     if (m_subset)
01224         SG_NOTIMPLEMENTED;
01225 
01226     ASSERT(positions);
01227     ASSERT(window_size>0);
01228     ASSERT(num_vectors==1 || single_string);
01229     ASSERT(max_string_length>=window_size ||
01230             (single_string && length_of_single_string>=window_size));
01231 
01232     num_vectors= positions->get_num_elements();
01233     ASSERT(num_vectors>0);
01234 
01235     int32_t len;
01236 
01237     //in case we are dealing with a single remapped string
01238     //allow remapping
01239     if (single_string)
01240         len=length_of_single_string;
01241     else
01242     {
01243         single_string=features[0].string;
01244         len=max_string_length;
01245         length_of_single_string=max_string_length;
01246     }
01247 
01248     SGString<ST>* f=SG_MALLOC(SGString<ST>, num_vectors);
01249     for (int32_t i=0; i<num_vectors; i++)
01250     {
01251         int32_t p=positions->get_element(i);
01252 
01253         if (p>=0 && p<=len-window_size)
01254         {
01255             f[i].string=&features[0].string[p+skip];
01256             f[i].slen=window_size-skip;
01257         }
01258         else
01259         {
01260             num_vectors=1;
01261             max_string_length=len;
01262             features[0].slen=len;
01263             single_string=NULL;
01264             SG_FREE(f);
01265             SG_ERROR("window (size:%d) starting at position[%d]=%d does not fit in sequence(len:%d)\n",
01266                     window_size, i, p, len);
01267             return -1;
01268         }
01269     }
01270 
01271     SG_FREE(features);
01272     features=f;
01273     max_string_length=window_size-skip;
01274 
01275     return num_vectors;
01276 }
01277 
01278 template<class ST> bool CStringFeatures<ST>::obtain_from_char(CStringFeatures<char>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
01279 {
01280     return obtain_from_char_features(sf, start, p_order, gap, rev);
01281 }
01282 
01283 template<class ST> bool CStringFeatures<ST>::have_same_length(int32_t len)
01284 {
01285     if (len!=-1)
01286     {
01287         if (len!=max_string_length)
01288             return false;
01289     }
01290     len=max_string_length;
01291 
01292     index_t num_str=get_num_vectors();
01293     for (int32_t i=0; i<num_str; i++)
01294     {
01295         if (get_vector_length(i)!=len)
01296             return false;
01297     }
01298 
01299     return true;
01300 }
01301 
01302 template<class ST> void CStringFeatures<ST>::embed_features(int32_t p_order)
01303 {
01304     if (m_subset)
01305         SG_NOTIMPLEMENTED;
01306 
01307     ASSERT(alphabet->get_num_symbols_in_histogram() > 0);
01308 
01309     order=p_order;
01310     original_num_symbols=alphabet->get_num_symbols();
01311     int32_t max_val=alphabet->get_num_bits();
01312 
01313     if (p_order>1)
01314         num_symbols=CMath::powl((floatmax_t) 2, (floatmax_t) max_val*p_order);
01315     else
01316         num_symbols=original_num_symbols;
01317 
01318     SG_INFO( "max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols);
01319 
01320     if ( ((floatmax_t) num_symbols) > CMath::powl(((floatmax_t) 2),((floatmax_t) sizeof(ST)*8)) )
01321         SG_WARNING("symbols did not fit into datatype \"%c\" (%d)\n", (char) max_val, (int) max_val);
01322 
01323     ST mask=0;
01324     for (int32_t i=0; i<p_order*max_val; i++)
01325         mask= (mask<<1) | ((ST) 1);
01326 
01327     for (int32_t i=0; i<num_vectors; i++)
01328     {
01329         int32_t len=features[i].slen;
01330 
01331         if (len < p_order)
01332             SG_ERROR("Sequence must be longer than order (%d vs. %d)\n", len, p_order);
01333 
01334         ST* str=features[i].string;
01335 
01336         // convert first word
01337         for (int32_t j=0; j<p_order; j++)
01338             str[j]=(ST) alphabet->remap_to_bin(str[j]);
01339         str[0]=embed_word(&str[0], p_order);
01340 
01341         // convert the rest
01342         int32_t idx=0;
01343         for (int32_t j=p_order; j<len; j++)
01344         {
01345             str[j]=(ST) alphabet->remap_to_bin(str[j]);
01346             str[idx+1]= ((str[idx]<<max_val) | str[j]) & mask;
01347             idx++;
01348         }
01349 
01350         features[i].slen=len-p_order+1;
01351     }
01352 
01353     compute_symbol_mask_table(max_val);
01354 }
01355 
01356 template<class ST> void CStringFeatures<ST>::compute_symbol_mask_table(int64_t max_val)
01357 {
01358     if (m_subset)
01359         SG_NOTIMPLEMENTED;
01360 
01361     SG_FREE(symbol_mask_table);
01362     symbol_mask_table=SG_MALLOC(ST, 256);
01363 
01364     uint64_t mask=0;
01365     for (int32_t i=0; i< (int64_t) max_val; i++)
01366         mask=(mask<<1) | 1;
01367 
01368     for (int32_t i=0; i<256; i++)
01369     {
01370         uint8_t bits=(uint8_t) i;
01371         symbol_mask_table[i]=0;
01372 
01373         for (int32_t j=0; j<8; j++)
01374         {
01375             if (bits & 1)
01376                 symbol_mask_table[i]|=mask<<(max_val*j);
01377 
01378             bits>>=1;
01379         }
01380     }
01381 }
01382 
01383 template<class ST> void CStringFeatures<ST>::unembed_word(ST word, uint8_t* seq, int32_t len)
01384 {
01385     uint32_t nbits= (uint32_t) alphabet->get_num_bits();
01386 
01387     ST mask=0;
01388     for (uint32_t i=0; i<nbits; i++)
01389         mask=(mask<<1) | (ST) 1;
01390 
01391     for (int32_t i=0; i<len; i++)
01392     {
01393         ST w=(word & mask);
01394         seq[len-i-1]=alphabet->remap_to_char((uint8_t) w);
01395         word>>=nbits;
01396     }
01397 }
01398 
01399 template<class ST> ST CStringFeatures<ST>::embed_word(ST* seq, int32_t len)
01400 {
01401     ST value=(ST) 0;
01402     uint32_t nbits= (uint32_t) alphabet->get_num_bits();
01403     for (int32_t i=0; i<len; i++)
01404     {
01405         value<<=nbits;
01406         value|=seq[i];
01407     }
01408 
01409     return value;
01410 }
01411 
01412 template<class ST> void CStringFeatures<ST>::determine_maximum_string_length()
01413 {
01414     max_string_length=0;
01415     index_t num_str=get_num_vectors();
01416 
01417     for (int32_t i=0; i<num_str; i++)
01418     {
01419         max_string_length=CMath::max(max_string_length,
01420             features[subset_idx_conversion(i)].slen);
01421     }
01422 }
01423 
01424 template<class ST> ST* CStringFeatures<ST>::get_zero_terminated_string_copy(SGString<ST> str)
01425 {
01426     int32_t l=str.slen;
01427     ST* s=SG_MALLOC(ST, l+1);
01428     memcpy(s, str.string, sizeof(ST)*l);
01429     s[l]='\0';
01430     return s;
01431 }
01432 
01433 template<class ST> void CStringFeatures<ST>::set_feature_vector(int32_t num, ST* string, int32_t len)
01434 {
01435     ASSERT(features);
01436     ASSERT(num<get_num_vectors());
01437 
01438     int32_t real_num=subset_idx_conversion(num);
01439 
01440 
01441     features[real_num].slen=len ;
01442     features[real_num].string=string ;
01443 
01444     max_string_length=CMath::max(len, max_string_length);
01445 }
01446 
01447 template<class ST> void CStringFeatures<ST>::get_histogram(float64_t** hist, int32_t* rows, int32_t* cols, bool normalize)
01448 {
01449     int32_t nsym=get_num_symbols();
01450     int32_t slen=get_max_vector_length();
01451     int64_t sz=int64_t(nsym)*slen*sizeof(float64_t);
01452     float64_t* h= SG_MALLOC(float64_t, sz);
01453     memset(h, 0, sz);
01454 
01455     float64_t* h_normalizer=SG_MALLOC(float64_t, slen);
01456     memset(h_normalizer, 0, slen*sizeof(float64_t));
01457     int32_t num_str=get_num_vectors();
01458     for (int32_t i=0; i<num_str; i++)
01459     {
01460         int32_t len;
01461         bool free_vec;
01462         ST* vec=get_feature_vector(i, len, free_vec);
01463         for (int32_t j=0; j<len; j++)
01464         {
01465             h[int64_t(j)*nsym+alphabet->remap_to_bin(vec[j])]++;
01466             h_normalizer[j]++;
01467         }
01468         free_feature_vector(vec, i, free_vec);
01469     }
01470 
01471     if (normalize)
01472     {
01473         for (int32_t i=0; i<slen; i++)
01474         {
01475             for (int32_t j=0; j<nsym; j++)
01476             {
01477                 if (h_normalizer && h_normalizer[i])
01478                     h[int64_t(i)*nsym+j]/=h_normalizer[i];
01479             }
01480         }
01481     }
01482     SG_FREE(h_normalizer);
01483 
01484     *hist=h;
01485     *rows=nsym;
01486     *cols=slen;
01487 }
01488 
01489 template<class ST> void CStringFeatures<ST>::create_random(float64_t* hist, int32_t rows, int32_t cols, int32_t num_vec)
01490 {
01491     ASSERT(rows == get_num_symbols());
01492     cleanup();
01493     float64_t* randoms=SG_MALLOC(float64_t, cols);
01494     SGString<ST>* sf=SG_MALLOC(SGString<ST>, num_vec);
01495 
01496     for (int32_t i=0; i<num_vec; i++)
01497     {
01498         sf[i].string=SG_MALLOC(ST, cols);
01499         sf[i].slen=cols;
01500 
01501         CMath::random_vector(randoms, cols, 0.0, 1.0);
01502 
01503         for (int32_t j=0; j<cols; j++)
01504         {
01505             float64_t lik=hist[int64_t(j)*rows+0];
01506 
01507             int32_t c;
01508             for (c=0; c<rows-1; c++)
01509             {
01510                 if (randoms[j]<=lik)
01511                     break;
01512                 lik+=hist[int64_t(j)*rows+c+1];
01513             }
01514             sf[i].string[j]=alphabet->remap_to_char(c);
01515         }
01516     }
01517     SG_FREE(randoms);
01518     set_features(sf, num_vec, cols);
01519 }
01520 
01521 /*
01522 CStringFeatures<SSKTripleFeature>* obtain_sssk_triple_from_cha(int d1, int d2)
01523 {
01524     int *s;
01525     int32_t nStr=get_num_vectors();
01526 
01527     int32_t nfeat=0;
01528     for (int32_t i=0; i < nStr; ++i)
01529         nfeat += get_vector_length[i] - d1 -d2;
01530     SGString<SSKFeature>* F= SG_MALLOC(SGString<SSKFeature>, nfeat);
01531     int32_t c=0;
01532     for (int32_t i=0; i < nStr; ++i)
01533     {
01534     int32_t len;
01535     bool free_vec;
01536     ST* S=get_feature_vector(vec_num, len, free_vec);
01537     free_feature_vector(vec, vec_num, free_vec);
01538         int32_t n=len - d1 - d2;
01539         s=S[i];
01540         for (int32_t j=0; j < n; ++j)
01541         {
01542             F[c].feature1=s[j];
01543             F[c].feature2=s[j+d1];
01544             F[c].feature3=s[j+d1+d2];
01545             F[c].group=i;
01546             c++;
01547         }
01548     }
01549     ASSERT(nfeat==c);
01550     return F;
01551 }
01552 
01553 CStringFeatures<SSKFeature>* obtain_sssk_double_from_char(int **S, int *len, int nStr, int d1)
01554 {
01555     int i, j;
01556     int n, nfeat;
01557     int *group;
01558     int *features;
01559     int *s;
01560     int c;
01561     SSKFeatures *F;
01562 
01563     nfeat=0;
01564     for (i=0; i < nStr; ++i)
01565         nfeat += len[i] - d1;
01566     group=(int *)SG_MALLOC(nfeat*sizeof(int));
01567     features=(int *)SG_MALLOC(nfeat*2*sizeof(int *));
01568     c=0;
01569     for (i=0; i < nStr; ++i)
01570     {
01571         n=len[i] - d1;
01572         s=S[i];
01573         for (j=0; j < n; ++j)
01574         {
01575             features[c]=s[j];
01576             features[c+nfeat]=s[j+d1];
01577             group[c]=i;
01578             c++;
01579         }
01580     }
01581     if (nfeat!=c)
01582         printf("Something is wrong...\n");
01583     F=(SSKFeatures *)SG_MALLOC(sizeof(SSKFeatures));
01584     (*F).features=features;
01585     (*F).group=group;
01586     (*F).n=nfeat;
01587     return F;
01588 }
01589 */
01590 
01591 template<class ST> CFeatures* CStringFeatures<ST>::copy_subset(SGVector<index_t> indices)
01592 {
01593     /* string list to create new CStringFeatures from */
01594     SGStringList<ST> list_copy(indices.vlen, max_string_length);
01595 
01596     /* copy all features */
01597     for (index_t i=0; i<indices.vlen; ++i)
01598     {
01599         /* index with respect to possible subset */
01600         index_t real_idx=subset_idx_conversion(indices.vector[i]);
01601 
01602         /* copy string */
01603         SGString<ST> current_string=features[real_idx];
01604         SGString<ST> string_copy(current_string.slen);
01605         memcpy(string_copy.string, current_string.string,
01606             current_string.slen*sizeof(ST));
01607         list_copy.strings[i]=string_copy;
01608     }
01609 
01610     /* create copy instance */
01611     CStringFeatures* result=new CStringFeatures(list_copy, alphabet);
01612 
01613     /* max string length may have changed */
01614     result->determine_maximum_string_length();
01615 
01616     return result;
01617 }
01618 
01619 template<class ST> void CStringFeatures<ST>::subset_changed_post()
01620 {
01621     /* max string length has to be updated */
01622     determine_maximum_string_length();
01623 }
01624 
01625 template<class ST> ST* CStringFeatures<ST>::compute_feature_vector(int32_t num, int32_t& len)
01626 {
01627     ASSERT(features && num<get_num_vectors());
01628 
01629     int32_t real_num=subset_idx_conversion(num);
01630 
01631     len=features[real_num].slen;
01632     if (len<=0)
01633         return NULL;
01634 
01635     ST* target=SG_MALLOC(ST, len);
01636     memcpy(target, features[real_num].string, len*sizeof(ST));
01637     return target;
01638 }
01639 
01640 template<class ST> void CStringFeatures<ST>::init()
01641 {
01642     set_generic<ST>();
01643 
01644     alphabet=NULL;
01645     num_vectors=0;
01646     features=NULL;
01647     single_string=NULL;
01648     length_of_single_string=0;
01649     max_string_length=0;
01650     order=0;
01651     symbol_mask_table=0;
01652     preprocess_on_get=false;
01653     feature_cache=NULL;
01654 
01655     m_parameters->add((CSGObject**) &alphabet, "alphabet");
01656     m_parameters->add_vector(&features, &num_vectors, "features",
01657             "This contains the array of features.");
01658     m_parameters->add_vector(&single_string,
01659             &length_of_single_string,
01660             "single_string",
01661             "Created by sliding window.");
01662     m_parameters->add(&max_string_length, "max_string_length",
01663             "Length of longest string.");
01664     m_parameters->add(&num_symbols, "num_symbols",
01665             "Number of used symbols.");
01666     m_parameters->add(&original_num_symbols, "original_num_symbols",
01667             "Original number of used symbols.");
01668     m_parameters->add(&order, "order",
01669             "Order used in higher order mapping.");
01670     m_parameters->add(&preprocess_on_get, "preprocess_on_get",
01671             "Preprocess on-the-fly?");
01672 
01673     /* TODO M_PARAMETERS->ADD?
01674      * /// order used in higher order mapping
01675      * ST* symbol_mask_table;
01676      */
01677 }
01678 
01683 template<> EFeatureType CStringFeatures<bool>::get_feature_type()
01684 {
01685     return F_BOOL;
01686 }
01687 
01692 template<> EFeatureType CStringFeatures<char>::get_feature_type()
01693 {
01694     return F_CHAR;
01695 }
01696 
01701 template<> EFeatureType CStringFeatures<uint8_t>::get_feature_type()
01702 {
01703     return F_BYTE;
01704 }
01705 
01710 template<> EFeatureType CStringFeatures<int16_t>::get_feature_type()
01711 {
01712     return F_SHORT;
01713 }
01714 
01719 template<> EFeatureType CStringFeatures<uint16_t>::get_feature_type()
01720 {
01721     return F_WORD;
01722 }
01723 
01728 template<> EFeatureType CStringFeatures<int32_t>::get_feature_type()
01729 {
01730     return F_INT;
01731 }
01732 
01737 template<> EFeatureType CStringFeatures<uint32_t>::get_feature_type()
01738 {
01739     return F_UINT;
01740 }
01741 
01746 template<> EFeatureType CStringFeatures<int64_t>::get_feature_type()
01747 {
01748     return F_LONG;
01749 }
01750 
01755 template<> EFeatureType CStringFeatures<uint64_t>::get_feature_type()
01756 {
01757     return F_ULONG;
01758 }
01759 
01764 template<> EFeatureType CStringFeatures<float32_t>::get_feature_type()
01765 {
01766     return F_SHORTREAL;
01767 }
01768 
01773 template<> EFeatureType CStringFeatures<float64_t>::get_feature_type()
01774 {
01775     return F_DREAL;
01776 }
01777 
01782 template<> EFeatureType CStringFeatures<floatmax_t>::get_feature_type()
01783 {
01784     return F_LONGREAL;
01785 }
01786 
01787 template<> bool CStringFeatures<bool>::get_masked_symbols(bool symbol, uint8_t mask)
01788 {
01789     return symbol;
01790 }
01791 template<> float32_t CStringFeatures<float32_t>::get_masked_symbols(float32_t symbol, uint8_t mask)
01792 {
01793     return symbol;
01794 }
01795 template<> float64_t CStringFeatures<float64_t>::get_masked_symbols(float64_t symbol, uint8_t mask)
01796 {
01797     return symbol;
01798 }
01799 template<> floatmax_t CStringFeatures<floatmax_t>::get_masked_symbols(floatmax_t symbol, uint8_t mask)
01800 {
01801     return symbol;
01802 }
01803 
01804 template<> bool CStringFeatures<bool>::shift_offset(bool symbol, int32_t amount)
01805 {
01806     return false;
01807 }
01808 template<> float32_t CStringFeatures<float32_t>::shift_offset(float32_t symbol, int32_t amount)
01809 {
01810     return 0;
01811 }
01812 template<> float64_t CStringFeatures<float64_t>::shift_offset(float64_t symbol, int32_t amount)
01813 {
01814     return 0;
01815 }
01816 template<> floatmax_t CStringFeatures<floatmax_t>::shift_offset(floatmax_t symbol, int32_t amount)
01817 {
01818     return 0;
01819 }
01820 
01821 template<> bool CStringFeatures<bool>::shift_symbol(bool symbol, int32_t amount)
01822 {
01823     return symbol;
01824 }
01825 template<> float32_t CStringFeatures<float32_t>::shift_symbol(float32_t symbol, int32_t amount)
01826 {
01827     return symbol;
01828 }
01829 template<> float64_t CStringFeatures<float64_t>::shift_symbol(float64_t symbol, int32_t amount)
01830 {
01831     return symbol;
01832 }
01833 template<> floatmax_t CStringFeatures<floatmax_t>::shift_symbol(floatmax_t symbol, int32_t amount)
01834 {
01835     return symbol;
01836 }
01837 
01838 #ifndef SUNOS
01839 template<>  template <class CT> bool CStringFeatures<float32_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
01840 {
01841     return false;
01842 }
01843 template<>  template <class CT> bool CStringFeatures<float64_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
01844 {
01845     return false;
01846 }
01847 template<>  template <class CT> bool CStringFeatures<floatmax_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
01848 {
01849     return false;
01850 }
01851 #endif
01852 
01853 template<>  void CStringFeatures<float32_t>::embed_features(int32_t p_order)
01854 {
01855 }
01856 template<>  void CStringFeatures<float64_t>::embed_features(int32_t p_order)
01857 {
01858 }
01859 template<>  void CStringFeatures<floatmax_t>::embed_features(int32_t p_order)
01860 {
01861 }
01862 
01863 template<>  void CStringFeatures<float32_t>::compute_symbol_mask_table(int64_t max_val)
01864 {
01865 }
01866 template<>  void CStringFeatures<float64_t>::compute_symbol_mask_table(int64_t max_val)
01867 {
01868 }
01869 template<>  void CStringFeatures<floatmax_t>::compute_symbol_mask_table(int64_t max_val)
01870 {
01871 }
01872 
01873 template<>  float32_t CStringFeatures<float32_t>::embed_word(float32_t* seq, int32_t len)
01874 {
01875     return 0;
01876 }
01877 template<>  float64_t CStringFeatures<float64_t>::embed_word(float64_t* seq, int32_t len)
01878 {
01879     return 0;
01880 }
01881 template<>  floatmax_t CStringFeatures<floatmax_t>::embed_word(floatmax_t* seq, int32_t len)
01882 {
01883     return 0;
01884 }
01885 
01886 template<>  void CStringFeatures<float32_t>::unembed_word(float32_t word, uint8_t* seq, int32_t len)
01887 {
01888 }
01889 template<>  void CStringFeatures<float64_t>::unembed_word(float64_t word, uint8_t* seq, int32_t len)
01890 {
01891 }
01892 template<>  void CStringFeatures<floatmax_t>::unembed_word(floatmax_t word, uint8_t* seq, int32_t len)
01893 {
01894 }
01895 #define LOAD(f_load, sg_type)                                               \
01896 template<> void CStringFeatures<sg_type>::load(CFile* loader)       \
01897 {                                                                           \
01898     SG_INFO( "loading...\n");                                               \
01899                                                                             \
01900     SG_SET_LOCALE_C;                                                    \
01901     SGString<sg_type>* strs;                                                \
01902     int32_t num_str;                                                        \
01903     int32_t max_len;                                                        \
01904     loader->f_load(strs, num_str, max_len);                                 \
01905     set_features(strs, num_str, max_len);                                   \
01906     SG_RESET_LOCALE;                                                    \
01907 }
01908 
01909 LOAD(get_string_list, bool)
01910 LOAD(get_string_list, char)
01911 LOAD(get_int8_string_list, int8_t)
01912 LOAD(get_string_list, uint8_t)
01913 LOAD(get_string_list, int16_t)
01914 LOAD(get_string_list, uint16_t)
01915 LOAD(get_string_list, int32_t)
01916 LOAD(get_uint_string_list, uint32_t)
01917 LOAD(get_long_string_list, int64_t)
01918 LOAD(get_ulong_string_list, uint64_t)
01919 LOAD(get_string_list, float32_t)
01920 LOAD(get_string_list, float64_t)
01921 LOAD(get_longreal_string_list, floatmax_t)
01922 #undef LOAD
01923 
01924 #define SAVE(f_write, sg_type)                                              \
01925 template<> void CStringFeatures<sg_type>::save(CFile* writer)       \
01926 {                                                                           \
01927     if (m_subset)                                                           \
01928         SG_ERROR("save() is not possible on subset");                       \
01929     SG_SET_LOCALE_C;                                                    \
01930     ASSERT(writer);                                                         \
01931     writer->f_write(features, num_vectors);                                 \
01932     SG_RESET_LOCALE;                                                    \
01933 }
01934 
01935 SAVE(set_string_list, bool)
01936 SAVE(set_string_list, char)
01937 SAVE(set_int8_string_list, int8_t)
01938 SAVE(set_string_list, uint8_t)
01939 SAVE(set_string_list, int16_t)
01940 SAVE(set_string_list, uint16_t)
01941 SAVE(set_string_list, int32_t)
01942 SAVE(set_uint_string_list, uint32_t)
01943 SAVE(set_long_string_list, int64_t)
01944 SAVE(set_ulong_string_list, uint64_t)
01945 SAVE(set_string_list, float32_t)
01946 SAVE(set_string_list, float64_t)
01947 SAVE(set_longreal_string_list, floatmax_t)
01948 #undef SAVE
01949 
01950 template <class ST> template <class CT>
01951 bool CStringFeatures<ST>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start,
01952         int32_t p_order, int32_t gap, bool rev)
01953 {
01954     remove_subset();
01955     ASSERT(sf);
01956 
01957     CAlphabet* alpha=sf->get_alphabet();
01958     ASSERT(alpha->get_num_symbols_in_histogram() > 0);
01959 
01960     this->order=p_order;
01961     cleanup();
01962 
01963     num_vectors=sf->get_num_vectors();
01964     ASSERT(num_vectors>0);
01965     max_string_length=sf->get_max_vector_length()-start;
01966     features=SG_MALLOC(SGString<ST>, num_vectors);
01967 
01968     SG_DEBUG( "%1.0llf symbols in StringFeatures<*> %d symbols in histogram\n", sf->get_num_symbols(),
01969             alpha->get_num_symbols_in_histogram());
01970 
01971     for (int32_t i=0; i<num_vectors; i++)
01972     {
01973         int32_t len=-1;
01974         bool vfree;
01975         CT* c=sf->get_feature_vector(i, len, vfree);
01976         ASSERT(!vfree); // won't work when preprocessors are attached
01977 
01978         features[i].string=SG_MALLOC(ST, len);
01979         features[i].slen=len;
01980 
01981         ST* str=features[i].string;
01982         for (int32_t j=0; j<len; j++)
01983             str[j]=(ST) alpha->remap_to_bin(c[j]);
01984     }
01985 
01986     original_num_symbols=alpha->get_num_symbols();
01987     int32_t max_val=alpha->get_num_bits();
01988 
01989     SG_UNREF(alpha);
01990 
01991     if (p_order>1)
01992         num_symbols=CMath::powl((floatmax_t) 2, (floatmax_t) max_val*p_order);
01993     else
01994         num_symbols=original_num_symbols;
01995     SG_INFO( "max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols);
01996 
01997     if ( ((floatmax_t) num_symbols) > CMath::powl(((floatmax_t) 2),((floatmax_t) sizeof(ST)*8)) )
01998     {
01999         SG_ERROR( "symbol does not fit into datatype \"%c\" (%d)\n", (char) max_val, (int) max_val);
02000         return false;
02001     }
02002 
02003     SG_DEBUG( "translate: start=%i order=%i gap=%i(size:%i)\n", start, p_order, gap, sizeof(ST)) ;
02004     for (int32_t line=0; line<num_vectors; line++)
02005     {
02006         int32_t len=0;
02007         bool vfree;
02008         ST* fv=get_feature_vector(line, len, vfree);
02009         ASSERT(!vfree); // won't work when preprocessors are attached
02010 
02011         if (rev)
02012             CAlphabet::translate_from_single_order_reversed(fv, len, start+gap, p_order+gap, max_val, gap);
02013         else
02014             CAlphabet::translate_from_single_order(fv, len, start+gap, p_order+gap, max_val, gap);
02015 
02016         /* fix the length of the string -- hacky */
02017         features[line].slen-=start+gap ;
02018         if (features[line].slen<0)
02019             features[line].slen=0 ;
02020     }
02021 
02022     compute_symbol_mask_table(max_val);
02023 
02024     return true;
02025 }
02026 
02027 template class CStringFeatures<bool>;
02028 template class CStringFeatures<char>;
02029 template class CStringFeatures<int8_t>;
02030 template class CStringFeatures<uint8_t>;
02031 template class CStringFeatures<int16_t>;
02032 template class CStringFeatures<uint16_t>;
02033 template class CStringFeatures<int32_t>;
02034 template class CStringFeatures<uint32_t>;
02035 template class CStringFeatures<int64_t>;
02036 template class CStringFeatures<uint64_t>;
02037 template class CStringFeatures<float32_t>;
02038 template class CStringFeatures<float64_t>;
02039 template class CStringFeatures<floatmax_t>;
02040 
02041 template bool CStringFeatures<uint16_t>::obtain_from_char_features<uint8_t>(CStringFeatures<uint8_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
02042 template bool CStringFeatures<uint32_t>::obtain_from_char_features<uint8_t>(CStringFeatures<uint8_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
02043 template bool CStringFeatures<uint64_t>::obtain_from_char_features<uint8_t>(CStringFeatures<uint8_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
02044 
02045 template bool CStringFeatures<uint16_t>::obtain_from_char_features<uint16_t>(CStringFeatures<uint16_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
02046 template bool CStringFeatures<uint32_t>::obtain_from_char_features<uint16_t>(CStringFeatures<uint16_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
02047 template bool CStringFeatures<uint64_t>::obtain_from_char_features<uint16_t>(CStringFeatures<uint16_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
02048 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation