73 :
CFeatures(orig), num_vectors(orig.num_vectors),
74 single_string(orig.single_string),
75 length_of_single_string(orig.length_of_single_string),
76 max_string_length(orig.max_string_length),
77 num_symbols(orig.num_symbols),
78 original_num_symbols(orig.original_num_symbols),
79 order(orig.order), preprocess_on_get(false),
106 for (int32_t i=0; i<256; i++)
116 features(NULL), single_string(NULL), length_of_single_string(0),
117 max_string_length(0), order(0),
118 preprocess_on_get(false), feature_cache(NULL)
138 remove_all_subsets();
142 SG_FREE(single_string);
146 cleanup_feature_vectors(0, num_vectors-1);
160 SG_FREE(symbol_mask_table);
162 symbol_mask_table=NULL;
176 ASSERT(num<get_num_vectors())
180 int32_t real_num=m_subset_stack->subset_idx_conversion(num);
181 SG_FREE(features[real_num].
string);
182 features[real_num].string=NULL;
183 features[real_num].slen=0;
185 determine_maximum_string_length();
191 if (features && get_num_vectors())
193 ASSERT(start<get_num_vectors())
194 ASSERT(stop<get_num_vectors())
196 for (int32_t i=start; i<=stop; i++)
198 int32_t real_num=m_subset_stack->subset_idx_conversion(i);
199 SG_FREE(features[real_num].
string);
200 features[real_num].string=NULL;
201 features[real_num].slen=0;
203 determine_maximum_string_length();
225 if (num>=get_num_vectors())
227 SG_ERROR(
"Index out of bounds (number of strings %d, you "
228 "requested %d)\n", get_num_vectors(), num);
233 ST* vec=get_feature_vector(num, l, free_vec);
234 ST* dst=SG_MALLOC(ST, l);
235 memcpy(dst, vec, l*
sizeof(ST));
236 free_feature_vector(vec, num, free_vec);
244 if (m_subset_stack->has_subsets())
245 SG_ERROR(
"A subset is set, cannot set feature vector\n")
247 if (num>=num_vectors)
249 SG_ERROR(
"Index out of bounds (number of strings %d, you "
250 "requested %d)\n", num_vectors, num);
254 SG_ERROR(
"String has zero or negative length\n")
256 cleanup_feature_vector(num);
257 features[num].slen=vector.
vlen;
258 features[num].string=SG_MALLOC(ST, vector.
vlen);
259 memcpy(features[num].
string, vector.
vector, vector.
vlen*
sizeof(ST));
261 determine_maximum_string_length();
266 preprocess_on_get=
true;
271 preprocess_on_get=
false;
277 if (num>=get_num_vectors())
278 SG_ERROR(
"Requested feature vector with index %d while total num is", num, get_num_vectors())
280 int32_t real_num=m_subset_stack->subset_idx_conversion(num);
282 if (!preprocess_on_get)
285 len=features[real_num].slen;
286 return features[real_num].string;
290 SG_DEBUG(
"computing feature vector!\n")
291 ST* feat=compute_feature_vector(num, len);
294 if (get_num_preprocessors())
296 ST* tmp_feat_before=feat;
298 for (int32_t i=0; i<get_num_preprocessors(); i++)
303 SG_FREE(tmp_feat_before);
304 tmp_feat_before=feat;
327 num_feat=get_num_vectors();
328 num_vec=get_max_vector_length();
329 ASSERT(have_same_length())
331 SG_DEBUG(
"Allocating memory for transposed string features of size %ld\n",
332 int64_t(num_feat)*num_vec);
336 for (int32_t i=0; i<num_vec; i++)
338 sf[i].
string=SG_MALLOC(ST, num_feat);
342 for (int32_t i=0; i<num_feat; i++)
346 ST* vec=get_feature_vector(i, len, free_vec);
348 for (int32_t j=0; j<num_vec; j++)
349 sf[j].
string[i]=vec[j];
351 free_feature_vector(vec, i, free_vec);
358 if (num>=get_num_vectors())
361 "Trying to access string[%d] but num_str=%d\n", num,
365 int32_t real_num=m_subset_stack->subset_idx_conversion(num);
368 feature_cache->unlock_entry(real_num);
376 if (num>=get_num_vectors())
379 "Trying to access string[%d] but num_str=%d\n", num,
383 int32_t real_num=m_subset_stack->subset_idx_conversion(num);
386 feature_cache->unlock_entry(real_num);
391 ASSERT(vec_num<get_num_vectors())
395 ST* vec=get_feature_vector(vec_num, len, free_vec);
397 ST result=vec[feat_num];
398 free_feature_vector(vec, vec_num, free_vec);
405 ASSERT(vec_num<get_num_vectors())
409 ST* vec=get_feature_vector(vec_num, len, free_vec);
410 free_feature_vector(vec, vec_num, free_vec);
416 return max_string_length;
421 return m_subset_stack->has_subsets() ? m_subset_stack->get_size() : num_vectors;
435 return symbol_mask_table[mask] & symbol;
441 return (offset << (amount*alphabet->get_num_bits()));
447 return (symbol >> (amount*alphabet->get_num_bits()));
453 remove_all_subsets();
455 size_t blocksize=1024*1024;
456 size_t required_blocksize=0;
457 uint8_t* dummy=SG_MALLOC(uint8_t, blocksize);
458 uint8_t* overflow=NULL;
459 int32_t overflow_len=0;
466 FILE* f=fopen(fname,
"ro");
473 SG_INFO(
"counting line numbers in file %s\n", fname)
475 size_t old_block_offs=0;
476 fseek(f, 0, SEEK_END);
477 size_t fsize=ftell(f);
483 SG_DEBUG(
"block_size=%ld file_size=%ld\n", blocksize, fsize)
486 while (sz == blocksize)
488 sz=fread(dummy,
sizeof(uint8_t), blocksize, f);
489 for (
size_t i=0; i<sz; i++)
492 if (dummy[i]==
'\n' || (i==sz-1 && sz<blocksize))
495 required_blocksize=
CMath::max(required_blocksize, block_offs-old_block_offs);
496 old_block_offs=block_offs;
499 SG_PROGRESS(block_offs, 0, fsize, 1,
"COUNTING:\t")
502 SG_INFO(
"found %d strings\n", num_vectors)
504 blocksize=required_blocksize;
505 dummy=SG_MALLOC(uint8_t, blocksize);
506 overflow=SG_MALLOC(uint8_t, blocksize);
512 while (sz == blocksize)
514 sz=fread(dummy,
sizeof(uint8_t), blocksize, f);
517 for (
size_t i=0; i<sz; i++)
519 if (dummy[i]==
'\n' || (i==sz-1 && sz<blocksize))
521 int32_t len=i-old_sz;
523 max_string_length=
CMath::max(max_string_length, len+overflow_len);
525 features[lines].slen=len;
526 features[lines].string=SG_MALLOC(ST, len);
530 for (int32_t j=0; j<overflow_len; j++)
531 features[lines].
string[j]=alpha->
remap_to_bin(overflow[j]);
532 for (int32_t j=0; j<len; j++)
533 features[lines].
string[j+overflow_len]=alpha->
remap_to_bin(dummy[old_sz+j]);
539 for (int32_t j=0; j<overflow_len; j++)
540 features[lines].
string[j]=overflow[j];
541 for (int32_t j=0; j<len; j++)
542 features[lines].
string[j+overflow_len]=dummy[old_sz+j];
553 SG_PROGRESS(lines, 0, num_vectors, 1,
"LOADING:\t")
556 for (
size_t i=old_sz; i<sz; i++)
557 overflow[i-old_sz]=dummy[i];
559 overflow_len=sz-old_sz;
564 SG_INFO(
"file successfully read\n")
565 SG_INFO(
"max_string_length=%d\n", max_string_length)
566 SG_INFO(
"num_strings=%d\n", num_vectors)
580 num_symbols=alphabet->get_num_symbols();
585 remove_all_subsets();
601 if (len>0 && s[0]==
'>')
606 SG_ERROR(
"No fasta hunks (lines starting with '>') found\n")
611 num_symbols=alphabet->get_num_symbols();
624 int32_t spanned_lines=0;
629 SG_ERROR(
"Error reading fasta entry in line %d len=%ld", 4*i+1, len)
631 if (s[0]==
'>' || offs==f.
get_size())
640 len=fasta_len-spanned_lines;
641 strings[i].
string=SG_MALLOC(ST, len);
644 ST* str=strings[i].
string;
646 SG_DEBUG(
"'%.*s', len=%d, spanned_lines=%d\n", (int32_t) id_len,
id, (int32_t) len, (int32_t) spanned_lines)
648 for (int32_t j=0; j<fasta_len; j++)
655 if (ignore_invalid && !alphabet->is_valid((uint8_t) fasta[j]))
658 if (uint64_t(idx)>=len)
659 SG_ERROR(
"idx=%d j=%d fasta_len=%d, spanned_lines=%d str='%.*s'\n", idx, j, fasta_len, spanned_lines, idx, str)
673 return set_features(strings, num, max_len);
677 bool ignore_invalid,
bool bitremap_in_single_string)
679 remove_all_subsets();
691 SG_ERROR(
"Number of lines must be divisible by 4 in fastq files\n")
701 if (bitremap_in_single_string)
704 strings[0].
string=SG_MALLOC(ST, num);
711 original_num_symbols=alphabet->get_num_symbols();
712 str=SG_MALLOC(ST, len);
720 SG_ERROR(
"Error reading 'read' identifier in line %d", 4*i)
724 SG_ERROR(
"Error reading 'read' in line %d len=%ld", 4*i+1, len)
726 if (bitremap_in_single_string)
728 if (len!=(uint64_t) order)
729 SG_ERROR(
"read in line %d not of length %d (is %d)\n", 4*i+1, order, len)
730 for (int32_t j=0; j<order; j++)
731 str[j]=(ST) alphabet->remap_to_bin((uint8_t) s[j]);
733 strings[0].
string[i]=embed_word(str, order);
737 strings[i].
string=SG_MALLOC(ST, len);
743 for (uint64_t j=0; j<len; j++)
745 if (alphabet->is_valid((uint8_t) s[j]))
753 for (uint64_t j=0; j<len; j++)
761 SG_ERROR(
"Error reading 'read' quality identifier in line %d", 4*i+2)
764 SG_ERROR(
"Error reading 'read' quality in line %d", 4*i+3)
767 if (bitremap_in_single_string)
771 max_string_length=max_len;
779 remove_all_subsets();
781 struct dirent **namelist;
788 n=scandir(dirname, &namelist, &
SGIO::filter, alphasort);
791 SG_ERROR(
"error calling scandir - no files found\n")
805 for (int32_t i=0; i<n; i++)
812 if (!stat(fname, &s) && s.st_size>0)
814 filesize=s.st_size/
sizeof(ST);
816 FILE* f=fopen(fname,
"ro");
819 ST* str=SG_MALLOC(ST, filesize);
820 SG_DEBUG(
"%s:%ld\n", fname, (int64_t) filesize)
821 if (fread(str,
sizeof(ST), filesize, f)!=(
size_t) filesize)
824 strings[num].
slen=filesize;
825 max_len=
CMath::max(max_len, strings[num].slen);
832 SG_ERROR(
"empty or non readable file \'%s\'\n", fname)
834 SG_FREE(namelist[i]);
838 if (num>0 && strings)
840 set_features(strings, num, max_len);
854 if (m_subset_stack->has_subsets())
855 SG_ERROR(
"Cannot call set_features() with subset.\n")
862 for (int32_t i=0; i<p_num_vectors; i++)
878 memcpy(features,p_features,
sizeof(
SGString<ST>)*p_num_vectors);
879 num_vectors = p_num_vectors;
880 max_string_length = p_max_string_length;
895 if (m_subset_stack->has_subsets())
896 SG_ERROR(
"Cannot call set_features() with subset.\n")
901 for (int32_t i=0; i<sf_num_str; i++)
905 new_features[i].
string=SG_MALLOC(ST, length);
906 memcpy(new_features[i].
string, sf->
features[real_i].
string, length);
907 new_features[i].
slen=length;
909 return append_features(new_features, sf_num_str,
915 if (m_subset_stack->has_subsets())
916 SG_ERROR(
"Cannot call set_features() with subset.\n")
919 return set_features(p_features, p_num_vectors, p_max_string_length);
924 for (int32_t i=0; i<p_num_vectors; i++)
933 for (int32_t i=0; i<p_num_vectors; i++)
934 alphabet->add_string_to_histogram( p_features[i].
string, p_features[i].
slen);
936 int32_t old_num_vectors=num_vectors;
937 num_vectors=old_num_vectors+p_num_vectors;
940 for (int32_t i=0; i<num_vectors; i++)
942 if (i<old_num_vectors)
944 new_features[i].
string=features[i].string;
945 new_features[i].
slen=features[i].slen;
949 new_features[i].
string=p_features[i-old_num_vectors].
string;
950 new_features[i].
slen=p_features[i-old_num_vectors].
slen;
956 this->features=new_features;
957 max_string_length=
CMath::max(max_string_length, p_max_string_length);
976 if (m_subset_stack->has_subsets())
977 SG_ERROR(
"get features() is not possible on subset")
980 max_str_len=max_string_length;
988 num_str=get_num_vectors();
989 max_str_len=max_string_length;
992 for (int32_t i=0; i<num_str; i++)
996 ST* vec=get_feature_vector(i, len, free_vec);
997 new_feat[i].
string=SG_MALLOC(ST, len);
998 new_feat[i].
slen=len;
999 memcpy(new_feat[i].
string, vec, ((
size_t) len) *
sizeof(ST));
1000 free_feature_vector(vec, i, free_vec);
1009 int32_t max_str_len;
1010 *dst=copy_features(num_vec, max_str_len);
1016 remove_all_subsets();
1020 if (!(file=fopen(src,
"r")))
1026 if (fread(&
id[0],
sizeof(
char), 1, file)!=1)
1029 if (fread(&
id[1],
sizeof(
char), 1, file)!=1)
1032 if (fread(&
id[2],
sizeof(
char), 1, file)!=1)
1035 if (fread(&
id[3],
sizeof(
char), 1, file)!=1)
1041 if (fread(&c,
sizeof(uint8_t), 1, file)!=1)
1042 SG_ERROR(
"failed to read compression type")
1047 if (fread(&a,
sizeof(uint8_t), 1, file)!=1)
1048 SG_ERROR(
"failed to read compression alphabet")
1051 if (fread(&num_vectors,
sizeof(int32_t), 1, file)!=1)
1052 SG_ERROR(
"failed to read compression number of vectors")
1055 if (fread(&max_string_length,
sizeof(int32_t), 1, file)!=1)
1056 SG_ERROR(
"failed to read maximum string length")
1057 ASSERT(max_string_length>0)
1062 for (int32_t i=0; i<num_vectors; i++)
1065 int32_t len_compressed;
1066 if (fread(&len_compressed,
sizeof(int32_t), 1, file)!=1)
1067 SG_ERROR(
"failed to read vector length compressed")
1069 int32_t len_uncompressed;
1070 if (fread(&len_uncompressed,
sizeof(int32_t), 1, file)!=1)
1071 SG_ERROR(
"failed to read vector length uncompressed")
1076 features[i].string=SG_MALLOC(ST, len_uncompressed);
1077 features[i].slen=len_uncompressed;
1078 uint8_t* compressed=SG_MALLOC(uint8_t, len_compressed);
1079 if (fread(compressed,
sizeof(uint8_t), len_compressed, file)!=(
size_t) len_compressed)
1080 SG_ERROR(
"failed to read compressed data (expected %d bytes)", len_compressed)
1081 uint64_t uncompressed_size=len_uncompressed;
1082 uncompressed_size*=
sizeof(ST);
1083 compressor->
decompress(compressed, len_compressed,
1084 (uint8_t*) features[i].
string, uncompressed_size);
1085 SG_FREE(compressed);
1086 ASSERT(uncompressed_size==((uint64_t) len_uncompressed)*
sizeof(ST))
1090 int32_t offs=
CMath::ceil(2.0*
sizeof(int32_t)/
sizeof(ST));
1091 features[i].string=SG_MALLOC(ST, len_compressed+offs);
1092 features[i].slen=len_compressed+offs;
1093 int32_t* feat32ptr=((int32_t*) (features[i].
string));
1094 memset(features[i].
string, 0, offs*
sizeof(ST));
1095 feat32ptr[0]=(int32_t) len_compressed;
1096 feat32ptr[1]=(int32_t) len_uncompressed;
1097 uint8_t* compressed=(uint8_t*) (&features[i].
string[offs]);
1098 if (fread(compressed, 1, len_compressed, file)!=(size_t) len_compressed)
1099 SG_ERROR(
"failed to read uncompressed data")
1111 if (m_subset_stack->has_subsets())
1112 SG_ERROR(
"save_compressed() is not possible on subset")
1116 if (!(file=fopen(dest,
"wb")))
1122 const char*
id=
"SGV0";
1123 fwrite(&
id[0],
sizeof(
char), 1, file);
1124 fwrite(&
id[1],
sizeof(
char), 1, file);
1125 fwrite(&
id[2],
sizeof(
char), 1, file);
1126 fwrite(&
id[3],
sizeof(
char), 1, file);
1129 uint8_t c=(uint8_t) compression;
1130 fwrite(&c,
sizeof(uint8_t), 1, file);
1132 uint8_t a=(uint8_t) alphabet->get_alphabet();
1133 fwrite(&a,
sizeof(uint8_t), 1, file);
1135 fwrite(&num_vectors,
sizeof(int32_t), 1, file);
1137 fwrite(&max_string_length,
sizeof(int32_t), 1, file);
1140 for (int32_t i=0; i<num_vectors; i++)
1144 ST* vec=get_feature_vector(i, len, vfree);
1146 uint8_t* compressed=NULL;
1147 uint64_t compressed_size=0;
1149 compressor->
compress((uint8_t*) vec, ((uint64_t) len)*
sizeof(ST),
1150 compressed, compressed_size, level);
1152 int32_t len_compressed=(int32_t) compressed_size;
1154 fwrite(&len_compressed,
sizeof(int32_t), 1, file);
1156 fwrite(&len,
sizeof(int32_t), 1, file);
1158 fwrite(compressed, compressed_size, 1, file);
1159 SG_FREE(compressed);
1161 free_feature_vector(vec, i, vfree);
1171 SG_DEBUG(
"force: %d\n", force_preprocessing)
1173 for (int32_t i=0; i<get_num_preprocessors(); i++)
1175 if ( (!is_preprocessed(i) || force_preprocessing) )
1177 set_preprocessed(i);
1195 if (m_subset_stack->has_subsets())
1200 ASSERT(num_vectors==1 || single_string)
1201 ASSERT(max_string_length>=window_size ||
1202 (single_string && length_of_single_string>=window_size));
1207 num_vectors= (length_of_single_string-window_size)/step_size + 1;
1208 else if (num_vectors==1)
1210 num_vectors= (max_string_length-window_size)/step_size + 1;
1211 length_of_single_string=max_string_length;
1216 for (int32_t i=0; i<num_vectors; i++)
1218 f[i].
string=&features[0].string[offs+skip];
1219 f[i].
slen=window_size-skip;
1222 single_string=features[0].string;
1225 max_string_length=window_size-skip;
1233 if (m_subset_stack->has_subsets())
1238 ASSERT(num_vectors==1 || single_string)
1239 ASSERT(max_string_length>=window_size ||
1240 (single_string && length_of_single_string>=window_size));
1250 len=length_of_single_string;
1253 single_string=features[0].string;
1254 len=max_string_length;
1255 length_of_single_string=max_string_length;
1259 for (int32_t i=0; i<num_vectors; i++)
1263 if (p>=0 && p<=len-window_size)
1265 f[i].
string=&features[0].string[p+skip];
1266 f[i].
slen=window_size-skip;
1271 max_string_length=len;
1272 features[0].slen=len;
1275 SG_ERROR(
"window (size:%d) starting at position[%d]=%d does not fit in sequence(len:%d)\n",
1276 window_size, i, p, len);
1283 max_string_length=window_size-skip;
1290 return obtain_from_char_features(sf, start, p_order, gap, rev);
1297 if (len!=max_string_length)
1300 len=max_string_length;
1302 index_t num_str=get_num_vectors();
1303 for (int32_t i=0; i<num_str; i++)
1305 if (get_vector_length(i)!=len)
1314 if (m_subset_stack->has_subsets())
1317 ASSERT(alphabet->get_num_symbols_in_histogram() > 0)
1320 original_num_symbols=alphabet->get_num_symbols();
1321 int32_t max_val=alphabet->get_num_bits();
1326 num_symbols=original_num_symbols;
1328 SG_INFO(
"max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols)
1331 SG_WARNING(
"symbols did not fit into datatype \"%c\" (%d)\n", (
char) max_val, (
int) max_val)
1334 for (int32_t i=0; i<p_order*max_val; i++)
1335 mask= (mask<<1) | ((ST) 1);
1337 for (int32_t i=0; i<num_vectors; i++)
1339 int32_t len=features[i].slen;
1342 SG_ERROR(
"Sequence must be longer than order (%d vs. %d)\n", len, p_order)
1344 ST* str=features[i].string;
1347 for (int32_t j=0; j<p_order; j++)
1348 str[j]=(ST) alphabet->remap_to_bin(str[j]);
1349 str[0]=embed_word(&str[0], p_order);
1353 for (int32_t j=p_order; j<len; j++)
1355 str[j]=(ST) alphabet->remap_to_bin(str[j]);
1356 str[idx+1]= ((str[idx]<<max_val) | str[j]) & mask;
1360 features[i].slen=len-p_order+1;
1363 compute_symbol_mask_table(max_val);
1368 if (m_subset_stack->has_subsets())
1371 SG_FREE(symbol_mask_table);
1372 symbol_mask_table=SG_MALLOC(ST, 256);
1373 symbol_mask_table_len=256;
1376 for (int32_t i=0; i< (int64_t) max_val; i++)
1379 for (int32_t i=0; i<256; i++)
1381 uint8_t bits=(uint8_t) i;
1382 symbol_mask_table[i]=0;
1384 for (int32_t j=0; j<8; j++)
1387 symbol_mask_table[i]|=mask<<(max_val*j);
1396 uint32_t nbits= (uint32_t) alphabet->get_num_bits();
1399 for (uint32_t i=0; i<nbits; i++)
1400 mask=(mask<<1) | (ST) 1;
1402 for (int32_t i=0; i<len; i++)
1405 seq[len-i-1]=alphabet->remap_to_char((uint8_t) w);
1413 uint32_t nbits= (uint32_t) alphabet->get_num_bits();
1414 for (int32_t i=0; i<len; i++)
1425 max_string_length=0;
1426 index_t num_str=get_num_vectors();
1428 for (int32_t i=0; i<num_str; i++)
1430 max_string_length=
CMath::max(max_string_length,
1431 features[m_subset_stack->subset_idx_conversion(i)].slen);
1438 ST* s=SG_MALLOC(ST, l+1);
1439 memcpy(s, str.
string,
sizeof(ST)*l);
1447 ASSERT(num<get_num_vectors())
1449 int32_t real_num=m_subset_stack->subset_idx_conversion(num);
1452 features[real_num].slen=len ;
1453 features[real_num].string=string ;
1455 max_string_length=
CMath::max(len, max_string_length);
1460 int32_t nsym=get_num_symbols();
1461 int32_t slen=get_max_vector_length();
1462 int64_t sz=int64_t(nsym)*slen*
sizeof(
float64_t);
1467 memset(h_normalizer, 0, slen*
sizeof(
float64_t));
1468 int32_t num_str=get_num_vectors();
1469 for (int32_t i=0; i<num_str; i++)
1473 ST* vec=get_feature_vector(i, len, free_vec);
1474 for (int32_t j=0; j<len; j++)
1476 h[int64_t(j)*nsym+alphabet->remap_to_bin(vec[j])]++;
1479 free_feature_vector(vec, i, free_vec);
1484 for (int32_t i=0; i<slen; i++)
1486 for (int32_t j=0; j<nsym; j++)
1488 if (h_normalizer && h_normalizer[i])
1489 h[int64_t(i)*nsym+j]/=h_normalizer[i];
1493 SG_FREE(h_normalizer);
1502 ASSERT(rows == get_num_symbols())
1507 for (int32_t i=0; i<num_vec; i++)
1509 sf[i].
string=SG_MALLOC(ST, cols);
1514 for (int32_t j=0; j<cols; j++)
1519 for (c=0; c<rows-1; c++)
1521 if (randoms[j]<=lik)
1523 lik+=hist[int64_t(j)*rows+c+1];
1525 sf[i].
string[j]=alphabet->remap_to_char(c);
1529 set_features(sf, num_vec, cols);
1612 index_t real_idx=m_subset_stack->subset_idx_conversion(indices.
vector[i]);
1618 current_string.
slen*
sizeof(ST));
1619 list_copy.
strings[i]=string_copy;
1629 result->
order=order;
1640 determine_maximum_string_length();
1645 ASSERT(features && num<get_num_vectors())
1647 int32_t real_num=m_subset_stack->subset_idx_conversion(num);
1649 len=features[real_num].slen;
1653 ST* target=SG_MALLOC(ST, len);
1654 memcpy(target, features[real_num].
string, len*
sizeof(ST));
1666 length_of_single_string=0;
1667 max_string_length=0;
1669 preprocess_on_get=
false;
1671 symbol_mask_table=NULL;
1672 symbol_mask_table_len=0;
1674 original_num_symbols=0;
1676 m_parameters->add((
CSGObject**) &alphabet,
"alphabet");
1677 m_parameters->add_vector(&features, &num_vectors,
"features",
1678 "This contains the array of features.");
1679 m_parameters->add_vector(&single_string,
1680 &length_of_single_string,
1682 "Created by sliding window.");
1683 m_parameters->add(&max_string_length,
"max_string_length",
1684 "Length of longest string.");
1685 m_parameters->add(&num_symbols,
"num_symbols",
1686 "Number of used symbols.");
1687 m_parameters->add(&original_num_symbols,
"original_num_symbols",
1688 "Original number of used symbols.");
1689 m_parameters->add(&order,
"order",
1690 "Order used in higher order mapping.");
1691 m_parameters->add(&preprocess_on_get,
"preprocess_on_get",
1692 "Preprocess on-the-fly?");
1694 m_parameters->add_vector(&symbol_mask_table, &symbol_mask_table_len,
"mask_table",
"Symbol mask table - using in higher order mapping");
1913 #define LOAD(f_load, sg_type) \
1914 template<> void CStringFeatures<sg_type>::load(CFile* loader) \
1916 SG_INFO("loading...\n") \
1919 SGString<sg_type>* strs; \
1922 loader->f_load(strs, num_str, max_len); \
1923 set_features(strs, num_str, max_len); \
1927 LOAD(get_string_list,
bool)
1928 LOAD(get_string_list,
char)
1929 LOAD(get_string_list, int8_t)
1930 LOAD(get_string_list, uint8_t)
1931 LOAD(get_string_list, int16_t)
1932 LOAD(get_string_list, uint16_t)
1933 LOAD(get_string_list, int32_t)
1934 LOAD(get_string_list, uint32_t)
1935 LOAD(get_string_list, int64_t)
1936 LOAD(get_string_list, uint64_t)
1942 #define SAVE(f_write, sg_type) \
1943 template<> void CStringFeatures<sg_type>::save(CFile* writer) \
1945 if (m_subset_stack->has_subsets()) \
1946 SG_ERROR("save() is not possible on subset") \
1949 writer->f_write(features, num_vectors); \
1953 SAVE(set_string_list,
bool)
1954 SAVE(set_string_list,
char)
1955 SAVE(set_string_list, int8_t)
1956 SAVE(set_string_list, uint8_t)
1957 SAVE(set_string_list, int16_t)
1958 SAVE(set_string_list, uint16_t)
1959 SAVE(set_string_list, int32_t)
1960 SAVE(set_string_list, uint32_t)
1961 SAVE(set_string_list, int64_t)
1962 SAVE(set_string_list, uint64_t)
1968 template <
class ST>
template <
class CT>
1970 int32_t p_order, int32_t gap,
bool rev)
1972 remove_all_subsets();
1978 this->order=p_order;
1989 for (int32_t i=0; i<num_vectors; i++)
1996 features[i].string=SG_MALLOC(ST, len);
1997 features[i].slen=len;
1999 ST* str=features[i].string;
2000 for (int32_t j=0; j<len; j++)
2012 num_symbols=original_num_symbols;
2013 SG_INFO(
"max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols)
2017 SG_ERROR(
"symbol does not fit into datatype \"%c\" (%d)\n", (
char) max_val, (
int) max_val)
2021 SG_DEBUG(
"translate: start=%i order=%i gap=%i(size:%i)\n", start, p_order, gap,
sizeof(ST))
2022 for (int32_t line=0; line<num_vectors; line++)
2026 ST* fv=get_feature_vector(line, len, vfree);
2030 CAlphabet::translate_from_single_order_reversed(fv, len, start+gap, p_order+gap, max_val, gap);
2032 CAlphabet::translate_from_single_order(fv, len, start+gap, p_order+gap, max_val, gap);
2035 features[line].slen-=start+gap ;
2036 if (features[line].slen<0)
2037 features[line].slen=0 ;
2040 compute_symbol_mask_table(max_val);