51 SG_DEBUG(
"using dictionary of %d words\n", size)
97 ASSERT((1<<(
sizeof(uint16_t)*8)) > alen)
103 memset(dic, 0, num_symbols*
sizeof(int32_t));
105 for (int32_t i=0; i<alen; i++)
118 for (int32_t i=0; i<num_symbols; i++)
121 result+=dic[i]*dic[i];
130 int32_t idx_a, int32_t idx_b,
bool do_sort)
133 bool free_av, free_bv;
148 avec=SG_MALLOC(uint16_t, alen);
149 memcpy(avec, av,
sizeof(uint16_t)*alen);
157 bvec=SG_MALLOC(uint16_t, blen);
158 memcpy(bvec, bv,
sizeof(uint16_t)*blen);
169 SG_ERROR(
"not all preprocessors have been applied to training (%d/%d)"
182 while (left_idx < alen && right_idx < blen)
184 if (avec[left_idx]==bvec[right_idx])
186 uint16_t sym=avec[left_idx];
188 while (left_idx< alen && avec[left_idx]==sym)
191 while (right_idx< blen && bvec[right_idx]==sym)
196 else if (avec[left_idx]<bvec[right_idx])
204 while (left_idx < alen && right_idx < blen)
206 if (avec[left_idx]==bvec[right_idx])
208 int32_t old_left_idx=left_idx;
209 int32_t old_right_idx=right_idx;
211 uint16_t sym=avec[left_idx];
213 while (left_idx< alen && avec[left_idx]==sym)
216 while (right_idx< blen && bvec[right_idx]==sym)
219 result+=((
float64_t) (left_idx-old_left_idx))*
222 else if (avec[left_idx]<bvec[right_idx])
246 get_feature_vector(vec_idx, len, free_vec);
253 for (j=1; j<len; j++)
255 if (vec[j]==vec[j-1])
259 normalize_lhs(weight, vec_idx);
263 normalize_lhs(weight, vec_idx);
267 for (j=1; j<len; j++)
269 if (vec[j]==vec[j-1])
273 normalize_lhs(weight*(j-last_j), vec_idx);
278 normalize_lhs(weight*(len-last_j), vec_idx);
293 int32_t count, int32_t* IDX,
float64_t* weights)
304 SG_DEBUG(
"initializing CCommWordStringKernel optimization\n")
306 for (int32_t i=0; i<count; i++)
308 if ( (i % (count/10+1)) == 0)
320 SG_DEBUG(
"deleting CCommWordStringKernel optimization\n")
330 SG_ERROR(
"CCommWordStringKernel optimization not initialized\n")
338 get_feature_vector(i, len, free_vec);
345 for (j=1; j<len; j++)
347 if (vec[j]==vec[j-1])
357 for (j=1; j<len; j++)
359 if (vec[j]==vec[j-1])
376 int32_t max_degree, int32_t& num_feat, int32_t& num_sym,
float64_t* target,
377 int32_t num_suppvec, int32_t* IDX,
float64_t* alphas,
bool do_init)
393 for (int32_t i=0; i<order; i++)
396 SG_DEBUG(
"num_words:%d, order:%d, len:%d sz:%d (len*sz:%d)\n", num_words, order,
397 num_feat, num_sym, num_feat*num_sym);
400 target=SG_MALLOC(
float64_t, num_feat*num_sym);
401 memset(target, 0, num_feat*num_sym*
sizeof(
float64_t));
406 uint32_t kmer_mask=0;
407 uint32_t words=
CMath::pow((int32_t) num_words,(int32_t) order);
409 for (int32_t o=0; o<max_degree; o++)
412 offset+=
CMath::pow((int32_t) num_words,(int32_t) o+1);
416 for (int32_t p=-o; p<order; p++)
418 int32_t o_sym=0, m_sym=0, il=0,ir=0, jl=0;
419 uint32_t imer_mask=kmer_mask;
420 uint32_t jmer_mask=kmer_mask;
439 imer_mask=(kmer_mask>>(num_bits*o_sym));
440 jmer_mask=(kmer_mask>>(num_bits*jl));
444 1.0/
CMath::pow((int32_t) num_words,(int32_t) m_sym);
446 for (uint32_t i=0; i<words; i++)
448 uint16_t x= ((i << (num_bits*il)) >> (num_bits*ir)) & imer_mask;
450 if (p>=0 && p<order-o)
453 #ifdef DEBUG_COMMSCORING
454 SG_PRINT(
"o=%d/%d p=%d/%d i=0x%x x=0x%x imask=%x jmask=%x kmask=%x il=%d ir=%d marg=%g o_sym:%d m_sym:%d weight(",
455 o,order, p,order, i, x, imer_mask, jmer_mask, kmer_mask, il, ir, marginalizer, o_sym, m_sym);
457 SG_PRINT(
"%c%c%c%c/%c%c%c%c)+=%g/%g\n",
468 for (uint32_t j=0; j< (uint32_t)
CMath::pow((int32_t) num_words, (int32_t) o_sym); j++)
470 uint32_t c=x | ((j & jmer_mask) << (num_bits*jl));
471 #ifdef DEBUG_COMMSCORING
473 SG_PRINT(
"o=%d/%d p=%d/%d i=0x%x j=0x%x x=0x%x c=0x%x imask=%x jmask=%x kmask=%x il=%d ir=%d jl=%d marg=%g o_sym:%d m_sym:%d weight(",
474 o,order, p,order, i, j, x, c, imer_mask, jmer_mask, kmer_mask, il, ir, jl, marginalizer, o_sym, m_sym);
475 SG_PRINT(
"%c%c%c%c/%c%c%c%c)+=%g/%g\n",
489 for (int32_t i=1; i<num_feat; i++)
490 memcpy(&target[num_sym*i], target, num_sym*
sizeof(
float64_t));
499 int32_t &result_len, int32_t num_suppvec, int32_t* IDX,
float64_t* alphas)
508 int64_t total_len=((int64_t) num_feat) * num_words;
515 result_len=num_feat+order-1;
520 char* result=SG_MALLOC(
char, result_len);
521 int32_t* bt=SG_MALLOC(int32_t, total_len);
524 for (int64_t i=0; i<total_len; i++)
530 for (int32_t t=0; t<num_words; t++)
534 for (int32_t i=1; i<num_feat; i++)
536 for (int32_t t1=0; t1<num_words; t1++)
550 uint16_t suffix=(uint16_t) t1 >> num_bits;
554 uint16_t t=suffix | sym << (num_bits*(order-1));
560 if (sc > max_score || max_idx==-1)
568 score[num_words*i + t1]=max_score;
569 bt[num_words*i + t1]=max_idx;
575 max_score=score[num_words*(num_feat-1) + 0];
576 for (int32_t t=1; t<num_words; t++)
578 float64_t sc=score[num_words*(num_feat-1) + t];
586 SG_DEBUG(
"max_idx:%i, max_score:%f\n", max_idx, max_score)
588 for (int32_t i=result_len-1; i>=num_feat; i--)
591 for (int32_t i=num_feat-1; i>=0; i--)
594 max_idx=bt[num_words*i + max_idx];
603 void CCommWordStringKernel::init()
617 "Dictionary for applying kernel.");
619 "If signum(counts) is used instead of counts.",
MS_AVAILABLE);
621 "use_dict_diagonal_optimization",
"If K(x,x) is computed potentially "
virtual int32_t get_max_vector_length()
SGVector< ST > get_feature_vector(int32_t num)
virtual float64_t compute_optimized(int32_t idx)
virtual bool init_optimization(int32_t count, int32_t *IDX, float64_t *weights)
int32_t * dict_diagonal_optimization
virtual bool init_dictionary(int32_t size)
virtual bool set_normalizer(CKernelNormalizer *normalizer)
EAlphabet get_alphabet() const
virtual float64_t normalize_rhs(float64_t value, int32_t idx_rhs)=0
void set_is_initialized(bool p_init)
The class Alphabet implements an alphabet and alphabet utility functions.
void free_feature_vector(ST *feat_vec, int32_t num, bool dofree)
floatmax_t get_original_num_symbols()
float64_t * dictionary_weights
virtual float64_t compute_helper(int32_t idx_a, int32_t idx_b, bool do_sort)
bool get_is_initialized()
int32_t get_num_preprocessors() const
virtual bool delete_optimization()
ST get_masked_symbols(ST symbol, uint8_t mask)
floatmax_t get_num_symbols()
virtual float64_t compute_diag(int32_t idx_a)
virtual ~CCommWordStringKernel()
CAlphabet * get_alphabet()
virtual bool init(CFeatures *l, CFeatures *r)
virtual void clear_normal()
int32_t get_num_bits() const
virtual bool init_normalizer()
CFeatures * rhs
feature vectors to occur on right hand side
void add_vector(bool **param, index_t *length, const char *name, const char *description="")
all of classes and functions are contained in the shogun namespace
virtual float64_t * compute_scoring(int32_t max_degree, int32_t &num_feat, int32_t &num_sym, float64_t *target, int32_t num_suppvec, int32_t *IDX, float64_t *alphas, bool do_init=true)
int32_t get_num_preprocessed() const
CFeatures * lhs
feature vectors to occur on left hand side
The class Features is the base class of all feature objects.
char * compute_consensus(int32_t &num_feat, int32_t num_suppvec, int32_t *IDX, float64_t *alphas)
CKernelNormalizer * normalizer
uint8_t remap_to_char(uint8_t c)
bool use_dict_diagonal_optimization
static void radix_sort(T *array, int32_t size)
virtual void add_to_normal(int32_t idx, float64_t weight)
Template class StringKernel, is the base class of all String Kernels.
static int32_t pow(bool x, int32_t n)
friend class CSqrtDiagKernelNormalizer