00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #ifndef _WEIGHTEDDEGREESTRINGKERNEL_H___
00013 #define _WEIGHTEDDEGREESTRINGKERNEL_H___
00014
00015 #include <shogun/lib/common.h>
00016 #include <shogun/lib/Trie.h>
00017 #include <shogun/kernel/StringKernel.h>
00018 #include <shogun/kernel/MultitaskKernelMklNormalizer.h>
00019 #include <shogun/features/StringFeatures.h>
00020
00021 namespace shogun
00022 {
00023
00025 enum EWDKernType
00026 {
00027 E_WD=0,
00028 E_EXTERNAL=1,
00029
00030 E_BLOCK_CONST=2,
00031 E_BLOCK_LINEAR=3,
00032 E_BLOCK_SQPOLY=4,
00033 E_BLOCK_CUBICPOLY=5,
00034 E_BLOCK_EXP=6,
00035 E_BLOCK_LOG=7,
00036 };
00037
00038
00053 class CWeightedDegreeStringKernel: public CStringKernel<char>
00054 {
00055 public:
00056
00060 CWeightedDegreeStringKernel();
00061
00062
00068 CWeightedDegreeStringKernel(int32_t degree, EWDKernType type=E_WD);
00069
00075 CWeightedDegreeStringKernel(float64_t* weights, int32_t degree);
00076
00083 CWeightedDegreeStringKernel(
00084 CStringFeatures<char>* l, CStringFeatures<char>* r, int32_t degree);
00085
00086 virtual ~CWeightedDegreeStringKernel();
00087
00094 virtual bool init(CFeatures* l, CFeatures* r);
00095
00097 virtual void cleanup();
00098
00106 EWDKernType get_type() const
00107 {
00108 return type;
00109 }
00110
00115 virtual EKernelType get_kernel_type() { return K_WEIGHTEDDEGREE; }
00116
00121 virtual const char* get_name() const {
00122 return "WeightedDegreeStringKernel";
00123 }
00124
00132 inline virtual bool init_optimization(
00133 int32_t count, int32_t *IDX, float64_t* alphas)
00134 {
00135 return init_optimization(count, IDX, alphas, -1);
00136 }
00137
00148 virtual bool init_optimization(
00149 int32_t count, int32_t *IDX, float64_t* alphas, int32_t tree_num);
00150
00155 virtual bool delete_optimization();
00156
00162 virtual float64_t compute_optimized(int32_t idx)
00163 {
00164 if (get_is_initialized())
00165 return compute_by_tree(idx);
00166
00167 SG_ERROR( "CWeightedDegreeStringKernel optimization not initialized\n");
00168 return 0;
00169 }
00170
00175 static void* compute_batch_helper(void* p);
00176
00187 virtual void compute_batch(
00188 int32_t num_vec, int32_t* vec_idx, float64_t* target,
00189 int32_t num_suppvec, int32_t* IDX, float64_t* alphas,
00190 float64_t factor=1.0);
00191
00195 inline virtual void clear_normal()
00196 {
00197 if (get_is_initialized())
00198 {
00199
00200 if (normalizer && normalizer->get_normalizer_type()==N_MULTITASK)
00201 SG_ERROR("not implemented");
00202
00203 tries->delete_trees(max_mismatch==0);
00204 set_is_initialized(false);
00205 }
00206 }
00207
00213 inline virtual void add_to_normal(int32_t idx, float64_t weight)
00214 {
00215
00216 if (normalizer && normalizer->get_normalizer_type()==N_MULTITASK)
00217 SG_ERROR("not implemented");
00218
00219 if (max_mismatch==0)
00220 add_example_to_tree(idx, weight);
00221 else
00222 add_example_to_tree_mismatch(idx, weight);
00223
00224 set_is_initialized(true);
00225 }
00226
00231 inline virtual int32_t get_num_subkernels()
00232 {
00233 if (normalizer && normalizer->get_normalizer_type()==N_MULTITASK)
00234 return ((CMultitaskKernelMklNormalizer*)normalizer)->get_num_betas();
00235 if (position_weights!=NULL)
00236 return (int32_t) ceil(1.0*seq_length/mkl_stepsize) ;
00237 if (length==0)
00238 return (int32_t) ceil(1.0*get_degree()/mkl_stepsize);
00239 return (int32_t) ceil(1.0*get_degree()*length/mkl_stepsize) ;
00240 }
00241
00247 inline void compute_by_subkernel(
00248 int32_t idx, float64_t * subkernel_contrib)
00249 {
00250
00251 if (get_is_initialized())
00252 {
00253
00254 if (normalizer && normalizer->get_normalizer_type()==N_MULTITASK)
00255 SG_ERROR("not implemented");
00256
00257 compute_by_tree(idx, subkernel_contrib);
00258 return ;
00259 }
00260
00261 SG_ERROR( "CWeightedDegreeStringKernel optimization not initialized\n");
00262 }
00263
00269 inline const float64_t* get_subkernel_weights(int32_t& num_weights)
00270 {
00271
00272 num_weights = get_num_subkernels();
00273
00274 SG_FREE(weights_buffer);
00275 weights_buffer = SG_MALLOC(float64_t, num_weights);
00276
00277 if (normalizer && normalizer->get_normalizer_type()==N_MULTITASK)
00278 for (int32_t i=0; i<num_weights; i++)
00279 weights_buffer[i] = ((CMultitaskKernelMklNormalizer*)normalizer)->get_beta(i);
00280 else if (position_weights!=NULL)
00281 for (int32_t i=0; i<num_weights; i++)
00282 weights_buffer[i] = position_weights[i*mkl_stepsize];
00283 else
00284 for (int32_t i=0; i<num_weights; i++)
00285 weights_buffer[i] = weights[i*mkl_stepsize];
00286
00287 return weights_buffer;
00288 }
00289
00295 inline void set_subkernel_weights(
00296 float64_t* weights2, int32_t num_weights2)
00297 {
00298 int32_t num_weights = get_num_subkernels();
00299 if (num_weights!=num_weights2)
00300 SG_ERROR( "number of weights do not match\n");
00301
00302
00303 if (normalizer && normalizer->get_normalizer_type()==N_MULTITASK)
00304 for (int32_t i=0; i<num_weights; i++)
00305 ((CMultitaskKernelMklNormalizer*)normalizer)->set_beta(i, weights2[i]);
00306 else if (position_weights!=NULL)
00307 {
00308 for (int32_t i=0; i<num_weights; i++)
00309 {
00310 for (int32_t j=0; j<mkl_stepsize; j++)
00311 {
00312 if (i*mkl_stepsize+j<seq_length)
00313 position_weights[i*mkl_stepsize+j] = weights2[i];
00314 }
00315 }
00316 }
00317 else if (length==0)
00318 {
00319 for (int32_t i=0; i<num_weights; i++)
00320 {
00321 for (int32_t j=0; j<mkl_stepsize; j++)
00322 {
00323 if (i*mkl_stepsize+j<get_degree())
00324 weights[i*mkl_stepsize+j] = weights2[i];
00325 }
00326 }
00327 }
00328 else
00329 {
00330 for (int32_t i=0; i<num_weights; i++)
00331 {
00332 for (int32_t j=0; j<mkl_stepsize; j++)
00333 {
00334 if (i*mkl_stepsize+j<get_degree()*length)
00335 weights[i*mkl_stepsize+j] = weights2[i];
00336 }
00337 }
00338 }
00339 }
00340
00345 virtual bool set_normalizer(CKernelNormalizer* normalizer_) {
00346
00347 if (normalizer_ && strcmp(normalizer_->get_name(),"MultitaskKernelTreeNormalizer")==0) {
00348 unset_property(KP_LINADD);
00349 unset_property(KP_BATCHEVALUATION);
00350 }
00351 else
00352 {
00353 set_property(KP_LINADD);
00354 set_property(KP_BATCHEVALUATION);
00355 }
00356
00357
00358 return CStringKernel<char>::set_normalizer(normalizer_);
00359
00360 }
00361
00362
00368 float64_t *compute_abs_weights(int32_t & len);
00369
00376 void compute_by_tree(int32_t idx, float64_t *LevelContrib);
00377
00382 bool is_tree_initialized() { return tree_initialized; }
00383
00389 inline float64_t *get_degree_weights(int32_t& d, int32_t& len)
00390 {
00391 d=degree;
00392 len=length;
00393 return weights;
00394 }
00395
00401 inline float64_t *get_weights(int32_t& num_weights)
00402 {
00403
00404 if (normalizer && normalizer->get_normalizer_type()==N_MULTITASK)
00405 SG_ERROR("not implemented");
00406
00407 if (position_weights!=NULL)
00408 {
00409 num_weights = seq_length ;
00410 return position_weights ;
00411 }
00412 if (length==0)
00413 num_weights = degree ;
00414 else
00415 num_weights = degree*length ;
00416 return weights;
00417 }
00418
00424 inline float64_t *get_position_weights(int32_t& len)
00425 {
00426 len=seq_length;
00427 return position_weights;
00428 }
00429
00435 bool set_wd_weights_by_type(EWDKernType type);
00436
00441 inline void set_wd_weights(SGVector<float64_t> new_weights)
00442 {
00443 set_weights(SGMatrix<float64_t>(new_weights.vector,new_weights.vlen,0));
00444 }
00445
00450 bool set_weights(SGMatrix<float64_t> new_weights);
00451
00458 bool set_position_weights(float64_t* pws, int32_t len);
00459
00464 bool init_block_weights();
00465
00470 bool init_block_weights_from_wd();
00471
00476 bool init_block_weights_from_wd_external();
00477
00482 bool init_block_weights_const();
00483
00488 bool init_block_weights_linear();
00489
00494 bool init_block_weights_sqpoly();
00495
00500 bool init_block_weights_cubicpoly();
00501
00506 bool init_block_weights_exp();
00507
00512 bool init_block_weights_log();
00513
00518 bool delete_position_weights()
00519 {
00520 SG_FREE(position_weights);
00521 position_weights=NULL;
00522 return true;
00523 }
00524
00530 bool set_max_mismatch(int32_t max);
00531
00536 inline int32_t get_max_mismatch() const { return max_mismatch; }
00537
00543 inline bool set_degree(int32_t deg) { degree=deg; return true; }
00544
00549 inline int32_t get_degree() const { return degree; }
00550
00556 inline bool set_use_block_computation(bool block)
00557 {
00558 block_computation=block;
00559 return true;
00560 }
00561
00566 inline bool get_use_block_computation() { return block_computation; }
00567
00573 inline bool set_mkl_stepsize(int32_t step)
00574 {
00575 if (step<1)
00576 SG_ERROR("Stepsize must be a positive integer\n");
00577 mkl_stepsize=step;
00578 return true;
00579 }
00580
00585 inline int32_t get_mkl_stepsize() { return mkl_stepsize; }
00586
00592 inline bool set_which_degree(int32_t which)
00593 {
00594 which_degree=which;
00595 return true;
00596 }
00597
00602 inline int32_t get_which_degree() { return which_degree; }
00603
00604 protected:
00606 void create_empty_tries();
00607
00613 void add_example_to_tree(int32_t idx, float64_t weight);
00614
00621 void add_example_to_single_tree(
00622 int32_t idx, float64_t weight, int32_t tree_num);
00623
00629 void add_example_to_tree_mismatch(int32_t idx, float64_t weight);
00630
00637 void add_example_to_single_tree_mismatch(
00638 int32_t idx, float64_t weight, int32_t tree_num);
00639
00645 float64_t compute_by_tree(int32_t idx);
00646
00655 float64_t compute(int32_t idx_a, int32_t idx_b);
00656
00665 float64_t compute_with_mismatch(
00666 char* avec, int32_t alen, char* bvec, int32_t blen);
00667
00676 float64_t compute_without_mismatch(
00677 char* avec, int32_t alen, char* bvec, int32_t blen);
00678
00687 float64_t compute_without_mismatch_matrix(
00688 char* avec, int32_t alen, char* bvec, int32_t blen);
00689
00698 float64_t compute_using_block(char* avec, int32_t alen,
00699 char* bvec, int32_t blen);
00700
00702 virtual void remove_lhs();
00703
00704 private:
00707 void init();
00708
00709 protected:
00713 float64_t* weights;
00715 int32_t weights_degree;
00717 int32_t weights_length;
00718
00719
00721 float64_t* position_weights;
00723 int32_t position_weights_len;
00725 float64_t* weights_buffer;
00727 int32_t mkl_stepsize;
00729 int32_t degree;
00731 int32_t length;
00732
00734 int32_t max_mismatch;
00736 int32_t seq_length;
00737
00739 bool initialized;
00740
00742 bool block_computation;
00743
00745 float64_t* block_weights;
00747 EWDKernType type;
00749 int32_t which_degree;
00750
00752 CTrie<DNATrie>* tries;
00753
00755 bool tree_initialized;
00756
00758 CAlphabet* alphabet;
00759 };
00760
00761 }
00762
00763 #endif