00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015 #ifndef _KERNEL_H___
00016 #define _KERNEL_H___
00017
00018 #include "lib/common.h"
00019 #include "lib/Signal.h"
00020 #include "lib/File.h"
00021 #include "lib/Mathematics.h"
00022 #include "base/SGObject.h"
00023 #include "features/Features.h"
00024 #include "kernel/KernelNormalizer.h"
00025
00026 #include <vector>
00027 #include <set>
00028 #include <string>
00029
00030 namespace shogun
00031 {
00032 class CFile;
00033 class CFeatures;
00034 class CKernelNormalizer;
00035 enum EFeatureType;
00036 enum EFeatureClass;
00037
00038 #ifdef USE_SHORTREAL_KERNELCACHE
00039 typedef float32_t KERNELCACHE_ELEM;
00040 #else
00041 typedef float64_t KERNELCACHE_ELEM;
00042 #endif
00043
00044 typedef int64_t KERNELCACHE_IDX;
00045
00046
00047 enum EOptimizationType
00048 {
00049 FASTBUTMEMHUNGRY,
00050 SLOWBUTMEMEFFICIENT
00051 };
00052
00053 enum EKernelType
00054 {
00055 K_UNKNOWN = 0,
00056 K_LINEAR = 10,
00057 K_POLY = 20,
00058 K_GAUSSIAN = 30,
00059 K_GAUSSIANSHIFT = 32,
00060 K_GAUSSIANMATCH = 33,
00061 K_HISTOGRAM = 40,
00062 K_SALZBERG = 41,
00063 K_LOCALITYIMPROVED = 50,
00064 K_SIMPLELOCALITYIMPROVED = 60,
00065 K_FIXEDDEGREE = 70,
00066 K_WEIGHTEDDEGREE = 80,
00067 K_WEIGHTEDDEGREEPOS = 81,
00068 K_WEIGHTEDDEGREERBF = 82,
00069 K_WEIGHTEDCOMMWORDSTRING = 90,
00070 K_POLYMATCH = 100,
00071 K_ALIGNMENT = 110,
00072 K_COMMWORDSTRING = 120,
00073 K_COMMULONGSTRING = 121,
00074 K_SPECTRUMMISMATCHRBF = 122,
00075 K_COMBINED = 140,
00076 K_AUC = 150,
00077 K_CUSTOM = 160,
00078 K_SIGMOID = 170,
00079 K_CHI2 = 180,
00080 K_DIAG = 190,
00081 K_CONST = 200,
00082 K_DISTANCE = 220,
00083 K_LOCALALIGNMENT = 230,
00084 K_PYRAMIDCHI2 = 240,
00085 K_OLIGO = 250,
00086 K_MATCHWORD = 260,
00087 K_TPPK = 270,
00088 K_REGULATORYMODULES = 280,
00089 K_SPARSESPATIALSAMPLE = 290,
00090 K_HISTOGRAMINTERSECTION = 300
00091 };
00092
00093 enum EKernelProperty
00094 {
00095 KP_NONE = 0,
00096 KP_LINADD = 1,
00097 KP_KERNCOMBINATION = 2,
00098 KP_BATCHEVALUATION = 4
00099 };
00100
00102 template <class T> struct K_THREAD_PARAM
00103 {
00105 CKernel* kernel;
00107 int32_t start;
00109 int32_t end;
00111 int32_t total_start;
00113 int32_t total_end;
00115 int32_t m;
00117 int32_t n;
00119 T* result;
00121 bool symmetric;
00123 bool verbose;
00124 };
00125
00126 class CSVM;
00127
00153 class CKernel : public CSGObject
00154 {
00155 friend class CVarianceKernelNormalizer;
00156 friend class CSqrtDiagKernelNormalizer;
00157 friend class CAvgDiagKernelNormalizer;
00158 friend class CRidgeKernelNormalizer;
00159 friend class CFirstElementKernelNormalizer;
00160 friend class CMultitaskKernelNormalizer;
00161 friend class CMultitaskKernelMklNormalizer;
00162 friend class CMultitaskKernelMaskNormalizer;
00163 friend class CMultitaskKernelMaskPairNormalizer;
00164 friend class CTanimotoKernelNormalizer;
00165 friend class CDiceKernelNormalizer;
00166 friend class CZeroMeanCenterKernelNormalizer;
00167
00168 public:
00169
00173 CKernel();
00174
00175
00180 CKernel(int32_t size);
00181
00188 CKernel(CFeatures* l, CFeatures* r, int32_t size);
00189
00190 virtual ~CKernel();
00191
00199 inline float64_t kernel(int32_t idx_a, int32_t idx_b)
00200 {
00201 if (idx_a<0 || idx_b<0 || idx_a>=num_lhs || idx_b>=num_rhs)
00202 {
00203 SG_ERROR("Index out of Range: idx_a=%d/%d idx_b=%d/%d\n",
00204 idx_a,num_lhs, idx_b,num_rhs);
00205 }
00206
00207 return normalizer->normalize(compute(idx_a, idx_b), idx_a, idx_b);
00208 }
00209
00216 void get_kernel_matrix(float64_t** dst, int32_t* m, int32_t* n);
00217
00218
00224 virtual std::vector<float64_t> get_kernel_col(int32_t j)
00225 {
00226
00227 std::vector<float64_t> col = std::vector<float64_t>(num_rhs);
00228
00229 for (int32_t i=0; i!=num_rhs; i++)
00230 {
00231 col[i] = kernel(i,j);
00232 }
00233
00234 return col;
00235
00236 }
00237
00238
00244 virtual std::vector<float64_t> get_kernel_row(int32_t i)
00245 {
00246
00247 std::vector<float64_t> row = std::vector<float64_t>(num_lhs);
00248
00249 for (int32_t j=0; j!=num_lhs; j++)
00250 {
00251 row[j] = kernel(i,j);
00252 }
00253
00254 return row;
00255
00256 }
00257
00258
00266 template <class T>
00267 T* get_kernel_matrix(int32_t &m, int32_t &n, T* target)
00268 {
00269 T* result = NULL;
00270
00271 if (!has_features())
00272 SG_ERROR( "no features assigned to kernel\n");
00273
00274 if (target && (m!=get_num_vec_lhs() ||
00275 n!=get_num_vec_rhs()) )
00276 {
00277 SG_ERROR( "kernel matrix size mismatch\n");
00278 }
00279
00280 m=get_num_vec_lhs();
00281 n=get_num_vec_rhs();
00282
00283 int64_t total_num = int64_t(m)*n;
00284
00285
00286 bool symmetric= (lhs && lhs==rhs && m==n);
00287
00288 SG_DEBUG( "returning kernel matrix of size %dx%d\n", m, n);
00289
00290 if (target)
00291 result=target;
00292 else
00293 result=new T[total_num];
00294
00295 int32_t num_threads=parallel->get_num_threads();
00296 if (num_threads < 2)
00297 {
00298 K_THREAD_PARAM<T> params;
00299 params.kernel=this;
00300 params.result=result;
00301 params.start=0;
00302 params.end=m;
00303 params.total_start=0;
00304 params.total_end=total_num;
00305 params.n=n;
00306 params.m=m;
00307 params.symmetric=symmetric;
00308 params.verbose=true;
00309 get_kernel_matrix_helper<T>((void*) ¶ms);
00310 }
00311 else
00312 {
00313 pthread_t* threads = new pthread_t[num_threads-1];
00314 K_THREAD_PARAM<T>* params = new K_THREAD_PARAM<T>[num_threads];
00315 int64_t step= total_num/num_threads;
00316
00317 int32_t t;
00318
00319 for (t=0; t<num_threads-1; t++)
00320 {
00321 params[t].kernel = this;
00322 params[t].result = result;
00323 params[t].start = compute_row_start(t*step, n, symmetric);
00324 params[t].end = compute_row_start((t+1)*step, n, symmetric);
00325 params[t].total_start=t*step;
00326 params[t].total_end=(t+1)*step;
00327 params[t].n=n;
00328 params[t].m=m;
00329 params[t].symmetric=symmetric;
00330 params[t].verbose=false;
00331 pthread_create(&threads[t], NULL,
00332 CKernel::get_kernel_matrix_helper<T>, (void*)¶ms[t]);
00333 }
00334
00335 params[t].kernel = this;
00336 params[t].result = result;
00337 params[t].start = compute_row_start(t*step, n, symmetric);
00338 params[t].end = m;
00339 params[t].total_start=t*step;
00340 params[t].total_end=total_num;
00341 params[t].n=n;
00342 params[t].m=m;
00343 params[t].symmetric=symmetric;
00344 params[t].verbose=true;
00345 get_kernel_matrix_helper<T>(¶ms[t]);
00346
00347 for (t=0; t<num_threads-1; t++)
00348 pthread_join(threads[t], NULL);
00349
00350 delete[] params;
00351 delete[] threads;
00352 }
00353
00354 SG_DONE();
00355
00356 return result;
00357 }
00358
00359
00370 virtual bool init(CFeatures* lhs, CFeatures* rhs);
00371
00376 virtual bool set_normalizer(CKernelNormalizer* normalizer);
00377
00382 virtual CKernelNormalizer* get_normalizer();
00383
00387 virtual bool init_normalizer();
00388
00395 virtual void cleanup();
00396
00401 void load(CFile* loader);
00402
00407 void save(CFile* writer);
00408
00413 inline CFeatures* get_lhs() { SG_REF(lhs); return lhs; }
00414
00419 inline CFeatures* get_rhs() { SG_REF(rhs); return rhs; }
00420
00425 virtual inline int32_t get_num_vec_lhs()
00426 {
00427 return num_lhs;
00428 }
00429
00434 virtual inline int32_t get_num_vec_rhs()
00435 {
00436 return num_rhs;
00437 }
00438
00443 virtual inline bool has_features()
00444 {
00445 return lhs && rhs;
00446 }
00447
00452 inline bool get_lhs_equals_rhs()
00453 {
00454 return lhs_equals_rhs;
00455 }
00456
00458 virtual void remove_lhs_and_rhs();
00459
00461 virtual void remove_lhs();
00462
00464 virtual void remove_rhs();
00465
00473 virtual EKernelType get_kernel_type()=0 ;
00474
00481 virtual EFeatureType get_feature_type()=0;
00482
00489 virtual EFeatureClass get_feature_class()=0;
00490
00495 inline void set_cache_size(int32_t size)
00496 {
00497 cache_size = size;
00498 #ifdef USE_SVMLIGHT
00499 cache_reset();
00500 #endif //USE_SVMLIGHT
00501 }
00502
00507 inline int32_t get_cache_size() { return cache_size; }
00508
00509 #ifdef USE_SVMLIGHT
00510
00511 inline void cache_reset() { resize_kernel_cache(cache_size); }
00512
00517 inline int32_t get_max_elems_cache() { return kernel_cache.max_elems; }
00518
00523 inline int32_t get_activenum_cache() { return kernel_cache.activenum; }
00524
00532 void get_kernel_row(
00533 int32_t docnum, int32_t *active2dnum, float64_t *buffer,
00534 bool full_line=false);
00535
00540 void cache_kernel_row(int32_t x);
00541
00547 void cache_multiple_kernel_rows(int32_t* key, int32_t varnum);
00548
00550 void kernel_cache_reset_lru();
00551
00558 void kernel_cache_shrink(
00559 int32_t totdoc, int32_t num_shrink, int32_t *after);
00560
00566 void resize_kernel_cache(KERNELCACHE_IDX size,
00567 bool regression_hack=false);
00568
00573 inline void set_time(int32_t t)
00574 {
00575 kernel_cache.time=t;
00576 }
00577
00583 inline int32_t kernel_cache_touch(int32_t cacheidx)
00584 {
00585 if(kernel_cache.index[cacheidx] != -1)
00586 {
00587 kernel_cache.lru[kernel_cache.index[cacheidx]]=kernel_cache.time;
00588 return(1);
00589 }
00590 return(0);
00591 }
00592
00598 inline int32_t kernel_cache_check(int32_t cacheidx)
00599 {
00600 return(kernel_cache.index[cacheidx] >= 0);
00601 }
00602
00607 inline int32_t kernel_cache_space_available()
00608 {
00609 return(kernel_cache.elems < kernel_cache.max_elems);
00610 }
00611
00617 void kernel_cache_init(int32_t size, bool regression_hack=false);
00618
00620 void kernel_cache_cleanup();
00621
00622 #endif //USE_SVMLIGHT
00623
00625 void list_kernel();
00626
00632 inline bool has_property(EKernelProperty p) { return (properties & p) != 0; }
00633
00637 virtual void clear_normal();
00638
00644 virtual void add_to_normal(int32_t vector_idx, float64_t weight);
00645
00650 inline EOptimizationType get_optimization_type() { return opt_type; }
00651
00656 virtual inline void set_optimization_type(EOptimizationType t) { opt_type=t;}
00657
00662 inline bool get_is_initialized() { return optimization_initialized; }
00663
00671 virtual bool init_optimization(
00672 int32_t count, int32_t *IDX, float64_t *weights);
00673
00678 virtual bool delete_optimization();
00679
00685 bool init_optimization_svm(CSVM * svm) ;
00686
00692 virtual float64_t compute_optimized(int32_t vector_idx);
00693
00702 virtual void compute_batch(
00703 int32_t num_vec, int32_t* vec_idx, float64_t* target,
00704 int32_t num_suppvec, int32_t* IDX, float64_t* alphas,
00705 float64_t factor=1.0);
00706
00711 inline float64_t get_combined_kernel_weight() { return combined_kernel_weight; }
00712
00717 inline void set_combined_kernel_weight(float64_t nw) { combined_kernel_weight=nw; }
00718
00723 virtual int32_t get_num_subkernels();
00724
00730 virtual void compute_by_subkernel(
00731 int32_t vector_idx, float64_t * subkernel_contrib);
00732
00738 virtual const float64_t* get_subkernel_weights(int32_t& num_weights);
00739
00745 virtual void set_subkernel_weights(
00746 float64_t* weights, int32_t num_weights);
00747
00748 protected:
00753 inline void set_property(EKernelProperty p)
00754 {
00755 properties |= p;
00756 }
00757
00762 inline void unset_property(EKernelProperty p)
00763 {
00764 properties &= (properties | p) ^ p;
00765 }
00766
00771 inline void set_is_initialized(bool p_init) { optimization_initialized=p_init; }
00772
00783 virtual float64_t compute(int32_t x, int32_t y)=0;
00784
00791 int32_t compute_row_start(int64_t offs, int32_t n, bool symmetric)
00792 {
00793 int32_t i_start;
00794
00795 if (symmetric)
00796 i_start=(int32_t) CMath::floor(n-CMath::sqrt(CMath::sq((float64_t) n)-offs));
00797 else
00798 i_start=(int32_t) (offs/int64_t(n));
00799
00800 return i_start;
00801 }
00802
00807 template <class T>
00808 static void* get_kernel_matrix_helper(void* p)
00809 {
00810 K_THREAD_PARAM<T>* params= (K_THREAD_PARAM<T>*) p;
00811 int32_t i_start=params->start;
00812 int32_t i_end=params->end;
00813 CKernel* k=params->kernel;
00814 T* result=params->result;
00815 bool symmetric=params->symmetric;
00816 int32_t n=params->n;
00817 int32_t m=params->m;
00818 bool verbose=params->verbose;
00819 int64_t total_start=params->total_start;
00820 int64_t total_end=params->total_end;
00821 int64_t total=total_start;
00822
00823 for (int32_t i=i_start; i<i_end; i++)
00824 {
00825 int32_t j_start=0;
00826
00827 if (symmetric)
00828 j_start=i;
00829
00830 for (int32_t j=j_start; j<n; j++)
00831 {
00832 float64_t v=k->kernel(i,j);
00833 result[i+j*m]=v;
00834
00835 if (symmetric && i!=j)
00836 result[j+i*m]=v;
00837
00838 if (verbose)
00839 {
00840 total++;
00841
00842 if (symmetric && i!=j)
00843 total++;
00844
00845 if (total%100 == 0)
00846 k->SG_PROGRESS(total, total_start, total_end);
00847
00848 if (CSignal::cancel_computations())
00849 break;
00850 }
00851 }
00852
00853 }
00854
00855 return NULL;
00856 }
00857
00866 virtual void load_serializable_post() throw (ShogunException);
00867
00876 virtual void save_serializable_pre() throw (ShogunException);
00877
00886 virtual void save_serializable_post() throw (ShogunException);
00887
00888 private:
00891 void init();
00892
00893
00894 #ifdef USE_SVMLIGHT
00895
00896 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00897 struct KERNEL_CACHE {
00899 int32_t *index;
00901 int32_t *invindex;
00903 int32_t *active2totdoc;
00905 int32_t *totdoc2active;
00907 int32_t *lru;
00909 int32_t *occu;
00911 int32_t elems;
00913 int32_t max_elems;
00915 int32_t time;
00917 int32_t activenum;
00918
00920 KERNELCACHE_ELEM *buffer;
00922 KERNELCACHE_IDX buffsize;
00923 };
00924
00926 struct S_KTHREAD_PARAM
00927 {
00929 CKernel* kernel;
00931 KERNEL_CACHE* kernel_cache;
00933 KERNELCACHE_ELEM** cache;
00935 int32_t* uncached_rows;
00937 int32_t num_uncached;
00939 uint8_t* needs_computation;
00941 int32_t start;
00943 int32_t end;
00945 int32_t num_vectors;
00946 };
00947 #endif // DOXYGEN_SHOULD_SKIP_THIS
00948
00950 static void* cache_multiple_kernel_row_helper(void* p);
00951
00953 void kernel_cache_free(int32_t cacheidx);
00954 int32_t kernel_cache_malloc();
00955 int32_t kernel_cache_free_lru();
00956 KERNELCACHE_ELEM *kernel_cache_clean_and_malloc(int32_t cacheidx);
00957 #endif //USE_SVMLIGHT
00958
00959
00960 protected:
00962 int32_t cache_size;
00963
00964 #ifdef USE_SVMLIGHT
00965
00966 KERNEL_CACHE kernel_cache;
00967 #endif //USE_SVMLIGHT
00968
00971 KERNELCACHE_ELEM* kernel_matrix;
00972
00974 CFeatures* lhs;
00976 CFeatures* rhs;
00977
00979 bool lhs_equals_rhs;
00980
00982 int32_t num_lhs;
00984 int32_t num_rhs;
00985
00987 float64_t combined_kernel_weight;
00988
00990 bool optimization_initialized;
00994 EOptimizationType opt_type;
00995
00997 uint64_t properties;
00998
01001 CKernelNormalizer* normalizer;
01002 };
01003
01004 }
01005 #endif