00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include <shogun/kernel/DistantSegmentsKernel.h>
00013 #include <string>
00014
00015 using namespace shogun;
00016
00017 CDistantSegmentsKernel::CDistantSegmentsKernel() : CStringKernel<char>(),
00018 m_delta(0), m_theta(0)
00019 {
00020 init();
00021 }
00022
00023 CDistantSegmentsKernel::CDistantSegmentsKernel(int32_t size, int32_t delta,
00024 int32_t theta) : CStringKernel<char>(), m_delta(delta), m_theta(theta)
00025 {
00026 init();
00027 }
00028
00029 CDistantSegmentsKernel::CDistantSegmentsKernel(CStringFeatures<char>* l,
00030 CStringFeatures<char>* r, int32_t size, int32_t delta, int32_t theta) :
00031 CStringKernel<char>(), m_delta(delta), m_theta(theta)
00032 {
00033 init();
00034 CStringKernel<char>::init(l, r);
00035 }
00036
00037 bool CDistantSegmentsKernel::init(CFeatures* l, CFeatures* r)
00038 {
00039 CKernel::init(l, r);
00040 return init_normalizer();
00041 }
00042
00043 void CDistantSegmentsKernel::init()
00044 {
00045 SG_ADD(&m_delta, "delta", "Delta parameter of the DS-Kernel", MS_AVAILABLE);
00046 SG_ADD(&m_theta, "theta", "Theta parameter of the DS-Kernel", MS_AVAILABLE);
00047 }
00048
00049 float64_t CDistantSegmentsKernel::compute(int32_t idx_a, int32_t idx_b)
00050 {
00051 bool free_a, free_b;
00052 int32_t aLength=0, bLength=0;
00053 char* a=((CStringFeatures<char>*) lhs)->get_feature_vector(idx_a, aLength,
00054 free_a);
00055 char* b=((CStringFeatures<char>*) rhs)->get_feature_vector(idx_b, bLength,
00056 free_b);
00057 ASSERT(a && b);
00058
00059 if ((aLength<1)||(bLength<1))
00060 SG_ERROR("Empty sequences");
00061
00062 float64_t result=compute(a, aLength, b, bLength, m_delta, m_theta);
00063
00064 ((CStringFeatures<char>*) lhs)->free_feature_vector(a, idx_a, free_a);
00065 ((CStringFeatures<char>*) rhs)->free_feature_vector(b, idx_b, free_b);
00066
00067 return result;
00068 }
00069
00070 int32_t CDistantSegmentsKernel::bin(int32_t j, int32_t i)
00071 {
00072 if (i>j)
00073 return 0;
00074 if (i==3 && j>=3)
00075 {
00076 return j*(j-1)*(j-2)/6;
00077 }
00078 else if (i==2 && j>=2)
00079 {
00080 return j*(j-1)/2;
00081 }
00082 return 0;
00083 }
00084
00085 int32_t CDistantSegmentsKernel::compute(char* s, int32_t sLength, char* t,
00086 int32_t tLength, int32_t delta_m, int32_t theta_m)
00087 {
00088 int32_t c=0;
00089 int32_t* i_=SG_MALLOC(int32_t, delta_m+1);
00090 int32_t* l_=SG_MALLOC(int32_t, delta_m+1);
00091 for (int32_t j_s=0; j_s<=(int32_t) sLength-1; j_s++)
00092 {
00093 for (int32_t j_t=0; j_t<=(int32_t) tLength-1; j_t++)
00094 {
00095 if (s[j_s-1+1]==t[j_t-1+1])
00096 {
00097 int32_t n=CMath::min(CMath::min(sLength-j_s, tLength-j_t), delta_m);
00098 int32_t k=-1;
00099 int32_t i=1;
00100 while (i<=n)
00101 {
00102 k++;
00103 i_[2*k]=i;
00104 i++;
00105 while (i<=n&&s[j_s-1+i]==t[j_t-1+i])
00106 i++;
00107 i_[2*k+1]=i;
00108 l_[k]=i_[2*k+1]-i_[2*k]+1;
00109 i++;
00110 while (i<=n&&s[j_s-1+i]!=t[j_t-1+i])
00111 i++;
00112 }
00113 c+=bin(l_[0], 3)-2*bin(l_[0]-theta_m, 3)
00114 +bin(l_[0]-2*theta_m, 3);
00115 int32_t c1=0;
00116 for (int32_t r=1; r<=k; r++)
00117 {
00118 c1+=bin(l_[r], 2)-bin(l_[r]-theta_m, 2);
00119 }
00120 c+=CMath::min(theta_m, i_[1]-i_[0])*c1;
00121 }
00122 }
00123 }
00124 delete l_;
00125 delete i_;
00126 return c;
00127 }