00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include <shogun/kernel/string/DistantSegmentsKernel.h>
00013 #include <string>
00014
00015 using namespace shogun;
00016
00017 CDistantSegmentsKernel::CDistantSegmentsKernel() : CStringKernel<char>(),
00018 m_delta(0), m_theta(0)
00019 {
00020 init();
00021 }
00022
00023 CDistantSegmentsKernel::CDistantSegmentsKernel(int32_t size, int32_t delta,
00024 int32_t theta) : CStringKernel<char>(size), m_delta(delta),
00025 m_theta(theta)
00026 {
00027 init();
00028 }
00029
00030 CDistantSegmentsKernel::CDistantSegmentsKernel(CStringFeatures<char>* l,
00031 CStringFeatures<char>* r, int32_t size, int32_t delta, int32_t theta) :
00032 CStringKernel<char>(size), m_delta(delta), m_theta(theta)
00033 {
00034 init();
00035 CStringKernel<char>::init(l, r);
00036 }
00037
00038 bool CDistantSegmentsKernel::init(CFeatures* l, CFeatures* r)
00039 {
00040 CKernel::init(l, r);
00041 return init_normalizer();
00042 }
00043
00044 void CDistantSegmentsKernel::init()
00045 {
00046 SG_ADD(&m_delta, "delta", "Delta parameter of the DS-Kernel", MS_AVAILABLE);
00047 SG_ADD(&m_theta, "theta", "Theta parameter of the DS-Kernel", MS_AVAILABLE);
00048 }
00049
00050 float64_t CDistantSegmentsKernel::compute(int32_t idx_a, int32_t idx_b)
00051 {
00052 bool free_a, free_b;
00053 int32_t aLength=0, bLength=0;
00054 char* a=((CStringFeatures<char>*) lhs)->get_feature_vector(idx_a, aLength,
00055 free_a);
00056 char* b=((CStringFeatures<char>*) rhs)->get_feature_vector(idx_b, bLength,
00057 free_b);
00058 ASSERT(a && b);
00059
00060 if ((aLength<1)||(bLength<1))
00061 SG_ERROR("Empty sequences");
00062
00063 float64_t result=compute(a, aLength, b, bLength, m_delta, m_theta);
00064
00065 ((CStringFeatures<char>*) lhs)->free_feature_vector(a, idx_a, free_a);
00066 ((CStringFeatures<char>*) rhs)->free_feature_vector(b, idx_b, free_b);
00067
00068 return result;
00069 }
00070
00071 int32_t CDistantSegmentsKernel::bin(int32_t j, int32_t i)
00072 {
00073 if (i>j)
00074 return 0;
00075 if (i==3 && j>=3)
00076 {
00077 return j*(j-1)*(j-2)/6;
00078 }
00079 else if (i==2 && j>=2)
00080 {
00081 return j*(j-1)/2;
00082 }
00083 return 0;
00084 }
00085
00086 int32_t CDistantSegmentsKernel::compute(char* s, int32_t sLength, char* t,
00087 int32_t tLength, int32_t delta_m, int32_t theta_m)
00088 {
00089 int32_t c=0;
00090 int32_t* i_=SG_MALLOC(int32_t, delta_m+1);
00091 int32_t* l_=SG_MALLOC(int32_t, delta_m+1);
00092 for (int32_t j_s=0; j_s<=(int32_t) sLength-1; j_s++)
00093 {
00094 for (int32_t j_t=0; j_t<=(int32_t) tLength-1; j_t++)
00095 {
00096 if (s[j_s-1+1]==t[j_t-1+1])
00097 {
00098 int32_t n=CMath::min(CMath::min(sLength-j_s, tLength-j_t), delta_m);
00099 int32_t k=-1;
00100 int32_t i=1;
00101 while (i<=n)
00102 {
00103 k++;
00104 i_[2*k]=i;
00105 i++;
00106 while (i<=n&&s[j_s-1+i]==t[j_t-1+i])
00107 i++;
00108 i_[2*k+1]=i;
00109 l_[k]=i_[2*k+1]-i_[2*k]+1;
00110 i++;
00111 while (i<=n&&s[j_s-1+i]!=t[j_t-1+i])
00112 i++;
00113 }
00114 c+=bin(l_[0], 3)-2*bin(l_[0]-theta_m, 3)
00115 +bin(l_[0]-2*theta_m, 3);
00116 int32_t c1=0;
00117 for (int32_t r=1; r<=k; r++)
00118 {
00119 c1+=bin(l_[r], 2)-bin(l_[r]-theta_m, 2);
00120 }
00121 c+=CMath::min(theta_m, i_[1]-i_[0])*c1;
00122 }
00123 }
00124 }
00125 SG_FREE(l_);
00126 SG_FREE(i_);
00127 return c;
00128 }