00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011 #include "lib/common.h"
00012 #include "kernel/CommUlongStringKernel.h"
00013 #include "kernel/SqrtDiagKernelNormalizer.h"
00014 #include "features/StringFeatures.h"
00015 #include "lib/io.h"
00016
00017 using namespace shogun;
00018
00019 CCommUlongStringKernel::CCommUlongStringKernel(int32_t size, bool us)
00020 : CStringKernel<uint64_t>(size), use_sign(us)
00021 {
00022 properties |= KP_LINADD;
00023 clear_normal();
00024
00025 set_normalizer(new CSqrtDiagKernelNormalizer());
00026 }
00027
00028 CCommUlongStringKernel::CCommUlongStringKernel(
00029 CStringFeatures<uint64_t>* l, CStringFeatures<uint64_t>* r, bool us,
00030 int32_t size)
00031 : CStringKernel<uint64_t>(size), use_sign(us)
00032 {
00033 properties |= KP_LINADD;
00034 clear_normal();
00035 set_normalizer(new CSqrtDiagKernelNormalizer());
00036 init(l,r);
00037 }
00038
00039 CCommUlongStringKernel::~CCommUlongStringKernel()
00040 {
00041 cleanup();
00042 }
00043
00044 void CCommUlongStringKernel::remove_lhs()
00045 {
00046 delete_optimization();
00047
00048 #ifdef SVMLIGHT
00049 if (lhs)
00050 cache_reset();
00051 #endif
00052
00053 lhs = NULL ;
00054 rhs = NULL ;
00055 }
00056
00057 void CCommUlongStringKernel::remove_rhs()
00058 {
00059 #ifdef SVMLIGHT
00060 if (rhs)
00061 cache_reset();
00062 #endif
00063
00064 rhs = lhs;
00065 }
00066
00067 bool CCommUlongStringKernel::init(CFeatures* l, CFeatures* r)
00068 {
00069 CStringKernel<uint64_t>::init(l,r);
00070 return init_normalizer();
00071 }
00072
00073 void CCommUlongStringKernel::cleanup()
00074 {
00075 delete_optimization();
00076 clear_normal();
00077 CKernel::cleanup();
00078 }
00079
00080 float64_t CCommUlongStringKernel::compute(int32_t idx_a, int32_t idx_b)
00081 {
00082 int32_t alen, blen;
00083 bool free_avec, free_bvec;
00084 uint64_t* avec=((CStringFeatures<uint64_t>*) lhs)->get_feature_vector(idx_a, alen, free_avec);
00085 uint64_t* bvec=((CStringFeatures<uint64_t>*) rhs)->get_feature_vector(idx_b, blen, free_bvec);
00086
00087 float64_t result=0;
00088
00089 int32_t left_idx=0;
00090 int32_t right_idx=0;
00091
00092 if (use_sign)
00093 {
00094 while (left_idx < alen && right_idx < blen)
00095 {
00096 if (avec[left_idx]==bvec[right_idx])
00097 {
00098 uint64_t sym=avec[left_idx];
00099
00100 while (left_idx< alen && avec[left_idx]==sym)
00101 left_idx++;
00102
00103 while (right_idx< blen && bvec[right_idx]==sym)
00104 right_idx++;
00105
00106 result++;
00107 }
00108 else if (avec[left_idx]<bvec[right_idx])
00109 left_idx++;
00110 else
00111 right_idx++;
00112 }
00113 }
00114 else
00115 {
00116 while (left_idx < alen && right_idx < blen)
00117 {
00118 if (avec[left_idx]==bvec[right_idx])
00119 {
00120 int32_t old_left_idx=left_idx;
00121 int32_t old_right_idx=right_idx;
00122
00123 uint64_t sym=avec[left_idx];
00124
00125 while (left_idx< alen && avec[left_idx]==sym)
00126 left_idx++;
00127
00128 while (right_idx< blen && bvec[right_idx]==sym)
00129 right_idx++;
00130
00131 result+=((float64_t) (left_idx-old_left_idx)) * ((float64_t) (right_idx-old_right_idx));
00132 }
00133 else if (avec[left_idx]<bvec[right_idx])
00134 left_idx++;
00135 else
00136 right_idx++;
00137 }
00138 }
00139 ((CStringFeatures<uint64_t>*) lhs)->free_feature_vector(avec, idx_a, free_avec);
00140 ((CStringFeatures<uint64_t>*) rhs)->free_feature_vector(bvec, idx_b, free_bvec);
00141
00142 return result;
00143 }
00144
00145 void CCommUlongStringKernel::add_to_normal(int32_t vec_idx, float64_t weight)
00146 {
00147 int32_t t=0;
00148 int32_t j=0;
00149 int32_t k=0;
00150 int32_t last_j=0;
00151 int32_t len=-1;
00152 bool free_vec;
00153 uint64_t* vec=((CStringFeatures<uint64_t>*) lhs)->get_feature_vector(vec_idx, len, free_vec);
00154
00155 if (vec && len>0)
00156 {
00157
00158 uint64_t* dic= (uint64_t*) malloc(
00159 (len+dictionary.get_num_elements())*sizeof(uint64_t));
00160 float64_t* dic_weights= (float64_t*) malloc(
00161 (len+dictionary.get_num_elements())*sizeof(float64_t));
00162
00163 if (use_sign)
00164 {
00165 for (j=1; j<len; j++)
00166 {
00167 if (vec[j]==vec[j-1])
00168 continue;
00169
00170 merge_dictionaries(t, j, k, vec, dic, dic_weights, weight, vec_idx);
00171 }
00172
00173 merge_dictionaries(t, j, k, vec, dic, dic_weights, weight, vec_idx);
00174
00175 while (k<dictionary.get_num_elements())
00176 {
00177 dic[t]=dictionary[k];
00178 dic_weights[t]=dictionary_weights[k];
00179 t++;
00180 k++;
00181 }
00182 }
00183 else
00184 {
00185 for (j=1; j<len; j++)
00186 {
00187 if (vec[j]==vec[j-1])
00188 continue;
00189
00190 merge_dictionaries(t, j, k, vec, dic, dic_weights, weight*(j-last_j), vec_idx);
00191 last_j = j;
00192 }
00193
00194 merge_dictionaries(t, j, k, vec, dic, dic_weights, weight*(j-last_j), vec_idx);
00195
00196 while (k<dictionary.get_num_elements())
00197 {
00198 dic[t]=dictionary[k];
00199 dic_weights[t]=dictionary_weights[k];
00200 t++;
00201 k++;
00202 }
00203 }
00204
00205 dictionary.set_array(dic, t, len+dictionary.get_num_elements());
00206 dictionary_weights.set_array(dic_weights, t, len+dictionary.get_num_elements());
00207 }
00208 ((CStringFeatures<uint64_t>*) lhs)->free_feature_vector(vec, vec_idx, free_vec);
00209
00210 set_is_initialized(true);
00211 }
00212
00213 void CCommUlongStringKernel::clear_normal()
00214 {
00215 dictionary.resize_array(0);
00216 dictionary_weights.resize_array(0);
00217 set_is_initialized(false);
00218 }
00219
00220 bool CCommUlongStringKernel::init_optimization(
00221 int32_t count, int32_t *IDX, float64_t * weights)
00222 {
00223 clear_normal();
00224
00225 if (count<=0)
00226 {
00227 set_is_initialized(true);
00228 SG_DEBUG( "empty set of SVs\n");
00229 return true;
00230 }
00231
00232 SG_DEBUG( "initializing CCommUlongStringKernel optimization\n");
00233
00234 for (int32_t i=0; i<count; i++)
00235 {
00236 if ( (i % (count/10+1)) == 0)
00237 SG_PROGRESS(i, 0, count);
00238
00239 add_to_normal(IDX[i], weights[i]);
00240 }
00241
00242 SG_PRINT( "Done. \n");
00243
00244 set_is_initialized(true);
00245 return true;
00246 }
00247
00248 bool CCommUlongStringKernel::delete_optimization()
00249 {
00250 SG_DEBUG( "deleting CCommUlongStringKernel optimization\n");
00251 clear_normal();
00252 return true;
00253 }
00254
00255
00256
00257 float64_t CCommUlongStringKernel::compute_optimized(int32_t i)
00258 {
00259 float64_t result = 0;
00260 int32_t j, last_j=0;
00261 int32_t old_idx = 0;
00262
00263 if (!get_is_initialized())
00264 {
00265 SG_ERROR( "CCommUlongStringKernel optimization not initialized\n");
00266 return 0 ;
00267 }
00268
00269
00270
00271 int32_t alen = -1;
00272 bool free_avec;
00273 uint64_t* avec=((CStringFeatures<uint64_t>*) rhs)->
00274 get_feature_vector(i, alen, free_avec);
00275
00276 if (avec && alen>0)
00277 {
00278 if (use_sign)
00279 {
00280 for (j=1; j<alen; j++)
00281 {
00282 if (avec[j]==avec[j-1])
00283 continue;
00284
00285 int32_t idx = CMath::binary_search_max_lower_equal(&(dictionary.get_array()[old_idx]), dictionary.get_num_elements()-old_idx, avec[j-1]);
00286
00287 if (idx!=-1)
00288 {
00289 if (dictionary[idx+old_idx] == avec[j-1])
00290 result += dictionary_weights[idx+old_idx];
00291
00292 old_idx+=idx;
00293 }
00294 }
00295
00296 int32_t idx = CMath::binary_search(&(dictionary.get_array()[old_idx]), dictionary.get_num_elements()-old_idx, avec[alen-1]);
00297 if (idx!=-1)
00298 result += dictionary_weights[idx+old_idx];
00299 }
00300 else
00301 {
00302 for (j=1; j<alen; j++)
00303 {
00304 if (avec[j]==avec[j-1])
00305 continue;
00306
00307 int32_t idx = CMath::binary_search_max_lower_equal(&(dictionary.get_array()[old_idx]), dictionary.get_num_elements()-old_idx, avec[j-1]);
00308
00309 if (idx!=-1)
00310 {
00311 if (dictionary[idx+old_idx] == avec[j-1])
00312 result += dictionary_weights[idx+old_idx]*(j-last_j);
00313
00314 old_idx+=idx;
00315 }
00316
00317 last_j = j;
00318 }
00319
00320 int32_t idx = CMath::binary_search(&(dictionary.get_array()[old_idx]), dictionary.get_num_elements()-old_idx, avec[alen-1]);
00321 if (idx!=-1)
00322 result += dictionary_weights[idx+old_idx]*(alen-last_j);
00323 }
00324 }
00325
00326 ((CStringFeatures<uint64_t>*) rhs)->free_feature_vector(avec, i, free_avec);
00327
00328 return normalizer->normalize_rhs(result, i);
00329 }