SHOGUN  4.1.0
 全部  命名空间 文件 函数 变量 类型定义 枚举 枚举值 友元 宏定义  
CommUlongStringKernel.cpp
浏览该文件的文档.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 1999-2009 Soeren Sonnenburg
8  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
9  */
10 
11 #include <shogun/lib/common.h>
15 #include <shogun/io/SGIO.h>
16 
17 using namespace shogun;
18 
20 : CStringKernel<uint64_t>(size), use_sign(us)
21 {
23  clear_normal();
24 
26 }
27 
30  int32_t size)
31 : CStringKernel<uint64_t>(size), use_sign(us)
32 {
34  clear_normal();
36  init(l,r);
37 }
38 
40 {
41  cleanup();
42 }
43 
45 {
47 
48 #ifdef SVMLIGHT
49  if (lhs)
50  cache_reset();
51 #endif
52 
53  lhs = NULL ;
54  rhs = NULL ;
55 }
56 
58 {
59 #ifdef SVMLIGHT
60  if (rhs)
61  cache_reset();
62 #endif
63 
64  rhs = lhs;
65 }
66 
67 bool CCommUlongStringKernel::init(CFeatures* l, CFeatures* r)
68 {
70  return init_normalizer();
71 }
72 
74 {
76  clear_normal();
78 }
79 
80 float64_t CCommUlongStringKernel::compute(int32_t idx_a, int32_t idx_b)
81 {
82  int32_t alen, blen;
83  bool free_avec, free_bvec;
84  uint64_t* avec=((CStringFeatures<uint64_t>*) lhs)->get_feature_vector(idx_a, alen, free_avec);
85  uint64_t* bvec=((CStringFeatures<uint64_t>*) rhs)->get_feature_vector(idx_b, blen, free_bvec);
86 
87  float64_t result=0;
88 
89  int32_t left_idx=0;
90  int32_t right_idx=0;
91 
92  if (use_sign)
93  {
94  while (left_idx < alen && right_idx < blen)
95  {
96  if (avec[left_idx]==bvec[right_idx])
97  {
98  uint64_t sym=avec[left_idx];
99 
100  while (left_idx< alen && avec[left_idx]==sym)
101  left_idx++;
102 
103  while (right_idx< blen && bvec[right_idx]==sym)
104  right_idx++;
105 
106  result++;
107  }
108  else if (avec[left_idx]<bvec[right_idx])
109  left_idx++;
110  else
111  right_idx++;
112  }
113  }
114  else
115  {
116  while (left_idx < alen && right_idx < blen)
117  {
118  if (avec[left_idx]==bvec[right_idx])
119  {
120  int32_t old_left_idx=left_idx;
121  int32_t old_right_idx=right_idx;
122 
123  uint64_t sym=avec[left_idx];
124 
125  while (left_idx< alen && avec[left_idx]==sym)
126  left_idx++;
127 
128  while (right_idx< blen && bvec[right_idx]==sym)
129  right_idx++;
130 
131  result+=((float64_t) (left_idx-old_left_idx)) * ((float64_t) (right_idx-old_right_idx));
132  }
133  else if (avec[left_idx]<bvec[right_idx])
134  left_idx++;
135  else
136  right_idx++;
137  }
138  }
139  ((CStringFeatures<uint64_t>*) lhs)->free_feature_vector(avec, idx_a, free_avec);
140  ((CStringFeatures<uint64_t>*) rhs)->free_feature_vector(bvec, idx_b, free_bvec);
141 
142  return result;
143 }
144 
146 {
147  int32_t t=0;
148  int32_t j=0;
149  int32_t k=0;
150  int32_t last_j=0;
151  int32_t len=-1;
152  bool free_vec;
153  uint64_t* vec=((CStringFeatures<uint64_t>*) lhs)->get_feature_vector(vec_idx, len, free_vec);
154 
155  if (vec && len>0)
156  {
157  int32_t max_len = len+dictionary.vlen;
158  SGVector<uint64_t> dic(max_len);
159  SGVector<float64_t> dic_weights(max_len);
160 
161  if (use_sign)
162  {
163  for (j=1; j<len; j++)
164  {
165  if (vec[j]==vec[j-1])
166  continue;
167 
168  merge_dictionaries(t, j, k, vec, dic, dic_weights, weight, vec_idx);
169  }
170 
171  merge_dictionaries(t, j, k, vec, dic, dic_weights, weight, vec_idx);
172 
173  while (k<dictionary.vlen)
174  {
175  dic[t]=dictionary[k];
176  dic_weights[t]=dictionary_weights[k];
177  t++;
178  k++;
179  }
180  }
181  else
182  {
183  for (j=1; j<len; j++)
184  {
185  if (vec[j]==vec[j-1])
186  continue;
187 
188  merge_dictionaries(t, j, k, vec, dic, dic_weights, weight*(j-last_j), vec_idx);
189  last_j = j;
190  }
191 
192  merge_dictionaries(t, j, k, vec, dic, dic_weights, weight*(j-last_j), vec_idx);
193 
194  while (k<dictionary.vlen)
195  {
196  dic[t]=dictionary[k];
197  dic_weights[t]=dictionary_weights[k];
198  t++;
199  k++;
200  }
201  }
202 
203  dic.resize_vector(t);
204  dic_weights.resize_vector(t);
205  dictionary = dic;
206  dictionary_weights = dic_weights;
207  }
208  ((CStringFeatures<uint64_t>*) lhs)->free_feature_vector(vec, vec_idx, free_vec);
209 
210  set_is_initialized(true);
211 }
212 
214 {
217  set_is_initialized(false);
218 }
219 
221  int32_t count, int32_t *IDX, float64_t * weights)
222 {
223  clear_normal();
224 
225  if (count<=0)
226  {
227  set_is_initialized(true);
228  SG_DEBUG("empty set of SVs\n")
229  return true;
230  }
231 
232  SG_DEBUG("initializing CCommUlongStringKernel optimization\n")
233 
234  for (int32_t i=0; i<count; i++)
235  {
236  if ( (i % (count/10+1)) == 0)
237  SG_PROGRESS(i, 0, count)
238 
239  add_to_normal(IDX[i], weights[i]);
240  }
241 
242  SG_PRINT("Done. \n")
243 
244  set_is_initialized(true);
245  return true;
246 }
247 
249 {
250  SG_DEBUG("deleting CCommUlongStringKernel optimization\n")
251  clear_normal();
252  return true;
253 }
254 
255 // binary search for each feature. trick: as features are sorted save last found idx in old_idx and
256 // only search in the remainder of the dictionary
258 {
259  float64_t result = 0;
260  int32_t j, last_j=0;
261  int32_t old_idx = 0;
262 
263  if (!get_is_initialized())
264  {
265  SG_ERROR("CCommUlongStringKernel optimization not initialized\n")
266  return 0 ;
267  }
268 
269 
270 
271  int32_t alen = -1;
272  bool free_avec;
273  uint64_t* avec=((CStringFeatures<uint64_t>*) rhs)->
274  get_feature_vector(i, alen, free_avec);
275 
276  if (avec && alen>0)
277  {
278  if (use_sign)
279  {
280  for (j=1; j<alen; j++)
281  {
282  if (avec[j]==avec[j-1])
283  continue;
284 
285  int32_t idx = CMath::binary_search_max_lower_equal(&(dictionary[old_idx]), dictionary.vlen-old_idx, avec[j-1]);
286 
287  if (idx!=-1)
288  {
289  if (dictionary[idx+old_idx] == avec[j-1])
290  result += dictionary_weights[idx+old_idx];
291 
292  old_idx+=idx;
293  }
294  }
295 
296  int32_t idx = CMath::binary_search(&(dictionary[old_idx]), dictionary.vlen-old_idx, avec[alen-1]);
297  if (idx!=-1)
298  result += dictionary_weights[idx+old_idx];
299  }
300  else
301  {
302  for (j=1; j<alen; j++)
303  {
304  if (avec[j]==avec[j-1])
305  continue;
306 
307  int32_t idx = CMath::binary_search_max_lower_equal(&(dictionary[old_idx]), dictionary.vlen-old_idx, avec[j-1]);
308 
309  if (idx!=-1)
310  {
311  if (dictionary[idx+old_idx] == avec[j-1])
312  result += dictionary_weights[idx+old_idx]*(j-last_j);
313 
314  old_idx+=idx;
315  }
316 
317  last_j = j;
318  }
319 
320  int32_t idx = CMath::binary_search(&(dictionary[old_idx]), dictionary.vlen-old_idx, avec[alen-1]);
321  if (idx!=-1)
322  result += dictionary_weights[idx+old_idx]*(alen-last_j);
323  }
324  }
325 
326  ((CStringFeatures<uint64_t>*) rhs)->free_feature_vector(avec, i, free_avec);
327 
328  return normalizer->normalize_rhs(result, i);
329 }
virtual void cleanup()
Definition: Kernel.cpp:173
Template class StringFeatures implements a list of strings.
Definition: WDSVMOcas.h:25
#define SG_PROGRESS(...)
Definition: SGIO.h:142
virtual bool set_normalizer(CKernelNormalizer *normalizer)
Definition: Kernel.cpp:150
virtual float64_t compute_optimized(int32_t idx)
static int32_t binary_search(T *output, int32_t size, T elem)
Definition: Math.h:1809
virtual float64_t normalize_rhs(float64_t value, int32_t idx_rhs)=0
static int32_t binary_search_max_lower_equal(T *output, int32_t size, T elem)
Definition: Math.h:1877
#define SG_ERROR(...)
Definition: SGIO.h:129
void cache_reset()
Definition: Kernel.h:602
void set_is_initialized(bool p_init)
Definition: Kernel.h:899
uint64_t properties
Definition: Kernel.h:1082
virtual bool init(CFeatures *l, CFeatures *r)
bool get_is_initialized()
Definition: Kernel.h:753
CCommUlongStringKernel(int32_t size=10, bool use_sign=false)
float64_t compute(int32_t idx_a, int32_t idx_b)
index_t vlen
Definition: SGVector.h:494
#define SG_PRINT(...)
Definition: SGIO.h:137
double float64_t
Definition: common.h:50
virtual void add_to_normal(int32_t idx, float64_t weight)
virtual bool init_optimization(int32_t count, int32_t *IDX, float64_t *weights)
virtual bool init_normalizer()
Definition: Kernel.cpp:168
CFeatures * rhs
feature vectors to occur on right hand side
Definition: Kernel.h:1061
#define SG_DEBUG(...)
Definition: SGIO.h:107
SGVector< float64_t > dictionary_weights
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
CFeatures * lhs
feature vectors to occur on left hand side
Definition: Kernel.h:1059
The class Features is the base class of all feature objects.
Definition: Features.h:68
CKernelNormalizer * normalizer
Definition: Kernel.h:1086
void resize_vector(int32_t n)
Definition: SGVector.cpp:259
friend class CSqrtDiagKernelNormalizer
Definition: Kernel.h:161
void merge_dictionaries(int32_t &t, int32_t j, int32_t &k, uint64_t *vec, SGVector< uint64_t > dic, SGVector< float64_t > dic_weights, float64_t weight, int32_t vec_idx)
Template class StringKernel, is the base class of all String Kernels.
Definition: StringKernel.h:26

SHOGUN 机器学习工具包 - 项目文档