SHOGUN  4.1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
SalzbergWordStringKernel.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 1999-2009 Gunnar Raetsch
8  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
9  */
10 
11 #include <shogun/lib/common.h>
12 #include <shogun/io/SGIO.h>
16 #include <shogun/labels/Labels.h>
19 
20 using namespace shogun;
21 
23 : CStringKernel<uint16_t>(0)
24 {
25  init();
26 }
27 
29 : CStringKernel<uint16_t>(size)
30 {
31  init();
32  estimate=pie;
33 
34  if (labels)
36 }
37 
40  CPluginEstimate* pie, CLabels* labels)
41 : CStringKernel<uint16_t>(10),estimate(pie)
42 {
43  init();
44  estimate=pie;
45 
46  if (labels)
48 
49  init(l, r);
50 }
51 
53 {
54  cleanup();
55 }
56 
57 bool CSalzbergWordStringKernel::init(CFeatures* p_l, CFeatures* p_r)
58 {
61  ASSERT(l)
63  ASSERT(r)
64 
65  int32_t i;
66  initialized=false;
67 
69  SG_FREE(sqrtdiag_rhs);
70  sqrtdiag_rhs=NULL;
71  SG_FREE(sqrtdiag_lhs);
72  sqrtdiag_lhs=NULL;
74  SG_FREE(ld_mean_rhs);
75  ld_mean_rhs=NULL;
76  SG_FREE(ld_mean_lhs);
77  ld_mean_lhs=NULL;
78 
79  sqrtdiag_lhs=SG_MALLOC(float64_t, l->get_num_vectors());
80  ld_mean_lhs=SG_MALLOC(float64_t, l->get_num_vectors());
81 
82  for (i=0; i<l->get_num_vectors(); i++)
83  sqrtdiag_lhs[i]=1;
84 
85  if (l==r)
86  {
89  }
90  else
91  {
92  sqrtdiag_rhs=SG_MALLOC(float64_t, r->get_num_vectors());
93  for (i=0; i<r->get_num_vectors(); i++)
94  sqrtdiag_rhs[i]=1;
95 
96  ld_mean_rhs=SG_MALLOC(float64_t, r->get_num_vectors());
97  }
98 
99  float64_t* l_ld_mean_lhs=ld_mean_lhs;
100  float64_t* l_ld_mean_rhs=ld_mean_rhs;
101 
102  //from our knowledge first normalize variance to 1 and then norm=1 does the job
103  if (!initialized)
104  {
105  int32_t num_vectors=l->get_num_vectors();
106  num_symbols=(int32_t) l->get_num_symbols();
107  int32_t llen=l->get_vector_length(0);
108  int32_t rlen=r->get_vector_length(0);
109  num_params=(int32_t) llen*l->get_num_symbols();
110  int32_t num_params2=(int32_t) llen*l->get_num_symbols()+rlen*r->get_num_symbols();
111  if ((!estimate) || (!estimate->check_models()))
112  {
113  SG_ERROR("no estimate available\n")
114  return false ;
115  } ;
116  if (num_params2!=estimate->get_num_params())
117  {
118  SG_ERROR("number of parameters of estimate and feature representation do not match\n")
119  return false ;
120  } ;
121 
122  SG_FREE(variance);
123  SG_FREE(mean);
124  mean=SG_MALLOC(float64_t, num_params);
125  ASSERT(mean)
126  variance=SG_MALLOC(float64_t, num_params);
128 
129  for (i=0; i<num_params; i++)
130  {
131  mean[i]=0;
132  variance[i]=0;
133  }
134 
135 
136  // compute mean
137  for (i=0; i<num_vectors; i++)
138  {
139  int32_t len;
140  bool free_vec;
141  uint16_t* vec=l->get_feature_vector(i, len, free_vec);
142 
143  for (int32_t j=0; j<len; j++)
144  {
145  int32_t idx=compute_index(j, vec[j]);
146  float64_t theta_p = 1/estimate->log_derivative_pos_obsolete(vec[j], j) ;
147  float64_t theta_n = 1/estimate->log_derivative_neg_obsolete(vec[j], j) ;
148  float64_t value = (theta_p/(pos_prior*theta_p+neg_prior*theta_n)) ;
149 
150  mean[idx] += value/num_vectors ;
151  }
152  l->free_feature_vector(vec, i, free_vec);
153  }
154 
155  // compute variance
156  for (i=0; i<num_vectors; i++)
157  {
158  int32_t len;
159  bool free_vec;
160  uint16_t* vec=l->get_feature_vector(i, len, free_vec);
161 
162  for (int32_t j=0; j<len; j++)
163  {
164  for (int32_t k=0; k<4; k++)
165  {
166  int32_t idx=compute_index(j, k);
167  if (k!=vec[j])
168  variance[idx]+=mean[idx]*mean[idx]/num_vectors;
169  else
170  {
171  float64_t theta_p = 1/estimate->log_derivative_pos_obsolete(vec[j], j) ;
172  float64_t theta_n = 1/estimate->log_derivative_neg_obsolete(vec[j], j) ;
173  float64_t value = (theta_p/(pos_prior*theta_p+neg_prior*theta_n)) ;
174 
175  variance[idx] += CMath::sq(value-mean[idx])/num_vectors;
176  }
177  }
178  }
179  l->free_feature_vector(vec, i, free_vec);
180  }
181 
182 
183  // compute sum_i m_i^2/s_i^2
184  sum_m2_s2=0 ;
185  for (i=0; i<num_params; i++)
186  {
187  if (variance[i]<1e-14) // then it is likely to be numerical inaccuracy
188  variance[i]=1 ;
189 
190  //fprintf(stderr, "%i: mean=%1.2e std=%1.2e\n", i, mean[i], std[i]) ;
191  sum_m2_s2 += mean[i]*mean[i]/(variance[i]) ;
192  } ;
193  }
194 
195  // compute sum of
196  //result -= feature*mean[a_idx]/variance[a_idx] ;
197 
198  for (i=0; i<l->get_num_vectors(); i++)
199  {
200  int32_t alen ;
201  bool free_avec;
202  uint16_t* avec=l->get_feature_vector(i, alen, free_avec);
203  float64_t result=0 ;
204  for (int32_t j=0; j<alen; j++)
205  {
206  int32_t a_idx = compute_index(j, avec[j]) ;
207  float64_t theta_p = 1/estimate->log_derivative_pos_obsolete(avec[j], j) ;
208  float64_t theta_n = 1/estimate->log_derivative_neg_obsolete(avec[j], j) ;
209  float64_t value = (theta_p/(pos_prior*theta_p+neg_prior*theta_n)) ;
210 
211  if (variance[a_idx]!=0)
212  result-=value*mean[a_idx]/variance[a_idx];
213  }
214  ld_mean_lhs[i]=result ;
215 
216  l->free_feature_vector(avec, i, free_avec);
217  }
218 
219  if (ld_mean_lhs!=ld_mean_rhs)
220  {
221  // compute sum of
222  //result -= feature*mean[b_idx]/variance[b_idx] ;
223  for (i=0; i<r->get_num_vectors(); i++)
224  {
225  int32_t alen;
226  bool free_avec;
227  uint16_t* avec=r->get_feature_vector(i, alen, free_avec);
228  float64_t result=0;
229 
230  for (int32_t j=0; j<alen; j++)
231  {
232  int32_t a_idx = compute_index(j, avec[j]) ;
234  avec[j], j) ;
236  avec[j], j) ;
237  float64_t value=(theta_p/(pos_prior*theta_p+neg_prior*theta_n));
238 
239  result -= value*mean[a_idx]/variance[a_idx] ;
240  }
241 
242  ld_mean_rhs[i]=result;
243  r->free_feature_vector(avec, i, free_avec);
244  }
245  }
246 
247  //warning hacky
248  //
249  this->lhs=l;
250  this->rhs=l;
251  ld_mean_lhs = l_ld_mean_lhs ;
252  ld_mean_rhs = l_ld_mean_lhs ;
253 
254  //compute normalize to 1 values
255  for (i=0; i<lhs->get_num_vectors(); i++)
256  {
257  sqrtdiag_lhs[i]=sqrt(compute(i,i));
258 
259  //trap divide by zero exception
260  if (sqrtdiag_lhs[i]==0)
261  sqrtdiag_lhs[i]=1e-16;
262  }
263 
264  // if lhs is different from rhs (train/test data)
265  // compute also the normalization for rhs
267  {
268  this->lhs=r;
269  this->rhs=r;
270  ld_mean_lhs = l_ld_mean_rhs ;
271  ld_mean_rhs = l_ld_mean_rhs ;
272 
273  //compute normalize to 1 values
274  for (i=0; i<rhs->get_num_vectors(); i++)
275  {
276  sqrtdiag_rhs[i]=sqrt(compute(i,i));
277 
278  //trap divide by zero exception
279  if (sqrtdiag_rhs[i]==0)
280  sqrtdiag_rhs[i]=1e-16;
281  }
282  }
283 
284  this->lhs=l;
285  this->rhs=r;
286  ld_mean_lhs = l_ld_mean_lhs ;
287  ld_mean_rhs = l_ld_mean_rhs ;
288 
289  initialized = true ;
290  return init_normalizer();
291 }
292 
294 {
295  SG_FREE(variance);
296  variance=NULL;
297 
298  SG_FREE(mean);
299  mean=NULL;
300 
301  if (sqrtdiag_lhs != sqrtdiag_rhs)
302  SG_FREE(sqrtdiag_rhs);
303  sqrtdiag_rhs=NULL;
304 
305  SG_FREE(sqrtdiag_lhs);
306  sqrtdiag_lhs=NULL;
307 
309  SG_FREE(ld_mean_rhs);
310  ld_mean_rhs=NULL;
311 
312  SG_FREE(ld_mean_lhs);
313  ld_mean_lhs=NULL;
314 
316 }
317 
318 float64_t CSalzbergWordStringKernel::compute(int32_t idx_a, int32_t idx_b)
319 {
320  int32_t alen, blen;
321  bool free_avec, free_bvec;
322  uint16_t* avec=((CStringFeatures<uint16_t>*) lhs)->get_feature_vector(idx_a, alen, free_avec);
323  uint16_t* bvec=((CStringFeatures<uint16_t>*) rhs)->get_feature_vector(idx_b, blen, free_bvec);
324  // can only deal with strings of same length
325  ASSERT(alen==blen)
326 
327  float64_t result = sum_m2_s2 ; // does not contain 0-th element
328 
329  for (int32_t i=0; i<alen; i++)
330  {
331  if (avec[i]==bvec[i])
332  {
333  int32_t a_idx = compute_index(i, avec[i]) ;
334 
335  float64_t theta_p = 1/estimate->log_derivative_pos_obsolete(avec[i], i) ;
336  float64_t theta_n = 1/estimate->log_derivative_neg_obsolete(avec[i], i) ;
337  float64_t value = (theta_p/(pos_prior*theta_p+neg_prior*theta_n)) ;
338 
339  result += value*value/variance[a_idx] ;
340  }
341  }
342  result += ld_mean_lhs[idx_a] + ld_mean_rhs[idx_b] ;
343 
344  ((CStringFeatures<uint16_t>*) lhs)->free_feature_vector(avec, idx_a, free_avec);
345  ((CStringFeatures<uint16_t>*) rhs)->free_feature_vector(bvec, idx_b, free_bvec);
346 
347  if (initialized)
348  result /= (sqrtdiag_lhs[idx_a]*sqrtdiag_rhs[idx_b]) ;
349 
350  return result;
351 }
352 
354 {
355  ASSERT(labels)
356  ASSERT(labels->get_label_type() == LT_BINARY)
357  labels->ensure_valid();
358 
359  int32_t num_pos=0, num_neg=0;
360  for (int32_t i=0; i<labels->get_num_labels(); i++)
361  {
362  if (((CBinaryLabels*) labels)->get_int_label(i)==1)
363  num_pos++;
364  if (((CBinaryLabels*) labels)->get_int_label(i)==-1)
365  num_neg++;
366  }
367 
368  SG_INFO("priors: pos=%1.3f (%i) neg=%1.3f (%i)\n",
369  (float64_t) num_pos/(num_pos+num_neg), num_pos,
370  (float64_t) num_neg/(num_pos+num_neg), num_neg);
371 
373  (float64_t)num_pos/(num_pos+num_neg),
374  (float64_t)num_neg/(num_pos+num_neg));
375 }
376 
377 void CSalzbergWordStringKernel::init()
378 {
379  estimate=NULL;
380  mean=NULL;
381  variance=NULL;
382 
383  sqrtdiag_lhs=NULL;
384  sqrtdiag_rhs=NULL;
385 
386  ld_mean_lhs=NULL;
387  ld_mean_rhs=NULL;
388 
389  num_params=0;
390  num_symbols=0;
391  sum_m2_s2=0;
392  pos_prior=0.5;
393 
394  neg_prior=0.5;
395  initialized=false;
396 }
SGVector< ST > get_feature_vector(int32_t num)
#define SG_INFO(...)
Definition: SGIO.h:118
virtual void cleanup()
Definition: Kernel.cpp:173
virtual ELabelType get_label_type() const =0
binary labels +1/-1
Definition: LabelTypes.h:18
float64_t log_derivative_neg_obsolete(uint16_t obs, int32_t pos)
The class Labels models labels, i.e. class assignments of objects.
Definition: Labels.h:43
virtual int32_t get_num_labels() const =0
virtual int32_t get_num_vectors() const
static T sq(T x)
Definition: Math.h:450
int32_t compute_index(int32_t position, uint16_t symbol)
virtual int32_t get_num_vectors() const =0
#define SG_ERROR(...)
Definition: SGIO.h:129
void free_feature_vector(ST *feat_vec, int32_t num, bool dofree)
float64_t compute(int32_t idx_a, int32_t idx_b)
#define ASSERT(x)
Definition: SGIO.h:201
double float64_t
Definition: common.h:50
float64_t log_derivative_pos_obsolete(uint16_t obs, int32_t pos)
virtual bool init_normalizer()
Definition: Kernel.cpp:168
virtual bool init(CFeatures *l, CFeatures *r)
CFeatures * rhs
feature vectors to occur on right hand side
Definition: Kernel.h:1061
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
void set_prior_probs(float64_t pos_prior_, float64_t neg_prior_)
CFeatures * lhs
feature vectors to occur on left hand side
Definition: Kernel.h:1059
The class Features is the base class of all feature objects.
Definition: Features.h:68
class PluginEstimate
Binary Labels for binary classification.
Definition: BinaryLabels.h:37
virtual void ensure_valid(const char *context=NULL)=0
Template class StringKernel, is the base class of all String Kernels.
Definition: StringKernel.h:26
virtual int32_t get_vector_length(int32_t vec_num)

SHOGUN Machine Learning Toolbox - Documentation