SHOGUN  v2.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
ssl.cpp
Go to the documentation of this file.
1 /* Copyright 2006 Vikas Sindhwani (vikass@cs.uchicago.edu)
2  SVM-lin: Fast SVM Solvers for Supervised and Semi-supervised Learning
3 
4  This file is part of SVM-lin.
5 
6  SVM-lin is free software; you can redistribute it and/or modify
7  it under the terms of the GNU General Public License as published by
8  the Free Software Foundation; either version 2 of the License, or
9  (at your option) any later version.
10 
11  SVM-lin is distributed in the hope that it will be useful,
12  but WITHOUT ANY WARRANTY; without even the implied warranty of
13  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  GNU General Public License for more details.
15 
16  You should have received a copy of the GNU General Public License
17  along with SVM-lin (see gpl.txt); if not, write to the Free Software
18  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include <stdio.h>
22 #include <stdlib.h>
23 #include <ctype.h>
24 #include <algorithm>
25 
26 #include <shogun/io/SGIO.h>
30 
31 namespace shogun
32 {
33 void ssl_train(struct data *Data,
34  struct options *Options,
35  struct vector_double *Weights,
36  struct vector_double *Outputs)
37 {
38  // initialize
39  initialize(Weights,Data->n,0.0);
40  initialize(Outputs,Data->m,0.0);
41  vector_int *Subset = SG_MALLOC(vector_int, 1);
42  initialize(Subset,Data->m);
43  // call the right algorithm
44  int32_t optimality = 0;
45  switch(Options->algo)
46  {
47  case -1:
48  SG_SINFO("Regularized Least Squares Regression (CGLS)\n");
49  optimality=CGLS(Data,Options,Subset,Weights,Outputs);
50  break;
51  case RLS:
52  SG_SINFO("Regularized Least Squares Classification (CGLS)\n");
53  optimality=CGLS(Data,Options,Subset,Weights,Outputs);
54  break;
55  case SVM:
56  SG_SINFO("Modified Finite Newton L2-SVM (L2-SVM-MFN)\n");
57  optimality=L2_SVM_MFN(Data,Options,Weights,Outputs,0);
58  break;
59  case TSVM:
60  SG_SINFO("Transductive L2-SVM (TSVM)\n");
61  optimality=TSVM_MFN(Data,Options,Weights,Outputs);
62  break;
63  case DA_SVM:
64  SG_SINFO("Deterministic Annealing Semi-supervised L2-SVM (DAS3VM)\n");
65  optimality=DA_S3VM(Data,Options,Weights,Outputs);
66  break;
67  default:
68  SG_SERROR("Algorithm unspecified\n");
69  }
70 
71  if (!optimality)
72  SG_SWARNING("SSL-Algorithm terminated without reaching optimum.\n");
73 
74  SG_FREE(Subset->vec);
75  SG_FREE(Subset);
76  return;
77 }
78 
79 int32_t CGLS(
80  const struct data *Data, const struct options *Options,
81  const struct vector_int *Subset, struct vector_double *Weights,
82  struct vector_double *Outputs)
83 {
84  SG_SDEBUG("CGLS starting...");
85 
86  /* Disassemble the structures */
87  int32_t active = Subset->d;
88  int32_t *J = Subset->vec;
89  CDotFeatures* features=Data->features;
90  float64_t *Y = Data->Y;
91  float64_t *C = Data->C;
92  int32_t n = Data->n;
93  float64_t lambda = Options->lambda;
94  int32_t cgitermax = Options->cgitermax;
95  float64_t epsilon = Options->epsilon;
96  float64_t *beta = Weights->vec;
97  float64_t *o = Outputs->vec;
98  // initialize z
99  float64_t *z = SG_MALLOC(float64_t, active);
100  float64_t *q = SG_MALLOC(float64_t, active);
101  int32_t ii=0;
102  for (int32_t i = active ; i-- ;){
103  ii=J[i];
104  z[i] = C[ii]*(Y[ii] - o[ii]);
105  }
106  float64_t *r = SG_MALLOC(float64_t, n);
107  for (int32_t i = n ; i-- ;)
108  r[i] = 0.0;
109  for (register int32_t j=0; j < active; j++)
110  {
111  features->add_to_dense_vec(z[j], J[j], r, n-1);
112  r[n-1]+=Options->bias*z[j]; //bias (modelled as last dim)
113  }
114  float64_t *p = SG_MALLOC(float64_t, n);
115  float64_t omega1 = 0.0;
116  for (int32_t i = n ; i-- ;)
117  {
118  r[i] -= lambda*beta[i];
119  p[i] = r[i];
120  omega1 += r[i]*r[i];
121  }
122  float64_t omega_p = omega1;
123  float64_t omega_q = 0.0;
124  float64_t inv_omega2 = 1/omega1;
125  float64_t scale = 0.0;
126  float64_t omega_z=0.0;
127  float64_t gamma = 0.0;
128  int32_t cgiter = 0;
129  int32_t optimality = 0;
130  float64_t epsilon2 = epsilon*epsilon;
131  // iterate
132  while(cgiter < cgitermax)
133  {
134  cgiter++;
135  omega_q=0.0;
136  float64_t t=0.0;
137  register int32_t i,j;
138  // #pragma omp parallel for private(i,j)
139  for (i=0; i < active; i++)
140  {
141  ii=J[i];
142  t=features->dense_dot(ii, p, n-1);
143  t+=Options->bias*p[n-1]; //bias (modelled as last dim)
144  q[i]=t;
145  omega_q += C[ii]*t*t;
146  }
147  gamma = omega1/(lambda*omega_p + omega_q);
148  inv_omega2 = 1/omega1;
149  for (i = n ; i-- ;)
150  {
151  r[i] = 0.0;
152  beta[i] += gamma*p[i];
153  }
154  omega_z=0.0;
155  for (i = active ; i-- ;)
156  {
157  ii=J[i];
158  o[ii] += gamma*q[i];
159  z[i] -= gamma*C[ii]*q[i];
160  omega_z+=z[i]*z[i];
161  }
162  for (j=0; j < active; j++)
163  {
164  t=z[j];
165 
166  features->add_to_dense_vec(t, J[j], r, n-1);
167  r[n-1]+=Options->bias*t; //bias (modelled as last dim)
168  }
169  omega1 = 0.0;
170  for (i = n ; i-- ;)
171  {
172  r[i] -= lambda*beta[i];
173  omega1 += r[i]*r[i];
174  }
175  if(omega1 < epsilon2*omega_z)
176  {
177  optimality=1;
178  break;
179  }
180  omega_p=0.0;
181  scale=omega1*inv_omega2;
182  for(i = n ; i-- ;)
183  {
184  p[i] = r[i] + p[i]*scale;
185  omega_p += p[i]*p[i];
186  }
187  }
188  SG_SDEBUG("...Done.");
189  SG_SINFO("CGLS converged in %d iteration(s)", cgiter);
190 
191  SG_FREE(z);
192  SG_FREE(q);
193  SG_FREE(r);
194  SG_FREE(p);
195  return optimality;
196 }
197 
198 int32_t L2_SVM_MFN(
199  const struct data *Data, struct options *Options,
200  struct vector_double *Weights, struct vector_double *Outputs,
201  int32_t ini)
202 {
203  /* Disassemble the structures */
204  CDotFeatures* features=Data->features;
205  float64_t *Y = Data->Y;
206  float64_t *C = Data->C;
207  int32_t n = Data->n;
208  int32_t m = Data->m;
209  float64_t lambda = Options->lambda;
211  float64_t *w = Weights->vec;
212  float64_t *o = Outputs->vec;
213  float64_t F_old = 0.0;
214  float64_t F = 0.0;
215  float64_t diff=0.0;
216  vector_int *ActiveSubset = SG_MALLOC(vector_int, 1);
217  ActiveSubset->vec = SG_MALLOC(int32_t, m);
218  ActiveSubset->d = m;
219  // initialize
220  if(ini==0) {
221  epsilon=BIG_EPSILON;
222  Options->cgitermax=SMALL_CGITERMAX;
223  Options->epsilon=BIG_EPSILON;
224  }
225  else {epsilon = Options->epsilon;}
226  for (int32_t i=0;i<n;i++) F+=w[i]*w[i];
227  F=0.5*lambda*F;
228  int32_t active=0;
229  int32_t inactive=m-1; // l-1
230  for (int32_t i=0; i<m ; i++)
231  {
232  diff=1-Y[i]*o[i];
233  if(diff>0)
234  {
235  ActiveSubset->vec[active]=i;
236  active++;
237  F+=0.5*C[i]*diff*diff;
238  }
239  else
240  {
241  ActiveSubset->vec[inactive]=i;
242  inactive--;
243  }
244  }
245  ActiveSubset->d=active;
246  int32_t iter=0;
247  int32_t opt=0;
248  int32_t opt2=0;
249  vector_double *Weights_bar = SG_MALLOC(vector_double, 1);
250  vector_double *Outputs_bar = SG_MALLOC(vector_double, 1);
251  float64_t *w_bar = SG_MALLOC(float64_t, n);
252  float64_t *o_bar = SG_MALLOC(float64_t, m);
253  Weights_bar->vec=w_bar;
254  Outputs_bar->vec=o_bar;
255  Weights_bar->d=n;
256  Outputs_bar->d=m;
257  float64_t delta=0.0;
258  float64_t t=0.0;
259  int32_t ii = 0;
260  while(iter<MFNITERMAX)
261  {
262  iter++;
263  SG_SDEBUG("L2_SVM_MFN Iteration# %d (%d active examples, objective_value = %f)\n", iter, active, F);
264  for (int32_t i=n; i-- ;)
265  w_bar[i]=w[i];
266  for (int32_t i=m; i-- ;)
267  o_bar[i]=o[i];
268 
269  opt=CGLS(Data,Options,ActiveSubset,Weights_bar,Outputs_bar);
270  for(register int32_t i=active; i < m; i++)
271  {
272  ii=ActiveSubset->vec[i];
273 
274  t=features->dense_dot(ii, w_bar, n-1);
275  t+=Options->bias*w_bar[n-1]; //bias (modelled as last dim)
276 
277  o_bar[ii]=t;
278  }
279  if(ini==0) {Options->cgitermax=CGITERMAX; ini=1;};
280  opt2=1;
281  for (int32_t i=0;i<m;i++)
282  {
283  ii=ActiveSubset->vec[i];
284  if(i<active)
285  opt2=(opt2 && (Y[ii]*o_bar[ii]<=1+epsilon));
286  else
287  opt2=(opt2 && (Y[ii]*o_bar[ii]>=1-epsilon));
288  if(opt2==0) break;
289  }
290  if(opt && opt2) // l
291  {
292  if(epsilon==BIG_EPSILON)
293  {
294  epsilon=EPSILON;
295  Options->epsilon=EPSILON;
296  SG_SDEBUG("epsilon = %f case converged (speedup heuristic 2). Continuing with epsilon=%f", BIG_EPSILON , EPSILON);
297  continue;
298  }
299  else
300  {
301  for (int32_t i=n; i-- ;)
302  w[i]=w_bar[i];
303  for (int32_t i=m; i-- ;)
304  o[i]=o_bar[i];
305  SG_FREE(ActiveSubset->vec);
306  SG_FREE(ActiveSubset);
307  SG_FREE(o_bar);
308  SG_FREE(w_bar);
309  SG_FREE(Weights_bar);
310  SG_FREE(Outputs_bar);
311  SG_SINFO("L2_SVM_MFN converged (optimality) in %d", iter);
312  return 1;
313  }
314  }
315  delta=line_search(w,w_bar,lambda,o,o_bar,Y,C,n,m);
316  SG_SDEBUG("LINE_SEARCH delta = %f\n", delta);
317  F_old=F;
318  F=0.0;
319  for (int32_t i=n; i-- ;) {
320  w[i]+=delta*(w_bar[i]-w[i]);
321  F+=w[i]*w[i];
322  }
323  F=0.5*lambda*F;
324  active=0;
325  inactive=m-1;
326  for (int32_t i=0; i<m ; i++)
327  {
328  o[i]+=delta*(o_bar[i]-o[i]);
329  diff=1-Y[i]*o[i];
330  if(diff>0)
331  {
332  ActiveSubset->vec[active]=i;
333  active++;
334  F+=0.5*C[i]*diff*diff;
335  }
336  else
337  {
338  ActiveSubset->vec[inactive]=i;
339  inactive--;
340  }
341  }
342  ActiveSubset->d=active;
343  if(CMath::abs(F-F_old)<RELATIVE_STOP_EPS*CMath::abs(F_old))
344  {
345  SG_SINFO("L2_SVM_MFN converged (rel. criterion) in %d iterations", iter);
346  return 2;
347  }
348  }
349  SG_FREE(ActiveSubset->vec);
350  SG_FREE(ActiveSubset);
351  SG_FREE(o_bar);
352  SG_FREE(w_bar);
353  SG_FREE(Weights_bar);
354  SG_FREE(Outputs_bar);
355  SG_SINFO("L2_SVM_MFN converged (max iter exceeded) in %d iterations", iter);
356  return 0;
357 }
358 
360  float64_t *w_bar,
361  float64_t lambda,
362  float64_t *o,
363  float64_t *o_bar,
364  float64_t *Y,
365  float64_t *C,
366  int32_t d, /* data dimensionality -- 'n' */
367  int32_t l) /* number of examples */
368 {
369  float64_t omegaL = 0.0;
370  float64_t omegaR = 0.0;
371  float64_t diff=0.0;
372  for(int32_t i=d; i--; )
373  {
374  diff=w_bar[i]-w[i];
375  omegaL+=w[i]*diff;
376  omegaR+=w_bar[i]*diff;
377  }
378  omegaL=lambda*omegaL;
379  omegaR=lambda*omegaR;
380  float64_t L=0.0;
381  float64_t R=0.0;
382  int32_t ii=0;
383  for (int32_t i=0;i<l;i++)
384  {
385  if(Y[i]*o[i]<1)
386  {
387  diff=C[i]*(o_bar[i]-o[i]);
388  L+=(o[i]-Y[i])*diff;
389  R+=(o_bar[i]-Y[i])*diff;
390  }
391  }
392  L+=omegaL;
393  R+=omegaR;
394  Delta* deltas=SG_MALLOC(Delta, l);
395  int32_t p=0;
396  for(int32_t i=0;i<l;i++)
397  {
398  diff=Y[i]*(o_bar[i]-o[i]);
399 
400  if(Y[i]*o[i]<1)
401  {
402  if(diff>0)
403  {
404  deltas[p].delta=(1-Y[i]*o[i])/diff;
405  deltas[p].index=i;
406  deltas[p].s=-1;
407  p++;
408  }
409  }
410  else
411  {
412  if(diff<0)
413  {
414  deltas[p].delta=(1-Y[i]*o[i])/diff;
415  deltas[p].index=i;
416  deltas[p].s=1;
417  p++;
418  }
419  }
420  }
421  std::sort(deltas,deltas+p);
422  float64_t delta_prime=0.0;
423  for (int32_t i=0;i<p;i++)
424  {
425  delta_prime = L + deltas[i].delta*(R-L);
426  if(delta_prime>=0)
427  break;
428  ii=deltas[i].index;
429  diff=(deltas[i].s)*C[ii]*(o_bar[ii]-o[ii]);
430  L+=diff*(o[ii]-Y[ii]);
431  R+=diff*(o_bar[ii]-Y[ii]);
432  }
433  SG_FREE(deltas);
434  return (-L/(R-L));
435 }
436 
437 int32_t TSVM_MFN(
438  const struct data *Data, struct options *Options,
439  struct vector_double *Weights, struct vector_double *Outputs)
440 {
441  /* Setup labeled-only examples and train L2_SVM_MFN */
442  struct data *Data_Labeled = SG_MALLOC(data, 1);
443  struct vector_double *Outputs_Labeled = SG_MALLOC(vector_double, 1);
444  initialize(Outputs_Labeled,Data->l,0.0);
445  SG_SDEBUG("Initializing weights, unknown labels");
446  GetLabeledData(Data_Labeled,Data); /* gets labeled data and sets C=1/l */
447  L2_SVM_MFN(Data_Labeled, Options, Weights,Outputs_Labeled,0);
449  /* Use this weight vector to classify R*u unlabeled examples as
450  positive*/
451  int32_t p=0,q=0;
452  float64_t t=0.0;
453  int32_t *JU = SG_MALLOC(int32_t, Data->u);
454  float64_t *ou = SG_MALLOC(float64_t, Data->u);
455  float64_t lambda_0 = TSVM_LAMBDA_SMALL;
456  for (int32_t i=0;i<Data->m;i++)
457  {
458  if(Data->Y[i]==0.0)
459  {
460  t=Data->features->dense_dot(i, Weights->vec, Data->n-1);
461  t+=Options->bias*Weights->vec[Data->n-1]; //bias (modelled as last dim)
462 
463  Outputs->vec[i]=t;
464  Data->C[i]=lambda_0*1.0/Data->u;
465  JU[q]=i;
466  ou[q]=t;
467  q++;
468  }
469  else
470  {
471  Outputs->vec[i]=Outputs_Labeled->vec[p];
472  Data->C[i]=1.0/Data->l;
473  p++;
474  }
475  }
476  std::nth_element(ou,ou+int32_t((1-Options->R)*Data->u-1),ou+Data->u);
477  float64_t thresh=*(ou+int32_t((1-Options->R)*Data->u)-1);
478  SG_FREE(ou);
479  for (int32_t i=0;i<Data->u;i++)
480  {
481  if(Outputs->vec[JU[i]]>thresh)
482  Data->Y[JU[i]]=1.0;
483  else
484  Data->Y[JU[i]]=-1.0;
485  }
486  for (int32_t i=0;i<Data->n;i++)
487  Weights->vec[i]=0.0;
488  for (int32_t i=0;i<Data->m;i++)
489  Outputs->vec[i]=0.0;
490  L2_SVM_MFN(Data,Options,Weights,Outputs,0);
491  int32_t num_switches=0;
492  int32_t s=0;
493  int32_t last_round=0;
494  while(lambda_0 <= Options->lambda_u)
495  {
496  int32_t iter2=0;
497  while(1){
498  s=switch_labels(Data->Y,Outputs->vec,JU,Data->u,Options->S);
499  if(s==0) break;
500  iter2++;
501  SG_SDEBUG("****** lambda_0 = %f iteration = %d ************************************\n", lambda_0, iter2);
502  SG_SDEBUG("Optimizing unknown labels. switched %d labels.\n");
503  num_switches+=s;
504  SG_SDEBUG("Optimizing weights\n");
505  L2_SVM_MFN(Data,Options,Weights,Outputs,1);
506  }
507  if(last_round==1) break;
508  lambda_0=TSVM_ANNEALING_RATE*lambda_0;
509  if(lambda_0 >= Options->lambda_u) {lambda_0 = Options->lambda_u; last_round=1;}
510  for (int32_t i=0;i<Data->u;i++)
511  Data->C[JU[i]]=lambda_0*1.0/Data->u;
512  SG_SDEBUG("****** lambda0 increased to %f%% of lambda_u = %f ************************\n", lambda_0*100/Options->lambda_u, Options->lambda_u);
513  SG_SDEBUG("Optimizing weights\n");
514  L2_SVM_MFN(Data,Options,Weights,Outputs,1);
515  }
516  SG_SDEBUG("Total Number of Switches = %d\n", num_switches);
517  /* reset labels */
518  for (int32_t i=0;i<Data->u;i++) Data->Y[JU[i]] = 0.0;
519  float64_t F = transductive_cost(norm_square(Weights),Data->Y,Outputs->vec,Outputs->d,Options->lambda,Options->lambda_u);
520  SG_SDEBUG("Objective Value = %f\n",F);
521  delete [] JU;
522  return num_switches;
523 }
524 
525 int32_t switch_labels(float64_t* Y, float64_t* o, int32_t* JU, int32_t u, int32_t S)
526 {
527  int32_t npos=0;
528  int32_t nneg=0;
529  for (int32_t i=0;i<u;i++)
530  {
531  if((Y[JU[i]]>0) && (o[JU[i]]<1.0)) npos++;
532  if((Y[JU[i]]<0) && (-o[JU[i]]<1.0)) nneg++;
533  }
534  Delta* positive=SG_MALLOC(Delta, npos);
535  Delta* negative=SG_MALLOC(Delta, nneg);
536  int32_t p=0;
537  int32_t n=0;
538  int32_t ii=0;
539  for (int32_t i=0;i<u;i++)
540  {
541  ii=JU[i];
542  if((Y[ii]>0.0) && (o[ii]<1.0)) {
543  positive[p].delta=o[ii];
544  positive[p].index=ii;
545  positive[p].s=0;
546  p++;};
547  if((Y[ii]<0.0) && (-o[ii]<1.0))
548  {
549  negative[n].delta=-o[ii];
550  negative[n].index=ii;
551  negative[n].s=0;
552  n++;};
553  }
554  std::sort(positive,positive+npos);
555  std::sort(negative,negative+nneg);
556  int32_t s=-1;
557  while(1)
558  {
559  s++;
560  if((s>=S) || (positive[s].delta>=-negative[s].delta) || (s>=npos) || (s>=nneg))
561  break;
562  Y[positive[s].index]=-1.0;
563  Y[negative[s].index]= 1.0;
564  }
565  SG_FREE(positive);
566  SG_FREE(negative);
567  return s;
568 }
569 
570 int32_t DA_S3VM(
571  struct data *Data, struct options *Options, struct vector_double *Weights,
572  struct vector_double *Outputs)
573 {
574  float64_t T = DA_INIT_TEMP*Options->lambda_u;
575  int32_t iter1 = 0, iter2 =0;
576  float64_t *p = SG_MALLOC(float64_t, Data->u);
577  float64_t *q = SG_MALLOC(float64_t, Data->u);
578  float64_t *g = SG_MALLOC(float64_t, Data->u);
579  float64_t F,F_min;
580  float64_t *w_min = SG_MALLOC(float64_t, Data->n);
581  float64_t *o_min = SG_MALLOC(float64_t, Data->m);
582  float64_t *w = Weights->vec;
583  float64_t *o = Outputs->vec;
584  float64_t kl_divergence = 1.0;
585  /*initialize */
586  SG_SDEBUG("Initializing weights, p");
587  for (int32_t i=0;i<Data->u; i++)
588  p[i] = Options->R;
589  /* record which examples are unlabeled */
590  int32_t *JU = SG_MALLOC(int32_t, Data->u);
591  int32_t j=0;
592  for(int32_t i=0;i<Data->m;i++)
593  {
594  if(Data->Y[i]==0.0)
595  {JU[j]=i;j++;}
596  }
597  float64_t H = entropy(p,Data->u);
598  optimize_w(Data,p,Options,Weights,Outputs,0);
599  F = transductive_cost(norm_square(Weights),Data->Y,Outputs->vec,Outputs->d,Options->lambda,Options->lambda_u);
600  F_min = F;
601  for (int32_t i=0;i<Weights->d;i++)
602  w_min[i]=w[i];
603  for (int32_t i=0;i<Outputs->d;i++)
604  o_min[i]=o[i];
605  while((iter1 < DA_OUTER_ITERMAX) && (H > Options->epsilon))
606  {
607  iter1++;
608  iter2=0;
609  kl_divergence=1.0;
610  while((iter2 < DA_INNER_ITERMAX) && (kl_divergence > Options->epsilon))
611  {
612  iter2++;
613  for (int32_t i=0;i<Data->u;i++)
614  {
615  q[i]=p[i];
616  g[i] = Options->lambda_u*((o[JU[i]] > 1 ? 0 : (1 - o[JU[i]])*(1 - o[JU[i]])) - (o[JU[i]]< -1 ? 0 : (1 + o[JU[i]])*(1 + o[JU[i]])));
617  }
618  SG_SDEBUG("Optimizing p.\n");
619  optimize_p(g,Data->u,T,Options->R,p);
620  kl_divergence=KL(p,q,Data->u);
621  SG_SDEBUG("Optimizing weights\n");
622  optimize_w(Data,p,Options,Weights,Outputs,1);
623  F = transductive_cost(norm_square(Weights),Data->Y,Outputs->vec,Outputs->d,Options->lambda,Options->lambda_u);
624  if(F < F_min)
625  {
626  F_min = F;
627  for (int32_t i=0;i<Weights->d;i++)
628  w_min[i]=w[i];
629  for (int32_t i=0;i<Outputs->d;i++)
630  o_min[i]=o[i];
631  }
632  SG_SDEBUG("***** outer_iter = %d T = %g inner_iter = %d kl = %g cost = %g *****\n",iter1,T,iter2,kl_divergence,F);
633  }
634  H = entropy(p,Data->u);
635  SG_SDEBUG("***** Finished outer_iter = %d T = %g Entropy = %g ***\n", iter1,T,H);
636  T = T/DA_ANNEALING_RATE;
637  }
638  for (int32_t i=0;i<Weights->d;i++)
639  w[i]=w_min[i];
640  for (int32_t i=0;i<Outputs->d;i++)
641  o[i]=o_min[i];
642  /* may want to reset the original Y */
643  SG_FREE(p);
644  SG_FREE(q);
645  SG_FREE(g);
646  SG_FREE(JU);
647  SG_FREE(w_min);
648  SG_FREE(o_min);
649  SG_SINFO("(min) Objective Value = %f", F_min);
650  return 1;
651 }
652 
653 int32_t optimize_w(
654  const struct data *Data, const float64_t *p, struct options *Options,
655  struct vector_double *Weights, struct vector_double *Outputs, int32_t ini)
656 {
657  int32_t i,j;
658  CDotFeatures* features=Data->features;
659  int32_t n = Data->n;
660  int32_t m = Data->m;
661  int32_t u = Data->u;
662  float64_t lambda = Options->lambda;
664  float64_t *w = Weights->vec;
665  float64_t *o = SG_MALLOC(float64_t, m+u);
666  float64_t *Y = SG_MALLOC(float64_t, m+u);
667  float64_t *C = SG_MALLOC(float64_t, m+u);
668  int32_t *labeled_indices = SG_MALLOC(int32_t, m);
669  float64_t F_old = 0.0;
670  float64_t F = 0.0;
671  float64_t diff=0.0;
672  float64_t lambda_u_by_u = Options->lambda_u/u;
673  vector_int *ActiveSubset = SG_MALLOC(vector_int, 1);
674  ActiveSubset->vec = SG_MALLOC(int32_t, m);
675  ActiveSubset->d = m;
676  // initialize
677  if(ini==0)
678  {
679  epsilon=BIG_EPSILON;
680  Options->cgitermax=SMALL_CGITERMAX;
681  Options->epsilon=BIG_EPSILON;}
682  else {epsilon = Options->epsilon;}
683 
684  for(i=0;i<n;i++) F+=w[i]*w[i];
685  F=lambda*F;
686  int32_t active=0;
687  int32_t inactive=m-1; // l-1
688  float64_t temp1;
689  float64_t temp2;
690 
691  j = 0;
692  for(i=0; i<m ; i++)
693  {
694  o[i]=Outputs->vec[i];
695  if(Data->Y[i]==0.0)
696  {
697  labeled_indices[i]=0;
698  o[m+j]=o[i];
699  Y[i]=1;
700  Y[m+j]=-1;
701  C[i]=lambda_u_by_u*p[j];
702  C[m+j]=lambda_u_by_u*(1-p[j]);
703  ActiveSubset->vec[active]=i;
704  active++;
705  diff = 1 - CMath::abs(o[i]);
706  if(diff>0)
707  {
708  Data->Y[i] = 2*p[j]-1;
709  Data->C[i] = lambda_u_by_u;
710  temp1 = (1 - o[i]);
711  temp2 = (1 + o[i]);
712  F+=lambda_u_by_u*(p[j]*temp1*temp1 + (1-p[j])*temp2*temp2);
713  }
714  else
715  {
716  if(o[i]>0)
717  {
718  Data->Y[i] = -1.0;
719  Data->C[i] = C[m+j];
720  }
721  else
722  {
723  Data->Y[i] = 1.0;
724  Data->C[i] = C[i];
725  }
726  temp1 = (1-Data->Y[i]*o[i]);
727  F+= Data->C[i]*temp1*temp1;
728  }
729  j++;
730  }
731  else
732  {
733  labeled_indices[i]=1;
734  Y[i]=Data->Y[i];
735  C[i]=1.0/Data->l;
736  Data->C[i]=1.0/Data->l;
737  diff=1-Data->Y[i]*o[i];
738  if(diff>0)
739  {
740  ActiveSubset->vec[active]=i;
741  active++;
742  F+=Data->C[i]*diff*diff;
743  }
744  else
745  {
746  ActiveSubset->vec[inactive]=i;
747  inactive--;
748  }
749  }
750  }
751  F=0.5*F;
752  ActiveSubset->d=active;
753  int32_t iter=0;
754  int32_t opt=0;
755  int32_t opt2=0;
756  vector_double *Weights_bar = SG_MALLOC(vector_double, 1);
757  vector_double *Outputs_bar = SG_MALLOC(vector_double, 1);
758  float64_t *w_bar = SG_MALLOC(float64_t, n);
759  float64_t *o_bar = SG_MALLOC(float64_t, m+u);
760  Weights_bar->vec=w_bar;
761  Outputs_bar->vec=o_bar;
762  Weights_bar->d=n;
763  Outputs_bar->d=m; /* read only the top m ; bottom u will be copies */
764  float64_t delta=0.0;
765  float64_t t=0.0;
766  int32_t ii = 0;
767  while(iter<MFNITERMAX)
768  {
769  iter++;
770  SG_SDEBUG("L2_SVM_MFN Iteration# %d (%d active examples, objective_value = %f)", iter, active, F);
771  for(i=n; i-- ;)
772  w_bar[i]=w[i];
773 
774  for(i=m+u; i-- ;)
775  o_bar[i]=o[i];
776  opt=CGLS(Data,Options,ActiveSubset,Weights_bar,Outputs_bar);
777  for(i=active; i < m; i++)
778  {
779  ii=ActiveSubset->vec[i];
780  t=features->dense_dot(ii, w_bar, n-1);
781  t+=Options->bias*w_bar[n-1]; //bias (modelled as last dim)
782 
783  o_bar[ii]=t;
784  }
785  // make o_bar consistent in the bottom half
786  j=0;
787  for(i=0; i<m;i++)
788  {
789  if(labeled_indices[i]==0)
790  {o_bar[m+j]=o_bar[i]; j++;};
791  }
792  if(ini==0) {Options->cgitermax=CGITERMAX; ini=1;};
793  opt2=1;
794  for(i=0; i < m ;i++)
795  {
796  ii=ActiveSubset->vec[i];
797  if(i<active)
798  {
799  if(labeled_indices[ii]==1)
800  opt2=(opt2 && (Data->Y[ii]*o_bar[ii]<=1+epsilon));
801  else
802  {
803  if(CMath::abs(o[ii])<1)
804  opt2=(opt2 && (CMath::abs(o_bar[ii])<=1+epsilon));
805  else
806  opt2=(opt2 && (CMath::abs(o_bar[ii])>=1-epsilon));
807  }
808  }
809  else
810  opt2=(opt2 && (Data->Y[ii]*o_bar[ii]>=1-epsilon));
811  if(opt2==0) break;
812  }
813  if(opt && opt2) // l
814  {
815  if(epsilon==BIG_EPSILON)
816  {
817  epsilon=EPSILON;
818  Options->epsilon=EPSILON;
819  SG_SDEBUG( "epsilon = %f case converged (speedup heuristic 2). Continuing with epsilon=%f\n", BIG_EPSILON, EPSILON);
820  continue;
821  }
822  else
823  {
824  for(i=n; i-- ;)
825  w[i]=w_bar[i];
826  for(i=m; i-- ;)
827  Outputs->vec[i]=o_bar[i];
828  for(i=m; i-- ;)
829  {
830  if(labeled_indices[i]==0)
831  Data->Y[i]=0.0;
832  }
833  SG_FREE(ActiveSubset->vec);
834  SG_FREE(ActiveSubset);
835  SG_FREE(o_bar);
836  SG_FREE(w_bar);
837  SG_FREE(o);
838  SG_FREE(Weights_bar);
839  SG_FREE(Outputs_bar);
840  SG_FREE(Y);
841  SG_FREE(C);
842  SG_FREE(labeled_indices);
843  SG_SINFO("L2_SVM_MFN converged in %d iteration(s)", iter);
844  return 1;
845  }
846  }
847 
848  delta=line_search(w,w_bar,lambda,o,o_bar,Y,C,n,m+u);
849  SG_SDEBUG("LINE_SEARCH delta = %f", delta);
850  F_old=F;
851  F=0.0;
852  for(i=0;i<n;i++) {w[i]+=delta*(w_bar[i]-w[i]); F+=w[i]*w[i];}
853  F=lambda*F;
854  j=0;
855  active=0;
856  inactive=m-1;
857  for(i=0; i<m ; i++)
858  {
859  o[i]+=delta*(o_bar[i]-o[i]);
860  if(labeled_indices[i]==0)
861  {
862  o[m+j]=o[i];
863  ActiveSubset->vec[active]=i;
864  active++;
865  diff = 1 - CMath::abs(o[i]);
866  if(diff>0)
867  {
868  Data->Y[i] = 2*p[j]-1;
869  Data->C[i] = lambda_u_by_u;
870  temp1 = (1 - o[i]);
871  temp2 = (1 + o[i]);
872  F+=lambda_u_by_u*(p[j]*temp1*temp1 + (1-p[j])*temp2*temp2);
873  }
874  else
875  {
876  if(o[i]>0)
877  {
878  Data->Y[i] = -1;
879  Data->C[i] = C[m+j];
880  }
881  else
882  {
883  Data->Y[i] = +1;
884  Data->C[i] = C[i];
885  }
886  temp1=(1-Data->Y[i]*o[i]);
887  F+= Data->C[i]*temp1*temp1;
888  }
889  j++;
890  }
891  else
892  {
893  diff=1-Data->Y[i]*o[i];
894  if(diff>0)
895  {
896  ActiveSubset->vec[active]=i;
897  active++;
898  F+=Data->C[i]*diff*diff;
899  }
900  else
901  {
902  ActiveSubset->vec[inactive]=i;
903  inactive--;
904  }
905  }
906  }
907  F=0.5*F;
908  ActiveSubset->d=active;
909  if(CMath::abs(F-F_old)<EPSILON)
910  break;
911  }
912  for(i=m; i-- ;)
913  {
914  Outputs->vec[i]=o[i];
915  if(labeled_indices[i]==0)
916  Data->Y[i]=0.0;
917  }
918  SG_FREE(ActiveSubset->vec);
919  SG_FREE(labeled_indices);
920  SG_FREE(ActiveSubset);
921  SG_FREE(o_bar);
922  SG_FREE(w_bar);
923  SG_FREE(o);
924  SG_FREE(Weights_bar);
925  SG_FREE(Outputs_bar);
926  SG_FREE(Y);
927  SG_FREE(C);
928  SG_SINFO("L2_SVM_MFN converged in %d iterations", iter);
929  return 0;
930 }
931 
933  const float64_t* g, int32_t u, float64_t T, float64_t r, float64_t* p)
934 {
935  int32_t iter=0;
936  float64_t epsilon=1e-10;
937  int32_t maxiter=500;
938  float64_t nu_minus=g[0];
939  float64_t nu_plus=g[0];
940  for (int32_t i=0;i<u;i++)
941  {
942  if(g[i]<nu_minus) nu_minus=g[i];
943  if(g[i]>nu_plus) nu_plus=g[i];
944  };
945 
946  float64_t b=T*log((1-r)/r);
947  nu_minus-=b;
948  nu_plus-=b;
949  float64_t nu=(nu_plus+nu_minus)/2;
950  float64_t Bnu=0.0;
951  float64_t BnuPrime=0.0;
952  float64_t s=0.0;
953  float64_t tmp=0.0;
954  for (int32_t i=0;i<u;i++)
955  {
956  s=exp((g[i]-nu)/T);
957  if(!(CMath::is_infinity(s)))
958  {
959  tmp=1.0/(1.0+s);
960  Bnu+=tmp;
961  BnuPrime+=s*tmp*tmp;
962  }
963  }
964  Bnu=Bnu/u;
965  Bnu-=r;
966  BnuPrime=BnuPrime/(T*u);
967  float64_t nuHat=0.0;
968  while((CMath::abs(Bnu)>epsilon) && (iter < maxiter))
969  {
970  iter++;
971  if(CMath::abs(BnuPrime)>0.0)
972  nuHat=nu-Bnu/BnuPrime;
973  if((CMath::abs(BnuPrime) > 0.0) | (nuHat>nu_plus) | (nuHat < nu_minus))
974  nu=(nu_minus+nu_plus)/2.0;
975  else
976  nu=nuHat;
977  Bnu=0.0;
978  BnuPrime=0.0;
979  for(int32_t i=0;i<u;i++)
980  {
981  s=exp((g[i]-nu)/T);
982  if(!(CMath::is_infinity(s)))
983  {
984  tmp=1.0/(1.0+s);
985  Bnu+=tmp;
986  BnuPrime+=s*tmp*tmp;
987  }
988  }
989  Bnu=Bnu/u;
990  Bnu-=r;
991  BnuPrime=BnuPrime/(T*u);
992  if(Bnu<0)
993  nu_minus=nu;
994  else
995  nu_plus=nu;
996  if(CMath::abs(nu_minus-nu_plus)<epsilon)
997  break;
998  }
999  if(CMath::abs(Bnu)>epsilon)
1000  SG_SWARNING("Warning (Root): root not found to required precision\n");
1001 
1002  for (int32_t i=0;i<u;i++)
1003  {
1004  s=exp((g[i]-nu)/T);
1005  if(CMath::is_infinity(s)) p[i]=0.0;
1006  else p[i]=1.0/(1.0+s);
1007  }
1008  SG_SINFO(" root (nu) = %f B(nu) = %f", nu, Bnu);
1009 }
1010 
1012  float64_t normWeights, float64_t *Y, float64_t *Outputs, int32_t m,
1013  float64_t lambda, float64_t lambda_u)
1014 {
1015  float64_t F1=0.0,F2=0.0, o=0.0, y=0.0;
1016  int32_t u=0,l=0;
1017  for (int32_t i=0;i<m;i++)
1018  {
1019  o=Outputs[i];
1020  y=Y[i];
1021  if(y==0.0)
1022  {F1 += CMath::abs(o) > 1 ? 0 : (1 - CMath::abs(o))*(1 - CMath::abs(o)); u++;}
1023  else
1024  {F2 += y*o > 1 ? 0 : (1-y*o)*(1-y*o); l++;}
1025  }
1026  float64_t F;
1027  F = 0.5*(lambda*normWeights + lambda_u*F1/u + F2/l);
1028  return F;
1029 }
1030 
1031 float64_t entropy(const float64_t *p, int32_t u)
1032 {
1033  float64_t h=0.0;
1034  float64_t q=0.0;
1035  for (int32_t i=0;i<u;i++)
1036  {
1037  q=p[i];
1038  if(q>0 && q<1)
1039  h+= -(q*CMath::log2(q) + (1-q)*CMath::log2(1-q));
1040  }
1041  return h/u;
1042 }
1043 
1044 float64_t KL(const float64_t *p, const float64_t *q, int32_t u)
1045 {
1046  float64_t h=0.0;
1047  float64_t p1=0.0;
1048  float64_t q1=0.0;
1049  float64_t g=0.0;
1050  for (int32_t i=0;i<u;i++)
1051  {
1052  p1=p[i];
1053  q1=q[i];
1054  if(p1>1-1e-8) p1-=1e-8;
1055  if(p1<1-1e-8) p1+=1e-8;
1056  if(q1>1-1e-8) q1-=1e-8;
1057  if(q1<1-1e-8) q1+=1e-8;
1058  g= (p1*CMath::log2(p1/q1) + (1-p1)*CMath::log2((1-p1)/(1-q1)));
1059  if(CMath::abs(g)<1e-12 || CMath::is_nan(g)) g=0.0;
1060  h+=g;
1061  }
1062  return h/u;
1063 }
1064 
1065 /********************** UTILITIES ********************/
1066 float64_t norm_square(const vector_double *A)
1067 {
1068  float64_t x=0.0, t=0.0;
1069  for(int32_t i=0;i<A->d;i++)
1070  {
1071  t=A->vec[i];
1072  x+=t*t;
1073  }
1074  return x;
1075 }
1076 
1077 void initialize(struct vector_double *A, int32_t k, float64_t a)
1078 {
1079  float64_t *vec = SG_MALLOC(float64_t, k);
1080  for (int32_t i=0;i<k;i++)
1081  vec[i]=a;
1082  A->vec = vec;
1083  A->d = k;
1084  return;
1085 }
1086 
1087 void initialize(struct vector_int *A, int32_t k)
1088 {
1089  int32_t *vec = SG_MALLOC(int32_t, k);
1090  for(int32_t i=0;i<k;i++)
1091  vec[i]=i;
1092  A->vec = vec;
1093  A->d = k;
1094  return;
1095 }
1096 
1097 void GetLabeledData(struct data *D, const struct data *Data)
1098 {
1099  /*FIXME
1100  int32_t *J = SG_MALLOC(int, Data->l);
1101  D->C = SG_MALLOC(float64_t, Data->l);
1102  D->Y = SG_MALLOC(float64_t, Data->l);
1103  int32_t nz=0;
1104  int32_t k=0;
1105  int32_t rowptrs_=Data->l;
1106  for(int32_t i=0;i<Data->m;i++)
1107  {
1108  if(Data->Y[i]!=0.0)
1109  {
1110  J[k]=i;
1111  D->Y[k]=Data->Y[i];
1112  D->C[k]=1.0/Data->l;
1113  nz+=(Data->rowptr[i+1] - Data->rowptr[i]);
1114  k++;
1115  }
1116  }
1117  D->val = SG_MALLOC(float64_t, nz);
1118  D->colind = SG_MALLOC(int32_t, nz);
1119  D->rowptr = new int32_trowptrs_+1];
1120  nz=0;
1121  for(int32_t i=0;i<Data->l;i++)
1122  {
1123  D->rowptr[i]=nz;
1124  for(int32_t j=Data->rowptr[J[i]];j<Data->rowptr[J[i]+1];j++)
1125  {
1126  D->val[nz] = Data->val[j];
1127  D->colind[nz] = Data->colind[j];
1128  nz++;
1129  }
1130  }
1131  D->rowptr[rowptrs_]=nz;
1132  D->nz=nz;
1133  D->l=Data->l;
1134  D->m=Data->l;
1135  D->n=Data->n;
1136  D->u=0;
1137  SG_FREE(J);*/
1138 }
1139 }

SHOGUN Machine Learning Toolbox - Documentation