SHOGUN  v2.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
lbfgs.cpp
Go to the documentation of this file.
1 /*
2  * Limited memory BFGS (L-BFGS).
3  *
4  * Copyright (c) 1990, Jorge Nocedal
5  * Copyright (c) 2007-2010 Naoaki Okazaki
6  * All rights reserved.
7  *
8  * Permission is hereby granted, free of charge, to any person obtaining a copy
9  * of this software and associated documentation files (the "Software"), to deal
10  * in the Software without restriction, including without limitation the rights
11  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12  * copies of the Software, and to permit persons to whom the Software is
13  * furnished to do so, subject to the following conditions:
14  *
15  * The above copyright notice and this permission notice shall be included in
16  * all copies or substantial portions of the Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24  * THE SOFTWARE.
25  */
26 
27 /* $Id$ */
28 
29 /*
30 This library is a C port of the FORTRAN implementation of Limited-memory
31 Broyden-Fletcher-Goldfarb-Shanno (L-BFGS) method written by Jorge Nocedal.
32 The original FORTRAN source code is available at:
33 http://www.ece.northwestern.edu/~nocedal/lbfgs.html
34 
35 The L-BFGS algorithm is described in:
36  - Jorge Nocedal.
37  Updating Quasi-Newton Matrices with Limited Storage.
38  <i>Mathematics of Computation</i>, Vol. 35, No. 151, pp. 773--782, 1980.
39  - Dong C. Liu and Jorge Nocedal.
40  On the limited memory BFGS method for large scale optimization.
41  <i>Mathematical Programming</i> B, Vol. 45, No. 3, pp. 503-528, 1989.
42 
43 The line search algorithms used in this implementation are described in:
44  - John E. Dennis and Robert B. Schnabel.
45  <i>Numerical Methods for Unconstrained Optimization and Nonlinear
46  Equations</i>, Englewood Cliffs, 1983.
47  - Jorge J. More and David J. Thuente.
48  Line search algorithm with guaranteed sufficient decrease.
49  <i>ACM Transactions on Mathematical Software (TOMS)</i>, Vol. 20, No. 3,
50  pp. 286-307, 1994.
51 
52 This library also implements Orthant-Wise Limited-memory Quasi-Newton (OWL-QN)
53 method presented in:
54  - Galen Andrew and Jianfeng Gao.
55  Scalable training of L1-regularized log-linear models.
56  In <i>Proceedings of the 24th International Conference on Machine
57  Learning (ICML 2007)</i>, pp. 33-40, 2007.
58 
59 I would like to thank the original author, Jorge Nocedal, who has been
60 distributing the effieicnt and explanatory implementation in an open source
61 licence.
62 */
63 
64 #include <algorithm>
65 #include <cstdio>
66 #include <cstdlib>
67 #include <cmath>
68 
70 #include <shogun/lib/SGVector.h>
71 
72 namespace shogun
73 {
74 
75 #define min2(a, b) ((a) <= (b) ? (a) : (b))
76 #define max2(a, b) ((a) >= (b) ? (a) : (b))
77 #define max3(a, b, c) max2(max2((a), (b)), (c));
78 
80  int32_t n;
81  void *instance;
84 };
86 
89  float64_t *s; /* [n] */
90  float64_t *y; /* [n] */
91  float64_t ys; /* vecdot(y, s) */
92 };
94 
95 static const lbfgs_parameter_t _defparam = {
96  6, 1e-5, 0, 1e-5,
98  1e-20, 1e20, 1e-4, 0.9, 0.9, 1.0e-16,
99  0.0, 0, -1,
100 };
101 
102 /* Forward function declarations. */
103 
104 typedef int32_t (*line_search_proc)(
105  int32_t n,
106  float64_t *x,
107  float64_t *f,
108  float64_t *g,
109  float64_t *s,
110  float64_t *stp,
111  const float64_t* xp,
112  const float64_t* gp,
113  float64_t *wa,
114  callback_data_t *cd,
115  const lbfgs_parameter_t *param
116  );
117 
118 static int32_t line_search_backtracking(
119  int32_t n,
120  float64_t *x,
121  float64_t *f,
122  float64_t *g,
123  float64_t *s,
124  float64_t *stp,
125  const float64_t* xp,
126  const float64_t* gp,
127  float64_t *wa,
128  callback_data_t *cd,
129  const lbfgs_parameter_t *param
130  );
131 
132 static int32_t line_search_backtracking_owlqn(
133  int32_t n,
134  float64_t *x,
135  float64_t *f,
136  float64_t *g,
137  float64_t *s,
138  float64_t *stp,
139  const float64_t* xp,
140  const float64_t* gp,
141  float64_t *wp,
142  callback_data_t *cd,
143  const lbfgs_parameter_t *param
144  );
145 
146 static int32_t line_search_morethuente(
147  int32_t n,
148  float64_t *x,
149  float64_t *f,
150  float64_t *g,
151  float64_t *s,
152  float64_t *stp,
153  const float64_t* xp,
154  const float64_t* gp,
155  float64_t *wa,
156  callback_data_t *cd,
157  const lbfgs_parameter_t *param
158  );
159 
160 static int32_t update_trial_interval(
161  float64_t *x,
162  float64_t *fx,
163  float64_t *dx,
164  float64_t *y,
165  float64_t *fy,
166  float64_t *dy,
167  float64_t *t,
168  float64_t *ft,
169  float64_t *dt,
170  const float64_t tmin,
171  const float64_t tmax,
172  int32_t *brackt
173  );
174 
175 static float64_t owlqn_x1norm(
176  const float64_t* x,
177  const int32_t start,
178  const int32_t n
179  );
180 
181 static void owlqn_pseudo_gradient(
182  float64_t* pg,
183  const float64_t* x,
184  const float64_t* g,
185  const int32_t n,
186  const float64_t c,
187  const int32_t start,
188  const int32_t end
189  );
190 
191 static void owlqn_project(
192  float64_t* d,
193  const float64_t* sign,
194  const int32_t start,
195  const int32_t end
196  );
197 
198 
200 {
201  memcpy(param, &_defparam, sizeof(*param));
202 }
203 
204 int32_t lbfgs(
205  int32_t n,
206  float64_t *x,
207  float64_t *ptr_fx,
208  lbfgs_evaluate_t proc_evaluate,
209  lbfgs_progress_t proc_progress,
210  void *instance,
211  lbfgs_parameter_t *_param
212  )
213 {
214  int32_t ret;
215  int32_t i, j, k, ls, end, bound;
216  float64_t step;
217 
218  /* Constant parameters and their default values. */
219  lbfgs_parameter_t param = (_param != NULL) ? (*_param) : _defparam;
220  const int32_t m = param.m;
221 
222  float64_t *xp = NULL;
223  float64_t *g = NULL, *gp = NULL, *pg = NULL;
224  float64_t *d = NULL, *w = NULL, *pf = NULL;
225  iteration_data_t *lm = NULL, *it = NULL;
226  float64_t ys, yy;
227  float64_t xnorm, gnorm, beta;
228  float64_t fx = 0.;
229  float64_t rate = 0.;
231 
232  /* Construct a callback data. */
233  callback_data_t cd;
234  cd.n = n;
235  cd.instance = instance;
236  cd.proc_evaluate = proc_evaluate;
237  cd.proc_progress = proc_progress;
238 
239  /* Check the input parameters for errors. */
240  if (n <= 0) {
241  return LBFGSERR_INVALID_N;
242  }
243  if (param.epsilon < 0.) {
245  }
246  if (param.past < 0) {
248  }
249  if (param.delta < 0.) {
250  return LBFGSERR_INVALID_DELTA;
251  }
252  if (param.min_step < 0.) {
254  }
255  if (param.max_step < param.min_step) {
257  }
258  if (param.ftol < 0.) {
259  return LBFGSERR_INVALID_FTOL;
260  }
263  if (param.wolfe <= param.ftol || 1. <= param.wolfe) {
264  return LBFGSERR_INVALID_WOLFE;
265  }
266  }
267  if (param.gtol < 0.) {
268  return LBFGSERR_INVALID_GTOL;
269  }
270  if (param.xtol < 0.) {
271  return LBFGSERR_INVALID_XTOL;
272  }
273  if (param.max_linesearch <= 0) {
275  }
276  if (param.orthantwise_c < 0.) {
278  }
279  if (param.orthantwise_start < 0 || n < param.orthantwise_start) {
281  }
282  if (param.orthantwise_end < 0) {
283  param.orthantwise_end = n;
284  }
285  if (n < param.orthantwise_end) {
287  }
288  if (param.orthantwise_c != 0.) {
289  switch (param.linesearch) {
291  linesearch = line_search_backtracking_owlqn;
292  break;
293  default:
294  /* Only the backtracking method is available. */
296  }
297  } else {
298  switch (param.linesearch) {
300  linesearch = line_search_morethuente;
301  break;
305  linesearch = line_search_backtracking;
306  break;
307  default:
309  }
310  }
311 
312  /* Allocate working space. */
313  xp = SG_CALLOC(float64_t, n);
314  g = SG_CALLOC(float64_t, n);
315  gp = SG_CALLOC(float64_t, n);
316  d = SG_CALLOC(float64_t, n);
317  w = SG_CALLOC(float64_t, n);
318  if (xp == NULL || g == NULL || gp == NULL || d == NULL || w == NULL) {
319  ret = LBFGSERR_OUTOFMEMORY;
320  goto lbfgs_exit;
321  }
322 
323  if (param.orthantwise_c != 0.) {
324  /* Allocate working space for OW-LQN. */
325  pg = SG_CALLOC(float64_t, n);
326  if (pg == NULL) {
327  ret = LBFGSERR_OUTOFMEMORY;
328  goto lbfgs_exit;
329  }
330  }
331 
332  /* Allocate limited memory storage. */
333  lm = SG_CALLOC(iteration_data_t, m);
334  if (lm == NULL) {
335  ret = LBFGSERR_OUTOFMEMORY;
336  goto lbfgs_exit;
337  }
338 
339  /* Initialize the limited memory. */
340  for (i = 0;i < m;++i) {
341  it = &lm[i];
342  it->alpha = 0;
343  it->ys = 0;
344  it->s = SG_CALLOC(float64_t, n);
345  it->y = SG_CALLOC(float64_t, n);
346  if (it->s == NULL || it->y == NULL) {
347  ret = LBFGSERR_OUTOFMEMORY;
348  goto lbfgs_exit;
349  }
350  }
351 
352  /* Allocate an array for storing previous values of the objective function. */
353  if (0 < param.past) {
354  pf = SG_CALLOC(float64_t, param.past);
355  }
356 
357  /* Evaluate the function value and its gradient. */
358  fx = cd.proc_evaluate(cd.instance, x, g, cd.n, 0);
359  if (0. != param.orthantwise_c) {
360  /* Compute the L1 norm of the variable and add it to the object value. */
361  xnorm = owlqn_x1norm(x, param.orthantwise_start, param.orthantwise_end);
362  fx += xnorm * param.orthantwise_c;
364  pg, x, g, n,
366  );
367  }
368 
369  /* Store the initial value of the objective function. */
370  if (pf != NULL) {
371  pf[0] = fx;
372  }
373 
374  /*
375  Compute the direction;
376  we assume the initial hessian matrix H_0 as the identity matrix.
377  */
378  if (param.orthantwise_c == 0.) {
379  std::copy(g,g+n,d);
381  } else {
382  std::copy(pg,pg+n,d);
384  }
385 
386  /*
387  Make sure that the initial variables are not a minimizer.
388  */
389  xnorm = SGVector<float64_t>::twonorm(x, n);
390  if (param.orthantwise_c == 0.) {
391  gnorm = SGVector<float64_t>::twonorm(g, n);
392  } else {
393  gnorm = SGVector<float64_t>::twonorm(pg, n);
394  }
395  if (xnorm < 1.0) xnorm = 1.0;
396  if (gnorm / xnorm <= param.epsilon) {
398  goto lbfgs_exit;
399  }
400 
401  /* Compute the initial step:
402  step = 1.0 / sqrt(vecdot(d, d, n))
403  */
404  step = 1.0 / SGVector<float64_t>::twonorm(d, n);
405 
406  k = 1;
407  end = 0;
408  for (;;) {
409  /* Store the current position and gradient vectors. */
410  std::copy(x,x+n,xp);
411  std::copy(g,g+n,gp);
412 
413  /* Search for an optimal step. */
414  if (param.orthantwise_c == 0.) {
415  ls = linesearch(n, x, &fx, g, d, &step, xp, gp, w, &cd, &param);
416  } else {
417  ls = linesearch(n, x, &fx, g, d, &step, xp, pg, w, &cd, &param);
419  pg, x, g, n,
421  );
422  }
423  if (ls < 0) {
424  /* Revert to the previous point. */
425  std::copy(xp,xp+n,x);
426  std::copy(gp,gp+n,g);
427  ret = ls;
428  goto lbfgs_exit;
429  }
430 
431  /* Compute x and g norms. */
432  xnorm = SGVector<float64_t>::twonorm(x, n);
433  if (param.orthantwise_c == 0.) {
434  gnorm = SGVector<float64_t>::twonorm(g, n);
435  } else {
436  gnorm = SGVector<float64_t>::twonorm(pg, n);
437  }
438 
439  /* Report the progress. */
440  if (cd.proc_progress) {
441  if ((ret = cd.proc_progress(cd.instance, x, g, fx, xnorm, gnorm, step, cd.n, k, ls))) {
442  goto lbfgs_exit;
443  }
444  }
445 
446  /*
447  Convergence test.
448  The criterion is given by the following formula:
449  |g(x)| / \max2(1, |x|) < \epsilon
450  */
451  if (xnorm < 1.0) xnorm = 1.0;
452  if (gnorm / xnorm <= param.epsilon) {
453  /* Convergence. */
454  ret = LBFGS_SUCCESS;
455  break;
456  }
457 
458  /*
459  Test for stopping criterion.
460  The criterion is given by the following formula:
461  (f(past_x) - f(x)) / f(x) < \delta
462  */
463  if (pf != NULL) {
464  /* We don't test the stopping criterion while k < past. */
465  if (param.past <= k) {
466  /* Compute the relative improvement from the past. */
467  rate = (pf[k % param.past] - fx) / fx;
468 
469  /* The stopping criterion. */
470  if (rate < param.delta) {
471  ret = LBFGS_STOP;
472  break;
473  }
474  }
475 
476  /* Store the current value of the objective function. */
477  pf[k % param.past] = fx;
478  }
479 
480  if (param.max_iterations != 0 && param.max_iterations < k+1) {
481  /* Maximum number of iterations. */
483  break;
484  }
485 
486  /*
487  Update vectors s and y:
488  s_{k+1} = x_{k+1} - x_{k} = \step * d_{k}.
489  y_{k+1} = g_{k+1} - g_{k}.
490  */
491  it = &lm[end];
492  SGVector<float64_t>::add(it->s, 1, x, -1, xp, n);
493  SGVector<float64_t>::add(it->y, 1, g, -1, gp, n);
494 
495  /*
496  Compute scalars ys and yy:
497  ys = y^t \cdot s = 1 / \rho.
498  yy = y^t \cdot y.
499  Notice that yy is used for scaling the hessian matrix H_0 (Cholesky factor).
500  */
501  ys = SGVector<float64_t>::dot(it->y, it->s, n);
502  yy = SGVector<float64_t>::dot(it->y, it->y, n);
503  it->ys = ys;
504 
505  /*
506  Recursive formula to compute dir = -(H \cdot g).
507  This is described in page 779 of:
508  Jorge Nocedal.
509  Updating Quasi-Newton Matrices with Limited Storage.
510  Mathematics of Computation, Vol. 35, No. 151,
511  pp. 773--782, 1980.
512  */
513  bound = (m <= k) ? m : k;
514  ++k;
515  end = (end + 1) % m;
516 
517  /* Compute the steepest direction. */
518  if (param.orthantwise_c == 0.) {
519  /* Compute the negative of gradients. */
520  std::copy(g, g+n, d);
522  } else {
523  std::copy(pg, pg+n, d);
525  }
526 
527  j = end;
528  for (i = 0;i < bound;++i) {
529  j = (j + m - 1) % m; /* if (--j == -1) j = m-1; */
530  it = &lm[j];
531  /* \alpha_{j} = \rho_{j} s^{t}_{j} \cdot q_{k+1}. */
532  it->alpha = SGVector<float64_t>::dot(it->s, d, n);
533  it->alpha /= it->ys;
534  /* q_{i} = q_{i+1} - \alpha_{i} y_{i}. */
535  SGVector<float64_t>::add(d, 1, d, -it->alpha, it->y, n);
536  }
537 
538  SGVector<float64_t>::scale_vector(ys / yy, d, n);
539 
540  for (i = 0;i < bound;++i) {
541  it = &lm[j];
542  /* \beta_{j} = \rho_{j} y^t_{j} \cdot \gamma_{i}. */
543  beta = SGVector<float64_t>::dot(it->y, d, n);
544  beta /= it->ys;
545  /* \gamma_{i+1} = \gamma_{i} + (\alpha_{j} - \beta_{j}) s_{j}. */
546  SGVector<float64_t>::add(d, 1, d, it->alpha-beta, it->s, n);
547  j = (j + 1) % m; /* if (++j == m) j = 0; */
548  }
549 
550  /*
551  Constrain the search direction for orthant-wise updates.
552  */
553  if (param.orthantwise_c != 0.) {
554  for (i = param.orthantwise_start;i < param.orthantwise_end;++i) {
555  if (d[i] * pg[i] >= 0) {
556  d[i] = 0;
557  }
558  }
559  }
560 
561  /*
562  Now the search direction d is ready. We try step = 1 first.
563  */
564  step = 1.0;
565  }
566 
567 lbfgs_exit:
568  /* Return the final value of the objective function. */
569  if (ptr_fx != NULL) {
570  *ptr_fx = fx;
571  }
572 
573  SG_FREE(pf);
574 
575  /* Free memory blocks used by this function. */
576  if (lm != NULL) {
577  for (i = 0;i < m;++i) {
578  SG_FREE(lm[i].s);
579  SG_FREE(lm[i].y);
580  }
581  SG_FREE(lm);
582  }
583  SG_FREE(pg);
584  SG_FREE(w);
585  SG_FREE(d);
586  SG_FREE(gp);
587  SG_FREE(g);
588  SG_FREE(xp);
589 
590  return ret;
591 }
592 
593 
594 
596  int32_t n,
597  float64_t *x,
598  float64_t *f,
599  float64_t *g,
600  float64_t *s,
601  float64_t *stp,
602  const float64_t* xp,
603  const float64_t* gp,
604  float64_t *wp,
605  callback_data_t *cd,
606  const lbfgs_parameter_t *param
607  )
608 {
609  int32_t count = 0;
610  float64_t width, dg;
611  float64_t finit, dginit = 0., dgtest;
612  const float64_t dec = 0.5, inc = 2.1;
613 
614  /* Check the input parameters for errors. */
615  if (*stp <= 0.) {
617  }
618 
619  /* Compute the initial gradient in the search direction. */
620  dginit = SGVector<float64_t>::dot(g, s, n);
621 
622  /* Make sure that s points to a descent direction. */
623  if (0 < dginit) {
625  }
626 
627  /* The initial value of the objective function. */
628  finit = *f;
629  dgtest = param->ftol * dginit;
630 
631  for (;;) {
632  std::copy(xp,xp+n,x);
633  SGVector<float64_t>::add(x, 1, x, *stp, s, n);
634 
635  /* Evaluate the function and gradient values. */
636  *f = cd->proc_evaluate(cd->instance, x, g, cd->n, *stp);
637 
638  ++count;
639 
640  if (*f > finit + *stp * dgtest) {
641  width = dec;
642  } else {
643  /* The sufficient decrease condition (Armijo condition). */
645  /* Exit with the Armijo condition. */
646  return count;
647  }
648 
649  /* Check the Wolfe condition. */
650  dg = SGVector<float64_t>::dot(g, s, n);
651  if (dg < param->wolfe * dginit) {
652  width = inc;
653  } else {
655  /* Exit with the regular Wolfe condition. */
656  return count;
657  }
658 
659  /* Check the strong Wolfe condition. */
660  if(dg > -param->wolfe * dginit) {
661  width = dec;
662  } else {
663  /* Exit with the strong Wolfe condition. */
664  return count;
665  }
666  }
667  }
668 
669  if (*stp < param->min_step) {
670  /* The step is the minimum value. */
671  return LBFGSERR_MINIMUMSTEP;
672  }
673  if (*stp > param->max_step) {
674  /* The step is the maximum value. */
675  return LBFGSERR_MAXIMUMSTEP;
676  }
677  if (param->max_linesearch <= count) {
678  /* Maximum number of iteration. */
680  }
681 
682  (*stp) *= width;
683  }
684 }
685 
686 
687 
689  int32_t n,
690  float64_t *x,
691  float64_t *f,
692  float64_t *g,
693  float64_t *s,
694  float64_t *stp,
695  const float64_t* xp,
696  const float64_t* gp,
697  float64_t *wp,
698  callback_data_t *cd,
699  const lbfgs_parameter_t *param
700  )
701 {
702  int32_t i, count = 0;
703  float64_t width = 0.5, norm = 0.;
704  float64_t finit = *f, dgtest;
705 
706  /* Check the input parameters for errors. */
707  if (*stp <= 0.) {
709  }
710 
711  /* Choose the orthant for the new point. */
712  for (i = 0;i < n;++i) {
713  wp[i] = (xp[i] == 0.) ? -gp[i] : xp[i];
714  }
715 
716  for (;;) {
717  /* Update the current point. */
718  std::copy(xp,xp+n,x);
719  SGVector<float64_t>::add(x, 1, x, *stp, s, n);
720 
721  /* The current point is projected onto the orthant. */
722  owlqn_project(x, wp, param->orthantwise_start, param->orthantwise_end);
723 
724  /* Evaluate the function and gradient values. */
725  *f = cd->proc_evaluate(cd->instance, x, g, cd->n, *stp);
726 
727  /* Compute the L1 norm of the variables and add it to the object value. */
728  norm = owlqn_x1norm(x, param->orthantwise_start, param->orthantwise_end);
729  *f += norm * param->orthantwise_c;
730 
731  ++count;
732 
733  dgtest = 0.;
734  for (i = 0;i < n;++i) {
735  dgtest += (x[i] - xp[i]) * gp[i];
736  }
737 
738  if (*f <= finit + param->ftol * dgtest) {
739  /* The sufficient decrease condition. */
740  return count;
741  }
742 
743  if (*stp < param->min_step) {
744  /* The step is the minimum value. */
745  return LBFGSERR_MINIMUMSTEP;
746  }
747  if (*stp > param->max_step) {
748  /* The step is the maximum value. */
749  return LBFGSERR_MAXIMUMSTEP;
750  }
751  if (param->max_linesearch <= count) {
752  /* Maximum number of iteration. */
754  }
755 
756  (*stp) *= width;
757  }
758 }
759 
760 
761 
762 static int32_t line_search_morethuente(
763  int32_t n,
764  float64_t *x,
765  float64_t *f,
766  float64_t *g,
767  float64_t *s,
768  float64_t *stp,
769  const float64_t* xp,
770  const float64_t* gp,
771  float64_t *wa,
772  callback_data_t *cd,
773  const lbfgs_parameter_t *param
774  )
775 {
776  int32_t count = 0;
777  int32_t brackt, stage1, uinfo = 0;
778  float64_t dg;
779  float64_t stx, fx, dgx;
780  float64_t sty, fy, dgy;
781  float64_t fxm, dgxm, fym, dgym, fm, dgm;
782  float64_t finit, ftest1, dginit, dgtest;
783  float64_t width, prev_width;
784  float64_t stmin, stmax;
785 
786  /* Check the input parameters for errors. */
787  if (*stp <= 0.) {
789  }
790 
791  /* Compute the initial gradient in the search direction. */
792  dginit = SGVector<float64_t>::dot(g, s, n);
793 
794  /* Make sure that s points to a descent direction. */
795  if (0 < dginit) {
797  }
798 
799  /* Initialize local variables. */
800  brackt = 0;
801  stage1 = 1;
802  finit = *f;
803  dgtest = param->ftol * dginit;
804  width = param->max_step - param->min_step;
805  prev_width = 2.0 * width;
806 
807  /*
808  The variables stx, fx, dgx contain the values of the step,
809  function, and directional derivative at the best step.
810  The variables sty, fy, dgy contain the value of the step,
811  function, and derivative at the other endpoint of
812  the interval of uncertainty.
813  The variables stp, f, dg contain the values of the step,
814  function, and derivative at the current step.
815  */
816  stx = sty = 0.;
817  fx = fy = finit;
818  dgx = dgy = dginit;
819 
820  for (;;) {
821  /*
822  Set the minimum and maximum steps to correspond to the
823  present interval of uncertainty.
824  */
825  if (brackt) {
826  stmin = min2(stx, sty);
827  stmax = max2(stx, sty);
828  } else {
829  stmin = stx;
830  stmax = *stp + 4.0 * (*stp - stx);
831  }
832 
833  /* Clip the step in the range of [stpmin, stpmax]. */
834  if (*stp < param->min_step) *stp = param->min_step;
835  if (param->max_step < *stp) *stp = param->max_step;
836 
837  /*
838  If an unusual termination is to occur then let
839  stp be the lowest point obtained so far.
840  */
841  if ((brackt && ((*stp <= stmin || stmax <= *stp) || param->max_linesearch <= count + 1 || uinfo != 0)) || (brackt && (stmax - stmin <= param->xtol * stmax))) {
842  *stp = stx;
843  }
844 
845  /*
846  Compute the current value of x:
847  x <- x + (*stp) * s.
848  */
849  std::copy(xp,xp+n,x);
850  SGVector<float64_t>::add(x, 1, x, *stp, s, n);
851 
852  /* Evaluate the function and gradient values. */
853  *f = cd->proc_evaluate(cd->instance, x, g, cd->n, *stp);
854  dg = SGVector<float64_t>::dot(g, s, n);
855 
856  ftest1 = finit + *stp * dgtest;
857  ++count;
858 
859  /* Test for errors and convergence. */
860  if (brackt && ((*stp <= stmin || stmax <= *stp) || uinfo != 0)) {
861  /* Rounding errors prevent further progress. */
863  }
864  if (*stp == param->max_step && *f <= ftest1 && dg <= dgtest) {
865  /* The step is the maximum value. */
866  return LBFGSERR_MAXIMUMSTEP;
867  }
868  if (*stp == param->min_step && (ftest1 < *f || dgtest <= dg)) {
869  /* The step is the minimum value. */
870  return LBFGSERR_MINIMUMSTEP;
871  }
872  if (brackt && (stmax - stmin) <= param->xtol * stmax) {
873  /* Relative width of the interval of uncertainty is at most xtol. */
874  return LBFGSERR_WIDTHTOOSMALL;
875  }
876  if (param->max_linesearch <= count) {
877  /* Maximum number of iteration. */
879  }
880  if (*f <= ftest1 && fabs(dg) <= param->gtol * (-dginit)) {
881  /* The sufficient decrease condition and the directional derivative condition hold. */
882  return count;
883  }
884 
885  /*
886  In the first stage we seek a step for which the modified
887  function has a nonpositive value and nonnegative derivative.
888  */
889  if (stage1 && *f <= ftest1 && min2(param->ftol, param->gtol) * dginit <= dg) {
890  stage1 = 0;
891  }
892 
893  /*
894  A modified function is used to predict the step only if
895  we have not obtained a step for which the modified
896  function has a nonpositive function value and nonnegative
897  derivative, and if a lower function value has been
898  obtained but the decrease is not sufficient.
899  */
900  if (stage1 && ftest1 < *f && *f <= fx) {
901  /* Define the modified function and derivative values. */
902  fm = *f - *stp * dgtest;
903  fxm = fx - stx * dgtest;
904  fym = fy - sty * dgtest;
905  dgm = dg - dgtest;
906  dgxm = dgx - dgtest;
907  dgym = dgy - dgtest;
908 
909  /*
910  Call update_trial_interval() to update the interval of
911  uncertainty and to compute the new step.
912  */
913  uinfo = update_trial_interval(
914  &stx, &fxm, &dgxm,
915  &sty, &fym, &dgym,
916  stp, &fm, &dgm,
917  stmin, stmax, &brackt
918  );
919 
920  /* Reset the function and gradient values for f. */
921  fx = fxm + stx * dgtest;
922  fy = fym + sty * dgtest;
923  dgx = dgxm + dgtest;
924  dgy = dgym + dgtest;
925  } else {
926  /*
927  Call update_trial_interval() to update the interval of
928  uncertainty and to compute the new step.
929  */
930  uinfo = update_trial_interval(
931  &stx, &fx, &dgx,
932  &sty, &fy, &dgy,
933  stp, f, &dg,
934  stmin, stmax, &brackt
935  );
936  }
937 
938  /*
939  Force a sufficient decrease in the interval of uncertainty.
940  */
941  if (brackt) {
942  if (0.66 * prev_width <= fabs(sty - stx)) {
943  *stp = stx + 0.5 * (sty - stx);
944  }
945  prev_width = width;
946  width = fabs(sty - stx);
947  }
948  }
949 
950  return LBFGSERR_LOGICERROR;
951 }
952 
953 
954 
958 #define USES_MINIMIZER \
959  float64_t a, d, gamma, theta, p, q, r, s;
960 
971 #define CUBIC_MINIMIZER(cm, u, fu, du, v, fv, dv) \
972  d = (v) - (u); \
973  theta = ((fu) - (fv)) * 3 / d + (du) + (dv); \
974  p = fabs(theta); \
975  q = fabs(du); \
976  r = fabs(dv); \
977  s = max3(p, q, r); \
978  /* gamma = s*sqrt((theta/s)**2 - (du/s) * (dv/s)) */ \
979  a = theta / s; \
980  gamma = s * sqrt(a * a - ((du) / s) * ((dv) / s)); \
981  if ((v) < (u)) gamma = -gamma; \
982  p = gamma - (du) + theta; \
983  q = gamma - (du) + gamma + (dv); \
984  r = p / q; \
985  (cm) = (u) + r * d;
986 
999 #define CUBIC_MINIMIZER2(cm, u, fu, du, v, fv, dv, xmin, xmax) \
1000  d = (v) - (u); \
1001  theta = ((fu) - (fv)) * 3 / d + (du) + (dv); \
1002  p = fabs(theta); \
1003  q = fabs(du); \
1004  r = fabs(dv); \
1005  s = max3(p, q, r); \
1006  /* gamma = s*sqrt((theta/s)**2 - (du/s) * (dv/s)) */ \
1007  a = theta / s; \
1008  gamma = s * sqrt(max2(0, a * a - ((du) / s) * ((dv) / s))); \
1009  if ((u) < (v)) gamma = -gamma; \
1010  p = gamma - (dv) + theta; \
1011  q = gamma - (dv) + gamma + (du); \
1012  r = p / q; \
1013  if (r < 0. && gamma != 0.) { \
1014  (cm) = (v) - r * d; \
1015  } else if (a < 0) { \
1016  (cm) = (xmax); \
1017  } else { \
1018  (cm) = (xmin); \
1019  }
1020 
1030 #define QUARD_MINIMIZER(qm, u, fu, du, v, fv) \
1031  a = (v) - (u); \
1032  (qm) = (u) + (du) / (((fu) - (fv)) / a + (du)) / 2 * a;
1033 
1042 #define QUARD_MINIMIZER2(qm, u, du, v, dv) \
1043  a = (u) - (v); \
1044  (qm) = (v) + (dv) / ((dv) - (du)) * a;
1045 
1046 #define fsigndiff(x, y) (*(x) * (*(y) / fabs(*(y))) < 0.)
1047 
1077 static int32_t update_trial_interval(
1078  float64_t *x,
1079  float64_t *fx,
1080  float64_t *dx,
1081  float64_t *y,
1082  float64_t *fy,
1083  float64_t *dy,
1084  float64_t *t,
1085  float64_t *ft,
1086  float64_t *dt,
1087  const float64_t tmin,
1088  const float64_t tmax,
1089  int32_t *brackt
1090  )
1091 {
1092  int32_t bound;
1093  int32_t dsign = fsigndiff(dt, dx);
1094  float64_t mc; /* minimizer of an interpolated cubic. */
1095  float64_t mq; /* minimizer of an interpolated quadratic. */
1096  float64_t newt; /* new trial value. */
1097  USES_MINIMIZER; /* for CUBIC_MINIMIZER and QUARD_MINIMIZER. */
1098 
1099  /* Check the input parameters for errors. */
1100  if (*brackt) {
1101  if (*t <= min2(*x, *y) || max2(*x, *y) <= *t) {
1102  /* The trival value t is out of the interval. */
1103  return LBFGSERR_OUTOFINTERVAL;
1104  }
1105  if (0. <= *dx * (*t - *x)) {
1106  /* The function must decrease from x. */
1108  }
1109  if (tmax < tmin) {
1110  /* Incorrect tmin and tmax specified. */
1112  }
1113  }
1114 
1115  /*
1116  Trial value selection.
1117  */
1118  if (*fx < *ft) {
1119  /*
1120  Case 1: a higher function value.
1121  The minimum is brackt. If the cubic minimizer is closer
1122  to x than the quadratic one, the cubic one is taken, else
1123  the average of the minimizers is taken.
1124  */
1125  *brackt = 1;
1126  bound = 1;
1127  CUBIC_MINIMIZER(mc, *x, *fx, *dx, *t, *ft, *dt);
1128  QUARD_MINIMIZER(mq, *x, *fx, *dx, *t, *ft);
1129  if (fabs(mc - *x) < fabs(mq - *x)) {
1130  newt = mc;
1131  } else {
1132  newt = mc + 0.5 * (mq - mc);
1133  }
1134  } else if (dsign) {
1135  /*
1136  Case 2: a lower function value and derivatives of
1137  opposite sign. The minimum is brackt. If the cubic
1138  minimizer is closer to x than the quadratic (secant) one,
1139  the cubic one is taken, else the quadratic one is taken.
1140  */
1141  *brackt = 1;
1142  bound = 0;
1143  CUBIC_MINIMIZER(mc, *x, *fx, *dx, *t, *ft, *dt);
1144  QUARD_MINIMIZER2(mq, *x, *dx, *t, *dt);
1145  if (fabs(mc - *t) > fabs(mq - *t)) {
1146  newt = mc;
1147  } else {
1148  newt = mq;
1149  }
1150  } else if (fabs(*dt) < fabs(*dx)) {
1151  /*
1152  Case 3: a lower function value, derivatives of the
1153  same sign, and the magnitude of the derivative decreases.
1154  The cubic minimizer is only used if the cubic tends to
1155  infinity in the direction of the minimizer or if the minimum
1156  of the cubic is beyond t. Otherwise the cubic minimizer is
1157  defined to be either tmin or tmax. The quadratic (secant)
1158  minimizer is also computed and if the minimum is brackt
1159  then the the minimizer closest to x is taken, else the one
1160  farthest away is taken.
1161  */
1162  bound = 1;
1163  CUBIC_MINIMIZER2(mc, *x, *fx, *dx, *t, *ft, *dt, tmin, tmax);
1164  QUARD_MINIMIZER2(mq, *x, *dx, *t, *dt);
1165  if (*brackt) {
1166  if (fabs(*t - mc) < fabs(*t - mq)) {
1167  newt = mc;
1168  } else {
1169  newt = mq;
1170  }
1171  } else {
1172  if (fabs(*t - mc) > fabs(*t - mq)) {
1173  newt = mc;
1174  } else {
1175  newt = mq;
1176  }
1177  }
1178  } else {
1179  /*
1180  Case 4: a lower function value, derivatives of the
1181  same sign, and the magnitude of the derivative does
1182  not decrease. If the minimum is not brackt, the step
1183  is either tmin or tmax, else the cubic minimizer is taken.
1184  */
1185  bound = 0;
1186  if (*brackt) {
1187  CUBIC_MINIMIZER(newt, *t, *ft, *dt, *y, *fy, *dy);
1188  } else if (*x < *t) {
1189  newt = tmax;
1190  } else {
1191  newt = tmin;
1192  }
1193  }
1194 
1195  /*
1196  Update the interval of uncertainty. This update does not
1197  depend on the new step or the case analysis above.
1198 
1199  - Case a: if f(x) < f(t),
1200  x <- x, y <- t.
1201  - Case b: if f(t) <= f(x) && f'(t)*f'(x) > 0,
1202  x <- t, y <- y.
1203  - Case c: if f(t) <= f(x) && f'(t)*f'(x) < 0,
1204  x <- t, y <- x.
1205  */
1206  if (*fx < *ft) {
1207  /* Case a */
1208  *y = *t;
1209  *fy = *ft;
1210  *dy = *dt;
1211  } else {
1212  /* Case c */
1213  if (dsign) {
1214  *y = *x;
1215  *fy = *fx;
1216  *dy = *dx;
1217  }
1218  /* Cases b and c */
1219  *x = *t;
1220  *fx = *ft;
1221  *dx = *dt;
1222  }
1223 
1224  /* Clip the new trial value in [tmin, tmax]. */
1225  if (tmax < newt) newt = tmax;
1226  if (newt < tmin) newt = tmin;
1227 
1228  /*
1229  Redefine the new trial value if it is close to the upper bound
1230  of the interval.
1231  */
1232  if (*brackt && bound) {
1233  mq = *x + 0.66 * (*y - *x);
1234  if (*x < *y) {
1235  if (mq < newt) newt = mq;
1236  } else {
1237  if (newt < mq) newt = mq;
1238  }
1239  }
1240 
1241  /* Return the new trial value. */
1242  *t = newt;
1243  return 0;
1244 }
1245 
1246 
1247 
1248 
1249 
1251  const float64_t* x,
1252  const int32_t start,
1253  const int32_t n
1254  )
1255 {
1256  int32_t i;
1257  float64_t norm = 0.;
1258 
1259  for (i = start;i < n;++i) {
1260  norm += fabs(x[i]);
1261  }
1262 
1263  return norm;
1264 }
1265 
1267  float64_t* pg,
1268  const float64_t* x,
1269  const float64_t* g,
1270  const int32_t n,
1271  const float64_t c,
1272  const int32_t start,
1273  const int32_t end
1274  )
1275 {
1276  int32_t i;
1277 
1278  /* Compute the negative of gradients. */
1279  for (i = 0;i < start;++i) {
1280  pg[i] = g[i];
1281  }
1282 
1283  /* Compute the psuedo-gradients. */
1284  for (i = start;i < end;++i) {
1285  if (x[i] < 0.) {
1286  /* Differentiable. */
1287  pg[i] = g[i] - c;
1288  } else if (0. < x[i]) {
1289  /* Differentiable. */
1290  pg[i] = g[i] + c;
1291  } else {
1292  if (g[i] < -c) {
1293  /* Take the right partial derivative. */
1294  pg[i] = g[i] + c;
1295  } else if (c < g[i]) {
1296  /* Take the left partial derivative. */
1297  pg[i] = g[i] - c;
1298  } else {
1299  pg[i] = 0.;
1300  }
1301  }
1302  }
1303 
1304  for (i = end;i < n;++i) {
1305  pg[i] = g[i];
1306  }
1307 }
1308 
1309 static void owlqn_project(
1310  float64_t* d,
1311  const float64_t* sign,
1312  const int32_t start,
1313  const int32_t end
1314  )
1315 {
1316  int32_t i;
1317 
1318  for (i = start;i < end;++i) {
1319  if (d[i] * sign[i] <= 0) {
1320  d[i] = 0;
1321  }
1322  }
1323 }
1324 
1325 }

SHOGUN Machine Learning Toolbox - Documentation