SHOGUN  v3.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
lbfgs.cpp
Go to the documentation of this file.
1 /*
2  * Limited memory BFGS (L-BFGS).
3  *
4  * Copyright (c) 1990, Jorge Nocedal
5  * Copyright (c) 2007-2010 Naoaki Okazaki
6  * All rights reserved.
7  *
8  * Permission is hereby granted, free of charge, to any person obtaining a copy
9  * of this software and associated documentation files (the "Software"), to deal
10  * in the Software without restriction, including without limitation the rights
11  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12  * copies of the Software, and to permit persons to whom the Software is
13  * furnished to do so, subject to the following conditions:
14  *
15  * The above copyright notice and this permission notice shall be included in
16  * all copies or substantial portions of the Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24  * THE SOFTWARE.
25  */
26 
27 /* $Id$ */
28 
29 /*
30 This library is a C port of the FORTRAN implementation of Limited-memory
31 Broyden-Fletcher-Goldfarb-Shanno (L-BFGS) method written by Jorge Nocedal.
32 The original FORTRAN source code is available at:
33 http://www.ece.northwestern.edu/~nocedal/lbfgs.html
34 
35 The L-BFGS algorithm is described in:
36  - Jorge Nocedal.
37  Updating Quasi-Newton Matrices with Limited Storage.
38  <i>Mathematics of Computation</i>, Vol. 35, No. 151, pp. 773--782, 1980.
39  - Dong C. Liu and Jorge Nocedal.
40  On the limited memory BFGS method for large scale optimization.
41  <i>Mathematical Programming</i> B, Vol. 45, No. 3, pp. 503-528, 1989.
42 
43 The line search algorithms used in this implementation are described in:
44  - John E. Dennis and Robert B. Schnabel.
45  <i>Numerical Methods for Unconstrained Optimization and Nonlinear
46  Equations</i>, Englewood Cliffs, 1983.
47  - Jorge J. More and David J. Thuente.
48  Line search algorithm with guaranteed sufficient decrease.
49  <i>ACM Transactions on Mathematical Software (TOMS)</i>, Vol. 20, No. 3,
50  pp. 286-307, 1994.
51 
52 This library also implements Orthant-Wise Limited-memory Quasi-Newton (OWL-QN)
53 method presented in:
54  - Galen Andrew and Jianfeng Gao.
55  Scalable training of L1-regularized log-linear models.
56  In <i>Proceedings of the 24th International Conference on Machine
57  Learning (ICML 2007)</i>, pp. 33-40, 2007.
58 
59 I would like to thank the original author, Jorge Nocedal, who has been
60 distributing the effieicnt and explanatory implementation in an open source
61 licence.
62 */
63 
64 #include <algorithm>
65 #include <cstdio>
66 #include <cstdlib>
67 #include <cmath>
68 
70 #include <shogun/lib/SGVector.h>
71 #include <shogun/lib/common.h>
72 
73 namespace shogun
74 {
75 
76 #define min2(a, b) ((a) <= (b) ? (a) : (b))
77 #define max2(a, b) ((a) >= (b) ? (a) : (b))
78 #define max3(a, b, c) max2(max2((a), (b)), (c));
79 
81  int32_t n;
82  void *instance;
85 };
87 
90  float64_t *s; /* [n] */
91  float64_t *y; /* [n] */
92  float64_t ys; /* vecdot(y, s) */
93 };
95 
96 static const lbfgs_parameter_t _defparam = {
97  6, 1e-5, 0, 1e-5,
99  1e-20, 1e20, 1e-4, 0.9, 0.9, 1.0e-16,
100  0.0, 0, -1,
101 };
102 
103 /* Forward function declarations. */
104 
105 typedef int32_t (*line_search_proc)(
106  int32_t n,
107  float64_t *x,
108  float64_t *f,
109  float64_t *g,
110  float64_t *s,
111  float64_t *stp,
112  const float64_t* xp,
113  const float64_t* gp,
114  float64_t *wa,
115  callback_data_t *cd,
116  const lbfgs_parameter_t *param
117  );
118 
119 static int32_t line_search_backtracking(
120  int32_t n,
121  float64_t *x,
122  float64_t *f,
123  float64_t *g,
124  float64_t *s,
125  float64_t *stp,
126  const float64_t* xp,
127  const float64_t* gp,
128  float64_t *wa,
129  callback_data_t *cd,
130  const lbfgs_parameter_t *param
131  );
132 
133 static int32_t line_search_backtracking_owlqn(
134  int32_t n,
135  float64_t *x,
136  float64_t *f,
137  float64_t *g,
138  float64_t *s,
139  float64_t *stp,
140  const float64_t* xp,
141  const float64_t* gp,
142  float64_t *wp,
143  callback_data_t *cd,
144  const lbfgs_parameter_t *param
145  );
146 
147 static int32_t line_search_morethuente(
148  int32_t n,
149  float64_t *x,
150  float64_t *f,
151  float64_t *g,
152  float64_t *s,
153  float64_t *stp,
154  const float64_t* xp,
155  const float64_t* gp,
156  float64_t *wa,
157  callback_data_t *cd,
158  const lbfgs_parameter_t *param
159  );
160 
161 static int32_t update_trial_interval(
162  float64_t *x,
163  float64_t *fx,
164  float64_t *dx,
165  float64_t *y,
166  float64_t *fy,
167  float64_t *dy,
168  float64_t *t,
169  float64_t *ft,
170  float64_t *dt,
171  const float64_t tmin,
172  const float64_t tmax,
173  int32_t *brackt
174  );
175 
176 static float64_t owlqn_x1norm(
177  const float64_t* x,
178  const int32_t start,
179  const int32_t n
180  );
181 
182 static void owlqn_pseudo_gradient(
183  float64_t* pg,
184  const float64_t* x,
185  const float64_t* g,
186  const int32_t n,
187  const float64_t c,
188  const int32_t start,
189  const int32_t end
190  );
191 
192 static void owlqn_project(
193  float64_t* d,
194  const float64_t* sign,
195  const int32_t start,
196  const int32_t end
197  );
198 
199 
201 {
202  memcpy(param, &_defparam, sizeof(*param));
203 }
204 
205 int32_t lbfgs(
206  int32_t n,
207  float64_t *x,
208  float64_t *ptr_fx,
209  lbfgs_evaluate_t proc_evaluate,
210  lbfgs_progress_t proc_progress,
211  void *instance,
212  lbfgs_parameter_t *_param
213  )
214 {
215  int32_t ret;
216  int32_t i, j, k, ls, end, bound;
217  float64_t step;
218 
219  /* Constant parameters and their default values. */
220  lbfgs_parameter_t param = (_param != NULL) ? (*_param) : _defparam;
221  const int32_t m = param.m;
222 
223  float64_t *xp = NULL;
224  float64_t *g = NULL, *gp = NULL, *pg = NULL;
225  float64_t *d = NULL, *w = NULL, *pf = NULL;
226  iteration_data_t *lm = NULL, *it = NULL;
227  float64_t ys, yy;
228  float64_t xnorm, gnorm, beta;
229  float64_t fx = 0.;
230  float64_t rate = 0.;
232 
233  /* Construct a callback data. */
234  callback_data_t cd;
235  cd.n = n;
236  cd.instance = instance;
237  cd.proc_evaluate = proc_evaluate;
238  cd.proc_progress = proc_progress;
239 
240  /* Check the input parameters for errors. */
241  if (n <= 0) {
242  return LBFGSERR_INVALID_N;
243  }
244  if (param.epsilon < 0.) {
246  }
247  if (param.past < 0) {
249  }
250  if (param.delta < 0.) {
251  return LBFGSERR_INVALID_DELTA;
252  }
253  if (param.min_step < 0.) {
255  }
256  if (param.max_step < param.min_step) {
258  }
259  if (param.ftol < 0.) {
260  return LBFGSERR_INVALID_FTOL;
261  }
264  if (param.wolfe <= param.ftol || 1. <= param.wolfe) {
265  return LBFGSERR_INVALID_WOLFE;
266  }
267  }
268  if (param.gtol < 0.) {
269  return LBFGSERR_INVALID_GTOL;
270  }
271  if (param.xtol < 0.) {
272  return LBFGSERR_INVALID_XTOL;
273  }
274  if (param.max_linesearch <= 0) {
276  }
277  if (param.orthantwise_c < 0.) {
279  }
280  if (param.orthantwise_start < 0 || n < param.orthantwise_start) {
282  }
283  if (param.orthantwise_end < 0) {
284  param.orthantwise_end = n;
285  }
286  if (n < param.orthantwise_end) {
288  }
289  if (param.orthantwise_c != 0.) {
290  switch (param.linesearch) {
292  linesearch = line_search_backtracking_owlqn;
293  break;
294  default:
295  /* Only the backtracking method is available. */
297  }
298  } else {
299  switch (param.linesearch) {
301  linesearch = line_search_morethuente;
302  break;
306  linesearch = line_search_backtracking;
307  break;
308  default:
310  }
311  }
312 
313  /* Allocate working space. */
314  xp = SG_CALLOC(float64_t, n);
315  g = SG_CALLOC(float64_t, n);
316  gp = SG_CALLOC(float64_t, n);
317  d = SG_CALLOC(float64_t, n);
318  w = SG_CALLOC(float64_t, n);
319  if (xp == NULL || g == NULL || gp == NULL || d == NULL || w == NULL) {
320  ret = LBFGSERR_OUTOFMEMORY;
321  goto lbfgs_exit;
322  }
323 
324  if (param.orthantwise_c != 0.) {
325  /* Allocate working space for OW-LQN. */
326  pg = SG_CALLOC(float64_t, n);
327  if (pg == NULL) {
328  ret = LBFGSERR_OUTOFMEMORY;
329  goto lbfgs_exit;
330  }
331  }
332 
333  /* Allocate limited memory storage. */
334  lm = SG_CALLOC(iteration_data_t, m);
335  if (lm == NULL) {
336  ret = LBFGSERR_OUTOFMEMORY;
337  goto lbfgs_exit;
338  }
339 
340  /* Initialize the limited memory. */
341  for (i = 0;i < m;++i) {
342  it = &lm[i];
343  it->alpha = 0;
344  it->ys = 0;
345  it->s = SG_CALLOC(float64_t, n);
346  it->y = SG_CALLOC(float64_t, n);
347  if (it->s == NULL || it->y == NULL) {
348  ret = LBFGSERR_OUTOFMEMORY;
349  goto lbfgs_exit;
350  }
351  }
352 
353  /* Allocate an array for storing previous values of the objective function. */
354  if (0 < param.past) {
355  pf = SG_CALLOC(float64_t, param.past);
356  }
357 
358  /* Evaluate the function value and its gradient. */
359  fx = cd.proc_evaluate(cd.instance, x, g, cd.n, 0);
360  if (0. != param.orthantwise_c) {
361  /* Compute the L1 norm of the variable and add it to the object value. */
362  xnorm = owlqn_x1norm(x, param.orthantwise_start, param.orthantwise_end);
363  fx += xnorm * param.orthantwise_c;
365  pg, x, g, n,
367  );
368  }
369 
370  /* Store the initial value of the objective function. */
371  if (pf != NULL) {
372  pf[0] = fx;
373  }
374 
375  /*
376  Compute the direction;
377  we assume the initial hessian matrix H_0 as the identity matrix.
378  */
379  if (param.orthantwise_c == 0.) {
380  std::copy(g,g+n,d);
382  } else {
383  std::copy(pg,pg+n,d);
385  }
386 
387  /*
388  Make sure that the initial variables are not a minimizer.
389  */
390  xnorm = SGVector<float64_t>::twonorm(x, n);
391  if (param.orthantwise_c == 0.) {
392  gnorm = SGVector<float64_t>::twonorm(g, n);
393  } else {
394  gnorm = SGVector<float64_t>::twonorm(pg, n);
395  }
396  if (xnorm < 1.0) xnorm = 1.0;
397  if (gnorm / xnorm <= param.epsilon) {
399  goto lbfgs_exit;
400  }
401 
402  /* Compute the initial step:
403  step = 1.0 / sqrt(vecdot(d, d, n))
404  */
405  step = 1.0 / SGVector<float64_t>::twonorm(d, n);
406 
407  k = 1;
408  end = 0;
409  for (;;) {
410  /* Store the current position and gradient vectors. */
411  std::copy(x,x+n,xp);
412  std::copy(g,g+n,gp);
413 
414  /* Search for an optimal step. */
415  if (param.orthantwise_c == 0.) {
416  ls = linesearch(n, x, &fx, g, d, &step, xp, gp, w, &cd, &param);
417  } else {
418  ls = linesearch(n, x, &fx, g, d, &step, xp, pg, w, &cd, &param);
420  pg, x, g, n,
422  );
423  }
424  if (ls < 0) {
425  /* Revert to the previous point. */
426  std::copy(xp,xp+n,x);
427  std::copy(gp,gp+n,g);
428  ret = ls;
429  goto lbfgs_exit;
430  }
431 
432  /* Compute x and g norms. */
433  xnorm = SGVector<float64_t>::twonorm(x, n);
434  if (param.orthantwise_c == 0.) {
435  gnorm = SGVector<float64_t>::twonorm(g, n);
436  } else {
437  gnorm = SGVector<float64_t>::twonorm(pg, n);
438  }
439 
440  /* Report the progress. */
441  if (cd.proc_progress) {
442  if ((ret = cd.proc_progress(cd.instance, x, g, fx, xnorm, gnorm, step, cd.n, k, ls))) {
443  goto lbfgs_exit;
444  }
445  }
446 
447  /*
448  Convergence test.
449  The criterion is given by the following formula:
450  |g(x)| / \max2(1, |x|) < \epsilon
451  */
452  if (xnorm < 1.0) xnorm = 1.0;
453  if (gnorm / xnorm <= param.epsilon) {
454  /* Convergence. */
455  ret = LBFGS_SUCCESS;
456  break;
457  }
458 
459  /*
460  Test for stopping criterion.
461  The criterion is given by the following formula:
462  (f(past_x) - f(x)) / f(x) < \delta
463  */
464  if (pf != NULL) {
465  /* We don't test the stopping criterion while k < past. */
466  if (param.past <= k) {
467  /* Compute the relative improvement from the past. */
468  rate = (pf[k % param.past] - fx) / fx;
469 
470  /* The stopping criterion. */
471  if (rate < param.delta) {
472  ret = LBFGS_STOP;
473  break;
474  }
475  }
476 
477  /* Store the current value of the objective function. */
478  pf[k % param.past] = fx;
479  }
480 
481  if (param.max_iterations != 0 && param.max_iterations < k+1) {
482  /* Maximum number of iterations. */
484  break;
485  }
486 
487  /*
488  Update vectors s and y:
489  s_{k+1} = x_{k+1} - x_{k} = \step * d_{k}.
490  y_{k+1} = g_{k+1} - g_{k}.
491  */
492  it = &lm[end];
493  SGVector<float64_t>::add(it->s, 1, x, -1, xp, n);
494  SGVector<float64_t>::add(it->y, 1, g, -1, gp, n);
495 
496  /*
497  Compute scalars ys and yy:
498  ys = y^t \cdot s = 1 / \rho.
499  yy = y^t \cdot y.
500  Notice that yy is used for scaling the hessian matrix H_0 (Cholesky factor).
501  */
502  ys = SGVector<float64_t>::dot(it->y, it->s, n);
503  yy = SGVector<float64_t>::dot(it->y, it->y, n);
504  it->ys = ys;
505 
506  /*
507  Recursive formula to compute dir = -(H \cdot g).
508  This is described in page 779 of:
509  Jorge Nocedal.
510  Updating Quasi-Newton Matrices with Limited Storage.
511  Mathematics of Computation, Vol. 35, No. 151,
512  pp. 773--782, 1980.
513  */
514  bound = (m <= k) ? m : k;
515  ++k;
516  end = (end + 1) % m;
517 
518  /* Compute the steepest direction. */
519  if (param.orthantwise_c == 0.) {
520  /* Compute the negative of gradients. */
521  std::copy(g, g+n, d);
523  } else {
524  std::copy(pg, pg+n, d);
526  }
527 
528  j = end;
529  for (i = 0;i < bound;++i) {
530  j = (j + m - 1) % m; /* if (--j == -1) j = m-1; */
531  it = &lm[j];
532  /* \alpha_{j} = \rho_{j} s^{t}_{j} \cdot q_{k+1}. */
533  it->alpha = SGVector<float64_t>::dot(it->s, d, n);
534  it->alpha /= it->ys;
535  /* q_{i} = q_{i+1} - \alpha_{i} y_{i}. */
536  SGVector<float64_t>::add(d, 1, d, -it->alpha, it->y, n);
537  }
538 
539  SGVector<float64_t>::scale_vector(ys / yy, d, n);
540 
541  for (i = 0;i < bound;++i) {
542  it = &lm[j];
543  /* \beta_{j} = \rho_{j} y^t_{j} \cdot \gamma_{i}. */
544  beta = SGVector<float64_t>::dot(it->y, d, n);
545  beta /= it->ys;
546  /* \gamma_{i+1} = \gamma_{i} + (\alpha_{j} - \beta_{j}) s_{j}. */
547  SGVector<float64_t>::add(d, 1, d, it->alpha-beta, it->s, n);
548  j = (j + 1) % m; /* if (++j == m) j = 0; */
549  }
550 
551  /*
552  Constrain the search direction for orthant-wise updates.
553  */
554  if (param.orthantwise_c != 0.) {
555  for (i = param.orthantwise_start;i < param.orthantwise_end;++i) {
556  if (d[i] * pg[i] >= 0) {
557  d[i] = 0;
558  }
559  }
560  }
561 
562  /*
563  Now the search direction d is ready. We try step = 1 first.
564  */
565  step = 1.0;
566  }
567 
568 lbfgs_exit:
569  /* Return the final value of the objective function. */
570  if (ptr_fx != NULL) {
571  *ptr_fx = fx;
572  }
573 
574  SG_FREE(pf);
575 
576  /* Free memory blocks used by this function. */
577  if (lm != NULL) {
578  for (i = 0;i < m;++i) {
579  SG_FREE(lm[i].s);
580  SG_FREE(lm[i].y);
581  }
582  SG_FREE(lm);
583  }
584  SG_FREE(pg);
585  SG_FREE(w);
586  SG_FREE(d);
587  SG_FREE(gp);
588  SG_FREE(g);
589  SG_FREE(xp);
590 
591  return ret;
592 }
593 
594 
595 
597  int32_t n,
598  float64_t *x,
599  float64_t *f,
600  float64_t *g,
601  float64_t *s,
602  float64_t *stp,
603  const float64_t* xp,
604  const float64_t* gp,
605  float64_t *wp,
606  callback_data_t *cd,
607  const lbfgs_parameter_t *param
608  )
609 {
610  int32_t count = 0;
611  float64_t width, dg;
612  float64_t finit, dginit = 0., dgtest;
613  const float64_t dec = 0.5, inc = 2.1;
614 
615  /* Check the input parameters for errors. */
616  if (*stp <= 0.) {
618  }
619 
620  /* Compute the initial gradient in the search direction. */
621  dginit = SGVector<float64_t>::dot(g, s, n);
622 
623  /* Make sure that s points to a descent direction. */
624  if (0 < dginit) {
626  }
627 
628  /* The initial value of the objective function. */
629  finit = *f;
630  dgtest = param->ftol * dginit;
631 
632  for (;;) {
633  std::copy(xp,xp+n,x);
634  SGVector<float64_t>::add(x, 1, x, *stp, s, n);
635 
636  /* Evaluate the function and gradient values. */
637  *f = cd->proc_evaluate(cd->instance, x, g, cd->n, *stp);
638 
639  ++count;
640 
641  if (*f > finit + *stp * dgtest) {
642  width = dec;
643  } else {
644  /* The sufficient decrease condition (Armijo condition). */
646  /* Exit with the Armijo condition. */
647  return count;
648  }
649 
650  /* Check the Wolfe condition. */
651  dg = SGVector<float64_t>::dot(g, s, n);
652  if (dg < param->wolfe * dginit) {
653  width = inc;
654  } else {
656  /* Exit with the regular Wolfe condition. */
657  return count;
658  }
659 
660  /* Check the strong Wolfe condition. */
661  if(dg > -param->wolfe * dginit) {
662  width = dec;
663  } else {
664  /* Exit with the strong Wolfe condition. */
665  return count;
666  }
667  }
668  }
669 
670  if (*stp < param->min_step) {
671  /* The step is the minimum value. */
672  return LBFGSERR_MINIMUMSTEP;
673  }
674  if (*stp > param->max_step) {
675  /* The step is the maximum value. */
676  return LBFGSERR_MAXIMUMSTEP;
677  }
678  if (param->max_linesearch <= count) {
679  /* Maximum number of iteration. */
681  }
682 
683  (*stp) *= width;
684  }
685 }
686 
687 
688 
690  int32_t n,
691  float64_t *x,
692  float64_t *f,
693  float64_t *g,
694  float64_t *s,
695  float64_t *stp,
696  const float64_t* xp,
697  const float64_t* gp,
698  float64_t *wp,
699  callback_data_t *cd,
700  const lbfgs_parameter_t *param
701  )
702 {
703  int32_t i, count = 0;
704  float64_t width = 0.5, norm = 0.;
705  float64_t finit = *f, dgtest;
706 
707  /* Check the input parameters for errors. */
708  if (*stp <= 0.) {
710  }
711 
712  /* Choose the orthant for the new point. */
713  for (i = 0;i < n;++i) {
714  wp[i] = (xp[i] == 0.) ? -gp[i] : xp[i];
715  }
716 
717  for (;;) {
718  /* Update the current point. */
719  std::copy(xp,xp+n,x);
720  SGVector<float64_t>::add(x, 1, x, *stp, s, n);
721 
722  /* The current point is projected onto the orthant. */
723  owlqn_project(x, wp, param->orthantwise_start, param->orthantwise_end);
724 
725  /* Evaluate the function and gradient values. */
726  *f = cd->proc_evaluate(cd->instance, x, g, cd->n, *stp);
727 
728  /* Compute the L1 norm of the variables and add it to the object value. */
729  norm = owlqn_x1norm(x, param->orthantwise_start, param->orthantwise_end);
730  *f += norm * param->orthantwise_c;
731 
732  ++count;
733 
734  dgtest = 0.;
735  for (i = 0;i < n;++i) {
736  dgtest += (x[i] - xp[i]) * gp[i];
737  }
738 
739  if (*f <= finit + param->ftol * dgtest) {
740  /* The sufficient decrease condition. */
741  return count;
742  }
743 
744  if (*stp < param->min_step) {
745  /* The step is the minimum value. */
746  return LBFGSERR_MINIMUMSTEP;
747  }
748  if (*stp > param->max_step) {
749  /* The step is the maximum value. */
750  return LBFGSERR_MAXIMUMSTEP;
751  }
752  if (param->max_linesearch <= count) {
753  /* Maximum number of iteration. */
755  }
756 
757  (*stp) *= width;
758  }
759 }
760 
761 
762 
763 static int32_t line_search_morethuente(
764  int32_t n,
765  float64_t *x,
766  float64_t *f,
767  float64_t *g,
768  float64_t *s,
769  float64_t *stp,
770  const float64_t* xp,
771  const float64_t* gp,
772  float64_t *wa,
773  callback_data_t *cd,
774  const lbfgs_parameter_t *param
775  )
776 {
777  int32_t count = 0;
778  int32_t brackt, stage1, uinfo = 0;
779  float64_t dg;
780  float64_t stx, fx, dgx;
781  float64_t sty, fy, dgy;
782  float64_t fxm, dgxm, fym, dgym, fm, dgm;
783  float64_t finit, ftest1, dginit, dgtest;
784  float64_t width, prev_width;
785  float64_t stmin, stmax;
786 
787  /* Check the input parameters for errors. */
788  if (*stp <= 0.) {
790  }
791 
792  /* Compute the initial gradient in the search direction. */
793  dginit = SGVector<float64_t>::dot(g, s, n);
794 
795  /* Make sure that s points to a descent direction. */
796  if (0 < dginit) {
798  }
799 
800  /* Initialize local variables. */
801  brackt = 0;
802  stage1 = 1;
803  finit = *f;
804  dgtest = param->ftol * dginit;
805  width = param->max_step - param->min_step;
806  prev_width = 2.0 * width;
807 
808  /*
809  The variables stx, fx, dgx contain the values of the step,
810  function, and directional derivative at the best step.
811  The variables sty, fy, dgy contain the value of the step,
812  function, and derivative at the other endpoint of
813  the interval of uncertainty.
814  The variables stp, f, dg contain the values of the step,
815  function, and derivative at the current step.
816  */
817  stx = sty = 0.;
818  fx = fy = finit;
819  dgx = dgy = dginit;
820 
821  for (;;) {
822  /*
823  Set the minimum and maximum steps to correspond to the
824  present interval of uncertainty.
825  */
826  if (brackt) {
827  stmin = min2(stx, sty);
828  stmax = max2(stx, sty);
829  } else {
830  stmin = stx;
831  stmax = *stp + 4.0 * (*stp - stx);
832  }
833 
834  /* Clip the step in the range of [stpmin, stpmax]. */
835  if (*stp < param->min_step) *stp = param->min_step;
836  if (param->max_step < *stp) *stp = param->max_step;
837 
838  /*
839  If an unusual termination is to occur then let
840  stp be the lowest point obtained so far.
841  */
842  if ((brackt && ((*stp <= stmin || stmax <= *stp) || param->max_linesearch <= count + 1 || uinfo != 0)) || (brackt && (stmax - stmin <= param->xtol * stmax))) {
843  *stp = stx;
844  }
845 
846  /*
847  Compute the current value of x:
848  x <- x + (*stp) * s.
849  */
850  std::copy(xp,xp+n,x);
851  SGVector<float64_t>::add(x, 1, x, *stp, s, n);
852 
853  /* Evaluate the function and gradient values. */
854  *f = cd->proc_evaluate(cd->instance, x, g, cd->n, *stp);
855  dg = SGVector<float64_t>::dot(g, s, n);
856 
857  ftest1 = finit + *stp * dgtest;
858  ++count;
859 
860  /* Test for errors and convergence. */
861  if (brackt && ((*stp <= stmin || stmax <= *stp) || uinfo != 0)) {
862  /* Rounding errors prevent further progress. */
864  }
865  if (*stp == param->max_step && *f <= ftest1 && dg <= dgtest) {
866  /* The step is the maximum value. */
867  return LBFGSERR_MAXIMUMSTEP;
868  }
869  if (*stp == param->min_step && (ftest1 < *f || dgtest <= dg)) {
870  /* The step is the minimum value. */
871  return LBFGSERR_MINIMUMSTEP;
872  }
873  if (brackt && (stmax - stmin) <= param->xtol * stmax) {
874  /* Relative width of the interval of uncertainty is at most xtol. */
875  return LBFGSERR_WIDTHTOOSMALL;
876  }
877  if (param->max_linesearch <= count) {
878  /* Maximum number of iteration. */
880  }
881  if (*f <= ftest1 && fabs(dg) <= param->gtol * (-dginit)) {
882  /* The sufficient decrease condition and the directional derivative condition hold. */
883  return count;
884  }
885 
886  /*
887  In the first stage we seek a step for which the modified
888  function has a nonpositive value and nonnegative derivative.
889  */
890  if (stage1 && *f <= ftest1 && min2(param->ftol, param->gtol) * dginit <= dg) {
891  stage1 = 0;
892  }
893 
894  /*
895  A modified function is used to predict the step only if
896  we have not obtained a step for which the modified
897  function has a nonpositive function value and nonnegative
898  derivative, and if a lower function value has been
899  obtained but the decrease is not sufficient.
900  */
901  if (stage1 && ftest1 < *f && *f <= fx) {
902  /* Define the modified function and derivative values. */
903  fm = *f - *stp * dgtest;
904  fxm = fx - stx * dgtest;
905  fym = fy - sty * dgtest;
906  dgm = dg - dgtest;
907  dgxm = dgx - dgtest;
908  dgym = dgy - dgtest;
909 
910  /*
911  Call update_trial_interval() to update the interval of
912  uncertainty and to compute the new step.
913  */
914  uinfo = update_trial_interval(
915  &stx, &fxm, &dgxm,
916  &sty, &fym, &dgym,
917  stp, &fm, &dgm,
918  stmin, stmax, &brackt
919  );
920 
921  /* Reset the function and gradient values for f. */
922  fx = fxm + stx * dgtest;
923  fy = fym + sty * dgtest;
924  dgx = dgxm + dgtest;
925  dgy = dgym + dgtest;
926  } else {
927  /*
928  Call update_trial_interval() to update the interval of
929  uncertainty and to compute the new step.
930  */
931  uinfo = update_trial_interval(
932  &stx, &fx, &dgx,
933  &sty, &fy, &dgy,
934  stp, f, &dg,
935  stmin, stmax, &brackt
936  );
937  }
938 
939  /*
940  Force a sufficient decrease in the interval of uncertainty.
941  */
942  if (brackt) {
943  if (0.66 * prev_width <= fabs(sty - stx)) {
944  *stp = stx + 0.5 * (sty - stx);
945  }
946  prev_width = width;
947  width = fabs(sty - stx);
948  }
949  }
950 
951  return LBFGSERR_LOGICERROR;
952 }
953 
954 
955 
959 #define USES_MINIMIZER \
960  float64_t a, d, gamma, theta, p, q, r, s;
961 
972 #define CUBIC_MINIMIZER(cm, u, fu, du, v, fv, dv) \
973  d = (v) - (u); \
974  theta = ((fu) - (fv)) * 3 / d + (du) + (dv); \
975  p = fabs(theta); \
976  q = fabs(du); \
977  r = fabs(dv); \
978  s = max3(p, q, r); \
979  /* gamma = s*sqrt((theta/s)**2 - (du/s) * (dv/s)) */ \
980  a = theta / s; \
981  gamma = s * sqrt(a * a - ((du) / s) * ((dv) / s)); \
982  if ((v) < (u)) gamma = -gamma; \
983  p = gamma - (du) + theta; \
984  q = gamma - (du) + gamma + (dv); \
985  r = p / q; \
986  (cm) = (u) + r * d;
987 
1000 #define CUBIC_MINIMIZER2(cm, u, fu, du, v, fv, dv, xmin, xmax) \
1001  d = (v) - (u); \
1002  theta = ((fu) - (fv)) * 3 / d + (du) + (dv); \
1003  p = fabs(theta); \
1004  q = fabs(du); \
1005  r = fabs(dv); \
1006  s = max3(p, q, r); \
1007  /* gamma = s*sqrt((theta/s)**2 - (du/s) * (dv/s)) */ \
1008  a = theta / s; \
1009  gamma = s * sqrt(max2(0, a * a - ((du) / s) * ((dv) / s))); \
1010  if ((u) < (v)) gamma = -gamma; \
1011  p = gamma - (dv) + theta; \
1012  q = gamma - (dv) + gamma + (du); \
1013  r = p / q; \
1014  if (r < 0. && gamma != 0.) { \
1015  (cm) = (v) - r * d; \
1016  } else if (a < 0) { \
1017  (cm) = (xmax); \
1018  } else { \
1019  (cm) = (xmin); \
1020  }
1021 
1031 #define QUARD_MINIMIZER(qm, u, fu, du, v, fv) \
1032  a = (v) - (u); \
1033  (qm) = (u) + (du) / (((fu) - (fv)) / a + (du)) / 2 * a;
1034 
1043 #define QUARD_MINIMIZER2(qm, u, du, v, dv) \
1044  a = (u) - (v); \
1045  (qm) = (v) + (dv) / ((dv) - (du)) * a;
1046 
1047 #define fsigndiff(x, y) (*(x) * (*(y) / fabs(*(y))) < 0.)
1048 
1078 static int32_t update_trial_interval(
1079  float64_t *x,
1080  float64_t *fx,
1081  float64_t *dx,
1082  float64_t *y,
1083  float64_t *fy,
1084  float64_t *dy,
1085  float64_t *t,
1086  float64_t *ft,
1087  float64_t *dt,
1088  const float64_t tmin,
1089  const float64_t tmax,
1090  int32_t *brackt
1091  )
1092 {
1093  int32_t bound;
1094  int32_t dsign = fsigndiff(dt, dx);
1095  float64_t mc; /* minimizer of an interpolated cubic. */
1096  float64_t mq; /* minimizer of an interpolated quadratic. */
1097  float64_t newt; /* new trial value. */
1098  USES_MINIMIZER; /* for CUBIC_MINIMIZER and QUARD_MINIMIZER. */
1099 
1100  /* Check the input parameters for errors. */
1101  if (*brackt) {
1102  if (*t <= min2(*x, *y) || max2(*x, *y) <= *t) {
1103  /* The trival value t is out of the interval. */
1104  return LBFGSERR_OUTOFINTERVAL;
1105  }
1106  if (0. <= *dx * (*t - *x)) {
1107  /* The function must decrease from x. */
1109  }
1110  if (tmax < tmin) {
1111  /* Incorrect tmin and tmax specified. */
1113  }
1114  }
1115 
1116  /*
1117  Trial value selection.
1118  */
1119  if (*fx < *ft) {
1120  /*
1121  Case 1: a higher function value.
1122  The minimum is brackt. If the cubic minimizer is closer
1123  to x than the quadratic one, the cubic one is taken, else
1124  the average of the minimizers is taken.
1125  */
1126  *brackt = 1;
1127  bound = 1;
1128  CUBIC_MINIMIZER(mc, *x, *fx, *dx, *t, *ft, *dt);
1129  QUARD_MINIMIZER(mq, *x, *fx, *dx, *t, *ft);
1130  if (fabs(mc - *x) < fabs(mq - *x)) {
1131  newt = mc;
1132  } else {
1133  newt = mc + 0.5 * (mq - mc);
1134  }
1135  } else if (dsign) {
1136  /*
1137  Case 2: a lower function value and derivatives of
1138  opposite sign. The minimum is brackt. If the cubic
1139  minimizer is closer to x than the quadratic (secant) one,
1140  the cubic one is taken, else the quadratic one is taken.
1141  */
1142  *brackt = 1;
1143  bound = 0;
1144  CUBIC_MINIMIZER(mc, *x, *fx, *dx, *t, *ft, *dt);
1145  QUARD_MINIMIZER2(mq, *x, *dx, *t, *dt);
1146  if (fabs(mc - *t) > fabs(mq - *t)) {
1147  newt = mc;
1148  } else {
1149  newt = mq;
1150  }
1151  } else if (fabs(*dt) < fabs(*dx)) {
1152  /*
1153  Case 3: a lower function value, derivatives of the
1154  same sign, and the magnitude of the derivative decreases.
1155  The cubic minimizer is only used if the cubic tends to
1156  infinity in the direction of the minimizer or if the minimum
1157  of the cubic is beyond t. Otherwise the cubic minimizer is
1158  defined to be either tmin or tmax. The quadratic (secant)
1159  minimizer is also computed and if the minimum is brackt
1160  then the the minimizer closest to x is taken, else the one
1161  farthest away is taken.
1162  */
1163  bound = 1;
1164  CUBIC_MINIMIZER2(mc, *x, *fx, *dx, *t, *ft, *dt, tmin, tmax);
1165  QUARD_MINIMIZER2(mq, *x, *dx, *t, *dt);
1166  if (*brackt) {
1167  if (fabs(*t - mc) < fabs(*t - mq)) {
1168  newt = mc;
1169  } else {
1170  newt = mq;
1171  }
1172  } else {
1173  if (fabs(*t - mc) > fabs(*t - mq)) {
1174  newt = mc;
1175  } else {
1176  newt = mq;
1177  }
1178  }
1179  } else {
1180  /*
1181  Case 4: a lower function value, derivatives of the
1182  same sign, and the magnitude of the derivative does
1183  not decrease. If the minimum is not brackt, the step
1184  is either tmin or tmax, else the cubic minimizer is taken.
1185  */
1186  bound = 0;
1187  if (*brackt) {
1188  CUBIC_MINIMIZER(newt, *t, *ft, *dt, *y, *fy, *dy);
1189  } else if (*x < *t) {
1190  newt = tmax;
1191  } else {
1192  newt = tmin;
1193  }
1194  }
1195 
1196  /*
1197  Update the interval of uncertainty. This update does not
1198  depend on the new step or the case analysis above.
1199 
1200  - Case a: if f(x) < f(t),
1201  x <- x, y <- t.
1202  - Case b: if f(t) <= f(x) && f'(t)*f'(x) > 0,
1203  x <- t, y <- y.
1204  - Case c: if f(t) <= f(x) && f'(t)*f'(x) < 0,
1205  x <- t, y <- x.
1206  */
1207  if (*fx < *ft) {
1208  /* Case a */
1209  *y = *t;
1210  *fy = *ft;
1211  *dy = *dt;
1212  } else {
1213  /* Case c */
1214  if (dsign) {
1215  *y = *x;
1216  *fy = *fx;
1217  *dy = *dx;
1218  }
1219  /* Cases b and c */
1220  *x = *t;
1221  *fx = *ft;
1222  *dx = *dt;
1223  }
1224 
1225  /* Clip the new trial value in [tmin, tmax]. */
1226  if (tmax < newt) newt = tmax;
1227  if (newt < tmin) newt = tmin;
1228 
1229  /*
1230  Redefine the new trial value if it is close to the upper bound
1231  of the interval.
1232  */
1233  if (*brackt && bound) {
1234  mq = *x + 0.66 * (*y - *x);
1235  if (*x < *y) {
1236  if (mq < newt) newt = mq;
1237  } else {
1238  if (newt < mq) newt = mq;
1239  }
1240  }
1241 
1242  /* Return the new trial value. */
1243  *t = newt;
1244  return 0;
1245 }
1246 
1247 
1248 
1249 
1250 
1252  const float64_t* x,
1253  const int32_t start,
1254  const int32_t n
1255  )
1256 {
1257  int32_t i;
1258  float64_t norm = 0.;
1259 
1260  for (i = start;i < n;++i) {
1261  norm += fabs(x[i]);
1262  }
1263 
1264  return norm;
1265 }
1266 
1268  float64_t* pg,
1269  const float64_t* x,
1270  const float64_t* g,
1271  const int32_t n,
1272  const float64_t c,
1273  const int32_t start,
1274  const int32_t end
1275  )
1276 {
1277  int32_t i;
1278 
1279  /* Compute the negative of gradients. */
1280  for (i = 0;i < start;++i) {
1281  pg[i] = g[i];
1282  }
1283 
1284  /* Compute the psuedo-gradients. */
1285  for (i = start;i < end;++i) {
1286  if (x[i] < 0.) {
1287  /* Differentiable. */
1288  pg[i] = g[i] - c;
1289  } else if (0. < x[i]) {
1290  /* Differentiable. */
1291  pg[i] = g[i] + c;
1292  } else {
1293  if (g[i] < -c) {
1294  /* Take the right partial derivative. */
1295  pg[i] = g[i] + c;
1296  } else if (c < g[i]) {
1297  /* Take the left partial derivative. */
1298  pg[i] = g[i] - c;
1299  } else {
1300  pg[i] = 0.;
1301  }
1302  }
1303  }
1304 
1305  for (i = end;i < n;++i) {
1306  pg[i] = g[i];
1307  }
1308 }
1309 
1310 static void owlqn_project(
1311  float64_t* d,
1312  const float64_t* sign,
1313  const int32_t start,
1314  const int32_t end
1315  )
1316 {
1317  int32_t i;
1318 
1319  for (i = start;i < end;++i) {
1320  if (d[i] * sign[i] <= 0) {
1321  d[i] = 0;
1322  }
1323  }
1324 }
1325 
1326 }

SHOGUN Machine Learning Toolbox - Documentation