SHOGUN  3.2.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
lbfgs.cpp
Go to the documentation of this file.
1 /*
2  * Limited memory BFGS (L-BFGS).
3  *
4  * Copyright (c) 1990, Jorge Nocedal
5  * Copyright (c) 2007-2010 Naoaki Okazaki
6  * All rights reserved.
7  *
8  * Permission is hereby granted, free of charge, to any person obtaining a copy
9  * of this software and associated documentation files (the "Software"), to deal
10  * in the Software without restriction, including without limitation the rights
11  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12  * copies of the Software, and to permit persons to whom the Software is
13  * furnished to do so, subject to the following conditions:
14  *
15  * The above copyright notice and this permission notice shall be included in
16  * all copies or substantial portions of the Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24  * THE SOFTWARE.
25  */
26 
27 /* $Id$ */
28 
29 /*
30 This library is a C port of the FORTRAN implementation of Limited-memory
31 Broyden-Fletcher-Goldfarb-Shanno (L-BFGS) method written by Jorge Nocedal.
32 The original FORTRAN source code is available at:
33 http://www.ece.northwestern.edu/~nocedal/lbfgs.html
34 
35 The L-BFGS algorithm is described in:
36  - Jorge Nocedal.
37  Updating Quasi-Newton Matrices with Limited Storage.
38  <i>Mathematics of Computation</i>, Vol. 35, No. 151, pp. 773--782, 1980.
39  - Dong C. Liu and Jorge Nocedal.
40  On the limited memory BFGS method for large scale optimization.
41  <i>Mathematical Programming</i> B, Vol. 45, No. 3, pp. 503-528, 1989.
42 
43 The line search algorithms used in this implementation are described in:
44  - John E. Dennis and Robert B. Schnabel.
45  <i>Numerical Methods for Unconstrained Optimization and Nonlinear
46  Equations</i>, Englewood Cliffs, 1983.
47  - Jorge J. More and David J. Thuente.
48  Line search algorithm with guaranteed sufficient decrease.
49  <i>ACM Transactions on Mathematical Software (TOMS)</i>, Vol. 20, No. 3,
50  pp. 286-307, 1994.
51 
52 This library also implements Orthant-Wise Limited-memory Quasi-Newton (OWL-QN)
53 method presented in:
54  - Galen Andrew and Jianfeng Gao.
55  Scalable training of L1-regularized log-linear models.
56  In <i>Proceedings of the 24th International Conference on Machine
57  Learning (ICML 2007)</i>, pp. 33-40, 2007.
58 
59 I would like to thank the original author, Jorge Nocedal, who has been
60 distributing the effieicnt and explanatory implementation in an open source
61 licence.
62 */
63 
64 #include <algorithm>
65 #include <cstdio>
66 #include <cmath>
67 #include <string.h>
68 
70 #include <shogun/lib/SGVector.h>
71 #include <shogun/lib/common.h>
72 #include <shogun/lib/memory.h>
73 
74 namespace shogun
75 {
76 
77 #define min2(a, b) ((a) <= (b) ? (a) : (b))
78 #define max2(a, b) ((a) >= (b) ? (a) : (b))
79 #define max3(a, b, c) max2(max2((a), (b)), (c));
80 
82  int32_t n;
83  void *instance;
87 };
89 
92  float64_t *s; /* [n] */
93  float64_t *y; /* [n] */
94  float64_t ys; /* vecdot(y, s) */
95 };
97 
98 static const lbfgs_parameter_t _defparam = {
99  6, 1e-5, 0, 0.0,
101  1e-20, 1e20, 1e-4, 0.9, 0.9, 1.0e-16,
102  0.0, 0, 1,
103 };
104 
105 /* Forward function declarations. */
106 
107 typedef int32_t (*line_search_proc)(
108  int32_t n,
109  float64_t *x,
110  float64_t *f,
111  float64_t *g,
112  float64_t *s,
113  float64_t *stp,
114  const float64_t* xp,
115  const float64_t* gp,
116  float64_t *wa,
117  callback_data_t *cd,
118  const lbfgs_parameter_t *param
119  );
120 
121 static int32_t line_search_backtracking(
122  int32_t n,
123  float64_t *x,
124  float64_t *f,
125  float64_t *g,
126  float64_t *s,
127  float64_t *stp,
128  const float64_t* xp,
129  const float64_t* gp,
130  float64_t *wa,
131  callback_data_t *cd,
132  const lbfgs_parameter_t *param
133  );
134 
135 static int32_t line_search_backtracking_owlqn(
136  int32_t n,
137  float64_t *x,
138  float64_t *f,
139  float64_t *g,
140  float64_t *s,
141  float64_t *stp,
142  const float64_t* xp,
143  const float64_t* gp,
144  float64_t *wp,
145  callback_data_t *cd,
146  const lbfgs_parameter_t *param
147  );
148 
149 static int32_t line_search_morethuente(
150  int32_t n,
151  float64_t *x,
152  float64_t *f,
153  float64_t *g,
154  float64_t *s,
155  float64_t *stp,
156  const float64_t* xp,
157  const float64_t* gp,
158  float64_t *wa,
159  callback_data_t *cd,
160  const lbfgs_parameter_t *param
161  );
162 
163 static int32_t update_trial_interval(
164  float64_t *x,
165  float64_t *fx,
166  float64_t *dx,
167  float64_t *y,
168  float64_t *fy,
169  float64_t *dy,
170  float64_t *t,
171  float64_t *ft,
172  float64_t *dt,
173  const float64_t tmin,
174  const float64_t tmax,
175  int32_t *brackt
176  );
177 
178 static float64_t owlqn_x1norm(
179  const float64_t* x,
180  const int32_t start,
181  const int32_t n
182  );
183 
184 static void owlqn_pseudo_gradient(
185  float64_t* pg,
186  const float64_t* x,
187  const float64_t* g,
188  const int32_t n,
189  const float64_t c,
190  const int32_t start,
191  const int32_t end
192  );
193 
194 static void owlqn_project(
195  float64_t* d,
196  const float64_t* sign,
197  const int32_t start,
198  const int32_t end
199  );
200 
201 
203 {
204  memcpy(param, &_defparam, sizeof(*param));
205 }
206 
207 int32_t lbfgs(
208  int32_t n,
209  float64_t *x,
210  float64_t *ptr_fx,
211  lbfgs_evaluate_t proc_evaluate,
212  lbfgs_progress_t proc_progress,
213  void *instance,
214  lbfgs_parameter_t *_param,
215  lbfgs_adjust_step_t proc_adjust_step
216  )
217 {
218  int32_t ret;
219  int32_t i, j, k, ls, end, bound;
220  float64_t step;
221 
222  /* Constant parameters and their default values. */
223  lbfgs_parameter_t param = (_param != NULL) ? (*_param) : _defparam;
224  const int32_t m = param.m;
225 
226  float64_t *xp = NULL;
227  float64_t *g = NULL, *gp = NULL, *pg = NULL;
228  float64_t *d = NULL, *w = NULL, *pf = NULL;
229  iteration_data_t *lm = NULL, *it = NULL;
230  float64_t ys, yy;
231  float64_t xnorm, gnorm, beta;
232  float64_t fx = 0.;
233  float64_t rate = 0.;
235 
236  /* Construct a callback data. */
237  callback_data_t cd;
238  cd.n = n;
239  cd.instance = instance;
240  cd.proc_evaluate = proc_evaluate;
241  cd.proc_progress = proc_progress;
242  cd.proc_adjust_step=proc_adjust_step;
243 
244  /* Check the input parameters for errors. */
245  if (n <= 0) {
246  return LBFGSERR_INVALID_N;
247  }
248  if (param.epsilon < 0.) {
250  }
251  if (param.past < 0) {
253  }
254  if (param.delta < 0.) {
255  return LBFGSERR_INVALID_DELTA;
256  }
257  if (param.min_step < 0.) {
259  }
260  if (param.max_step < param.min_step) {
262  }
263  if (param.ftol < 0.) {
264  return LBFGSERR_INVALID_FTOL;
265  }
268  if (param.wolfe <= param.ftol || 1. <= param.wolfe) {
269  return LBFGSERR_INVALID_WOLFE;
270  }
271  }
272  if (param.gtol < 0.) {
273  return LBFGSERR_INVALID_GTOL;
274  }
275  if (param.xtol < 0.) {
276  return LBFGSERR_INVALID_XTOL;
277  }
278  if (param.max_linesearch <= 0) {
280  }
281  if (param.orthantwise_c < 0.) {
283  }
284  if (param.orthantwise_start < 0 || n < param.orthantwise_start) {
286  }
287  if (param.orthantwise_end < 0) {
288  param.orthantwise_end = n;
289  }
290  if (n < param.orthantwise_end) {
292  }
293  if (param.orthantwise_c != 0.) {
294  switch (param.linesearch) {
296  linesearch = line_search_backtracking_owlqn;
297  break;
298  default:
299  /* Only the backtracking method is available. */
301  }
302  } else {
303  switch (param.linesearch) {
305  linesearch = line_search_morethuente;
306  break;
310  linesearch = line_search_backtracking;
311  break;
312  default:
314  }
315  }
316 
317  /* Allocate working space. */
318  xp = SG_CALLOC(float64_t, n);
319  g = SG_CALLOC(float64_t, n);
320  gp = SG_CALLOC(float64_t, n);
321  d = SG_CALLOC(float64_t, n);
322  w = SG_CALLOC(float64_t, n);
323  if (xp == NULL || g == NULL || gp == NULL || d == NULL || w == NULL) {
324  ret = LBFGSERR_OUTOFMEMORY;
325  goto lbfgs_exit;
326  }
327 
328  if (param.orthantwise_c != 0.) {
329  /* Allocate working space for OW-LQN. */
330  pg = SG_CALLOC(float64_t, n);
331  if (pg == NULL) {
332  ret = LBFGSERR_OUTOFMEMORY;
333  goto lbfgs_exit;
334  }
335  }
336 
337  /* Allocate limited memory storage. */
338  lm = SG_CALLOC(iteration_data_t, m);
339  if (lm == NULL) {
340  ret = LBFGSERR_OUTOFMEMORY;
341  goto lbfgs_exit;
342  }
343 
344  /* Initialize the limited memory. */
345  for (i = 0;i < m;++i) {
346  it = &lm[i];
347  it->alpha = 0;
348  it->ys = 0;
349  it->s = SG_CALLOC(float64_t, n);
350  it->y = SG_CALLOC(float64_t, n);
351  if (it->s == NULL || it->y == NULL) {
352  ret = LBFGSERR_OUTOFMEMORY;
353  goto lbfgs_exit;
354  }
355  }
356 
357  /* Allocate an array for storing previous values of the objective function. */
358  if (0 < param.past) {
359  pf = SG_CALLOC(float64_t, param.past);
360  }
361 
362  /* Evaluate the function value and its gradient. */
363  fx = cd.proc_evaluate(cd.instance, x, g, cd.n, 0);
364  if (0. != param.orthantwise_c) {
365  /* Compute the L1 norm of the variable and add it to the object value. */
366  xnorm = owlqn_x1norm(x, param.orthantwise_start, param.orthantwise_end);
367  fx += xnorm * param.orthantwise_c;
369  pg, x, g, n,
371  );
372  }
373 
374  /* Store the initial value of the objective function. */
375  if (pf != NULL) {
376  pf[0] = fx;
377  }
378 
379  /*
380  Compute the direction;
381  we assume the initial hessian matrix H_0 as the identity matrix.
382  */
383  if (param.orthantwise_c == 0.) {
384  std::copy(g,g+n,d);
386  } else {
387  std::copy(pg,pg+n,d);
389  }
390 
391  /*
392  Make sure that the initial variables are not a minimizer.
393  */
394  xnorm = SGVector<float64_t>::twonorm(x, n);
395  if (param.orthantwise_c == 0.) {
396  gnorm = SGVector<float64_t>::twonorm(g, n);
397  } else {
398  gnorm = SGVector<float64_t>::twonorm(pg, n);
399  }
400  if (xnorm < 1.0) xnorm = 1.0;
401  if (gnorm / xnorm <= param.epsilon) {
403  goto lbfgs_exit;
404  }
405 
406  /* Compute the initial step:
407  step = 1.0 / sqrt(vecdot(d, d, n))
408  */
409  step = 1.0 / SGVector<float64_t>::twonorm(d, n);
410 
411  k = 1;
412  end = 0;
413  for (;;) {
414  /* Store the current position and gradient vectors. */
415  std::copy(x,x+n,xp);
416  std::copy(g,g+n,gp);
417 
418  /* Search for an optimal step. */
419  if (param.orthantwise_c == 0.) {
420  ls = linesearch(n, x, &fx, g, d, &step, xp, gp, w, &cd, &param);
421  } else {
422  ls = linesearch(n, x, &fx, g, d, &step, xp, pg, w, &cd, &param);
424  pg, x, g, n,
426  );
427  }
428  if (ls < 0) {
429  /* Revert to the previous point. */
430  std::copy(xp,xp+n,x);
431  std::copy(gp,gp+n,g);
432  ret = ls;
433  goto lbfgs_exit;
434  }
435 
436  /* Compute x and g norms. */
437  xnorm = SGVector<float64_t>::twonorm(x, n);
438  if (param.orthantwise_c == 0.) {
439  gnorm = SGVector<float64_t>::twonorm(g, n);
440  } else {
441  gnorm = SGVector<float64_t>::twonorm(pg, n);
442  }
443 
444  /* Report the progress. */
445  if (cd.proc_progress) {
446  if ((ret = cd.proc_progress(cd.instance, x, g, fx, xnorm, gnorm, step, cd.n, k, ls))) {
447  goto lbfgs_exit;
448  }
449  }
450 
451  /*
452  Convergence test.
453  The criterion is given by the following formula:
454  |g(x)| / \max2(1, |x|) < \epsilon
455  */
456  if (xnorm < 1.0) xnorm = 1.0;
457  if (gnorm / xnorm <= param.epsilon) {
458  /* Convergence. */
459  ret = LBFGS_SUCCESS;
460  break;
461  }
462 
463  /*
464  Test for stopping criterion.
465  The criterion is given by the following formula:
466  (f(past_x) - f(x)) / f(x) < \delta
467  */
468  if (pf != NULL) {
469  /* We don't test the stopping criterion while k < past. */
470  if (param.past <= k) {
471  /* Compute the relative improvement from the past. */
472  rate = (pf[k % param.past] - fx) / fx;
473 
474  /* The stopping criterion. */
475  if (rate < param.delta) {
476  ret = LBFGS_STOP;
477  break;
478  }
479  }
480 
481  /* Store the current value of the objective function. */
482  pf[k % param.past] = fx;
483  }
484 
485  if (param.max_iterations != 0 && param.max_iterations < k+1) {
486  /* Maximum number of iterations. */
488  break;
489  }
490 
491  /*
492  Update vectors s and y:
493  s_{k+1} = x_{k+1} - x_{k} = \step * d_{k}.
494  y_{k+1} = g_{k+1} - g_{k}.
495  */
496  it = &lm[end];
497  SGVector<float64_t>::add(it->s, 1, x, -1, xp, n);
498  SGVector<float64_t>::add(it->y, 1, g, -1, gp, n);
499 
500  /*
501  Compute scalars ys and yy:
502  ys = y^t \cdot s = 1 / \rho.
503  yy = y^t \cdot y.
504  Notice that yy is used for scaling the hessian matrix H_0 (Cholesky factor).
505  */
506  ys = SGVector<float64_t>::dot(it->y, it->s, n);
507  yy = SGVector<float64_t>::dot(it->y, it->y, n);
508  it->ys = ys;
509 
510  /*
511  Recursive formula to compute dir = -(H \cdot g).
512  This is described in page 779 of:
513  Jorge Nocedal.
514  Updating Quasi-Newton Matrices with Limited Storage.
515  Mathematics of Computation, Vol. 35, No. 151,
516  pp. 773--782, 1980.
517  */
518  bound = (m <= k) ? m : k;
519  ++k;
520  end = (end + 1) % m;
521 
522  /* Compute the steepest direction. */
523  if (param.orthantwise_c == 0.) {
524  /* Compute the negative of gradients. */
525  std::copy(g, g+n, d);
527  } else {
528  std::copy(pg, pg+n, d);
530  }
531 
532  j = end;
533  for (i = 0;i < bound;++i) {
534  j = (j + m - 1) % m; /* if (--j == -1) j = m-1; */
535  it = &lm[j];
536  /* \alpha_{j} = \rho_{j} s^{t}_{j} \cdot q_{k+1}. */
537  it->alpha = SGVector<float64_t>::dot(it->s, d, n);
538  it->alpha /= it->ys;
539  /* q_{i} = q_{i+1} - \alpha_{i} y_{i}. */
540  SGVector<float64_t>::add(d, 1, d, -it->alpha, it->y, n);
541  }
542 
543  SGVector<float64_t>::scale_vector(ys / yy, d, n);
544 
545  for (i = 0;i < bound;++i) {
546  it = &lm[j];
547  /* \beta_{j} = \rho_{j} y^t_{j} \cdot \gamma_{i}. */
548  beta = SGVector<float64_t>::dot(it->y, d, n);
549  beta /= it->ys;
550  /* \gamma_{i+1} = \gamma_{i} + (\alpha_{j} - \beta_{j}) s_{j}. */
551  SGVector<float64_t>::add(d, 1, d, it->alpha-beta, it->s, n);
552  j = (j + 1) % m; /* if (++j == m) j = 0; */
553  }
554 
555  /*
556  Constrain the search direction for orthant-wise updates.
557  */
558  if (param.orthantwise_c != 0.) {
559  for (i = param.orthantwise_start;i < param.orthantwise_end;++i) {
560  if (d[i] * pg[i] >= 0) {
561  d[i] = 0;
562  }
563  }
564  }
565 
566  /*
567  Now the search direction d is ready. We try step = 1 first.
568  */
569  step = 1.0;
570  }
571 
572 lbfgs_exit:
573  /* Return the final value of the objective function. */
574  if (ptr_fx != NULL) {
575  *ptr_fx = fx;
576  }
577 
578  SG_FREE(pf);
579 
580  /* Free memory blocks used by this function. */
581  if (lm != NULL) {
582  for (i = 0;i < m;++i) {
583  SG_FREE(lm[i].s);
584  SG_FREE(lm[i].y);
585  }
586  SG_FREE(lm);
587  }
588  SG_FREE(pg);
589  SG_FREE(w);
590  SG_FREE(d);
591  SG_FREE(gp);
592  SG_FREE(g);
593  SG_FREE(xp);
594 
595  return ret;
596 }
597 
598 
599 
601  int32_t n,
602  float64_t *x,
603  float64_t *f,
604  float64_t *g,
605  float64_t *s,
606  float64_t *stp,
607  const float64_t* xp,
608  const float64_t* gp,
609  float64_t *wp,
610  callback_data_t *cd,
611  const lbfgs_parameter_t *param
612  )
613 {
614  int32_t count = 0;
615  float64_t width, dg;
616  float64_t finit, dginit = 0., dgtest;
617  const float64_t dec = 0.5, inc = 2.1;
618 
619  /* Check the input parameters for errors. */
620  if (*stp <= 0.) {
622  }
623 
624  /* Compute the initial gradient in the search direction. */
625  dginit = SGVector<float64_t>::dot(g, s, n);
626 
627  /* Make sure that s points to a descent direction. */
628  if (0 < dginit) {
630  }
631 
632  /* The initial value of the objective function. */
633  finit = *f;
634  dgtest = param->ftol * dginit;
635 
636  for (;;) {
637  std::copy(xp,xp+n,x);
638  if (cd->proc_adjust_step)
639  *stp=cd->proc_adjust_step(cd->instance, x, s, cd->n, *stp);
640 
641  SGVector<float64_t>::add(x, 1, x, *stp, s, n);
642 
643  /* Evaluate the function and gradient values. */
644  *f = cd->proc_evaluate(cd->instance, x, g, cd->n, *stp);
645 
646  ++count;
647 
648  if (*f > finit + *stp * dgtest) {
649  width = dec;
650  } else {
651  /* The sufficient decrease condition (Armijo condition). */
653  /* Exit with the Armijo condition. */
654  return count;
655  }
656 
657  /* Check the Wolfe condition. */
658  dg = SGVector<float64_t>::dot(g, s, n);
659  if (dg < param->wolfe * dginit) {
660  width = inc;
661  } else {
663  /* Exit with the regular Wolfe condition. */
664  return count;
665  }
666 
667  /* Check the strong Wolfe condition. */
668  if(dg > -param->wolfe * dginit) {
669  width = dec;
670  } else {
671  /* Exit with the strong Wolfe condition. */
672  return count;
673  }
674  }
675  }
676 
677  if (*stp < param->min_step) {
678  /* The step is the minimum value. */
679  return LBFGSERR_MINIMUMSTEP;
680  }
681  if (*stp > param->max_step) {
682  /* The step is the maximum value. */
683  return LBFGSERR_MAXIMUMSTEP;
684  }
685  if (param->max_linesearch <= count) {
686  /* Maximum number of iteration. */
688  }
689 
690  (*stp) *= width;
691  }
692 }
693 
694 
695 
697  int32_t n,
698  float64_t *x,
699  float64_t *f,
700  float64_t *g,
701  float64_t *s,
702  float64_t *stp,
703  const float64_t* xp,
704  const float64_t* gp,
705  float64_t *wp,
706  callback_data_t *cd,
707  const lbfgs_parameter_t *param
708  )
709 {
710  int32_t i, count = 0;
711  float64_t width = 0.5, norm = 0.;
712  float64_t finit = *f, dgtest;
713 
714  /* Check the input parameters for errors. */
715  if (*stp <= 0.) {
717  }
718 
719  /* Choose the orthant for the new point. */
720  for (i = 0;i < n;++i) {
721  wp[i] = (xp[i] == 0.) ? -gp[i] : xp[i];
722  }
723 
724  for (;;) {
725  /* Update the current point. */
726  std::copy(xp,xp+n,x);
727  if (cd->proc_adjust_step)
728  *stp=cd->proc_adjust_step(cd->instance, x, s, cd->n, *stp);
729  SGVector<float64_t>::add(x, 1, x, *stp, s, n);
730 
731  /* The current point is projected onto the orthant. */
732  owlqn_project(x, wp, param->orthantwise_start, param->orthantwise_end);
733 
734  /* Evaluate the function and gradient values. */
735  *f = cd->proc_evaluate(cd->instance, x, g, cd->n, *stp);
736 
737  /* Compute the L1 norm of the variables and add it to the object value. */
738  norm = owlqn_x1norm(x, param->orthantwise_start, param->orthantwise_end);
739  *f += norm * param->orthantwise_c;
740 
741  ++count;
742 
743  dgtest = 0.;
744  for (i = 0;i < n;++i) {
745  dgtest += (x[i] - xp[i]) * gp[i];
746  }
747 
748  if (*f <= finit + param->ftol * dgtest) {
749  /* The sufficient decrease condition. */
750  return count;
751  }
752 
753  if (*stp < param->min_step) {
754  /* The step is the minimum value. */
755  return LBFGSERR_MINIMUMSTEP;
756  }
757  if (*stp > param->max_step) {
758  /* The step is the maximum value. */
759  return LBFGSERR_MAXIMUMSTEP;
760  }
761  if (param->max_linesearch <= count) {
762  /* Maximum number of iteration. */
764  }
765 
766  (*stp) *= width;
767  }
768 }
769 
770 
771 
772 static int32_t line_search_morethuente(
773  int32_t n,
774  float64_t *x,
775  float64_t *f,
776  float64_t *g,
777  float64_t *s,
778  float64_t *stp,
779  const float64_t* xp,
780  const float64_t* gp,
781  float64_t *wa,
782  callback_data_t *cd,
783  const lbfgs_parameter_t *param
784  )
785 {
786  int32_t count = 0;
787  int32_t brackt, stage1, uinfo = 0;
788  float64_t dg;
789  float64_t stx, fx, dgx;
790  float64_t sty, fy, dgy;
791  float64_t fxm, dgxm, fym, dgym, fm, dgm;
792  float64_t finit, ftest1, dginit, dgtest;
793  float64_t width, prev_width;
794  float64_t stmin, stmax;
795 
796  /* Check the input parameters for errors. */
797  if (*stp <= 0.) {
799  }
800 
801  /* Compute the initial gradient in the search direction. */
802  dginit = SGVector<float64_t>::dot(g, s, n);
803 
804  /* Make sure that s points to a descent direction. */
805  if (0 < dginit) {
807  }
808 
809  /* Initialize local variables. */
810  brackt = 0;
811  stage1 = 1;
812  finit = *f;
813  dgtest = param->ftol * dginit;
814  width = param->max_step - param->min_step;
815  prev_width = 2.0 * width;
816 
817  /*
818  The variables stx, fx, dgx contain the values of the step,
819  function, and directional derivative at the best step.
820  The variables sty, fy, dgy contain the value of the step,
821  function, and derivative at the other endpoint of
822  the interval of uncertainty.
823  The variables stp, f, dg contain the values of the step,
824  function, and derivative at the current step.
825  */
826  stx = sty = 0.;
827  fx = fy = finit;
828  dgx = dgy = dginit;
829 
830  for (;;) {
831  /*
832  Set the minimum and maximum steps to correspond to the
833  present interval of uncertainty.
834  */
835  if (brackt) {
836  stmin = min2(stx, sty);
837  stmax = max2(stx, sty);
838  } else {
839  stmin = stx;
840  stmax = *stp + 4.0 * (*stp - stx);
841  }
842 
843  /* Clip the step in the range of [stpmin, stpmax]. */
844  if (*stp < param->min_step) *stp = param->min_step;
845  if (param->max_step < *stp) *stp = param->max_step;
846 
847  /*
848  If an unusual termination is to occur then let
849  stp be the lowest point obtained so far.
850  */
851  if ((brackt && ((*stp <= stmin || stmax <= *stp) || param->max_linesearch <= count + 1 || uinfo != 0)) || (brackt && (stmax - stmin <= param->xtol * stmax))) {
852  *stp = stx;
853  }
854 
855  /*
856  Compute the current value of x:
857  x <- x + (*stp) * s.
858  */
859  std::copy(xp,xp+n,x);
860  if (cd->proc_adjust_step)
861  *stp=cd->proc_adjust_step(cd->instance, x, s, cd->n, *stp);
862  SGVector<float64_t>::add(x, 1, x, *stp, s, n);
863 
864  /* Evaluate the function and gradient values. */
865  *f = cd->proc_evaluate(cd->instance, x, g, cd->n, *stp);
866  dg = SGVector<float64_t>::dot(g, s, n);
867 
868  ftest1 = finit + *stp * dgtest;
869  ++count;
870 
871  /* Test for errors and convergence. */
872  if (brackt && ((*stp <= stmin || stmax <= *stp) || uinfo != 0)) {
873  /* Rounding errors prevent further progress. */
875  }
876  if (*stp == param->max_step && *f <= ftest1 && dg <= dgtest) {
877  /* The step is the maximum value. */
878  return LBFGSERR_MAXIMUMSTEP;
879  }
880  if (*stp == param->min_step && (ftest1 < *f || dgtest <= dg)) {
881  /* The step is the minimum value. */
882  return LBFGSERR_MINIMUMSTEP;
883  }
884  if (brackt && (stmax - stmin) <= param->xtol * stmax) {
885  /* Relative width of the interval of uncertainty is at most xtol. */
886  return LBFGSERR_WIDTHTOOSMALL;
887  }
888  if (param->max_linesearch <= count) {
889  /* Maximum number of iteration. */
891  }
892  if (*f <= ftest1 && fabs(dg) <= param->gtol * (-dginit)) {
893  /* The sufficient decrease condition and the directional derivative condition hold. */
894  return count;
895  }
896 
897  /*
898  In the first stage we seek a step for which the modified
899  function has a nonpositive value and nonnegative derivative.
900  */
901  if (stage1 && *f <= ftest1 && min2(param->ftol, param->gtol) * dginit <= dg) {
902  stage1 = 0;
903  }
904 
905  /*
906  A modified function is used to predict the step only if
907  we have not obtained a step for which the modified
908  function has a nonpositive function value and nonnegative
909  derivative, and if a lower function value has been
910  obtained but the decrease is not sufficient.
911  */
912  if (stage1 && ftest1 < *f && *f <= fx) {
913  /* Define the modified function and derivative values. */
914  fm = *f - *stp * dgtest;
915  fxm = fx - stx * dgtest;
916  fym = fy - sty * dgtest;
917  dgm = dg - dgtest;
918  dgxm = dgx - dgtest;
919  dgym = dgy - dgtest;
920 
921  /*
922  Call update_trial_interval() to update the interval of
923  uncertainty and to compute the new step.
924  */
925  uinfo = update_trial_interval(
926  &stx, &fxm, &dgxm,
927  &sty, &fym, &dgym,
928  stp, &fm, &dgm,
929  stmin, stmax, &brackt
930  );
931 
932  /* Reset the function and gradient values for f. */
933  fx = fxm + stx * dgtest;
934  fy = fym + sty * dgtest;
935  dgx = dgxm + dgtest;
936  dgy = dgym + dgtest;
937  } else {
938  /*
939  Call update_trial_interval() to update the interval of
940  uncertainty and to compute the new step.
941  */
942  uinfo = update_trial_interval(
943  &stx, &fx, &dgx,
944  &sty, &fy, &dgy,
945  stp, f, &dg,
946  stmin, stmax, &brackt
947  );
948  }
949 
950  /*
951  Force a sufficient decrease in the interval of uncertainty.
952  */
953  if (brackt) {
954  if (0.66 * prev_width <= fabs(sty - stx)) {
955  *stp = stx + 0.5 * (sty - stx);
956  }
957  prev_width = width;
958  width = fabs(sty - stx);
959  }
960  }
961 
962  return LBFGSERR_LOGICERROR;
963 }
964 
965 
966 
970 #define USES_MINIMIZER \
971  float64_t a, d, gamma, theta, p, q, r, s;
972 
983 #define CUBIC_MINIMIZER(cm, u, fu, du, v, fv, dv) \
984  d = (v) - (u); \
985  theta = ((fu) - (fv)) * 3 / d + (du) + (dv); \
986  p = fabs(theta); \
987  q = fabs(du); \
988  r = fabs(dv); \
989  s = max3(p, q, r); \
990  /* gamma = s*sqrt((theta/s)**2 - (du/s) * (dv/s)) */ \
991  a = theta / s; \
992  gamma = s * sqrt(a * a - ((du) / s) * ((dv) / s)); \
993  if ((v) < (u)) gamma = -gamma; \
994  p = gamma - (du) + theta; \
995  q = gamma - (du) + gamma + (dv); \
996  r = p / q; \
997  (cm) = (u) + r * d;
998 
1011 #define CUBIC_MINIMIZER2(cm, u, fu, du, v, fv, dv, xmin, xmax) \
1012  d = (v) - (u); \
1013  theta = ((fu) - (fv)) * 3 / d + (du) + (dv); \
1014  p = fabs(theta); \
1015  q = fabs(du); \
1016  r = fabs(dv); \
1017  s = max3(p, q, r); \
1018  /* gamma = s*sqrt((theta/s)**2 - (du/s) * (dv/s)) */ \
1019  a = theta / s; \
1020  gamma = s * sqrt(max2(0, a * a - ((du) / s) * ((dv) / s))); \
1021  if ((u) < (v)) gamma = -gamma; \
1022  p = gamma - (dv) + theta; \
1023  q = gamma - (dv) + gamma + (du); \
1024  r = p / q; \
1025  if (r < 0. && gamma != 0.) { \
1026  (cm) = (v) - r * d; \
1027  } else if (a < 0) { \
1028  (cm) = (xmax); \
1029  } else { \
1030  (cm) = (xmin); \
1031  }
1032 
1042 #define QUARD_MINIMIZER(qm, u, fu, du, v, fv) \
1043  a = (v) - (u); \
1044  (qm) = (u) + (du) / (((fu) - (fv)) / a + (du)) / 2 * a;
1045 
1054 #define QUARD_MINIMIZER2(qm, u, du, v, dv) \
1055  a = (u) - (v); \
1056  (qm) = (v) + (dv) / ((dv) - (du)) * a;
1057 
1058 #define fsigndiff(x, y) (*(x) * (*(y) / fabs(*(y))) < 0.)
1059 
1089 static int32_t update_trial_interval(
1090  float64_t *x,
1091  float64_t *fx,
1092  float64_t *dx,
1093  float64_t *y,
1094  float64_t *fy,
1095  float64_t *dy,
1096  float64_t *t,
1097  float64_t *ft,
1098  float64_t *dt,
1099  const float64_t tmin,
1100  const float64_t tmax,
1101  int32_t *brackt
1102  )
1103 {
1104  int32_t bound;
1105  int32_t dsign = fsigndiff(dt, dx);
1106  float64_t mc; /* minimizer of an interpolated cubic. */
1107  float64_t mq; /* minimizer of an interpolated quadratic. */
1108  float64_t newt; /* new trial value. */
1109  USES_MINIMIZER; /* for CUBIC_MINIMIZER and QUARD_MINIMIZER. */
1110 
1111  /* Check the input parameters for errors. */
1112  if (*brackt) {
1113  if (*t <= min2(*x, *y) || max2(*x, *y) <= *t) {
1114  /* The trival value t is out of the interval. */
1115  return LBFGSERR_OUTOFINTERVAL;
1116  }
1117  if (0. <= *dx * (*t - *x)) {
1118  /* The function must decrease from x. */
1120  }
1121  if (tmax < tmin) {
1122  /* Incorrect tmin and tmax specified. */
1124  }
1125  }
1126 
1127  /*
1128  Trial value selection.
1129  */
1130  if (*fx < *ft) {
1131  /*
1132  Case 1: a higher function value.
1133  The minimum is brackt. If the cubic minimizer is closer
1134  to x than the quadratic one, the cubic one is taken, else
1135  the average of the minimizers is taken.
1136  */
1137  *brackt = 1;
1138  bound = 1;
1139  CUBIC_MINIMIZER(mc, *x, *fx, *dx, *t, *ft, *dt);
1140  QUARD_MINIMIZER(mq, *x, *fx, *dx, *t, *ft);
1141  if (fabs(mc - *x) < fabs(mq - *x)) {
1142  newt = mc;
1143  } else {
1144  newt = mc + 0.5 * (mq - mc);
1145  }
1146  } else if (dsign) {
1147  /*
1148  Case 2: a lower function value and derivatives of
1149  opposite sign. The minimum is brackt. If the cubic
1150  minimizer is closer to x than the quadratic (secant) one,
1151  the cubic one is taken, else the quadratic one is taken.
1152  */
1153  *brackt = 1;
1154  bound = 0;
1155  CUBIC_MINIMIZER(mc, *x, *fx, *dx, *t, *ft, *dt);
1156  QUARD_MINIMIZER2(mq, *x, *dx, *t, *dt);
1157  if (fabs(mc - *t) > fabs(mq - *t)) {
1158  newt = mc;
1159  } else {
1160  newt = mq;
1161  }
1162  } else if (fabs(*dt) < fabs(*dx)) {
1163  /*
1164  Case 3: a lower function value, derivatives of the
1165  same sign, and the magnitude of the derivative decreases.
1166  The cubic minimizer is only used if the cubic tends to
1167  infinity in the direction of the minimizer or if the minimum
1168  of the cubic is beyond t. Otherwise the cubic minimizer is
1169  defined to be either tmin or tmax. The quadratic (secant)
1170  minimizer is also computed and if the minimum is brackt
1171  then the the minimizer closest to x is taken, else the one
1172  farthest away is taken.
1173  */
1174  bound = 1;
1175  CUBIC_MINIMIZER2(mc, *x, *fx, *dx, *t, *ft, *dt, tmin, tmax);
1176  QUARD_MINIMIZER2(mq, *x, *dx, *t, *dt);
1177  if (*brackt) {
1178  if (fabs(*t - mc) < fabs(*t - mq)) {
1179  newt = mc;
1180  } else {
1181  newt = mq;
1182  }
1183  } else {
1184  if (fabs(*t - mc) > fabs(*t - mq)) {
1185  newt = mc;
1186  } else {
1187  newt = mq;
1188  }
1189  }
1190  } else {
1191  /*
1192  Case 4: a lower function value, derivatives of the
1193  same sign, and the magnitude of the derivative does
1194  not decrease. If the minimum is not brackt, the step
1195  is either tmin or tmax, else the cubic minimizer is taken.
1196  */
1197  bound = 0;
1198  if (*brackt) {
1199  CUBIC_MINIMIZER(newt, *t, *ft, *dt, *y, *fy, *dy);
1200  } else if (*x < *t) {
1201  newt = tmax;
1202  } else {
1203  newt = tmin;
1204  }
1205  }
1206 
1207  /*
1208  Update the interval of uncertainty. This update does not
1209  depend on the new step or the case analysis above.
1210 
1211  - Case a: if f(x) < f(t),
1212  x <- x, y <- t.
1213  - Case b: if f(t) <= f(x) && f'(t)*f'(x) > 0,
1214  x <- t, y <- y.
1215  - Case c: if f(t) <= f(x) && f'(t)*f'(x) < 0,
1216  x <- t, y <- x.
1217  */
1218  if (*fx < *ft) {
1219  /* Case a */
1220  *y = *t;
1221  *fy = *ft;
1222  *dy = *dt;
1223  } else {
1224  /* Case c */
1225  if (dsign) {
1226  *y = *x;
1227  *fy = *fx;
1228  *dy = *dx;
1229  }
1230  /* Cases b and c */
1231  *x = *t;
1232  *fx = *ft;
1233  *dx = *dt;
1234  }
1235 
1236  /* Clip the new trial value in [tmin, tmax]. */
1237  if (tmax < newt) newt = tmax;
1238  if (newt < tmin) newt = tmin;
1239 
1240  /*
1241  Redefine the new trial value if it is close to the upper bound
1242  of the interval.
1243  */
1244  if (*brackt && bound) {
1245  mq = *x + 0.66 * (*y - *x);
1246  if (*x < *y) {
1247  if (mq < newt) newt = mq;
1248  } else {
1249  if (newt < mq) newt = mq;
1250  }
1251  }
1252 
1253  /* Return the new trial value. */
1254  *t = newt;
1255  return 0;
1256 }
1257 
1258 
1259 
1260 
1261 
1263  const float64_t* x,
1264  const int32_t start,
1265  const int32_t n
1266  )
1267 {
1268  int32_t i;
1269  float64_t norm = 0.;
1270 
1271  for (i = start;i < n;++i) {
1272  norm += fabs(x[i]);
1273  }
1274 
1275  return norm;
1276 }
1277 
1279  float64_t* pg,
1280  const float64_t* x,
1281  const float64_t* g,
1282  const int32_t n,
1283  const float64_t c,
1284  const int32_t start,
1285  const int32_t end
1286  )
1287 {
1288  int32_t i;
1289 
1290  /* Compute the negative of gradients. */
1291  for (i = 0;i < start;++i) {
1292  pg[i] = g[i];
1293  }
1294 
1295  /* Compute the psuedo-gradients. */
1296  for (i = start;i < end;++i) {
1297  if (x[i] < 0.) {
1298  /* Differentiable. */
1299  pg[i] = g[i] - c;
1300  } else if (0. < x[i]) {
1301  /* Differentiable. */
1302  pg[i] = g[i] + c;
1303  } else {
1304  if (g[i] < -c) {
1305  /* Take the right partial derivative. */
1306  pg[i] = g[i] + c;
1307  } else if (c < g[i]) {
1308  /* Take the left partial derivative. */
1309  pg[i] = g[i] - c;
1310  } else {
1311  pg[i] = 0.;
1312  }
1313  }
1314  }
1315 
1316  for (i = end;i < n;++i) {
1317  pg[i] = g[i];
1318  }
1319 }
1320 
1321 static void owlqn_project(
1322  float64_t* d,
1323  const float64_t* sign,
1324  const int32_t start,
1325  const int32_t end
1326  )
1327 {
1328  int32_t i;
1329 
1330  for (i = start;i < end;++i) {
1331  if (d[i] * sign[i] <= 0) {
1332  d[i] = 0;
1333  }
1334  }
1335 }
1336 
1337 }

SHOGUN Machine Learning Toolbox - Documentation