lbfgs.cpp

Go to the documentation of this file.
00001 /*
00002  *      Limited memory BFGS (L-BFGS).
00003  *
00004  * Copyright (c) 1990, Jorge Nocedal
00005  * Copyright (c) 2007-2010 Naoaki Okazaki
00006  * All rights reserved.
00007  *
00008  * Permission is hereby granted, free of charge, to any person obtaining a copy
00009  * of this software and associated documentation files (the "Software"), to deal
00010  * in the Software without restriction, including without limitation the rights
00011  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
00012  * copies of the Software, and to permit persons to whom the Software is
00013  * furnished to do so, subject to the following conditions:
00014  *
00015  * The above copyright notice and this permission notice shall be included in
00016  * all copies or substantial portions of the Software.
00017  *
00018  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
00019  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
00020  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
00021  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
00022  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
00023  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
00024  * THE SOFTWARE.
00025  */
00026 
00027 /* $Id$ */
00028 
00029 /*
00030 This library is a C port of the FORTRAN implementation of Limited-memory
00031 Broyden-Fletcher-Goldfarb-Shanno (L-BFGS) method written by Jorge Nocedal.
00032 The original FORTRAN source code is available at:
00033 http://www.ece.northwestern.edu/~nocedal/lbfgs.html
00034 
00035 The L-BFGS algorithm is described in:
00036     - Jorge Nocedal.
00037       Updating Quasi-Newton Matrices with Limited Storage.
00038       <i>Mathematics of Computation</i>, Vol. 35, No. 151, pp. 773--782, 1980.
00039     - Dong C. Liu and Jorge Nocedal.
00040       On the limited memory BFGS method for large scale optimization.
00041       <i>Mathematical Programming</i> B, Vol. 45, No. 3, pp. 503-528, 1989.
00042 
00043 The line search algorithms used in this implementation are described in:
00044     - John E. Dennis and Robert B. Schnabel.
00045       <i>Numerical Methods for Unconstrained Optimization and Nonlinear
00046       Equations</i>, Englewood Cliffs, 1983.
00047     - Jorge J. More and David J. Thuente.
00048       Line search algorithm with guaranteed sufficient decrease.
00049       <i>ACM Transactions on Mathematical Software (TOMS)</i>, Vol. 20, No. 3,
00050       pp. 286-307, 1994.
00051 
00052 This library also implements Orthant-Wise Limited-memory Quasi-Newton (OWL-QN)
00053 method presented in:
00054     - Galen Andrew and Jianfeng Gao.
00055       Scalable training of L1-regularized log-linear models.
00056       In <i>Proceedings of the 24th International Conference on Machine
00057       Learning (ICML 2007)</i>, pp. 33-40, 2007.
00058 
00059 I would like to thank the original author, Jorge Nocedal, who has been
00060 distributing the effieicnt and explanatory implementation in an open source
00061 licence.
00062 */
00063 
00064 #include <algorithm>
00065 #include <cstdio>
00066 #include <cstdlib>
00067 #include <cmath>
00068 
00069 #include <shogun/optimization/lbfgs/lbfgs.h>
00070 #include <shogun/lib/SGVector.h>
00071 
00072 namespace shogun
00073 {
00074 
00075 #define min2(a, b)      ((a) <= (b) ? (a) : (b))
00076 #define max2(a, b)      ((a) >= (b) ? (a) : (b))
00077 #define max3(a, b, c)   max2(max2((a), (b)), (c));
00078 
00079 struct tag_callback_data {
00080     int32_t n;
00081     void *instance;
00082     lbfgs_evaluate_t proc_evaluate;
00083     lbfgs_progress_t proc_progress;
00084 };
00085 typedef struct tag_callback_data callback_data_t;
00086 
00087 struct tag_iteration_data {
00088     float64_t alpha;
00089     float64_t *s;     /* [n] */
00090     float64_t *y;     /* [n] */
00091     float64_t ys;     /* vecdot(y, s) */
00092 };
00093 typedef struct tag_iteration_data iteration_data_t;
00094 
00095 static const lbfgs_parameter_t _defparam = {
00096     6, 1e-5, 0, 1e-5,
00097     0, LBFGS_LINESEARCH_DEFAULT, 40,
00098     1e-20, 1e20, 1e-4, 0.9, 0.9, 1.0e-16,
00099     0.0, 0, -1,
00100 };
00101 
00102 /* Forward function declarations. */
00103 
00104 typedef int32_t (*line_search_proc)(
00105     int32_t n,
00106     float64_t *x,
00107     float64_t *f,
00108     float64_t *g,
00109     float64_t *s,
00110     float64_t *stp,
00111     const float64_t* xp,
00112     const float64_t* gp,
00113     float64_t *wa,
00114     callback_data_t *cd,
00115     const lbfgs_parameter_t *param
00116     );
00117     
00118 static int32_t line_search_backtracking(
00119     int32_t n,
00120     float64_t *x,
00121     float64_t *f,
00122     float64_t *g,
00123     float64_t *s,
00124     float64_t *stp,
00125     const float64_t* xp,
00126     const float64_t* gp,
00127     float64_t *wa,
00128     callback_data_t *cd,
00129     const lbfgs_parameter_t *param
00130     );
00131 
00132 static int32_t line_search_backtracking_owlqn(
00133     int32_t n,
00134     float64_t *x,
00135     float64_t *f,
00136     float64_t *g,
00137     float64_t *s,
00138     float64_t *stp,
00139     const float64_t* xp,
00140     const float64_t* gp,
00141     float64_t *wp,
00142     callback_data_t *cd,
00143     const lbfgs_parameter_t *param
00144     );
00145 
00146 static int32_t line_search_morethuente(
00147     int32_t n,
00148     float64_t *x,
00149     float64_t *f,
00150     float64_t *g,
00151     float64_t *s,
00152     float64_t *stp,
00153     const float64_t* xp,
00154     const float64_t* gp,
00155     float64_t *wa,
00156     callback_data_t *cd,
00157     const lbfgs_parameter_t *param
00158     );
00159 
00160 static int32_t update_trial_interval(
00161     float64_t *x,
00162     float64_t *fx,
00163     float64_t *dx,
00164     float64_t *y,
00165     float64_t *fy,
00166     float64_t *dy,
00167     float64_t *t,
00168     float64_t *ft,
00169     float64_t *dt,
00170     const float64_t tmin,
00171     const float64_t tmax,
00172     int32_t *brackt
00173     );
00174 
00175 static float64_t owlqn_x1norm(
00176     const float64_t* x,
00177     const int32_t start,
00178     const int32_t n
00179     );
00180 
00181 static void owlqn_pseudo_gradient(
00182     float64_t* pg,
00183     const float64_t* x,
00184     const float64_t* g,
00185     const int32_t n,
00186     const float64_t c,
00187     const int32_t start,
00188     const int32_t end
00189     );
00190 
00191 static void owlqn_project(
00192     float64_t* d,
00193     const float64_t* sign,
00194     const int32_t start,
00195     const int32_t end
00196     );
00197 
00198 
00199 void lbfgs_parameter_init(lbfgs_parameter_t *param)
00200 {
00201     memcpy(param, &_defparam, sizeof(*param));
00202 }
00203 
00204 int32_t lbfgs(
00205     int32_t n,
00206     float64_t *x,
00207     float64_t *ptr_fx,
00208     lbfgs_evaluate_t proc_evaluate,
00209     lbfgs_progress_t proc_progress,
00210     void *instance,
00211     lbfgs_parameter_t *_param
00212     )
00213 {
00214     int32_t ret;
00215     int32_t i, j, k, ls, end, bound;
00216     float64_t step;
00217 
00218     /* Constant parameters and their default values. */
00219     lbfgs_parameter_t param = (_param != NULL) ? (*_param) : _defparam;
00220     const int32_t m = param.m;
00221 
00222     float64_t *xp = NULL;
00223     float64_t *g = NULL, *gp = NULL, *pg = NULL;
00224     float64_t *d = NULL, *w = NULL, *pf = NULL;
00225     iteration_data_t *lm = NULL, *it = NULL;
00226     float64_t ys, yy;
00227     float64_t xnorm, gnorm, beta;
00228     float64_t fx = 0.;
00229     float64_t rate = 0.;
00230     line_search_proc linesearch = line_search_morethuente;
00231 
00232     /* Construct a callback data. */
00233     callback_data_t cd;
00234     cd.n = n;
00235     cd.instance = instance;
00236     cd.proc_evaluate = proc_evaluate;
00237     cd.proc_progress = proc_progress;
00238 
00239     /* Check the input parameters for errors. */
00240     if (n <= 0) {
00241         return LBFGSERR_INVALID_N;
00242     }
00243     if (param.epsilon < 0.) {
00244         return LBFGSERR_INVALID_EPSILON;
00245     }
00246     if (param.past < 0) {
00247         return LBFGSERR_INVALID_TESTPERIOD;
00248     }
00249     if (param.delta < 0.) {
00250         return LBFGSERR_INVALID_DELTA;
00251     }
00252     if (param.min_step < 0.) {
00253         return LBFGSERR_INVALID_MINSTEP;
00254     }
00255     if (param.max_step < param.min_step) {
00256         return LBFGSERR_INVALID_MAXSTEP;
00257     }
00258     if (param.ftol < 0.) {
00259         return LBFGSERR_INVALID_FTOL;
00260     }
00261     if (param.linesearch == LBFGS_LINESEARCH_BACKTRACKING_WOLFE ||
00262         param.linesearch == LBFGS_LINESEARCH_BACKTRACKING_STRONG_WOLFE) {
00263         if (param.wolfe <= param.ftol || 1. <= param.wolfe) {
00264             return LBFGSERR_INVALID_WOLFE;
00265         }
00266     }
00267     if (param.gtol < 0.) {
00268         return LBFGSERR_INVALID_GTOL;
00269     }
00270     if (param.xtol < 0.) {
00271         return LBFGSERR_INVALID_XTOL;
00272     }
00273     if (param.max_linesearch <= 0) {
00274         return LBFGSERR_INVALID_MAXLINESEARCH;
00275     }
00276     if (param.orthantwise_c < 0.) {
00277         return LBFGSERR_INVALID_ORTHANTWISE;
00278     }
00279     if (param.orthantwise_start < 0 || n < param.orthantwise_start) {
00280         return LBFGSERR_INVALID_ORTHANTWISE_START;
00281     }
00282     if (param.orthantwise_end < 0) {
00283         param.orthantwise_end = n;
00284     }
00285     if (n < param.orthantwise_end) {
00286         return LBFGSERR_INVALID_ORTHANTWISE_END;
00287     }
00288     if (param.orthantwise_c != 0.) {
00289         switch (param.linesearch) {
00290         case LBFGS_LINESEARCH_BACKTRACKING:
00291             linesearch = line_search_backtracking_owlqn;
00292             break;
00293         default:
00294             /* Only the backtracking method is available. */
00295             return LBFGSERR_INVALID_LINESEARCH;
00296         }
00297     } else {
00298         switch (param.linesearch) {
00299         case LBFGS_LINESEARCH_MORETHUENTE:
00300             linesearch = line_search_morethuente;
00301             break;
00302         case LBFGS_LINESEARCH_BACKTRACKING_ARMIJO:
00303         case LBFGS_LINESEARCH_BACKTRACKING_WOLFE:
00304         case LBFGS_LINESEARCH_BACKTRACKING_STRONG_WOLFE:
00305             linesearch = line_search_backtracking;
00306             break;
00307         default:
00308             return LBFGSERR_INVALID_LINESEARCH;
00309         }
00310     }
00311 
00312     /* Allocate working space. */
00313     xp = SG_CALLOC(float64_t, n);
00314     g = SG_CALLOC(float64_t, n);
00315     gp = SG_CALLOC(float64_t, n);
00316     d = SG_CALLOC(float64_t, n);
00317     w = SG_CALLOC(float64_t, n);
00318     if (xp == NULL || g == NULL || gp == NULL || d == NULL || w == NULL) {
00319         ret = LBFGSERR_OUTOFMEMORY;
00320         goto lbfgs_exit;
00321     }
00322 
00323     if (param.orthantwise_c != 0.) {
00324         /* Allocate working space for OW-LQN. */
00325         pg = SG_CALLOC(float64_t, n);
00326         if (pg == NULL) {
00327             ret = LBFGSERR_OUTOFMEMORY;
00328             goto lbfgs_exit;
00329         }
00330     }
00331 
00332     /* Allocate limited memory storage. */
00333     lm = SG_CALLOC(iteration_data_t, m);
00334     if (lm == NULL) {
00335         ret = LBFGSERR_OUTOFMEMORY;
00336         goto lbfgs_exit;
00337     }
00338 
00339     /* Initialize the limited memory. */
00340     for (i = 0;i < m;++i) {
00341         it = &lm[i];
00342         it->alpha = 0;
00343         it->ys = 0;
00344         it->s = SG_CALLOC(float64_t, n);
00345         it->y = SG_CALLOC(float64_t, n);
00346         if (it->s == NULL || it->y == NULL) {
00347             ret = LBFGSERR_OUTOFMEMORY;
00348             goto lbfgs_exit;
00349         }
00350     }
00351 
00352     /* Allocate an array for storing previous values of the objective function. */
00353     if (0 < param.past) {
00354         pf = SG_CALLOC(float64_t, param.past);
00355     }
00356 
00357     /* Evaluate the function value and its gradient. */
00358     fx = cd.proc_evaluate(cd.instance, x, g, cd.n, 0);
00359     if (0. != param.orthantwise_c) {
00360         /* Compute the L1 norm of the variable and add it to the object value. */
00361         xnorm = owlqn_x1norm(x, param.orthantwise_start, param.orthantwise_end);
00362         fx += xnorm * param.orthantwise_c;
00363         owlqn_pseudo_gradient(
00364             pg, x, g, n,
00365             param.orthantwise_c, param.orthantwise_start, param.orthantwise_end
00366             );
00367     }
00368 
00369     /* Store the initial value of the objective function. */
00370     if (pf != NULL) {
00371         pf[0] = fx;
00372     }
00373 
00374     /*
00375         Compute the direction;
00376         we assume the initial hessian matrix H_0 as the identity matrix.
00377      */
00378     if (param.orthantwise_c == 0.) {
00379         std::copy(g,g+n,d);
00380         SGVector<float64_t>::scale_vector(-1, d, n);
00381     } else {
00382         std::copy(pg,pg+n,d);
00383         SGVector<float64_t>::scale_vector(-1, d, n);
00384     }
00385 
00386     /*
00387        Make sure that the initial variables are not a minimizer.
00388      */
00389     xnorm = SGVector<float64_t>::twonorm(x, n);
00390     if (param.orthantwise_c == 0.) {
00391         gnorm = SGVector<float64_t>::twonorm(g, n);
00392     } else {
00393         gnorm = SGVector<float64_t>::twonorm(pg, n);
00394     }
00395     if (xnorm < 1.0) xnorm = 1.0;
00396     if (gnorm / xnorm <= param.epsilon) {
00397         ret = LBFGS_ALREADY_MINIMIZED;
00398         goto lbfgs_exit;
00399     }
00400 
00401     /* Compute the initial step:
00402         step = 1.0 / sqrt(vecdot(d, d, n))
00403      */
00404     step = 1.0 / SGVector<float64_t>::twonorm(d, n);
00405 
00406     k = 1;
00407     end = 0;
00408     for (;;) {
00409         /* Store the current position and gradient vectors. */
00410         std::copy(x,x+n,xp);
00411         std::copy(g,g+n,gp);
00412 
00413         /* Search for an optimal step. */
00414         if (param.orthantwise_c == 0.) {
00415             ls = linesearch(n, x, &fx, g, d, &step, xp, gp, w, &cd, &param);
00416         } else {
00417             ls = linesearch(n, x, &fx, g, d, &step, xp, pg, w, &cd, &param);
00418             owlqn_pseudo_gradient(
00419                 pg, x, g, n,
00420                 param.orthantwise_c, param.orthantwise_start, param.orthantwise_end
00421                 );
00422         }
00423         if (ls < 0) {
00424             /* Revert to the previous point. */
00425             std::copy(xp,xp+n,x);
00426             std::copy(gp,gp+n,g);
00427             ret = ls;
00428             goto lbfgs_exit;
00429         }
00430 
00431         /* Compute x and g norms. */
00432         xnorm = SGVector<float64_t>::twonorm(x, n);
00433         if (param.orthantwise_c == 0.) {
00434             gnorm = SGVector<float64_t>::twonorm(g, n);
00435         } else {
00436             gnorm = SGVector<float64_t>::twonorm(pg, n);
00437         }
00438 
00439         /* Report the progress. */
00440         if (cd.proc_progress) {
00441             if ((ret = cd.proc_progress(cd.instance, x, g, fx, xnorm, gnorm, step, cd.n, k, ls))) {
00442                 goto lbfgs_exit;
00443             }
00444         }
00445 
00446         /*
00447             Convergence test.
00448             The criterion is given by the following formula:
00449                 |g(x)| / \max2(1, |x|) < \epsilon
00450          */
00451         if (xnorm < 1.0) xnorm = 1.0;
00452         if (gnorm / xnorm <= param.epsilon) {
00453             /* Convergence. */
00454             ret = LBFGS_SUCCESS;
00455             break;
00456         }
00457 
00458         /*
00459             Test for stopping criterion.
00460             The criterion is given by the following formula:
00461                 (f(past_x) - f(x)) / f(x) < \delta
00462          */
00463         if (pf != NULL) {
00464             /* We don't test the stopping criterion while k < past. */
00465             if (param.past <= k) {
00466                 /* Compute the relative improvement from the past. */
00467                 rate = (pf[k % param.past] - fx) / fx;
00468 
00469                 /* The stopping criterion. */
00470                 if (rate < param.delta) {
00471                     ret = LBFGS_STOP;
00472                     break;
00473                 }
00474             }
00475 
00476             /* Store the current value of the objective function. */
00477             pf[k % param.past] = fx;
00478         }
00479 
00480         if (param.max_iterations != 0 && param.max_iterations < k+1) {
00481             /* Maximum number of iterations. */
00482             ret = LBFGSERR_MAXIMUMITERATION;
00483             break;
00484         }
00485 
00486         /*
00487             Update vectors s and y:
00488                 s_{k+1} = x_{k+1} - x_{k} = \step * d_{k}.
00489                 y_{k+1} = g_{k+1} - g_{k}.
00490          */
00491         it = &lm[end];
00492         SGVector<float64_t>::add(it->s, 1, x, -1, xp, n);
00493         SGVector<float64_t>::add(it->y, 1, g, -1, gp, n);
00494 
00495         /*
00496             Compute scalars ys and yy:
00497                 ys = y^t \cdot s = 1 / \rho.
00498                 yy = y^t \cdot y.
00499             Notice that yy is used for scaling the hessian matrix H_0 (Cholesky factor).
00500          */
00501         ys = SGVector<float64_t>::dot(it->y, it->s, n);
00502         yy = SGVector<float64_t>::dot(it->y, it->y, n);
00503         it->ys = ys;
00504 
00505         /*
00506             Recursive formula to compute dir = -(H \cdot g).
00507                 This is described in page 779 of:
00508                 Jorge Nocedal.
00509                 Updating Quasi-Newton Matrices with Limited Storage.
00510                 Mathematics of Computation, Vol. 35, No. 151,
00511                 pp. 773--782, 1980.
00512          */
00513         bound = (m <= k) ? m : k;
00514         ++k;
00515         end = (end + 1) % m;
00516 
00517         /* Compute the steepest direction. */
00518         if (param.orthantwise_c == 0.) {
00519             /* Compute the negative of gradients. */
00520             std::copy(g, g+n, d);
00521             SGVector<float64_t>::scale_vector(-1, d, n);
00522         } else {
00523             std::copy(pg, pg+n, d);
00524             SGVector<float64_t>::scale_vector(-1, d, n);
00525         }
00526 
00527         j = end;
00528         for (i = 0;i < bound;++i) {
00529             j = (j + m - 1) % m;    /* if (--j == -1) j = m-1; */
00530             it = &lm[j];
00531             /* \alpha_{j} = \rho_{j} s^{t}_{j} \cdot q_{k+1}. */
00532             it->alpha = SGVector<float64_t>::dot(it->s, d, n);
00533             it->alpha /= it->ys;
00534             /* q_{i} = q_{i+1} - \alpha_{i} y_{i}. */
00535             SGVector<float64_t>::add(d, 1, d, -it->alpha, it->y, n);
00536         }
00537 
00538         SGVector<float64_t>::scale_vector(ys / yy, d, n);
00539 
00540         for (i = 0;i < bound;++i) {
00541             it = &lm[j];
00542             /* \beta_{j} = \rho_{j} y^t_{j} \cdot \gamma_{i}. */
00543             beta = SGVector<float64_t>::dot(it->y, d, n);
00544             beta /= it->ys;
00545             /* \gamma_{i+1} = \gamma_{i} + (\alpha_{j} - \beta_{j}) s_{j}. */
00546             SGVector<float64_t>::add(d, 1, d, it->alpha-beta, it->s, n);
00547             j = (j + 1) % m;        /* if (++j == m) j = 0; */
00548         }
00549 
00550         /*
00551             Constrain the search direction for orthant-wise updates.
00552          */
00553         if (param.orthantwise_c != 0.) {
00554             for (i = param.orthantwise_start;i < param.orthantwise_end;++i) {
00555                 if (d[i] * pg[i] >= 0) {
00556                     d[i] = 0;
00557                 }
00558             }
00559         }
00560 
00561         /*
00562             Now the search direction d is ready. We try step = 1 first.
00563          */
00564         step = 1.0;
00565     }
00566 
00567 lbfgs_exit:
00568     /* Return the final value of the objective function. */
00569     if (ptr_fx != NULL) {
00570         *ptr_fx = fx;
00571     }
00572 
00573     SG_FREE(pf);
00574 
00575     /* Free memory blocks used by this function. */
00576     if (lm != NULL) {
00577         for (i = 0;i < m;++i) {
00578             SG_FREE(lm[i].s);
00579             SG_FREE(lm[i].y);
00580         }
00581         SG_FREE(lm);
00582     }
00583     SG_FREE(pg);
00584     SG_FREE(w);
00585     SG_FREE(d);
00586     SG_FREE(gp);
00587     SG_FREE(g);
00588     SG_FREE(xp);
00589 
00590     return ret;
00591 }
00592 
00593 
00594 
00595 static int32_t line_search_backtracking(
00596     int32_t n,
00597     float64_t *x,
00598     float64_t *f,
00599     float64_t *g,
00600     float64_t *s,
00601     float64_t *stp,
00602     const float64_t* xp,
00603     const float64_t* gp,
00604     float64_t *wp,
00605     callback_data_t *cd,
00606     const lbfgs_parameter_t *param
00607     )
00608 {
00609     int32_t count = 0;
00610     float64_t width, dg;
00611     float64_t finit, dginit = 0., dgtest;
00612     const float64_t dec = 0.5, inc = 2.1;
00613 
00614     /* Check the input parameters for errors. */
00615     if (*stp <= 0.) {
00616         return LBFGSERR_INVALIDPARAMETERS;
00617     }
00618 
00619     /* Compute the initial gradient in the search direction. */
00620     dginit = SGVector<float64_t>::dot(g, s, n);
00621 
00622     /* Make sure that s points to a descent direction. */
00623     if (0 < dginit) {
00624         return LBFGSERR_INCREASEGRADIENT;
00625     }
00626 
00627     /* The initial value of the objective function. */
00628     finit = *f;
00629     dgtest = param->ftol * dginit;
00630 
00631     for (;;) {
00632         std::copy(xp,xp+n,x);
00633         SGVector<float64_t>::add(x, 1, x, *stp, s, n);
00634 
00635         /* Evaluate the function and gradient values. */
00636         *f = cd->proc_evaluate(cd->instance, x, g, cd->n, *stp);
00637 
00638         ++count;
00639 
00640         if (*f > finit + *stp * dgtest) {
00641             width = dec;
00642         } else {
00643             /* The sufficient decrease condition (Armijo condition). */
00644             if (param->linesearch == LBFGS_LINESEARCH_BACKTRACKING_ARMIJO) {
00645                 /* Exit with the Armijo condition. */
00646                 return count;
00647             }
00648 
00649             /* Check the Wolfe condition. */
00650             dg = SGVector<float64_t>::dot(g, s, n);
00651             if (dg < param->wolfe * dginit) {
00652                 width = inc;
00653             } else {
00654                 if(param->linesearch == LBFGS_LINESEARCH_BACKTRACKING_WOLFE) {
00655                     /* Exit with the regular Wolfe condition. */
00656                     return count;
00657                 }
00658 
00659                 /* Check the strong Wolfe condition. */
00660                 if(dg > -param->wolfe * dginit) {
00661                     width = dec;
00662                 } else {
00663                     /* Exit with the strong Wolfe condition. */
00664                     return count;
00665                 }
00666             }
00667         }
00668 
00669         if (*stp < param->min_step) {
00670             /* The step is the minimum value. */
00671             return LBFGSERR_MINIMUMSTEP;
00672         }
00673         if (*stp > param->max_step) {
00674             /* The step is the maximum value. */
00675             return LBFGSERR_MAXIMUMSTEP;
00676         }
00677         if (param->max_linesearch <= count) {
00678             /* Maximum number of iteration. */
00679             return LBFGSERR_MAXIMUMLINESEARCH;
00680         }
00681 
00682         (*stp) *= width;
00683     }
00684 }
00685 
00686 
00687 
00688 static int32_t line_search_backtracking_owlqn(
00689     int32_t n,
00690     float64_t *x,
00691     float64_t *f,
00692     float64_t *g,
00693     float64_t *s,
00694     float64_t *stp,
00695     const float64_t* xp,
00696     const float64_t* gp,
00697     float64_t *wp,
00698     callback_data_t *cd,
00699     const lbfgs_parameter_t *param
00700     )
00701 {
00702     int32_t i, count = 0;
00703     float64_t width = 0.5, norm = 0.;
00704     float64_t finit = *f, dgtest;
00705 
00706     /* Check the input parameters for errors. */
00707     if (*stp <= 0.) {
00708         return LBFGSERR_INVALIDPARAMETERS;
00709     }
00710 
00711     /* Choose the orthant for the new point. */
00712     for (i = 0;i < n;++i) {
00713         wp[i] = (xp[i] == 0.) ? -gp[i] : xp[i];
00714     }
00715 
00716     for (;;) {
00717         /* Update the current point. */
00718         std::copy(xp,xp+n,x);
00719         SGVector<float64_t>::add(x, 1, x, *stp, s, n);
00720 
00721         /* The current point is projected onto the orthant. */
00722         owlqn_project(x, wp, param->orthantwise_start, param->orthantwise_end);
00723 
00724         /* Evaluate the function and gradient values. */
00725         *f = cd->proc_evaluate(cd->instance, x, g, cd->n, *stp);
00726 
00727         /* Compute the L1 norm of the variables and add it to the object value. */
00728         norm = owlqn_x1norm(x, param->orthantwise_start, param->orthantwise_end);
00729         *f += norm * param->orthantwise_c;
00730 
00731         ++count;
00732 
00733         dgtest = 0.;
00734         for (i = 0;i < n;++i) {
00735             dgtest += (x[i] - xp[i]) * gp[i];
00736         }
00737 
00738         if (*f <= finit + param->ftol * dgtest) {
00739             /* The sufficient decrease condition. */
00740             return count;
00741         }
00742 
00743         if (*stp < param->min_step) {
00744             /* The step is the minimum value. */
00745             return LBFGSERR_MINIMUMSTEP;
00746         }
00747         if (*stp > param->max_step) {
00748             /* The step is the maximum value. */
00749             return LBFGSERR_MAXIMUMSTEP;
00750         }
00751         if (param->max_linesearch <= count) {
00752             /* Maximum number of iteration. */
00753             return LBFGSERR_MAXIMUMLINESEARCH;
00754         }
00755 
00756         (*stp) *= width;
00757     }
00758 }
00759 
00760 
00761 
00762 static int32_t line_search_morethuente(
00763     int32_t n,
00764     float64_t *x,
00765     float64_t *f,
00766     float64_t *g,
00767     float64_t *s,
00768     float64_t *stp,
00769     const float64_t* xp,
00770     const float64_t* gp,
00771     float64_t *wa,
00772     callback_data_t *cd,
00773     const lbfgs_parameter_t *param
00774     )
00775 {
00776     int32_t count = 0;
00777     int32_t brackt, stage1, uinfo = 0;
00778     float64_t dg;
00779     float64_t stx, fx, dgx;
00780     float64_t sty, fy, dgy;
00781     float64_t fxm, dgxm, fym, dgym, fm, dgm;
00782     float64_t finit, ftest1, dginit, dgtest;
00783     float64_t width, prev_width;
00784     float64_t stmin, stmax;
00785 
00786     /* Check the input parameters for errors. */
00787     if (*stp <= 0.) {
00788         return LBFGSERR_INVALIDPARAMETERS;
00789     }
00790 
00791     /* Compute the initial gradient in the search direction. */
00792     dginit = SGVector<float64_t>::dot(g, s, n);
00793 
00794     /* Make sure that s points to a descent direction. */
00795     if (0 < dginit) {
00796         return LBFGSERR_INCREASEGRADIENT;
00797     }
00798 
00799     /* Initialize local variables. */
00800     brackt = 0;
00801     stage1 = 1;
00802     finit = *f;
00803     dgtest = param->ftol * dginit;
00804     width = param->max_step - param->min_step;
00805     prev_width = 2.0 * width;
00806 
00807     /*
00808         The variables stx, fx, dgx contain the values of the step,
00809         function, and directional derivative at the best step.
00810         The variables sty, fy, dgy contain the value of the step,
00811         function, and derivative at the other endpoint of
00812         the interval of uncertainty.
00813         The variables stp, f, dg contain the values of the step,
00814         function, and derivative at the current step.
00815     */
00816     stx = sty = 0.;
00817     fx = fy = finit;
00818     dgx = dgy = dginit;
00819 
00820     for (;;) {
00821         /*
00822             Set the minimum and maximum steps to correspond to the
00823             present interval of uncertainty.
00824          */
00825         if (brackt) {
00826             stmin = min2(stx, sty);
00827             stmax = max2(stx, sty);
00828         } else {
00829             stmin = stx;
00830             stmax = *stp + 4.0 * (*stp - stx);
00831         }
00832 
00833         /* Clip the step in the range of [stpmin, stpmax]. */
00834         if (*stp < param->min_step) *stp = param->min_step;
00835         if (param->max_step < *stp) *stp = param->max_step;
00836 
00837         /*
00838             If an unusual termination is to occur then let
00839             stp be the lowest point obtained so far.
00840          */
00841         if ((brackt && ((*stp <= stmin || stmax <= *stp) || param->max_linesearch <= count + 1 || uinfo != 0)) || (brackt && (stmax - stmin <= param->xtol * stmax))) {
00842             *stp = stx;
00843         }
00844 
00845         /*
00846             Compute the current value of x:
00847                 x <- x + (*stp) * s.
00848          */
00849         std::copy(xp,xp+n,x);
00850         SGVector<float64_t>::add(x, 1, x, *stp, s, n);
00851 
00852         /* Evaluate the function and gradient values. */
00853         *f = cd->proc_evaluate(cd->instance, x, g, cd->n, *stp);
00854         dg = SGVector<float64_t>::dot(g, s, n);
00855 
00856         ftest1 = finit + *stp * dgtest;
00857         ++count;
00858 
00859         /* Test for errors and convergence. */
00860         if (brackt && ((*stp <= stmin || stmax <= *stp) || uinfo != 0)) {
00861             /* Rounding errors prevent further progress. */
00862             return LBFGSERR_ROUNDING_ERROR;
00863         }
00864         if (*stp == param->max_step && *f <= ftest1 && dg <= dgtest) {
00865             /* The step is the maximum value. */
00866             return LBFGSERR_MAXIMUMSTEP;
00867         }
00868         if (*stp == param->min_step && (ftest1 < *f || dgtest <= dg)) {
00869             /* The step is the minimum value. */
00870             return LBFGSERR_MINIMUMSTEP;
00871         }
00872         if (brackt && (stmax - stmin) <= param->xtol * stmax) {
00873             /* Relative width of the interval of uncertainty is at most xtol. */
00874             return LBFGSERR_WIDTHTOOSMALL;
00875         }
00876         if (param->max_linesearch <= count) {
00877             /* Maximum number of iteration. */
00878             return LBFGSERR_MAXIMUMLINESEARCH;
00879         }
00880         if (*f <= ftest1 && fabs(dg) <= param->gtol * (-dginit)) {
00881             /* The sufficient decrease condition and the directional derivative condition hold. */
00882             return count;
00883         }
00884 
00885         /*
00886             In the first stage we seek a step for which the modified
00887             function has a nonpositive value and nonnegative derivative.
00888          */
00889         if (stage1 && *f <= ftest1 && min2(param->ftol, param->gtol) * dginit <= dg) {
00890             stage1 = 0;
00891         }
00892 
00893         /*
00894             A modified function is used to predict the step only if
00895             we have not obtained a step for which the modified
00896             function has a nonpositive function value and nonnegative
00897             derivative, and if a lower function value has been
00898             obtained but the decrease is not sufficient.
00899          */
00900         if (stage1 && ftest1 < *f && *f <= fx) {
00901             /* Define the modified function and derivative values. */
00902             fm = *f - *stp * dgtest;
00903             fxm = fx - stx * dgtest;
00904             fym = fy - sty * dgtest;
00905             dgm = dg - dgtest;
00906             dgxm = dgx - dgtest;
00907             dgym = dgy - dgtest;
00908 
00909             /*
00910                 Call update_trial_interval() to update the interval of
00911                 uncertainty and to compute the new step.
00912              */
00913             uinfo = update_trial_interval(
00914                 &stx, &fxm, &dgxm,
00915                 &sty, &fym, &dgym,
00916                 stp, &fm, &dgm,
00917                 stmin, stmax, &brackt
00918                 );
00919 
00920             /* Reset the function and gradient values for f. */
00921             fx = fxm + stx * dgtest;
00922             fy = fym + sty * dgtest;
00923             dgx = dgxm + dgtest;
00924             dgy = dgym + dgtest;
00925         } else {
00926             /*
00927                 Call update_trial_interval() to update the interval of
00928                 uncertainty and to compute the new step.
00929              */
00930             uinfo = update_trial_interval(
00931                 &stx, &fx, &dgx,
00932                 &sty, &fy, &dgy,
00933                 stp, f, &dg,
00934                 stmin, stmax, &brackt
00935                 );
00936         }
00937 
00938         /*
00939             Force a sufficient decrease in the interval of uncertainty.
00940          */
00941         if (brackt) {
00942             if (0.66 * prev_width <= fabs(sty - stx)) {
00943                 *stp = stx + 0.5 * (sty - stx);
00944             }
00945             prev_width = width;
00946             width = fabs(sty - stx);
00947         }
00948     }
00949 
00950     return LBFGSERR_LOGICERROR;
00951 }
00952 
00953 
00954 
00958 #define USES_MINIMIZER \
00959     float64_t a, d, gamma, theta, p, q, r, s;
00960 
00971 #define CUBIC_MINIMIZER(cm, u, fu, du, v, fv, dv) \
00972     d = (v) - (u); \
00973     theta = ((fu) - (fv)) * 3 / d + (du) + (dv); \
00974     p = fabs(theta); \
00975     q = fabs(du); \
00976     r = fabs(dv); \
00977     s = max3(p, q, r); \
00978     /* gamma = s*sqrt((theta/s)**2 - (du/s) * (dv/s)) */ \
00979     a = theta / s; \
00980     gamma = s * sqrt(a * a - ((du) / s) * ((dv) / s)); \
00981     if ((v) < (u)) gamma = -gamma; \
00982     p = gamma - (du) + theta; \
00983     q = gamma - (du) + gamma + (dv); \
00984     r = p / q; \
00985     (cm) = (u) + r * d;
00986 
00999 #define CUBIC_MINIMIZER2(cm, u, fu, du, v, fv, dv, xmin, xmax) \
01000     d = (v) - (u); \
01001     theta = ((fu) - (fv)) * 3 / d + (du) + (dv); \
01002     p = fabs(theta); \
01003     q = fabs(du); \
01004     r = fabs(dv); \
01005     s = max3(p, q, r); \
01006     /* gamma = s*sqrt((theta/s)**2 - (du/s) * (dv/s)) */ \
01007     a = theta / s; \
01008     gamma = s * sqrt(max2(0, a * a - ((du) / s) * ((dv) / s))); \
01009     if ((u) < (v)) gamma = -gamma; \
01010     p = gamma - (dv) + theta; \
01011     q = gamma - (dv) + gamma + (du); \
01012     r = p / q; \
01013     if (r < 0. && gamma != 0.) { \
01014         (cm) = (v) - r * d; \
01015     } else if (a < 0) { \
01016         (cm) = (xmax); \
01017     } else { \
01018         (cm) = (xmin); \
01019     }
01020 
01030 #define QUARD_MINIMIZER(qm, u, fu, du, v, fv) \
01031     a = (v) - (u); \
01032     (qm) = (u) + (du) / (((fu) - (fv)) / a + (du)) / 2 * a;
01033 
01042 #define QUARD_MINIMIZER2(qm, u, du, v, dv) \
01043     a = (u) - (v); \
01044     (qm) = (v) + (dv) / ((dv) - (du)) * a;
01045 
01046 #define fsigndiff(x, y) (*(x) * (*(y) / fabs(*(y))) < 0.)
01047 
01077 static int32_t update_trial_interval(
01078     float64_t *x,
01079     float64_t *fx,
01080     float64_t *dx,
01081     float64_t *y,
01082     float64_t *fy,
01083     float64_t *dy,
01084     float64_t *t,
01085     float64_t *ft,
01086     float64_t *dt,
01087     const float64_t tmin,
01088     const float64_t tmax,
01089     int32_t *brackt
01090     )
01091 {
01092     int32_t bound;
01093     int32_t dsign = fsigndiff(dt, dx);
01094     float64_t mc; /* minimizer of an interpolated cubic. */
01095     float64_t mq; /* minimizer of an interpolated quadratic. */
01096     float64_t newt;   /* new trial value. */
01097     USES_MINIMIZER;     /* for CUBIC_MINIMIZER and QUARD_MINIMIZER. */
01098 
01099     /* Check the input parameters for errors. */
01100     if (*brackt) {
01101         if (*t <= min2(*x, *y) || max2(*x, *y) <= *t) {
01102             /* The trival value t is out of the interval. */
01103             return LBFGSERR_OUTOFINTERVAL;
01104         }
01105         if (0. <= *dx * (*t - *x)) {
01106             /* The function must decrease from x. */
01107             return LBFGSERR_INCREASEGRADIENT;
01108         }
01109         if (tmax < tmin) {
01110             /* Incorrect tmin and tmax specified. */
01111             return LBFGSERR_INCORRECT_TMINMAX;
01112         }
01113     }
01114 
01115     /*
01116         Trial value selection.
01117      */
01118     if (*fx < *ft) {
01119         /*
01120             Case 1: a higher function value.
01121             The minimum is brackt. If the cubic minimizer is closer
01122             to x than the quadratic one, the cubic one is taken, else
01123             the average of the minimizers is taken.
01124          */
01125         *brackt = 1;
01126         bound = 1;
01127         CUBIC_MINIMIZER(mc, *x, *fx, *dx, *t, *ft, *dt);
01128         QUARD_MINIMIZER(mq, *x, *fx, *dx, *t, *ft);
01129         if (fabs(mc - *x) < fabs(mq - *x)) {
01130             newt = mc;
01131         } else {
01132             newt = mc + 0.5 * (mq - mc);
01133         }
01134     } else if (dsign) {
01135         /*
01136             Case 2: a lower function value and derivatives of
01137             opposite sign. The minimum is brackt. If the cubic
01138             minimizer is closer to x than the quadratic (secant) one,
01139             the cubic one is taken, else the quadratic one is taken.
01140          */
01141         *brackt = 1;
01142         bound = 0;
01143         CUBIC_MINIMIZER(mc, *x, *fx, *dx, *t, *ft, *dt);
01144         QUARD_MINIMIZER2(mq, *x, *dx, *t, *dt);
01145         if (fabs(mc - *t) > fabs(mq - *t)) {
01146             newt = mc;
01147         } else {
01148             newt = mq;
01149         }
01150     } else if (fabs(*dt) < fabs(*dx)) {
01151         /*
01152             Case 3: a lower function value, derivatives of the
01153             same sign, and the magnitude of the derivative decreases.
01154             The cubic minimizer is only used if the cubic tends to
01155             infinity in the direction of the minimizer or if the minimum
01156             of the cubic is beyond t. Otherwise the cubic minimizer is
01157             defined to be either tmin or tmax. The quadratic (secant)
01158             minimizer is also computed and if the minimum is brackt
01159             then the the minimizer closest to x is taken, else the one
01160             farthest away is taken.
01161          */
01162         bound = 1;
01163         CUBIC_MINIMIZER2(mc, *x, *fx, *dx, *t, *ft, *dt, tmin, tmax);
01164         QUARD_MINIMIZER2(mq, *x, *dx, *t, *dt);
01165         if (*brackt) {
01166             if (fabs(*t - mc) < fabs(*t - mq)) {
01167                 newt = mc;
01168             } else {
01169                 newt = mq;
01170             }
01171         } else {
01172             if (fabs(*t - mc) > fabs(*t - mq)) {
01173                 newt = mc;
01174             } else {
01175                 newt = mq;
01176             }
01177         }
01178     } else {
01179         /*
01180             Case 4: a lower function value, derivatives of the
01181             same sign, and the magnitude of the derivative does
01182             not decrease. If the minimum is not brackt, the step
01183             is either tmin or tmax, else the cubic minimizer is taken.
01184          */
01185         bound = 0;
01186         if (*brackt) {
01187             CUBIC_MINIMIZER(newt, *t, *ft, *dt, *y, *fy, *dy);
01188         } else if (*x < *t) {
01189             newt = tmax;
01190         } else {
01191             newt = tmin;
01192         }
01193     }
01194 
01195     /*
01196         Update the interval of uncertainty. This update does not
01197         depend on the new step or the case analysis above.
01198 
01199         - Case a: if f(x) < f(t),
01200             x <- x, y <- t.
01201         - Case b: if f(t) <= f(x) && f'(t)*f'(x) > 0,
01202             x <- t, y <- y.
01203         - Case c: if f(t) <= f(x) && f'(t)*f'(x) < 0, 
01204             x <- t, y <- x.
01205      */
01206     if (*fx < *ft) {
01207         /* Case a */
01208         *y = *t;
01209         *fy = *ft;
01210         *dy = *dt;
01211     } else {
01212         /* Case c */
01213         if (dsign) {
01214             *y = *x;
01215             *fy = *fx;
01216             *dy = *dx;
01217         }
01218         /* Cases b and c */
01219         *x = *t;
01220         *fx = *ft;
01221         *dx = *dt;
01222     }
01223 
01224     /* Clip the new trial value in [tmin, tmax]. */
01225     if (tmax < newt) newt = tmax;
01226     if (newt < tmin) newt = tmin;
01227 
01228     /*
01229         Redefine the new trial value if it is close to the upper bound
01230         of the interval.
01231      */
01232     if (*brackt && bound) {
01233         mq = *x + 0.66 * (*y - *x);
01234         if (*x < *y) {
01235             if (mq < newt) newt = mq;
01236         } else {
01237             if (newt < mq) newt = mq;
01238         }
01239     }
01240 
01241     /* Return the new trial value. */
01242     *t = newt;
01243     return 0;
01244 }
01245 
01246 
01247 
01248 
01249 
01250 static float64_t owlqn_x1norm(
01251     const float64_t* x,
01252     const int32_t start,
01253     const int32_t n
01254     )
01255 {
01256     int32_t i;
01257     float64_t norm = 0.;
01258 
01259     for (i = start;i < n;++i) {
01260         norm += fabs(x[i]);
01261     }
01262 
01263     return norm;
01264 }
01265 
01266 static void owlqn_pseudo_gradient(
01267     float64_t* pg,
01268     const float64_t* x,
01269     const float64_t* g,
01270     const int32_t n,
01271     const float64_t c,
01272     const int32_t start,
01273     const int32_t end
01274     )
01275 {
01276     int32_t i;
01277 
01278     /* Compute the negative of gradients. */
01279     for (i = 0;i < start;++i) {
01280         pg[i] = g[i];
01281     }
01282 
01283     /* Compute the psuedo-gradients. */
01284     for (i = start;i < end;++i) {
01285         if (x[i] < 0.) {
01286             /* Differentiable. */
01287             pg[i] = g[i] - c;
01288         } else if (0. < x[i]) {
01289             /* Differentiable. */
01290             pg[i] = g[i] + c;
01291         } else {
01292             if (g[i] < -c) {
01293                 /* Take the right partial derivative. */
01294                 pg[i] = g[i] + c;
01295             } else if (c < g[i]) {
01296                 /* Take the left partial derivative. */
01297                 pg[i] = g[i] - c;
01298             } else {
01299                 pg[i] = 0.;
01300             }
01301         }
01302     }
01303 
01304     for (i = end;i < n;++i) {
01305         pg[i] = g[i];
01306     }
01307 }
01308 
01309 static void owlqn_project(
01310     float64_t* d,
01311     const float64_t* sign,
01312     const int32_t start,
01313     const int32_t end
01314     )
01315 {
01316     int32_t i;
01317 
01318     for (i = start;i < end;++i) {
01319         if (d[i] * sign[i] <= 0) {
01320             d[i] = 0;
01321         }
01322     }
01323 }
01324 
01325 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation