/* macopt library release 1.1 gradient-based optimizer Copyright (c) 2002 David J.C. MacKay and Steve Waterhouse and Mark Gibbs This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA GNU licenses are here : http://www.gnu.org/licenses/licenses.html Author contact details are here : http://www.inference.phy.cam.ac.uk/mackay/c/macopt.html mackay@mrao.cam.ac.uk */ #include "r.h" #include "macopt.h" /* http://www.inference.phy.cam.ac.uk/mackay/c/macopt.html mackay@mrao.cam.ac.uk Please do not use macopt without understanding a little about how it works; there are some control parameters which the user MUST set! David MacKay's optimizer, based on conjugate gradient ideas, but using bracketing of the zero of the inner product (gradient).(line_search_direction) to do the line minimization. Only derivative calculations are required. The length of the first step in the line search (often set to "1.0" in other code) is adapted here so that, if 0.00001 is a better step size, it soon cottons on to that and saves ~log(10000) bracketing operations. The result is that (with rich set to 0) the program can use as few as 2 derivatives per line search. (If rich is set to 1, it does an extra derivative calculation at the beginning of each line search making a minimum of 3 per line search. Set rich=0 if you think that the surface is locally quite quadratic.) If the program does average 2 derivatives per line search then it must be superior to most cg methods including use of Rbackprop (which costs 2 derivatives straight off) A possible modification: where the function can be returned at same time as the dfunction --- there is nothing clever to do with the value, but it could be used as a sanity check and a convergence criterion. See http://131.111.48.24/mackay/c/macopt.html for further discussion. NB: The value of "tol" is totally arbitrary and must be set by you to a value that works well for your problem. It depends completely on the typical value of the gradient / step size. Tol specifies a magnitude of gradient at which a halt is called. or a step size. This program MINIMIZES a function. ********************************************** Modified and converted to C++ class by Steve Waterhouse 8th April 1997. ********************************************** This version with some modifications by Mark Gibbs. (MNG) */ Macopt::Macopt(int n, int _verbose, double _tolerance, int _itmax, int _rich) : a_n(n), a_verbose(_verbose), a_tol(_tolerance), a_itmax(_itmax), a_rich(_rich) { a_g = new double[n+1] ; a_h = new double[n+1] ; a_xi = new double[n+1] ; a_pt = new double[n+1] ; /* scratch vector for sole use of macprod */ a_gx = new double[n+1] ; /* scratch gradients */ a_gy = new double[n+1] ; /* used by maclinmin and macprod */ /* if verbose = 1 then there is one report for each line minimization. if verbose = 2 then there is an additional report for each step of the line minimization. if verbose = 3 then extra debugging routines kick in. */ a_end_if_small_step = 1 ; /* Change this to 0/1 if you prefer */ a_stepmax = 0.5 ; a_grad_tol_tiny = 1e-16 ; /* Probably not worth fiddling with */ a_step_tol_tiny = 0.0 ; /* Probably not worth fiddling with */ a_linmin_maxits = 20 ; /* Probably not worth fiddling with */ a_lastx = 0.01 ; /* only has a transient effect */ a_lastx_default = 0.01 ; /* -- defines typical distance in parameter space at which the line minimum is expected; both these should be set. the default is consulted if something goes badly wrong and a reset is demanded. */ /* don't fiddle with the following, unless you really mean it */ a_linmin_g1 = 2.0 ; a_linmin_g2 = 1.25 ; a_linmin_g3 = 0.5 ; a_restart = 0 ; } Macopt::~Macopt() { free(a_g); free(a_h); free(a_xi); free(a_pt); free(a_gx); free(a_gy); } void Macopt::macoptII (double *p, /* starting vector */ int n /* number of dimensions */ ) { int j ; double gg , gam , dgg ; double *g , *h , *xi ; int end_if_small_grad = 1 - a_end_if_small_step ; double step , tmpd ; /* A total of 7 double * 1..n are used by this optimizer. p is provided when the optimizer is called pt is used by the line minimizer as the temporary vector. this could be cut out with minor rewriting, using p alone g, h and xi are used by the cg method as in NR - could one of these be cut out? the line minimizer uses an extra gx and gy to evaluate two gradients. */ g = a_g ; h = a_h ; xi = a_xi ; dfunc( p , xi ); macopt_restart ( 1 ) ; for ( a_its = 1 ; a_its <= a_itmax ; a_its ++ ) { for ( gg = 0.0 , j = 1 ; j <= n ; j ++ ) gg += g[j]*g[j]; /* find the magnitude of the old gradient */ a_gtyp = sqrt ( gg / (double)(n) ) ; if ( a_verbose > 0 ) printf ( "mac_it %d of %d : gg = %6.3g tol = %6.3g: ", a_its , a_itmax , gg , a_tol ) ; if ( ( end_if_small_grad && ( gg <= a_tol ) ) || ( gg <= a_grad_tol_tiny ) ) { // macopt_free ( a ) ; if ( a_verbose > 0 ) printf ("\n"); return; } step = maclinminII ( p ) ; if ( a_restart == 0 ) { if ( a_verbose > 1 ) printf (" (step %9.5g)",step); if ( a_verbose > 0 ) printf ("\n"); if ( ( a_end_if_small_step && ( step <= a_tol ) ) || ( step <= a_step_tol_tiny ) ) { // macopt_free ( a ) ; return; } } /* if we are feeling rich, evaluate the gradient at the new `minimum'. alternatively, linmin has already estimated this gradient by linear combination of the last two evaluations and left it in xi */ if ( a_rich || a_restart ) { dfunc( p , xi ) ; } if ( a_restart ) { fprintf(stderr,"Restarting macopt\n" ) ; macopt_restart ( 0 ) ; /* this is not quite right should distinguish whether there was an overrun indicating that the value of lastx needs to be bigger / smaller; in which case resetting lastx to default value may be a bad idea, giving an endless loop of resets */ } else { dgg=0.0; for ( j = 1 ; j <= n ; j ++ ) { dgg += ( xi[j] + g[j] ) * xi[j] ; } gam = dgg / gg ; for ( tmpd = 0.0 , j = 1 ; j <= n ; j ++ ) { g[j] = -xi[j]; /* g stores (-) the most recent gradient */ xi[j] = h[j] = g[j] + gam * h[j] ; /* h stores xi, the current line direction */ /* check that the inner product of gradient and line search is < 0 */ tmpd -= xi[j] * g[j] ; } if ( tmpd > 0.0 || a_verbose > 2 ) { fprintf(stderr,"new line search has inner prod %9.4g\n", tmpd ) ; } if ( tmpd > 0.0 ) { if ( a_rich == 0 ) { fprintf (stderr, "Setting rich to 1; " ) ; a_rich = 1 ; } a_restart = 2 ; /* signifies that g[j] = -xi[j] is already done */ fprintf(stderr,"Restarting macopt (2)\n" ) ; macopt_restart ( 0 ) ; } } } fprintf(stderr,"Reached iteration limit in macopt; continuing.\n"); // macopt_free ( a ) ; return; } /* NB this leaves the best value of p in the p vector, but the function has not been evaluated there if rich=0 */ /* maclinmin. Method: evaluate gradient at a sequence of points and calculate the inner product with the line search direction. Continue until a bracketing is achieved ( i.e a change in sign ). */ double Macopt::maclinminII ( double *p ) { int n = a_n ; double x , y ; double s , t , m ; int its = 1 , i ; double step , tmpd ; double *gx = a_gx , *gy = a_gy ; /* at x=0, the gradient (uphill) satisfies s < 0 */ if ( a_verbose > 2 ) { /* check this is true: (no need to do this really as it is already checked at the end of the main loop of macopt) */ /* #define TESTS 5 x = a_lastx / a_gtyp ; fprintf (stderr, "inner product at:\n" ) ; for ( i = -TESTS ; i <= TESTS ; i += 2 ) { step = x * 2.0 * (double) i / (double) TESTS ; fprintf (stderr, "%9.5g %9.5g\n" , step , tmpd = macprodII ( p , gy , step ) ) ; } */ fprintf (stderr, "inner product at 0 = %9.4g\n" , tmpd = macprodII ( p , gy , 0.0 ) ) ; if ( tmpd > 0.0 ) { a_restart = 1 ; return 0.0 ; } } x = a_lastx / a_gtyp ; /* MNG alteration : Here we check to make sure that the step in x is not going to change the value of the hyperparameters by anything more than 1.0 on the first go. This is to avoid initial massive steps that are making things screw up */ double max = 0.0; double x_new = x; for(i=1; i <= n; ++i){ double temp1 = fabs(a_xi[i]*x); if( temp1 > max && temp1 > 1.0){ max = temp1; x_new = fabs(1.0 / a_xi[i]); } } if( x != x_new ){ if(a_verbose > 1) fprintf(stderr,"Note : maclinmin step truncated\n"); x = x_new; } s = macprodII ( p , gx , x ) ; if ( s < 0 ) { /* we need to go further */ do { y = x * a_linmin_g1 ; t = macprodII ( p , gy , y ) ; if ( a_verbose > 1 ) printf ("s = %6.3g: t = %6.3g; x = %6.3g y = %6.3g\n",s, t , x , y ); if ( t >= 0.0 ) break ; x = y ; s = t ; a_gunused = gx ; gx = gy ; gy = a_gunused ; its++ ; /* replaces: for ( i = 1 ; i <= n ; i ++ ) gx[i] = gy[i] ; */ } while ( its <= a_linmin_maxits ) ; } else if ( s > 0 ) { /* need to step back inside interval */ do { y = x * a_linmin_g3 ; t = macprodII ( p , gy , y ) ; if ( a_verbose > 1 ) printf ("s = %6.3g: t = %6.3g; x = %6.3g y = %6.3g\n",s, t , x , y ); if ( t <= 0.0 ) break ; x = y ; s = t ; a_gunused = gx ; gx = gy ; gy = a_gunused ; its ++ ; } while ( its <= a_linmin_maxits ) ; } else { /* hole in one s = 0.0 */ t = 1.0 ; y = x; } if ( its > a_linmin_maxits ) { fprintf (stderr, "Warning! maclinmin overran" ); /* this can happen where the function goes \_ and doesn't buck up again; it also happens if the initial `gradient' does not satisfy gradient.`gradient' > 0, so that there is no minimum in the supposed downhill direction. I don't know if this actually happens... If it does then I guess a_rich should be 1. If the overrun is because too big a step was taken then the interpolation should be made between zero and the most recent measurement. If the overrun is because too small a step was taken then the best place to go is the most distant point. I will assume that this doesn't happen for the moment. Also need to check up what happens to t and s in the case of overrun. And gx and gy. Maybe sort this out when writing a macopt that makes use of the gradient at zero? */ fprintf (stderr, "- inner product at 0 = %9.4g\n" , tmpd = macprodII ( p , gy , 0.0 ) ) ; if ( tmpd > 0 && a_rich == 0 ) { fprintf (stderr, "setting rich to 1\n" ) ; a_rich = 1 ; } if ( tmpd > 0 ) a_restart = 1 ; } /* MNG - modified routine for(i=1; i <= 4; ++i){ if( s < 0 ){ y_old = y; s_old = - s ; t_old = t; m = ( s_old + t_old ) ; s_old /= m ; t_old /= m ; m = s_old * y_old + t_old * x ; t_new = macprodII ( p , gx , m , dfunc , arg , a ) ; if( t_new < 0 ){ s = t_new; x = m; } if( t_new > 0 ){ t = t_new; y = x; } } if( s > 0 ){ y_old = y; s_old = s ; t_old = -t; m = ( s_old + t_old ) ; s_old /= m ; t_old /= m ; m = s_old * y_old + t_old * x ; t_new = macprodII ( p , gx , m , dfunc , arg , a ) ; if( t_new > 0 ){ s = t_new; x = m; } if( t_new < 0 ){ t = t_new; y = x; } } } End of MNG addition */ /* Linear interpolate between the last two. This assumes that x and y do bracket. */ if ( s < 0.0 ) s = - s ; if ( t < 0.0 ) t = - t ; m = ( s + t ) ; s /= m ; t /= m ; m = s * y + t * x ; /* evaluate the step length, not that it necessarily means anything */ for ( step = 0.0 , i = 1 ; i <= n ; i ++ ) { tmpd = m * a_xi[i] ; p[i] += tmpd ; /* this is the point where the parameter vector steps */ step += fabs ( tmpd ) ; a_xi[i] = s * gy[i] + t * gx[i] ; /* send back the estimated gradient in xi (NB not like linmin) */ } a_lastx = m * a_linmin_g2 * a_gtyp ; return ( step / (double) ( n ) ) ; } double Macopt::macprodII ( double *p , double *gy , double y ) { double *pt = a_pt ; double *xi = a_xi ; /* finds pt = p + y xi and gets gy there, returning gy . xi */ int n = a_n ; int i; double s = 0.0 ; for ( i = 1 ; i <= n ; i ++ ) pt[i] = p[i] + y * xi[i] ; dfunc( pt , gy ) ; for ( i = 1 ; i <= n ; i ++ ) s += gy[i] * xi[i] ; return s ; } void Macopt::macopt_restart ( int start ) /* if start == 1 then this is the start of a fresh macopt, not a restart */ { int j , n=a_n ; double *g, *h, *xi ; g = a_g ; h = a_h ; xi = a_xi ; if ( start == 0 ) a_lastx = a_lastx_default ; /* it is assumed that dfunc( p , xi ) ; has happened */ for ( j = 1 ; j <= n ; j ++ ) { if ( a_restart != 2 ) g[j] = -xi[j] ; xi[j] = h[j] = g[j] ; } a_restart = 0 ; } void Macopt::maccheckgrad /* Examines objective function and d_objective function to see if they agree for a step of size epsilon */ (double *p, int n, double epsilon, int stopat /* stop at this component. If 0, do the lot. */ ) { int j; double f1; double *g,*h; double tmpp ; f1 = func(p); // printf("f1 = %f\n", f1); h = new double[n+1]; g = new double[n+1]; dfunc(p, g); if ( stopat <= 0 || stopat > n ) stopat = n ; printf("Testing gradient evaluation\n"); printf(" analytic 1st_diffs difference\n"); for ( j = 1 ; j <= stopat ; j ++ ) { tmpp = p[j] ; p[j] += epsilon ; h[j] = func(p) - f1 ; // printf("h = %f\n", h[j]); p[j] = tmpp ; printf("%2d %9.5g %9.5g %9.5g\n" , j , g[j] , h[j]/epsilon , g[j] - h[j]/epsilon ); fflush(stdout) ; } free(h); free(g); // free_dvector(h,1,n); // free_dvector(g,1,n); printf(" -------- ---------\n"); } /* Last modified: Tue Dec 17 17:49:44 1996 */