# bfgs.R -- smooth function minimization with the BFGS algorithm
# by Andrew Clausen <clausen@econ.upenn.edu> in 2007.
#
# This file implements the Broyden-Fletcher-Goldfarb-Shanno algorithm for
# function minimization.  It uses the More-Thuente linesearch algorithm which
# is implemented in linesearch.R.
#
# It requires a derivative which can be obtained
#  * numerically from numericDeriv or the numDeriv package, or
#  * symbolically from either the built-in D and deriv commands, the Ryacas
# package, or my Deriv.r script.
#
# If you do not specify a derivative, then "numericDeriv" is used.
#
# Advantages over optim()'s "BFGS" method:
#  * the linesearch algorithm (More-Thuente) quickly finds a region of interest
# to zoom into.  Moreover, it strikes a much better balance between finding
# a point that adequately improves upon the old point, but doesn't waste too
# much time finding a much better point.  (It uses the standard Wolfe
# conditions with weak parameters.)
#  * the linesearch algorithm uses interpolation, so it finds an acceptable
# point more quickly.
#  * implements "box" constraints.
#  * easier to understand and modify the code, partly because it's written in R.
#  * apparently has fewer numerical stability problems.  (Not sure why!)
# In particular, bfgs() survives large sample sizes in tests/mle-norm.R.
#
# Disadvantages:
#  * more overhead, because it's written in R.

source("linesearch.R")

norm <- function(x) max(abs(x))

# collects statistics on how many times f is called.
call.counter <- function(f, name, environment)
{
	assign(name, 0, envir=environment)
	function(x)
	{
		assign(name, 1 + get(name, envir=environment),
		       envir=environment)
		f(x)
	}
}

# Returns a function wrapper of f that caches old values.  (i.e. memoization)
function.cache <- function(f)
{
	cache.env <- new.env()
	cache <- list()
	cache$n <- 0
	cache$params <- list()
	cache$vals <- list()
	assign("cache", cache, envir=cache.env)

	function(x)
	{
		cache <- get("cache", envir=cache.env)
		if (cache$n > 0) {
			for (i in cache$n:max(1, (cache$n-100))) {
				if (all(x == cache$params[[i]]))
					return(cache$vals[[i]])
			}
		}
		cache$n <- cache$n + 1
		cache$params[[cache$n]] <- x
		cache$vals[[cache$n]] <- f(x)
		assign("cache", cache, envir=cache.env)
		cache$vals[[cache$n]]
	}
}

# Wrapper for numericDeriv that returns the derivative function of g.
# (numericDeriv only evalutes the derivative.)
numericDeriv_ <- function(f, grad=FALSE)
{
	rho <- new.env()
	assign("f", f, envir=rho)
	function(x)
	{
		assign("x", x, envir=rho)
		result <- attr(numericDeriv(quote(f(x)), "x", rho), "gradient")
		if (grad)
			result <- as.vector(result)
		result
	}
}

# We require each coordinate of x satisfy
#
#	x in [min.x, max.x].
#
# This function returns max.lambda such that for all lambda in [0, max.lambda],
#
#	(x + s * max.lambda) in [min.x, max.x].
calc.constraint <- function(x, s, min.x, max.x)
{
	stopifnot((x >= min.x) && (x <= max.x))

	min.constraints <- (x - min.x) / (-s)
	min.constraints <- subset(min.constraints, min.constraints > 0)

	max.constraints <- (max.x - x) / s
	max.constraints <- subset(max.constraints, max.constraints > 0)

	constraints <- c(min.constraints, max.constraints)
	if (all(is.infinite(constraints)))
		return(Inf)
	min(constraints)
}

# Implements the Broyden-Fletcher-Goldfarb-Shanno algorithm for function
# minimization.  That is, it attempts to find
#
#	argmin_{x s.t. min.x < x < max.x coord-wise} f(x)
#
# Apart from returning the maximizer x, it also returns the function value f(x)
# the gradient f'(x), and the inverse hessian f''(x)^{-1}.
bfgs <- function(x0, f_, g_,
		 min.x=rep(-Inf, length(x0)),
		 max.x=rep(Inf, length(x0)),
		 prec=0.00001, verbose=FALSE)
{
	count.env <- new.env()
	f <- function.cache(call.counter(function(x) f_(x), "f", count.env))
	if (missing(g_))
		g_ <- numericDeriv_(f, grad=TRUE)
	g <- function.cache(call.counter(function(x) g_(x), "g", count.env))

	x <- x0
	I <- diag(rep(1, length(x)))
	H <- I

	iter <- 0
	while (norm(g(x)) > prec) {
		iter <- iter + 1
		if (verbose)
			cat(c("\niter", iter, f(x), x, "\n"))

		# minimize in the direction of p
		p <- as.vector(- H %*% g(x))
		phi <- function(alpha) f(x + alpha * p)
		phi_ <- function(alpha) as.numeric(g(x + alpha * p) %*% p)
		max.alpha <- calc.constraint(x, p, min.x, max.x)
		alpha <- linesearch(phi, phi_, 1, max.alpha)
		if (alpha == 0) {
			warning("Lost precision")
			break
		}

		x_ <- x + alpha * p
		s <- x_ - x
		y <- g(x_) - g(x)

		rho <- 1 / as.numeric(t(y) %*% s)
		J <- I - s %*% t(y) * rho
		K <- I - y %*% t(s) * rho
		H <- J %*% H %*% K + s %*% t(s) * rho

		x <- x_
	}

	list(par=x, value=f(x), grad=g(x), inv.hessian=H,
	     counts=c(`function`=get("f", envir=count.env),
		      gradient=get("g", envir=count.env)))
}
