The following program illustrates the large nonlinear least squares solvers on a system with significant sparse structure in the Jacobian. The cost function is given by
\Phi(x) &= 1/2 \sum_{i=1}^{p+1} f_i^2 f_i &= \sqrt{\alpha} (x_i - 1), 1 \le i \le p f_{p+1} &= ||x||^2 - 1/4
with \alpha = 10^{-5}. The residual f_{p+1} imposes a constraint on the p parameters x, to ensure that ||x||^2 \approx {1 \over 4}. The (p+1)-by-p Jacobian for this system is given by
J(x) = [ \sqrt{alpha} I_p; 2 x^T ]
and the normal equations matrix is given by
J^T J = [ \alpha I_p + 4 x x^T ]
Finally, the second directional derivative of f for the geodesic acceleration method is given by
fvv = [ 0; 2 ||v||^2 ]
Since the upper p-by-p block of J is diagonal, this sparse structure should be exploited in the nonlinear solver. For comparison, the following program solves the system for p = 2000 using the dense direct Cholesky solver based on the normal equations matrix J^T J, as well as the iterative Steihaug-Toint solver, based on sparse matrix-vector products J u and J^T u. The program output is shown below.
Method NITER NFEV NJUEV NJTJEV NAEV Init Cost Final cost cond(J) Final |x|^2 Time (s) levenberg-marquardt 25 31 26 26 0 7.1218e+18 1.9555e-02 447.50 2.5044e-01 46.28 levenberg-marquardt+accel 22 23 45 23 22 7.1218e+18 1.9555e-02 447.64 2.5044e-01 33.92 dogleg 37 87 36 36 0 7.1218e+18 1.9555e-02 447.59 2.5044e-01 56.05 double-dogleg 35 88 34 34 0 7.1218e+18 1.9555e-02 447.62 2.5044e-01 52.65 2D-subspace 37 88 36 36 0 7.1218e+18 1.9555e-02 447.71 2.5044e-01 59.75 steihaug-toint 35 88 345 0 0 7.1218e+18 1.9555e-02 inf 2.5044e-01 0.09
The first five rows use methods based on factoring the dense J^T J matrix while the last row uses the iterative Steihaug-Toint method. While the number of Jacobian matrix-vector products (NJUEV) is less for the dense methods, the added time to construct and factor the J^T J matrix (NJTJEV) results in a much larger runtime than the iterative method (see last column).
The program is given below.
#include <stdlib.h> #include <stdio.h> #include <sys/time.h> #include <gsl/gsl_vector.h> #include <gsl/gsl_matrix.h> #include <gsl/gsl_blas.h> #include <gsl/gsl_multilarge_nlinear.h> #include <gsl/gsl_spblas.h> #include <gsl/gsl_spmatrix.h> /* parameters for functions */ struct model_params { double alpha; gsl_spmatrix *J; }; /* penalty function */ int penalty_f (const gsl_vector * x, void *params, gsl_vector * f) { struct model_params *par = (struct model_params *) params; const double sqrt_alpha = sqrt(par->alpha); const size_t p = x->size; size_t i; double sum = 0.0; for (i = 0; i < p; ++i) { double xi = gsl_vector_get(x, i); gsl_vector_set(f, i, sqrt_alpha*(xi - 1.0)); sum += xi * xi; } gsl_vector_set(f, p, sum - 0.25); return GSL_SUCCESS; } int penalty_df (CBLAS_TRANSPOSE_t TransJ, const gsl_vector * x, const gsl_vector * u, void * params, gsl_vector * v, gsl_matrix * JTJ) { struct model_params *par = (struct model_params *) params; const size_t p = x->size; size_t j; /* store 2*x in last row of J */ for (j = 0; j < p; ++j) { double xj = gsl_vector_get(x, j); gsl_spmatrix_set(par->J, p, j, 2.0 * xj); } /* compute v = op(J) u */ if (v) gsl_spblas_dgemv(TransJ, 1.0, par->J, u, 0.0, v); if (JTJ) { gsl_vector_view diag = gsl_matrix_diagonal(JTJ); /* compute J^T J = [ alpha*I_p + 4 x x^T ] */ gsl_matrix_set_zero(JTJ); /* store 4 x x^T in lower half of JTJ */ gsl_blas_dsyr(CblasLower, 4.0, x, JTJ); /* add alpha to diag(JTJ) */ gsl_vector_add_constant(&diag.vector, par->alpha); } return GSL_SUCCESS; } int penalty_fvv (const gsl_vector * x, const gsl_vector * v, void *params, gsl_vector * fvv) { const size_t p = x->size; double normv = gsl_blas_dnrm2(v); gsl_vector_set_zero(fvv); gsl_vector_set(fvv, p, 2.0 * normv * normv); (void)params; /* avoid unused parameter warning */ return GSL_SUCCESS; } void solve_system(const gsl_vector *x0, gsl_multilarge_nlinear_fdf *fdf, gsl_multilarge_nlinear_parameters *params) { const gsl_multilarge_nlinear_type *T = gsl_multilarge_nlinear_trust; const size_t max_iter = 200; const double xtol = 1.0e-8; const double gtol = 1.0e-8; const double ftol = 1.0e-8; const size_t n = fdf->n; const size_t p = fdf->p; gsl_multilarge_nlinear_workspace *work = gsl_multilarge_nlinear_alloc(T, params, n, p); gsl_vector * f = gsl_multilarge_nlinear_residual(work); gsl_vector * x = gsl_multilarge_nlinear_position(work); int info; double chisq0, chisq, rcond, xsq; struct timeval tv0, tv1; gettimeofday(&tv0, NULL); /* initialize solver */ gsl_multilarge_nlinear_init(x0, fdf, work); /* store initial cost */ gsl_blas_ddot(f, f, &chisq0); /* iterate until convergence */ gsl_multilarge_nlinear_driver(max_iter, xtol, gtol, ftol, NULL, NULL, &info, work); gettimeofday(&tv1, NULL); /* store final cost */ gsl_blas_ddot(f, f, &chisq); /* compute final ||x||^2 */ gsl_blas_ddot(x, x, &xsq); /* store cond(J(x)) */ gsl_multilarge_nlinear_rcond(&rcond, work); /* print summary */ fprintf(stderr, "%-25s %-5zu %-4zu %-5zu %-6zu %-4zu %-10.4e %-10.4e %-7.2f %-11.4e %.2f\n", gsl_multilarge_nlinear_trs_name(work), gsl_multilarge_nlinear_niter(work), fdf->nevalf, fdf->nevaldfu, fdf->nevaldf2, fdf->nevalfvv, chisq0, chisq, 1.0 / rcond, xsq, (tv1.tv_sec - tv0.tv_sec) + 1.0e-6 * (tv1.tv_usec - tv0.tv_usec)); gsl_multilarge_nlinear_free(work); } int main (void) { const size_t p = 2000; const size_t n = p + 1; gsl_vector *f = gsl_vector_alloc(n); gsl_vector *x = gsl_vector_alloc(p); /* allocate sparse Jacobian matrix with 2*p non-zero elements in triplet format */ gsl_spmatrix *J = gsl_spmatrix_alloc_nzmax(n, p, 2 * p, GSL_SPMATRIX_TRIPLET); gsl_multilarge_nlinear_fdf fdf; gsl_multilarge_nlinear_parameters fdf_params = gsl_multilarge_nlinear_default_parameters(); struct model_params params; size_t i; params.alpha = 1.0e-5; params.J = J; /* define function to be minimized */ fdf.f = penalty_f; fdf.df = penalty_df; fdf.fvv = penalty_fvv; fdf.n = n; fdf.p = p; fdf.params = ¶ms; for (i = 0; i < p; ++i) { /* starting point */ gsl_vector_set(x, i, i + 1.0); /* store sqrt(alpha)*I_p in upper p-by-p block of J */ gsl_spmatrix_set(J, i, i, sqrt(params.alpha)); } fprintf(stderr, "%-25s %-4s %-4s %-5s %-6s %-4s %-10s %-10s %-7s %-11s %-10s\n", "Method", "NITER", "NFEV", "NJUEV", "NJTJEV", "NAEV", "Init Cost", "Final cost", "cond(J)", "Final |x|^2", "Time (s)"); fdf_params.scale = gsl_multilarge_nlinear_scale_levenberg; fdf_params.trs = gsl_multilarge_nlinear_trs_lm; solve_system(x, &fdf, &fdf_params); fdf_params.trs = gsl_multilarge_nlinear_trs_lmaccel; solve_system(x, &fdf, &fdf_params); fdf_params.trs = gsl_multilarge_nlinear_trs_dogleg; solve_system(x, &fdf, &fdf_params); fdf_params.trs = gsl_multilarge_nlinear_trs_ddogleg; solve_system(x, &fdf, &fdf_params); fdf_params.trs = gsl_multilarge_nlinear_trs_subspace2D; solve_system(x, &fdf, &fdf_params); fdf_params.trs = gsl_multilarge_nlinear_trs_cgst; solve_system(x, &fdf, &fdf_params); gsl_vector_free(f); gsl_vector_free(x); gsl_spmatrix_free(J); return 0; }