---
title: "rSPDE vs exact Matern covariance: timing and memory"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{rSPDE vs exact Matern covariance: timing and memory}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

```{r setup, message=FALSE, warning=FALSE}
library(rSPDE)
library(fmesher)
library(Matrix)
library(ggplot2)
library(dplyr)
library(gridExtra)
library(gtable)
library(tidyr)
```

## Introduction

This vignette compares the computational cost of exact dense Matérn likelihood evaluation with the sparse rSPDE approximation. For the R-INLA implementation of rSPDE, see [R-INLA implementation of the rational SPDE approach](https://davidbolin.github.io/rSPDE/articles/rspde_inla.html). For the inlabru implementation, see [inlabru implementation of the rational SPDE approach](https://davidbolin.github.io/rSPDE/articles/rspde_inlabru.html). For a general introduction, see [An introduction to the rSPDE package](https://davidbolin.github.io/rSPDE/articles/rSPDE.html). For covariance-based and operator-based workflows, see [Rational approximation with the rSPDE package](https://davidbolin.github.io/rSPDE/articles/rspde_cov.html) and [Operator-based rational approximation](https://davidbolin.github.io/rSPDE/articles/rspde_base.html). For additional modeling examples, see [Anisotropic models](https://davidbolin.github.io/rSPDE/articles/anisotropic.html), [Intrinsic models in the rSPDE package](https://davidbolin.github.io/rSPDE/articles/intrinsic.html), [Spatio-temporal models](https://davidbolin.github.io/rSPDE/articles/spacetime.html), and [Rational approximations without finite element approximations](https://davidbolin.github.io/rSPDE/articles/rspde_nofem.html).

## rSPDE sparse approximation

rSPDE replaces dense covariance computations with sparse precision matrix computations by constructing a Gaussian Markov random field (GMRF) representation. The resulting linear algebra is substantially cheaper in both time and memory than dense covariance factorizations. The approximation order $m$ controls accuracy; the approximation error decreases exponentially fast with $m$, so small orders often provide high accuracy at low computational cost.

## Benchmark setup

We consider a Gaussian model
\[
y_i = u(s_i) + \varepsilon_i,\qquad \varepsilon_i \sim \mathcal{N}(0,\sigma_e^2),
\]
where $u$ is a Mat\'ern field on the unit square and the observations are taken at $N$ random locations $s_i \in [0,1]^2$. We use
$N \in \{1000,2000,\ldots,10000,12500,15000,20000,25000,30000\}$. For each $N$, we build a triangulation with `fm_mesh_2d` using `cutoff = 0.0025`, construct the observation matrix with `fm_basis`, and compare the cost of evaluating:

- the exact dense Mat\'ern likelihood, based on the Cholesky factorization of the covariance matrix, and
- the rSPDE sparse log-likelihood for approximation orders $m = 1,\ldots,4$.

For each $m$, the rSPDE timing is averaged over 10 runs. When R is linked to FlexiBLAS and MKL or vecLib is available, we evaluate the exact model under Netlib (up to $N=8000$) and under MKL/vecLib for all sample sizes, while running rSPDE under Netlib for all sample sizes. Otherwise, both exact and rSPDE timings use the base R BLAS.

Netlib BLAS is single-threaded, whereas MKL is multi-threaded. To keep the overall cost manageable, the exact Netlib evaluations are only computed up to $N=8000$, since dense Cholesky factorization becomes prohibitively expensive at larger $N$ in a single-threaded setting.

```{r timing-benchmark-data, echo=TRUE, eval = FALSE}
loglike_exact_chol <- function(y, Sigma_matern, sigma_e) {
  S <- Sigma_matern
  diag(S) <- diag(S) + sigma_e^2
  chol_U <- tryCatch(chol(S), error = function(e) NULL)
  if (is.null(chol_U)) {
    return(-Inf)
  }
  log_det <- 2 * sum(log(diag(chol_U)))
  z <- backsolve(chol_U, y, transpose = TRUE)
  -0.5 * (length(y) * log(2 * pi) + log_det + sum(z^2))
}

has_flexiblas <- function() {
  if (!requireNamespace("flexiblas", quietly = TRUE)) {
    return(FALSE)
  }
  out <- try(flexiblas::flexiblas_list(), silent = TRUE)
  !inherits(out, "try-error")
}

list_backends <- function() {
  if (!has_flexiblas()) {
    return(character())
  }
  flexiblas::flexiblas_list()
}

has_backend <- function(pattern, backends) {
  any(grepl(pattern, tolower(backends)))
}

pick_backend <- function(target, backends) {
  nm_low <- tolower(backends)
  target_low <- tolower(target)
  idx <- which(nm_low == target_low)
  if (!length(idx) && target_low == "veclib") {
    idx <- which(grepl("veclib|vec|apple", nm_low))
  }
  if (!length(idx) && target_low == "mkl") {
    idx <- which(nm_low == "mklopenmp")
    if (!length(idx)) {
      idx <- which(grepl("mklopenmp", nm_low))
    }
    if (!length(idx)) {
      idx <- which(grepl("mkl", nm_low))
    }
  }
  if (!length(idx) && target_low == "netlib") {
    idx <- which(grepl("netlib", nm_low))
  }
  if (!length(idx)) stop(paste("FlexiBLAS backend not found:", target))
  backends[idx[1]]
}

set_backend <- function(backend_name) {
  if (!has_flexiblas()) {
    return(invisible(NULL))
  }
  loaded <- flexiblas::flexiblas_list_loaded()
  if (!any(tolower(loaded) == tolower(backend_name))) {
    flexiblas::flexiblas_load_backend(backend_name)
    loaded <- flexiblas::flexiblas_list_loaded()
  }
  idx <- which(tolower(loaded) == tolower(backend_name))
  if (!length(idx)) stop(paste("FlexiBLAS backend not loaded:", backend_name))
  flexiblas::flexiblas_switch(idx[1])
}

backend_label <- function(name) {
  if (name == "base") {
    return("Base")
  }
  if (grepl("mkl", name, ignore.case = TRUE)) {
    return("MKL")
  }
  if (grepl("netlib", name, ignore.case = TRUE)) {
    return("Netlib")
  }
  if (grepl("veclib|vec|apple", name, ignore.case = TRUE)) {
    return("VecLib")
  }
  name
}

sys <- Sys.info()
arch <- R.version$arch
machine <- sys[["machine"]]

is_mac <- sys[["sysname"]] == "Darwin"
is_arm64 <- grepl("arm64|aarch64", arch) || grepl("arm64|aarch64", machine)
is_intel <- grepl("x86_64|amd64|i386", arch) || grepl("x86_64|amd64|i386", machine)

flex_backends <- list_backends()
use_flexi <- length(flex_backends) > 0

bench_plan <- list()
has_netlib <- use_flexi && has_backend("netlib", flex_backends)
has_mkl <- use_flexi && has_backend("mkl", flex_backends)
has_veclib <- use_flexi && has_backend("veclib|vec|apple", flex_backends)

if (has_netlib && is_intel && has_mkl) {
  bench_plan$exact_backends <- c("netlib", "mkl")
  bench_plan$rspde_backend <- "netlib"
} else if (has_netlib && is_mac && is_arm64 && has_veclib) {
  bench_plan$exact_backends <- c("netlib", "veclib")
  bench_plan$rspde_backend <- "netlib"
} else {
  bench_plan$exact_backends <- "base"
  bench_plan$rspde_backend <- "base"
}

sample_sizes <- c(seq(1000, 10000, by = 1000), 12500, 15000, 20000, 25000, 30000)
sigma_true <- 1.5
range_true <- 0.2
nu_true <- 0.8
sigma_e <- 0.5
kappa_true <- sqrt(8 * nu_true) / range_true

benchmark_data <- lapply(sample_sizes, function(N) {
  set.seed(123 + N)
  list(
    N = N,
    locs = matrix(runif(N * 2), ncol = 2),
    Y = rnorm(N)
  )
})

max_ram_usage <- function(expr) {
  start <- gc(verbose = FALSE, reset = TRUE)
  eval(substitute(expr), envir = parent.frame())
  end <- gc(verbose = FALSE, reset = FALSE)
  max_col <- which(colnames(end) == "max used")
  if (length(max_col) == 0) stop("gc output does not include 'max used'")
  mb_col <- max_col + 1
  end["Vcells", mb_col] - start["Vcells", mb_col]
}

run_exact <- function(locs, Y, backend_name) {
  if (backend_name != "base") {
    set_backend(pick_backend(backend_name, flex_backends))
  }
  timing_env <- new.env(parent = emptyenv())
  timing_env$elapsed <- NA_real_
  peak_mb <- max_ram_usage({
    Sigma_field <- rSPDE::matern.covariance(
      h = as.matrix(dist(locs)),
      kappa = kappa_true, nu = nu_true, sigma = sigma_true
    )
    timing_env$elapsed <- system.time({
      loglike_exact_chol(Y, Sigma_field, sigma_e)
    })["elapsed"]
  })
  list(elapsed_sec = as.numeric(timing_env$elapsed), peak_ram = peak_mb)
}

run_rspde <- function(locs, Y, backend_name, m) {
  if (backend_name != "base") {
    set_backend(pick_backend(backend_name, flex_backends))
  }
  timing_env <- new.env(parent = emptyenv())
  timing_env$elapsed <- NA_real_
  peak_mb <- max_ram_usage({
    mesh <- fm_mesh_2d(loc = locs, cutoff = 0.0025)
    A <- fm_basis(mesh, locs)
    obj <- matern.operators(
      mesh = mesh, nu = nu_true, range = range_true, sigma = sigma_true,
      m = m, parameterization = "matern"
    )
    timing_env$elapsed <- system.time({
      rSPDE:::rSPDE.matern.loglike(object = obj, Y = Y, A = A, sigma.e = sigma_e)
    })["elapsed"]
  })
  list(elapsed_sec = as.numeric(timing_env$elapsed), peak_ram = peak_mb)
}

bench_rows <- list()
for (bm in benchmark_data) {
  N <- bm$N
  locs <- bm$locs
  Y <- bm$Y

  for (backend_name in bench_plan$exact_backends) {
    if (backend_name == "netlib" && N > 8000) next
    exact_vec <- run_exact(locs, Y, backend_name)
    bench_rows[[length(bench_rows) + 1]] <- data.frame(
      N = N,
      backend = backend_label(backend_name),
      method = "Exact",
      elapsed_sec = exact_vec$elapsed_sec,
      peak_ram = exact_vec$peak_ram
    )
  }

  for (m in 1:4) {
    rspde_runs <- lapply(seq_len(10), function(i) run_rspde(locs, Y, bench_plan$rspde_backend, m))
    rspde_elapsed <- vapply(rspde_runs, function(x) x$elapsed_sec, numeric(1))
    rspde_peak <- vapply(rspde_runs, function(x) x$peak_ram, numeric(1))
    rspde_net <- list(elapsed_sec = mean(rspde_elapsed), peak_ram = mean(rspde_peak))
    bench_rows[[length(bench_rows) + 1]] <- data.frame(
      N = N,
      backend = backend_label(bench_plan$rspde_backend),
      method = paste0("rSPDE m=", m),
      elapsed_sec = rspde_net$elapsed_sec,
      peak_ram = rspde_net$peak_ram
    )
  }
}

bench_df <- bind_rows(bench_rows)
results_all <- bench_df %>% mutate(expr = method, median = elapsed_sec)
mem_df <- bench_df %>% mutate(method = method)
if (!"ram_gib" %in% names(mem_df) && "peak_ram" %in% names(mem_df)) {
  divisor <- if (max(mem_df$peak_ram, na.rm = TRUE) > 1e4) 1e5 else 1024
  mem_df$ram_gib <- mem_df$peak_ram / divisor
}

dir.create("cache", showWarnings = FALSE, recursive = TRUE)
saveRDS(list(results_all = results_all, mem_df = mem_df, sample_sizes = sample_sizes),
  file = "cache/timing_benchmark_results.rds"
)
```

## Results

![Timing benchmark and peak memory usage for exact dense Matern likelihood evaluation versus rSPDE sparse approximations in a simplified Gaussian setting.](figures/timing-benchmark-1.png){width=100%}