This vignette considers the Heffernan and Tawn (2004) conditional model for extremes in the form of Shooter et al. (2019), so for \(Y_i \sim Laplace\)
\[\begin{equation} Z_{j \mid i} = \dfrac{Y_{j \mid i} - \alpha_{j \mid i} y_i}{y_i^{\beta_{j \mid i}}} \sim N(\mu_{j \mid i}, \sigma_{j \mid i}^2) \tag{1.1} \end{equation}\]
with \(-1 \leq \alpha_{j \mid i} \leq 1\), \(0 \leq \beta_{j \mid i} \leq 1\), \(\infty < \mu_{j \mid i} < \infty\) and \(0 < \sigma_{j \mid i} < \infty\).
For a regression-type approach, suppose that
\[ Y_{j \mid i} \sim N(\alpha_{j \mid i} y_i + y_i^{\beta_{j \mid i}} \mu_{j \mid i}, y_i^{2\beta_{j \mid i}} \sigma_{j \mid i}^2). \]
To illustrate a GAM-based approach, consider the spatial setting in which we’re interested in extremes at location \(\mathbf{s}\) given location \(\mathbf{s}_0\), so that
\[\begin{equation} Y(\mathbf{s}) \mid y(\mathbf{s}_0) \sim N(\alpha(\mathbf{s}) y(\mathbf{s}_0) + \mu(\mathbf{s}) y(\mathbf{s}_0)^{\beta(\mathbf{s})}, y(\mathbf{s}_0)^{2 \beta(\mathbf{s})} \sigma(\mathbf{s})^2). \label\tag{1.2} \end{equation}\]
Then, if we want to use a GAM-based formulation we can assume link functions \(h_*()\) such that \(\alpha(\mathbf{s}) = h_\alpha(\eta_\alpha(\mathbf{s}))\), \(\beta(\mathbf{s}) = h_\beta(\eta_\beta(\mathbf{s}))\), \(\mu(\mathbf{s}) = h_\mu(\eta_\mu(\mathbf{s}))\) and \(\sigma(\mathbf{s}) = h_\sigma(\eta_\sigma(\mathbf{s}))\), with \(h_\alpha(w) = 2 / (1 + \text{e}^{-w}) - 1\), \(h_\beta(w) = 1 / (1 + \text{e}^{-w})\), \(h_\mu(w) = w\) and \(h_\sigma(w) = \text{e}^w\).
For the GAM part of the model we’ll assume that \[ \eta_*(\mathbf{s}) = \gamma_{*0} + g_*(\mathbf{s}) \] for intercept \(\gamma_{*0}\) and where \(g_*(\mathbf{s})\) denotes a thin-plate regression spline, except for \(\beta\), which we assume to be constant.
Here we’ll model MERRA-2’s air surface skin temperatures, as used in Lee et al. (2025).
evgam
# remotes::install_github('byoungman/evgam')
library(evgam)
We’ll load the data, which arrive as separate yearly files, and then combine them.
fls <- list.files('./Average surface skin temperature/', full.names = TRUE)
lst <- list()
for (i in seq_along(fls)) {
load(fls[i])
lst[[i]] <- list(dates = dates, locations = locations, day_SFT_max = day_SFT_max)
}
dates <- do.call(c, lapply(lst, '[[', 'dates'))
day_SFT_max <- t(do.call(rbind, lapply(lst, '[[', 'day_SFT_max')))
transform things to Laplace scale
day_SFT_max_F <- t(apply(day_SFT_max, 1, function(x) (rank(x) - .5) / length(x)))
qlaplace <- function(p) {
ifelse(p <= .5, log(2 * p), -log(2 * (1 - p)))
}
data_L <- qlaplace(day_SFT_max_F)
and then put its meta data in data.frame
format
grid_df <- data.frame(lon = locations[, 1], lat = locations[, 2])
d <- nrow(grid_df)
We now need to condition on some site. Let’s choose the one in the middle.
cond <- with(locations, which(lon == median(lon) & lat == median(lat)))
lon0 <- locations[cond, 1]
lat0 <- locations[cond, 2]
Then we want to find which data exceed some threshold at that site. We’ll choose the threshold to be the 0.8 quantile.
threshold <- qlaplace(0.8)
exceed <- which(data_L[cond, ] > threshold)
Next we set up the conditioning data (I’ll probably write a fairly straightforward function to do this in future)
cond_mat <- matrix(data_L[cond, ], nrow(data_L), ncol(data_L), byrow = TRUE)
ev_df_mat <- data.frame(lon = grid_df$lon[-cond], lat = grid_df$lat[-cond])
ev_df_mat$y <- data_L[-cond, exceed]
ev_df_mat$x <- cond_mat[-cond, exceed]
and then set up our formula for the conditional extreme value model’s four transformed parameters.
fmla_condex <- list(
y ~ s(lon, lat, k = 10),
~ 1,
~ s(lon, lat, k = 10),
~ s(lon, lat, k = 10)
)
The following fits the model
m1 <- evgam(fmla_condex, data = ev_df_mat, family = 'condex', args = list(x = ev_df_mat$x))
Here’s a plot of the resulting parameter estimates.
library(lattice)
pred1 <- grid_df
pred1 <- cbind(pred1, predict(m1, newdata = grid_df, type = 'response'))
panel_fn <- function(...) {
panel.levelplot(...)
panel.xyplot(lon0, lat0, pch = 19, col = 1, cex = 1.5)
}
l1 <- levelplot(alpha ~ lon * lat, pred1, main = 'alpha', panel = panel_fn)
l2 <- levelplot(beta ~ lon * lat, pred1, main = 'beta', panel = panel_fn)
l3 <- levelplot(location ~ lon * lat, pred1, main = 'mu', panel = panel_fn)
l4 <- levelplot(scale ~ lon * lat, pred1, main = 'sigma', panel = panel_fn)
print(l1, split = c(1, 1, 2, 2), more = TRUE)
print(l2, split = c(2, 1, 2, 2), more = TRUE)
print(l3, split = c(1, 2, 2, 2), more = TRUE)
print(l4, split = c(2, 2, 2, 2))
To fit based on multiple conditioning sites, we can adopt the model
\[\begin{equation} Y(\mathbf{s}- \mathbf{s}_{0i}) \mid y(\mathbf{s}_{0i}) \sim N(\alpha(\mathbf{s}- \mathbf{s}_{0i}) y(\mathbf{s}_{0i}) + \mu(\mathbf{s}- \mathbf{s}_{0i}) y(\mathbf{s}_{0i})^{\beta(\mathbf{s}- \mathbf{s}_{0i})}, y(\mathbf{s}_{0i})^{2 \beta(\mathbf{s}- \mathbf{s}_{0i})} \sigma(\mathbf{s}- \mathbf{s}_{0i})^2), \tag{2.1} \end{equation}\]
where \(i = 1, \ldots, m\) indexes conditioning site.
The following creates a data.frame
comprising \(m = 5\) conditioning sites (which can almost certainly be done much more compactly).
ncond <- 5
cond_vec <- round(nrow(grid_df) * (1:ncond - .5) / ncond)
ncol <- max(rowSums(data_L[cond_vec, ] > threshold))
mat0 <- matrix(NA, nrow(data_L) - 1, ncol)
ev_df_list <- list()
for (i in 1:ncond) {
cond <- cond_vec[i]
lon0 <- grid_df[cond, 'lon']
lat0 <- grid_df[cond, 'lat']
ev_df_i <- data.frame(lon = grid_df$lon[-cond], lat = grid_df$lat[-cond])
ev_df_i$lon2 <- ev_df_i$lon - lon0
ev_df_i$lat2 <- ev_df_i$lat - lat0
matx <- maty <- mat0
exceed <- which(data_L[cond, ] > threshold)
maty[, seq_along(exceed)] <- data_L[-cond, exceed]
ev_df_i$y <- maty
cond_mat <- matrix(data_L[cond, ], nrow(data_L), ncol(data_L), byrow = TRUE)
matx[, seq_along(exceed)] <- cond_mat[-cond, exceed]
ev_df_i$x <- matx
ev_df_i$weights <- 0 * maty+ 1 / ncond
ev_df_list[[i]] <- ev_df_i
}
ev_df_comp <- do.call(rbind, lapply(ev_df_list, function(x) x[, c('lon', 'lat', 'lon2', 'lat2')]))
ev_df_comp$y <- do.call(rbind, lapply(ev_df_list, function(x) x$y))
ev_df_comp$x <- do.call(rbind, lapply(ev_df_list, function(x) x$x))
ev_df_comp$weights <- do.call(rbind, lapply(ev_df_list, function(x) x$weights))
We then update the model formula to allow for the centered coordinates and then fit the model
fmla_condex2 <- list(
y ~ s(lon2, lat2, k = 10),
~ 1,
~ s(lon2, lat2, k = 10),
~ s(lon2, lat2, k = 10)
)
m2 <- evgam(fmla_condex2, data = ev_df_comp, family = 'condex',
args = list(x = ev_df_comp$x, weights = ev_df_comp$weights))
which takes about 15 minutes.
Finally, we’ll plot the estimates from the centered model.
cond <- round(nrow(grid_df) / 2)
lon0 <- grid_df[cond, 'lon']
lat0 <- grid_df[cond, 'lat']
pred2 <- grid_df
pred2$lon2 <- pred2$lon - lon0
pred2$lat2 <- pred2$lat - lat0
pred2 <- cbind(pred2, predict(m2, newdata = pred2, type = 'response'))
panel_fn2 <- function(...) {
panel.levelplot(...)
panel.xyplot(0, 0, pch = 19, col = 1, cex = 1.5)
}
l1 <- levelplot(alpha ~ lon2 * lat2, pred2, main = 'alpha', panel = panel_fn2)
l2 <- levelplot(beta ~ lon2 * lat2, pred2, main = 'beta', panel = panel_fn2)
l3 <- levelplot(location ~ lon2 * lat2, pred2, main = 'mu', panel = panel_fn2)
l4 <- levelplot(scale ~ lon2 * lat2, pred2, main = 'sigma', panel = panel_fn2)
print(l1, split = c(1, 1, 2, 2), more = TRUE)
print(l2, split = c(2, 1, 2, 2), more = TRUE)
print(l3, split = c(1, 2, 2, 2), more = TRUE)
print(l4, split = c(2, 2, 2, 2))
I thank Jordan Richards, Jenny Wadsworth, Emma Simpson and Simon Brown for their insights into the conditional extreme value model, which have fed into the code, these data used, and this vignette.