knitr::opts_chunk$set(cache = TRUE, dev = c("png", "pdf"))
library(here)
source(here("src", "model_linear_validation_grampians.R"))
library(PRROC)
Examining the use of area under the precision-recall curve (AUC-PR). This ignores true negatives and instead compares precision, TP/(TP + FP), and recall (or sensitivity), TP/(TP+FN), (the fraction of the presences that are correctly recalled). Data to predict to.
df <- rbind(
mds, # All southeast
mdg[names(mds)] # All grampians
)
Predict for each taxa at each location
df$pred <-
predict(model_grampians, df, type = "response", allow.new.levels = TRUE)
Split the data (including preds) by taxa and subregion.
split_df <-
split(df[c("pred", "y")], df[c("taxon", "ibra_subregion")], drop = TRUE)
Extract a single taxa and a single region
acc_bat <- split_df$`Angophora costata subsp. costata.Bateman`
egn_hnf <- split_df$`Eucalyptus (goniocalyx|nortonii).Highlands-Northern Fall`
Prevalence
mean(acc_bat$y)
[1] 0.02681992
mean(egn_hnf$y)
[1] 0.0911188
Histogram of predictions.
hist(acc_bat$pred)

pr_acc_bat <- pr.curve(
scores.class0 = acc_bat$pred, weights.class0 = acc_bat$y, curve = TRUE,
max.compute = TRUE, min.compute = TRUE, rand.compute = TRUE
)
pr_acc_bat
Precision-recall curve
Area under curve (Integral):
0.02306758
Relative area under curve (Integral):
0.009666909
Area under curve (Davis & Goadrich):
0.02306224
Relative area under curves (Davis & Goadrich):
0.009662118
Curve for scores from 1.963065e-05 to 0.9333909
( can be plotted with plot(x) )
Maximum AUC:
1 1
Minimum AUC:
0.01353148 0.01353086
AUC of a random classifier:
0.02681992 0.02681992
plot(
pr_acc_bat, max.plot = TRUE, min.plot = TRUE, rand.plot = TRUE,
fill.area = TRUE
)

roc_acc_bat <- roc.curve(
scores.class0 = acc_bat$pred, weights.class0 = acc_bat$y, curve = TRUE,
max.compute = TRUE, min.compute = TRUE, rand.compute = TRUE
)
roc_acc_bat
ROC curve
Area under curve:
0.46991
Relative area under curve:
0.46991
Curve for scores from 1.963065e-05 to 0.9333909
( can be plotted with plot(x) )
Maximum AUC:
1
Minimum AUC:
0
AUC of a random classifier:
0.5
plot(
roc_acc_bat, max.plot = TRUE, min.plot = TRUE, rand.plot = TRUE,
fill.area = TRUE
)

pr_egn_hnf <- pr.curve(
scores.class0 = egn_hnf$pred, weights.class0 = egn_hnf$y, curve = TRUE,
max.compute = TRUE, min.compute = TRUE, rand.compute = TRUE
)
plot(
pr_egn_hnf , max.plot = TRUE, min.plot = TRUE, rand.plot = TRUE,
fill.area = TRUE
)

roc_egn_hnf <- roc.curve(
scores.class0 = egn_hnf$pred, weights.class0 = egn_hnf$y, curve = TRUE,
max.compute = TRUE, min.compute = TRUE, rand.compute = TRUE
)
plot(
roc_egn_hnf, max.plot = TRUE, min.plot = TRUE, rand.plot = TRUE,
fill.area = TRUE
)

auprc <- function(x) {
AUCPR <- pr.curve(
scores.class0 = x[,1], weights.class0 = x[,2], max.compute = TRUE,
min.compute = TRUE, rand.compute = TRUE
)
AUCPR$aucpr.rel <- (AUCPR$auc.integral - AUCPR$min$auc.integral) /
(AUCPR$max$auc.integral - AUCPR$min$auc.integral)
c(
aucpr = AUCPR$auc.integral, aucpr.min = AUCPR$min$auc.integral,
aucpr.rnd = AUCPR$rand$auc.integral, aucpr.rel = AUCPR$aucpr.rel
)
}
auprc(acc_bat)
aucpr aucpr.min aucpr.rnd aucpr.rel
0.023067582 0.013531480 0.026819923 0.009666909
aucpr <- t(sapply(split_df, auprc))
prev <- sapply(split_df, function(x) c(prev = mean(x[, 2])))
perf2 <- cbind(subset(perf, !random_effect), cbind(prev, aucpr))
# should have used aa left join or merge...
# left_join(aucpr,prev)
# pairs(perf2[, 4:8])
prdist <- data.frame(
perf2[
c(
"taxon", "ibra_subregion", "aucpr", "aucpr.min", "aucpr.rnd", "aucpr.rel",
"prev"
)
],
`Geographic (km)`= dists[as.character(perf2$ibra_subregion)],
Environmental = kldists[as.character(perf2$ibra_subregion)],
Community = community_dist[as.character(perf2$ibra_subregion)],
"Dummy" = "Dummy",
check.names = FALSE
)
prdist <- subset(prdist, ibra_subregion != "Greater Grampians" )
# & !random_effect
prdist <- gather(
prdist, "distance", "value", `Geographic (km)`, Environmental, Community
)
prdist <- na.omit(prdist)
# prdistmean <- summarise(
# group_by(prdist, ibra_subregion, distance),
# meanpr = mean(aucpr), value = mean(value)
# )
relprdistmedian <- summarise(
group_by(prdist, ibra_subregion, distance),
medianpr = median(aucpr), value = median(value)
)
prdistplot <-
ggplot(prdist) +
aes(aucpr, value) +
# geom_vline(xintercept = .5, color = "grey", size = 1.5) +
geom_point(alpha=.1) +
geom_point(
aes(medianpr), data = relprdistmedian, size = 2, shape = 21, fill = "white"
) +
ylab("Distance") +
facet_grid(. ~ distance, scales = "free_x") +
coord_flip() +
scale_x_sqrt(limits = c(0, .75)) +
theme_bw() +
theme(
axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
strip.background = element_blank()
)
aucprbw <-
ggplot(prdist) +
aes(x = Dummy, y = aucpr) +
geom_boxplot(fill = "grey") +
facet_grid(. ~ Dummy) +
scale_x_discrete(labels = "0.95") +
xlab("Dummy") +
theme_bw() +
scale_y_sqrt(limits = c(0, .75)) +
theme(
axis.title.x = element_text(color = "transparent"),
axis.text.x = element_text(color = "transparent"),
axis.ticks.x = element_line(color = "transparent"),
strip.background = element_blank(),
strip.text.x = element_text(color = "transparent"),
plot.margin = margin(5.5, 0, 5.5, 5.5)
)
grid.arrange(aucprbw, prdistplot, nrow = 1, widths = c(.14, .86))

Relative AUC-PR version
relprdistmedian <- summarise(
group_by(prdist, ibra_subregion, distance),
medianpr = median(aucpr.rel), value = median(value)
)
relprdistplot <-
ggplot(prdist) +
aes(aucpr.rel, value) +
# geom_vline(xintercept = .5, color = "grey", size = 1.5) +
geom_point(alpha=.1) +
geom_point(
aes(medianpr), data = relprdistmedian, size = 2, shape = 21, fill = "white"
) +
ylab("Distance") +
facet_grid(. ~ distance, scales = "free_x") +
coord_flip() +
scale_x_sqrt(limits = c(0, .75)) +
theme_bw() +
theme(
axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
strip.background = element_blank()
)
aucpr.relbw <-
ggplot(prdist) +
aes(x = Dummy, y = aucpr.rel) +
geom_boxplot(fill = "grey") +
facet_grid(. ~ Dummy) +
scale_x_discrete(labels = "0.95") +
ylab("Relative AUCPR") +
xlab("Dummy") +
theme_bw() +
scale_y_sqrt(limits = c(.0, .75)) +
theme(
axis.title.x = element_text(color = "transparent"),
axis.text.x = element_text(color = "transparent"),
axis.ticks.x = element_line(color = "transparent"),
strip.background = element_blank(),
strip.text.x = element_text(color = "transparent"),
plot.margin = margin(5.5, 0, 5.5, 5.5)
)
grid.arrange(aucpr.relbw, relprdistplot, nrow = 1, widths = c(.14, .86))

# a <- subset(
# prdist,
# ibra_subregion != "Greater Grampians" & distance == "Geographic (km)"
# )
# a <- subset(
# prdist, ibra_subregion != "Greater Grampians" & distance == "Community"
# )
a <- subset(
prdist,
ibra_subregion != "Greater Grampians" & distance == "Environmental"
)
perf3 <- merge(
subset(perf2, ibra_subregion != "Greater Grampians" & !random_effect), a
)
Plot relations between AUC-PR and prevalence.
ggplot(data = perf3) +
geom_point(mapping = aes(x = prev, y = aucpr), color = "blue", alpha = 1 / 4) +
scale_x_log10()+
scale_y_log10() +
geom_line(
aes(x = prev, y = 1+ ((1-prev)*log(1-prev) ) / prev), color = "red",
linetype = "dotted"
)

Plot for relative AUC-PR.
ggplot(data = perf3) +
geom_point(
mapping = aes(x = aucpr.rnd, y = aucpr.rel), color = "blue", alpha = 1 / 4
) +
scale_y_sqrt() +
scale_x_sqrt()

# geom_abline(slope=1, color = "red", linetype = "dotted")
Plot AUC-ROC
ggplot(data = perf3) +
geom_point(mapping = aes(x = prev, y = AUC), color = "blue", alpha = 1 / 4) +
scale_x_log10()

ggplot(data = perf3) +
geom_point(
mapping = aes(x = aucpr.rel, y = AUC), color = "blue", alpha = 1 / 4
) +
scale_y_sqrt() +
scale_x_sqrt()

# library(nlme)
model_prE <- lmer(
log(aucpr) ~
1 + scale(value) + scale(log10(prev)) + (1 | ibra_subregion) + (1 | taxon),
data = perf3
)
plot(model_prE)

summary(model_prE)
Linear mixed model fit by REML ['lmerMod']
Formula:
log(aucpr) ~ 1 + scale(value) + scale(log10(prev)) + (1 | ibra_subregion) +
(1 | taxon)
Data: perf3
REML criterion at convergence: 1324
Scaled residuals:
Min 1Q Median 3Q Max
-2.5747 -0.5790 -0.1312 0.3951 5.1032
Random effects:
Groups Name Variance Std.Dev.
taxon (Intercept) 0.16250 0.4031
ibra_subregion (Intercept) 0.03419 0.1849
Residual 0.50661 0.7118
Number of obs: 560, groups: taxon, 82; ibra_subregion, 18
Fixed effects:
Estimate Std. Error t value
(Intercept) -3.55059 0.07218 -49.192
scale(value) 0.08307 0.05220 1.591
scale(log10(prev)) 1.74619 0.03564 48.996
Correlation of Fixed Effects:
(Intr) scl(v)
scale(valu) -0.004
scl(lg10()) 0.034 0.093
Strong effects of prevalence on each, but no effects of any distance, point estimates were posistive in each case and < 0.1 and SE > 0.5 * effect. Scaled effects of prevalence were ~1.7 +/- 0.04, so large and certain effects.
model_aucrocG <- lmer(
AUC ~
1 + scale(value) + scale(log10(prev)) + (1 | ibra_subregion) + (1 | taxon),
data =perf3
)
plot(model_aucrocG)

summary(model_aucrocG)
Linear mixed model fit by REML ['lmerMod']
Formula: AUC ~ 1 + scale(value) + scale(log10(prev)) + (1 | ibra_subregion) +
(1 | taxon)
Data: perf3
REML criterion at convergence: -759.7
Scaled residuals:
Min 1Q Median 3Q Max
-2.99551 -0.71856 -0.06671 0.72832 2.21426
Random effects:
Groups Name Variance Std.Dev.
taxon (Intercept) 0.0009645 0.03106
ibra_subregion (Intercept) 0.0010214 0.03196
Residual 0.0132363 0.11505
Number of obs: 560, groups: taxon, 82; ibra_subregion, 18
Fixed effects:
Estimate Std. Error t value
(Intercept) 0.6759597 0.0098977 68.295
scale(value) 0.0006611 0.0087494 0.076
scale(log10(prev)) -0.0467494 0.0054271 -8.614
Correlation of Fixed Effects:
(Intr) scl(v)
scale(valu) -0.010
scl(lg10()) 0.008 0.082
Prevalence has a clearly negative effect on AUC-ROC, larger than taxon or subregion REs. but prevalence effect (-0.05 +/-0.005) smaller than residual.
# STAN model of AUCPR
# model_pr <- stan_glmer(
# aucpr ~ value + log10(prev) + (1 | ibra_subregion/taxon),
# family = mgcv::betar,
# iter = 1000, chains = 3,
# data = prdist,
# subset = ibra_subregion != "Greater Grampians" &
# distance == "Geographic (km)"
#)
# summary(model_pr)
# summary(model_pr, regex_pars = "^[^b]", probs = c(.025, .975))
#' ---
#' title: "Examining precision recall"
#' author: "Peter A. Vesk & William K. Morris"
#' date: "`r Sys.Date()`"
#' output:
#'   rmarkdown::html_notebook:
#'     code_folding: hide
#' ---

#+ setup, message=FALSE, warning=FALSE, fig.keep="none", fig.show="hide"
knitr::opts_chunk$set(cache = TRUE, dev = c("png", "pdf"))
library(here)
source(here("src", "model_linear_validation_grampians.R"))
library(PRROC)

#' Examining the use of area under the precision-recall curve (AUC-PR).
#' This ignores true negatives and instead compares precision, TP/(TP + FP),
#' and recall (or sensitivity), TP/(TP+FN), (the fraction of the presences that
#' are correctly recalled).

#' Data to predict to.
#+ df
df <- rbind(
  mds,            # All southeast
  mdg[names(mds)] # All grampians
)

#' Predict for each taxa at each location
#+ predict
df$pred <-
  predict(model_grampians, df, type = "response", allow.new.levels = TRUE)

#' Split the data (including preds) by taxa and subregion.
#+ split
split_df <-
  split(df[c("pred", "y")], df[c("taxon", "ibra_subregion")], drop = TRUE)

#' Extract a single taxa and a single region
#+ extract
acc_bat <- split_df$`Angophora costata subsp. costata.Bateman`
egn_hnf <- split_df$`Eucalyptus (goniocalyx|nortonii).Highlands-Northern Fall`

#' ## Prevalence
#+ prevalence
mean(acc_bat$y)
mean(egn_hnf$y)

#' ## Histogram of predictions.
#+ hist
hist(acc_bat$pred)

#+ precision-recall-curve
pr_acc_bat <- pr.curve(
  scores.class0 = acc_bat$pred, weights.class0 = acc_bat$y, curve = TRUE,
  max.compute = TRUE, min.compute = TRUE, rand.compute = TRUE
)
pr_acc_bat

#+ plot-precision-recall
plot(
  pr_acc_bat, max.plot = TRUE, min.plot = TRUE, rand.plot = TRUE,
  fill.area = TRUE
)

#+ receiver-operator-curve
roc_acc_bat <- roc.curve(
  scores.class0 = acc_bat$pred, weights.class0 = acc_bat$y, curve = TRUE,
  max.compute = TRUE, min.compute = TRUE, rand.compute = TRUE
)
roc_acc_bat

#+ plot-receiver-operator
plot(
  roc_acc_bat, max.plot = TRUE, min.plot = TRUE, rand.plot = TRUE,
  fill.area = TRUE
)

#+ plot-precision-recall-2
pr_egn_hnf <- pr.curve(
  scores.class0 = egn_hnf$pred, weights.class0 = egn_hnf$y, curve = TRUE,
  max.compute = TRUE, min.compute = TRUE, rand.compute = TRUE
)
plot(
  pr_egn_hnf , max.plot = TRUE, min.plot = TRUE, rand.plot = TRUE,
  fill.area = TRUE
)

#+ plot-receiver-operator-2
roc_egn_hnf <- roc.curve(
  scores.class0 = egn_hnf$pred, weights.class0 = egn_hnf$y, curve = TRUE,
  max.compute = TRUE, min.compute = TRUE, rand.compute = TRUE
)
plot(
  roc_egn_hnf, max.plot = TRUE, min.plot = TRUE, rand.plot = TRUE,
  fill.area = TRUE
)

#+ auprc
auprc <- function(x)  {
  AUCPR <- pr.curve(
    scores.class0 = x[,1], weights.class0 = x[,2], max.compute = TRUE,
    min.compute = TRUE, rand.compute = TRUE
  )
  AUCPR$aucpr.rel <- (AUCPR$auc.integral - AUCPR$min$auc.integral) /
    (AUCPR$max$auc.integral - AUCPR$min$auc.integral)
  c(
    aucpr = AUCPR$auc.integral, aucpr.min = AUCPR$min$auc.integral,
    aucpr.rnd = AUCPR$rand$auc.integral, aucpr.rel = AUCPR$aucpr.rel
  )
}

auprc(acc_bat)

#+ dist-plot
aucpr <- t(sapply(split_df, auprc))
prev <- sapply(split_df, function(x) c(prev = mean(x[, 2])))

perf2 <- cbind(subset(perf, !random_effect), cbind(prev, aucpr))
# should have used aa left join or merge...
# left_join(aucpr,prev)
# pairs(perf2[, 4:8])

prdist <- data.frame(
  perf2[
    c(
      "taxon", "ibra_subregion", "aucpr", "aucpr.min", "aucpr.rnd", "aucpr.rel",
      "prev"
    )
  ],
  `Geographic (km)`= dists[as.character(perf2$ibra_subregion)],
  Environmental = kldists[as.character(perf2$ibra_subregion)],
  Community = community_dist[as.character(perf2$ibra_subregion)],
  "Dummy" = "Dummy",
  check.names = FALSE
)

prdist <- subset(prdist, ibra_subregion != "Greater Grampians" )
# & !random_effect

prdist <- gather(
  prdist, "distance", "value", `Geographic (km)`, Environmental, Community
)

prdist <- na.omit(prdist)

# prdistmean <- summarise(
#   group_by(prdist, ibra_subregion, distance),
#   meanpr = mean(aucpr), value = mean(value)
# )

relprdistmedian <- summarise(
  group_by(prdist, ibra_subregion, distance),
  medianpr = median(aucpr), value = median(value)
)

prdistplot <-
  ggplot(prdist) +
  aes(aucpr, value) +
  # geom_vline(xintercept = .5, color = "grey", size = 1.5) +
  geom_point(alpha=.1) +
  geom_point(
    aes(medianpr), data = relprdistmedian, size = 2, shape = 21, fill = "white"
  ) +
  ylab("Distance") +
  facet_grid(. ~ distance, scales = "free_x") +
  coord_flip() +
  scale_x_sqrt(limits = c(0, .75)) +
  theme_bw() +
  theme(
    axis.title.y = element_blank(),
    axis.text.y = element_blank(),
    axis.ticks.y = element_blank(),
    strip.background = element_blank()
  )

aucprbw <-
  ggplot(prdist) +
  aes(x = Dummy, y = aucpr) +
  geom_boxplot(fill = "grey") +
  facet_grid(. ~ Dummy) +
  scale_x_discrete(labels = "0.95") +
  xlab("Dummy") +
  theme_bw() +
  scale_y_sqrt(limits = c(0, .75)) +
  theme(
    axis.title.x = element_text(color = "transparent"),
    axis.text.x  = element_text(color = "transparent"),
    axis.ticks.x = element_line(color = "transparent"),
    strip.background = element_blank(),
    strip.text.x = element_text(color = "transparent"),
    plot.margin = margin(5.5, 0, 5.5, 5.5)
  )

grid.arrange(aucprbw, prdistplot, nrow = 1, widths = c(.14, .86))

#' ## Relative AUC-PR version
#+ dist-plot-relpr, message = FALSE, warning = FALSE, fig.width = 7.5, fig.height = 3.5
relprdistmedian <- summarise(
  group_by(prdist, ibra_subregion, distance),
  medianpr = median(aucpr.rel), value = median(value)
)

relprdistplot <-
  ggplot(prdist) +
  aes(aucpr.rel, value) +
  # geom_vline(xintercept = .5, color = "grey", size = 1.5) +
  geom_point(alpha=.1) +
  geom_point(
    aes(medianpr), data = relprdistmedian, size = 2, shape = 21, fill = "white"
  ) +
  ylab("Distance") +
  facet_grid(. ~ distance, scales = "free_x") +
  coord_flip() +
  scale_x_sqrt(limits = c(0, .75)) +
  theme_bw() +
  theme(
    axis.title.y = element_blank(),
    axis.text.y = element_blank(),
    axis.ticks.y = element_blank(),
    strip.background = element_blank()
  )

aucpr.relbw <-
  ggplot(prdist) +
  aes(x = Dummy, y = aucpr.rel) +
  geom_boxplot(fill = "grey") +
  facet_grid(. ~ Dummy) +
  scale_x_discrete(labels = "0.95") +
  ylab("Relative AUCPR") +
  xlab("Dummy") +
  theme_bw() +
  scale_y_sqrt(limits = c(.0, .75)) +
  theme(
    axis.title.x = element_text(color = "transparent"),
    axis.text.x  = element_text(color = "transparent"),
    axis.ticks.x = element_line(color = "transparent"),
    strip.background = element_blank(),
    strip.text.x = element_text(color = "transparent"),
    plot.margin = margin(5.5, 0, 5.5, 5.5)
  )

grid.arrange(aucpr.relbw, relprdistplot, nrow = 1, widths = c(.14, .86))

#+ models
# a <- subset(
#   prdist,
#   ibra_subregion != "Greater Grampians" & distance == "Geographic (km)"
# )
# a <- subset(
#   prdist, ibra_subregion != "Greater Grampians" & distance == "Community"
# )
a <- subset(
  prdist,
  ibra_subregion != "Greater Grampians" & distance == "Environmental"
)

perf3 <- merge(
  subset(perf2, ibra_subregion != "Greater Grampians" & !random_effect), a
)

#' Plot relations between AUC-PR and prevalence.
#+ plot-perf3
ggplot(data = perf3) +
geom_point(mapping = aes(x = prev, y = aucpr), color = "blue", alpha = 1 / 4) +
scale_x_log10()+
scale_y_log10() +
geom_line(
  aes(x = prev, y = 1+ ((1-prev)*log(1-prev) ) / prev), color = "red",
  linetype = "dotted"
)

#' Plot for relative AUC-PR.
#+ plot-rel-perf3
ggplot(data = perf3) +
geom_point(
  mapping = aes(x = aucpr.rnd, y = aucpr.rel), color = "blue", alpha = 1 / 4
) +
scale_y_sqrt() +
scale_x_sqrt()
# geom_abline(slope=1, color = "red", linetype = "dotted")

#' Plot AUC-ROC
ggplot(data = perf3) +
geom_point(mapping = aes(x = prev, y = AUC), color = "blue", alpha = 1 / 4) +
scale_x_log10()

ggplot(data = perf3) +
geom_point(
  mapping = aes(x = aucpr.rel, y = AUC), color = "blue", alpha = 1 / 4
) +
scale_y_sqrt() +
scale_x_sqrt()

#+ model-aucpr
# library(nlme)
model_prE <- lmer(
  log(aucpr) ~
    1 + scale(value) + scale(log10(prev)) + (1 | ibra_subregion) + (1 | taxon),
  data = perf3
)

plot(model_prE)
summary(model_prE)
#' Strong effects of prevalence on each, but no effects of any distance, point
#' estimates were posistive in each case and < 0.1 and SE > 0.5 * effect.
#' Scaled effects of prevalence were ~1.7 +/- 0.04, so large and certain
#' effects.

#+ model-aucro
model_aucrocG <- lmer(
  AUC ~
    1 + scale(value) + scale(log10(prev)) + (1 | ibra_subregion) + (1 | taxon),
  data =perf3
  )
plot(model_aucrocG)
summary(model_aucrocG)
#' Prevalence has a clearly negative effect on AUC-ROC, larger than taxon or
#' subregion REs. but prevalence effect (-0.05 +/-0.005) smaller than residual.


# STAN model of AUCPR
#+ modelpr, results = "hide", message = FALSE
# model_pr <- stan_glmer(
#  aucpr ~ value + log10(prev) + (1 | ibra_subregion/taxon),
#  family = mgcv::betar,
#  iter = 1000, chains = 3,
#  data = prdist,
#  subset = ibra_subregion != "Greater Grampians" &
#    distance == "Geographic (km)"
#)

# summary(model_pr)

# summary(model_pr, regex_pars = "^[^b]", probs = c(.025, .975))
